mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 23:37:58 +10:00
switch to external plugins
This commit is contained in:
@@ -30,7 +30,8 @@
|
||||
"WebFetch(domain:python-statemachine.readthedocs.io)",
|
||||
"Bash(./bin/run_plugin_tests.sh:*)",
|
||||
"Bash(done)",
|
||||
"Bash(coverage erase:*)"
|
||||
"Bash(coverage erase:*)",
|
||||
"Bash(gh api:*)"
|
||||
]
|
||||
},
|
||||
"hooks": {
|
||||
|
||||
@@ -491,6 +491,7 @@ docker run -it -v $PWD:/data archivebox/archivebox help
|
||||
# optional: import your browser cookies into a persona for logged-in archiving
|
||||
archivebox persona create --import=chrome personal
|
||||
# supported: chrome/chromium/brave/edge (Chromium-based only)
|
||||
# use --profile to target a specific profile (e.g. Default, Profile 1)
|
||||
# re-running import merges/dedupes cookies.txt (by domain/path/name) but replaces chrome_user_data
|
||||
```
|
||||
|
||||
|
||||
@@ -18,6 +18,7 @@ from pathlib import Path
|
||||
# Import uuid_compat early to monkey-patch uuid.uuid7 before Django loads migrations
|
||||
# This fixes migrations generated on Python 3.14+ that reference uuid.uuid7 directly
|
||||
from archivebox import uuid_compat # noqa: F401
|
||||
from abx_plugins import get_plugins_dir
|
||||
|
||||
# Force unbuffered output for real-time logs
|
||||
if hasattr(sys.stdout, 'reconfigure'):
|
||||
@@ -56,9 +57,13 @@ check_io_encoding()
|
||||
# Install monkey patches for third-party libraries
|
||||
from .misc.monkey_patches import * # noqa
|
||||
|
||||
# Built-in plugin directories
|
||||
BUILTIN_PLUGINS_DIR = PACKAGE_DIR / 'plugins'
|
||||
USER_PLUGINS_DIR = Path(os.getcwd()) / 'plugins'
|
||||
# Plugin directories
|
||||
BUILTIN_PLUGINS_DIR = Path(get_plugins_dir()).resolve()
|
||||
USER_PLUGINS_DIR = Path(
|
||||
os.environ.get('ARCHIVEBOX_USER_PLUGINS_DIR')
|
||||
or os.environ.get('USER_PLUGINS_DIR')
|
||||
or os.environ.get('DATA_DIR', os.getcwd())
|
||||
) / 'custom_plugins'
|
||||
|
||||
# These are kept for backwards compatibility with existing code
|
||||
# that checks for plugins. The new hook system uses discover_hooks()
|
||||
|
||||
@@ -33,6 +33,7 @@ import shutil
|
||||
import platform
|
||||
import subprocess
|
||||
import tempfile
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Optional, Iterable
|
||||
from collections import OrderedDict
|
||||
@@ -138,6 +139,55 @@ def get_edge_user_data_dir() -> Optional[Path]:
|
||||
return None
|
||||
|
||||
|
||||
def get_browser_binary(browser: str) -> Optional[str]:
|
||||
system = platform.system()
|
||||
home = Path.home()
|
||||
browser = browser.lower()
|
||||
|
||||
if system == 'Darwin':
|
||||
candidates = {
|
||||
'chrome': ['/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'],
|
||||
'chromium': ['/Applications/Chromium.app/Contents/MacOS/Chromium'],
|
||||
'brave': ['/Applications/Brave Browser.app/Contents/MacOS/Brave Browser'],
|
||||
'edge': ['/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge'],
|
||||
}.get(browser, [])
|
||||
elif system == 'Linux':
|
||||
candidates = {
|
||||
'chrome': ['/usr/bin/google-chrome', '/usr/bin/google-chrome-stable', '/usr/bin/google-chrome-beta', '/usr/bin/google-chrome-unstable'],
|
||||
'chromium': ['/usr/bin/chromium', '/usr/bin/chromium-browser'],
|
||||
'brave': ['/usr/bin/brave-browser', '/usr/bin/brave-browser-beta', '/usr/bin/brave-browser-nightly'],
|
||||
'edge': ['/usr/bin/microsoft-edge', '/usr/bin/microsoft-edge-stable', '/usr/bin/microsoft-edge-beta', '/usr/bin/microsoft-edge-dev'],
|
||||
}.get(browser, [])
|
||||
elif system == 'Windows':
|
||||
local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local'))
|
||||
candidates = {
|
||||
'chrome': [
|
||||
str(local_app_data / 'Google' / 'Chrome' / 'Application' / 'chrome.exe'),
|
||||
'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe',
|
||||
'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe',
|
||||
],
|
||||
'chromium': [str(local_app_data / 'Chromium' / 'Application' / 'chrome.exe')],
|
||||
'brave': [
|
||||
str(local_app_data / 'BraveSoftware' / 'Brave-Browser' / 'Application' / 'brave.exe'),
|
||||
'C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe',
|
||||
'C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe',
|
||||
],
|
||||
'edge': [
|
||||
str(local_app_data / 'Microsoft' / 'Edge' / 'Application' / 'msedge.exe'),
|
||||
'C:\\Program Files\\Microsoft\\Edge\\Application\\msedge.exe',
|
||||
'C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe',
|
||||
],
|
||||
}.get(browser, [])
|
||||
else:
|
||||
candidates = []
|
||||
|
||||
for candidate in candidates:
|
||||
if candidate and Path(candidate).exists():
|
||||
return candidate
|
||||
|
||||
return None
|
||||
|
||||
|
||||
BROWSER_PROFILE_FINDERS = {
|
||||
'chrome': get_chrome_user_data_dir,
|
||||
'chromium': get_chrome_user_data_dir, # Same locations
|
||||
@@ -194,7 +244,12 @@ def _merge_netscape_cookies(existing_file: Path, new_file: Path) -> None:
|
||||
_write_netscape_cookies(existing_file, existing)
|
||||
|
||||
|
||||
def extract_cookies_via_cdp(user_data_dir: Path, output_file: Path) -> bool:
|
||||
def extract_cookies_via_cdp(
|
||||
user_data_dir: Path,
|
||||
output_file: Path,
|
||||
profile_dir: str | None = None,
|
||||
chrome_binary: str | None = None,
|
||||
) -> bool:
|
||||
"""
|
||||
Launch Chrome with the given user data dir and extract cookies via CDP.
|
||||
|
||||
@@ -218,6 +273,8 @@ def extract_cookies_via_cdp(user_data_dir: Path, output_file: Path) -> bool:
|
||||
env['NODE_MODULES_DIR'] = str(node_modules_dir)
|
||||
env['CHROME_USER_DATA_DIR'] = str(user_data_dir)
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
if chrome_binary:
|
||||
env['CHROME_BINARY'] = str(chrome_binary)
|
||||
output_path = output_file
|
||||
temp_output = None
|
||||
temp_dir = None
|
||||
@@ -225,6 +282,23 @@ def extract_cookies_via_cdp(user_data_dir: Path, output_file: Path) -> bool:
|
||||
temp_dir = Path(tempfile.mkdtemp(prefix='ab_cookies_'))
|
||||
temp_output = temp_dir / 'cookies.txt'
|
||||
output_path = temp_output
|
||||
if profile_dir:
|
||||
extra_arg = f'--profile-directory={profile_dir}'
|
||||
existing_extra = env.get('CHROME_ARGS_EXTRA', '').strip()
|
||||
args_list = []
|
||||
if existing_extra:
|
||||
if existing_extra.startswith('['):
|
||||
try:
|
||||
parsed = json.loads(existing_extra)
|
||||
if isinstance(parsed, list):
|
||||
args_list.extend(str(x) for x in parsed)
|
||||
except Exception:
|
||||
args_list.extend([s.strip() for s in existing_extra.split(',') if s.strip()])
|
||||
else:
|
||||
args_list.extend([s.strip() for s in existing_extra.split(',') if s.strip()])
|
||||
args_list.append(extra_arg)
|
||||
env['CHROME_ARGS_EXTRA'] = json.dumps(args_list)
|
||||
|
||||
env['COOKIES_OUTPUT_FILE'] = str(output_path)
|
||||
|
||||
try:
|
||||
@@ -322,6 +396,7 @@ def ensure_path_within_personas_dir(persona_path: Path) -> bool:
|
||||
def create_personas(
|
||||
names: Iterable[str],
|
||||
import_from: Optional[str] = None,
|
||||
profile: Optional[str] = None,
|
||||
) -> int:
|
||||
"""
|
||||
Create Personas from names.
|
||||
@@ -360,6 +435,15 @@ def create_personas(
|
||||
|
||||
rprint(f'[dim]Found {import_from} profile: {source_profile_dir}[/dim]', file=sys.stderr)
|
||||
|
||||
if profile is None and (source_profile_dir / 'Default').exists():
|
||||
profile = 'Default'
|
||||
|
||||
browser_binary = get_browser_binary(import_from)
|
||||
if browser_binary:
|
||||
rprint(f'[dim]Using {import_from} binary: {browser_binary}[/dim]', file=sys.stderr)
|
||||
else:
|
||||
browser_binary = None
|
||||
|
||||
created_count = 0
|
||||
for name in name_list:
|
||||
name = name.strip()
|
||||
@@ -414,7 +498,12 @@ def create_personas(
|
||||
# Extract cookies via CDP
|
||||
rprint(f'[dim]Extracting cookies via CDP...[/dim]', file=sys.stderr)
|
||||
|
||||
if extract_cookies_via_cdp(persona_chrome_dir, cookies_file):
|
||||
if extract_cookies_via_cdp(
|
||||
persona_chrome_dir,
|
||||
cookies_file,
|
||||
profile_dir=profile,
|
||||
chrome_binary=browser_binary,
|
||||
):
|
||||
rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr)
|
||||
else:
|
||||
rprint(f'[yellow]Could not extract cookies automatically.[/yellow]', file=sys.stderr)
|
||||
@@ -652,9 +741,10 @@ def main():
|
||||
@main.command('create')
|
||||
@click.argument('names', nargs=-1)
|
||||
@click.option('--import', 'import_from', help='Import profile from browser (chrome, chromium, brave, edge)')
|
||||
def create_cmd(names: tuple, import_from: Optional[str]):
|
||||
@click.option('--profile', help='Profile directory name under the user data dir (e.g. Default, Profile 1)')
|
||||
def create_cmd(names: tuple, import_from: Optional[str], profile: Optional[str]):
|
||||
"""Create Personas, optionally importing from a browser profile."""
|
||||
sys.exit(create_personas(names, import_from=import_from))
|
||||
sys.exit(create_personas(names, import_from=import_from, profile=profile))
|
||||
|
||||
|
||||
@main.command('list')
|
||||
|
||||
@@ -277,7 +277,7 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
# Show a helpful message when no plugins found
|
||||
rows['Name'].append('(no plugins found)')
|
||||
rows['Source'].append('-')
|
||||
rows['Path'].append(mark_safe('<code>archivebox/plugins/</code> or <code>data/plugins/</code>'))
|
||||
rows['Path'].append(mark_safe('<code>abx_plugins/plugins/</code> or <code>data/custom_plugins/</code>'))
|
||||
rows['Hooks'].append('-')
|
||||
rows['Config'].append('-')
|
||||
|
||||
|
||||
@@ -140,6 +140,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'crawl__created_by', TagNameListFilter)
|
||||
|
||||
fieldsets = (
|
||||
('Actions', {
|
||||
'fields': ('admin_actions',),
|
||||
'classes': ('card', 'wide', 'actions-card'),
|
||||
}),
|
||||
('URL', {
|
||||
'fields': ('url', 'title'),
|
||||
'classes': ('card', 'wide'),
|
||||
@@ -168,10 +172,6 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
'fields': ('output_dir',),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Actions', {
|
||||
'fields': ('admin_actions',),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Archive Results', {
|
||||
'fields': ('archiveresults_list',),
|
||||
'classes': ('card', 'wide'),
|
||||
@@ -179,7 +179,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
)
|
||||
|
||||
ordering = ['-created_at']
|
||||
actions = ['add_tags', 'remove_tags', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
|
||||
actions = ['add_tags', 'remove_tags', 'resnapshot_snapshot', 'update_snapshots', 'overwrite_snapshots', 'delete_snapshots']
|
||||
inlines = [] # Removed TagInline, using TagEditorWidget instead
|
||||
list_per_page = min(max(5, SERVER_CONFIG.SNAPSHOTS_PER_PAGE), 5000)
|
||||
|
||||
@@ -301,6 +301,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
# obj.pk,
|
||||
# )
|
||||
|
||||
@admin.display(description='')
|
||||
def admin_actions(self, obj):
|
||||
summary_url = build_web_url(f'/{obj.archive_path}')
|
||||
results_url = build_web_url(f'/{obj.archive_path}/index.html#all')
|
||||
@@ -311,13 +312,13 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
href="{}"
|
||||
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
|
||||
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
|
||||
📄 Summary Page
|
||||
📄 View Snapshot
|
||||
</a>
|
||||
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
|
||||
href="{}"
|
||||
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
|
||||
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
|
||||
📁 Result Files
|
||||
📁 All files
|
||||
</a>
|
||||
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
|
||||
href="{}"
|
||||
@@ -329,19 +330,19 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
|
||||
<span style="border-left: 1px solid #e2e8f0; height: 24px; margin: 0 4px;"></span>
|
||||
|
||||
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #ecfdf5; border: 1px solid #a7f3d0; border-radius: 8px; color: #065f46; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
|
||||
href="/admin/core/snapshot/?id__exact={}"
|
||||
title="Get missing extractors"
|
||||
onmouseover="this.style.background='#d1fae5';"
|
||||
onmouseout="this.style.background='#ecfdf5';">
|
||||
⬇️ Finish
|
||||
</a>
|
||||
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #eff6ff; border: 1px solid #bfdbfe; border-radius: 8px; color: #1e40af; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
|
||||
href="/admin/core/snapshot/?id__exact={}"
|
||||
title="Create a fresh new snapshot of this URL"
|
||||
onmouseover="this.style.background='#dbeafe';"
|
||||
onmouseout="this.style.background='#eff6ff';">
|
||||
🆕 Archive Again
|
||||
🆕 Archive Now
|
||||
</a>
|
||||
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #ecfdf5; border: 1px solid #a7f3d0; border-radius: 8px; color: #065f46; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
|
||||
href="/admin/core/snapshot/?id__exact={}"
|
||||
title="Redo failed extractors (missing outputs)"
|
||||
onmouseover="this.style.background='#d1fae5';"
|
||||
onmouseout="this.style.background='#ecfdf5';">
|
||||
🔁 Redo Failed
|
||||
</a>
|
||||
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #fffbeb; border: 1px solid #fde68a; border-radius: 8px; color: #92400e; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
|
||||
href="/admin/core/snapshot/?id__exact={}"
|
||||
@@ -707,7 +708,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
# return super().changelist_view(request, extra_context=None)
|
||||
|
||||
@admin.action(
|
||||
description="⏯️ Finish"
|
||||
description="🔁 Redo Failed"
|
||||
)
|
||||
def update_snapshots(self, request, queryset):
|
||||
count = queryset.count()
|
||||
@@ -721,7 +722,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
|
||||
|
||||
@admin.action(
|
||||
description="⬇️ Fresh"
|
||||
description="🆕 Archive Now"
|
||||
)
|
||||
def resnapshot_snapshot(self, request, queryset):
|
||||
for snapshot in queryset:
|
||||
|
||||
@@ -1704,8 +1704,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
Create ArchiveResult records for all enabled hooks.
|
||||
|
||||
Uses the hooks system to discover available hooks from:
|
||||
- archivebox/plugins/*/on_Snapshot__*.{py,sh,js}
|
||||
- data/plugins/*/on_Snapshot__*.{py,sh,js}
|
||||
- abx_plugins/plugins/*/on_Snapshot__*.{py,sh,js}
|
||||
- data/custom_plugins/*/on_Snapshot__*.{py,sh,js}
|
||||
|
||||
Creates one ArchiveResult per hook (not per plugin), with hook_name set.
|
||||
This enables step-based execution where all hooks in a step can run in parallel.
|
||||
@@ -2486,7 +2486,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
@property
|
||||
def plugin_module(self) -> Any | None:
|
||||
# Hook scripts are now used instead of Python plugin modules
|
||||
# The plugin name maps to hooks in archivebox/plugins/{plugin}/
|
||||
# The plugin name maps to hooks in abx_plugins/plugins/{plugin}/
|
||||
return None
|
||||
|
||||
def output_exists(self) -> bool:
|
||||
|
||||
@@ -349,15 +349,6 @@ def plugin_name(value: str) -> str:
|
||||
return get_plugin_name(value)
|
||||
|
||||
|
||||
@register.filter
|
||||
def plugin_display_name(value: str) -> str:
|
||||
"""
|
||||
Human-friendly plugin name overrides for UI display.
|
||||
"""
|
||||
name = get_plugin_name(value)
|
||||
if name == 'merkletree':
|
||||
return 'hashes'
|
||||
return name
|
||||
|
||||
|
||||
@register.simple_tag(takes_context=True)
|
||||
|
||||
@@ -1145,13 +1145,31 @@ def live_progress_view(request):
|
||||
for proc in running_workers:
|
||||
env = proc.env or {}
|
||||
if not isinstance(env, dict):
|
||||
continue
|
||||
env = {}
|
||||
|
||||
cmd = proc.cmd or []
|
||||
if proc.worker_type == 'crawl':
|
||||
crawl_id = env.get('CRAWL_ID')
|
||||
if not crawl_id:
|
||||
for i, part in enumerate(cmd):
|
||||
if part == '--crawl-id' and i + 1 < len(cmd):
|
||||
crawl_id = cmd[i + 1]
|
||||
break
|
||||
if part.startswith('--crawl-id='):
|
||||
crawl_id = part.split('=', 1)[1]
|
||||
break
|
||||
if crawl_id:
|
||||
crawl_worker_pids[str(crawl_id)] = proc.pid
|
||||
elif proc.worker_type == 'snapshot':
|
||||
snapshot_id = env.get('SNAPSHOT_ID')
|
||||
if not snapshot_id:
|
||||
for i, part in enumerate(cmd):
|
||||
if part == '--snapshot-id' and i + 1 < len(cmd):
|
||||
snapshot_id = cmd[i + 1]
|
||||
break
|
||||
if part.startswith('--snapshot-id='):
|
||||
snapshot_id = part.split('=', 1)[1]
|
||||
break
|
||||
if snapshot_id:
|
||||
snapshot_worker_pids[str(snapshot_id)] = proc.pid
|
||||
|
||||
@@ -1243,7 +1261,7 @@ def live_progress_view(request):
|
||||
'plugin': ar.plugin,
|
||||
'status': status,
|
||||
}
|
||||
if ar.process_id and ar.process and ar.process.status == Process.StatusChoices.RUNNING:
|
||||
if status == ArchiveResult.StatusChoices.STARTED and ar.process_id and ar.process:
|
||||
plugin_payload['pid'] = ar.process.pid
|
||||
if status == ArchiveResult.StatusChoices.STARTED:
|
||||
plugin_payload['progress'] = progress_value
|
||||
|
||||
@@ -6,8 +6,8 @@ with ArchiveBox via CLI arguments and stdout JSON output. This keeps the plugin
|
||||
system simple and language-agnostic.
|
||||
|
||||
Directory structure:
|
||||
archivebox/plugins/<plugin_name>/on_<Event>__<hook_name>.<ext> (built-in)
|
||||
data/plugins/<plugin_name>/on_<Event>__<hook_name>.<ext> (user)
|
||||
abx_plugins/plugins/<plugin_name>/on_<Event>__<hook_name>.<ext> (built-in package)
|
||||
data/custom_plugins/<plugin_name>/on_<Event>__<hook_name>.<ext> (user)
|
||||
|
||||
Hook contract:
|
||||
Input: --url=<url> (and other --key=value args)
|
||||
@@ -66,14 +66,20 @@ from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any, Optional, TypedDict
|
||||
|
||||
from abx_plugins import get_plugins_dir
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from django.utils.safestring import mark_safe
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
|
||||
|
||||
# Plugin directories
|
||||
BUILTIN_PLUGINS_DIR = Path(__file__).parent / 'plugins'
|
||||
USER_PLUGINS_DIR = Path(getattr(settings, 'DATA_DIR', Path.cwd())) / 'plugins'
|
||||
BUILTIN_PLUGINS_DIR = Path(get_plugins_dir()).resolve()
|
||||
USER_PLUGINS_DIR = Path(
|
||||
os.environ.get('ARCHIVEBOX_USER_PLUGINS_DIR')
|
||||
or getattr(settings, 'USER_PLUGINS_DIR', '')
|
||||
or str(CONSTANTS.USER_PLUGINS_DIR)
|
||||
).expanduser()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
@@ -197,11 +203,11 @@ def discover_hooks(
|
||||
|
||||
for hook in hooks:
|
||||
# Get plugin name from parent directory
|
||||
# e.g., archivebox/plugins/wget/on_Snapshot__50_wget.py -> 'wget'
|
||||
# e.g., abx_plugins/plugins/wget/on_Snapshot__50_wget.py -> 'wget'
|
||||
plugin_name = hook.parent.name
|
||||
|
||||
# Check if this is a plugin directory (not the root plugins dir)
|
||||
if plugin_name in ('plugins', '.'):
|
||||
if hook.parent.resolve() in (BUILTIN_PLUGINS_DIR.resolve(), USER_PLUGINS_DIR.resolve()):
|
||||
# Hook is in root plugins directory, not a plugin subdir
|
||||
# Include it by default (no filtering for non-plugin hooks)
|
||||
enabled_hooks.append(hook)
|
||||
@@ -581,7 +587,7 @@ def get_plugins() -> List[str]:
|
||||
The plugin name is the plugin directory name, not the hook script name.
|
||||
|
||||
Example:
|
||||
archivebox/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js
|
||||
abx_plugins/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js
|
||||
-> plugin = 'chrome'
|
||||
|
||||
Sorted alphabetically (plugins control their hook order via numeric prefixes in hook names).
|
||||
@@ -728,7 +734,7 @@ def discover_plugins_that_provide_interface(
|
||||
try:
|
||||
# Import the module dynamically
|
||||
spec = importlib.util.spec_from_file_location(
|
||||
f'archivebox.plugins.{plugin_name}.{module_name}',
|
||||
f'archivebox.dynamic_plugins.{plugin_name}.{module_name}',
|
||||
module_path
|
||||
)
|
||||
if spec is None or spec.loader is None:
|
||||
@@ -942,7 +948,7 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
|
||||
# Plugins can provide custom templates for rendering their output in the UI.
|
||||
# Templates are discovered by filename convention inside each plugin's templates/ dir:
|
||||
#
|
||||
# archivebox/plugins/<plugin_name>/
|
||||
# abx_plugins/plugins/<plugin_name>/
|
||||
# templates/
|
||||
# icon.html # Icon for admin table view (small inline HTML)
|
||||
# card.html # Preview card for snapshot header
|
||||
|
||||
318
archivebox/ideas/process_plugin.py
Normal file
318
archivebox/ideas/process_plugin.py
Normal file
@@ -0,0 +1,318 @@
|
||||
__package__ = 'archivebox.ideas'
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import shlex
|
||||
import signal
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Mapping, MutableMapping, Optional
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
try:
|
||||
from bubus import BaseEvent, EventBus
|
||||
except Exception as exc: # pragma: no cover - optional dependency
|
||||
raise ImportError('ProcessPlugin requires bubus to be installed') from exc
|
||||
|
||||
try:
|
||||
from bubus.service import uuid7str
|
||||
except Exception: # pragma: no cover - optional dependency
|
||||
from uuid import uuid4 as _uuid4
|
||||
|
||||
def uuid7str() -> str:
|
||||
return str(_uuid4())
|
||||
|
||||
|
||||
def _utcnow() -> datetime:
|
||||
return datetime.now(timezone.utc)
|
||||
|
||||
|
||||
class ProcessRecord(BaseModel):
|
||||
id: str = Field(default_factory=uuid7str)
|
||||
cmd: list[str]
|
||||
cwd: str | None = None
|
||||
env: dict[str, str] = Field(default_factory=dict)
|
||||
pid: int | None = None
|
||||
started_at: datetime | None = None
|
||||
ended_at: datetime | None = None
|
||||
exit_code: int | None = None
|
||||
stdout_path: str | None = None
|
||||
stderr_path: str | None = None
|
||||
cmd_path: str | None = None
|
||||
pid_path: str | None = None
|
||||
is_background: bool = False
|
||||
parent_process_id: str | None = None
|
||||
|
||||
|
||||
class ProcessLaunch(BaseEvent[ProcessRecord]):
|
||||
cmd: list[str]
|
||||
cwd: str | None = None
|
||||
env: dict[str, str] | None = None
|
||||
timeout: float | None = None
|
||||
output_dir: str | None = None
|
||||
log_prefix: str | None = None
|
||||
is_background: bool = False
|
||||
parent_process_id: str | None = None
|
||||
parse_stdout_events: bool = True
|
||||
|
||||
|
||||
class ProcessStarted(BaseEvent[None]):
|
||||
process: ProcessRecord
|
||||
|
||||
|
||||
class ProcessExited(BaseEvent[None]):
|
||||
process: ProcessRecord
|
||||
|
||||
|
||||
class ProcessKill(BaseEvent[ProcessRecord]):
|
||||
process_id: str
|
||||
signal: int = signal.SIGTERM
|
||||
timeout: float | None = 10.0
|
||||
|
||||
|
||||
@dataclass
|
||||
class _RunningProcess:
|
||||
process: asyncio.subprocess.Process
|
||||
record: ProcessRecord
|
||||
stdout_task: asyncio.Task[None] | None
|
||||
stderr_task: asyncio.Task[None] | None
|
||||
watcher_task: asyncio.Task[None] | None
|
||||
parent_event_id: str | None
|
||||
|
||||
|
||||
JsonEventAdapter = Callable[[dict[str, Any], str | None], Optional[BaseEvent[Any]]]
|
||||
|
||||
|
||||
class ProcessPlugin:
|
||||
"""Spawn and monitor processes using events (no Django required)."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
bus: EventBus,
|
||||
*,
|
||||
env: Mapping[str, str] | None = None,
|
||||
json_event_adapter: JsonEventAdapter | None = None,
|
||||
) -> None:
|
||||
self.bus = bus
|
||||
self.env = dict(env or os.environ)
|
||||
self.json_event_adapter = json_event_adapter
|
||||
self._running: MutableMapping[str, _RunningProcess] = {}
|
||||
|
||||
def register_event_handlers(self) -> None:
|
||||
self.bus.on(ProcessLaunch, self.on_ProcessLaunch)
|
||||
self.bus.on(ProcessKill, self.on_ProcessKill)
|
||||
|
||||
async def on_ProcessLaunch(self, event: ProcessLaunch) -> ProcessRecord:
|
||||
parent_event_id = event.event_id
|
||||
proc_id = uuid7str()
|
||||
cwd = event.cwd or event.output_dir or os.getcwd()
|
||||
output_dir = Path(event.output_dir or cwd)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
env = {**self.env, **(event.env or {})}
|
||||
|
||||
log_prefix = event.log_prefix or proc_id
|
||||
stdout_path = output_dir / f'{log_prefix}.stdout.log'
|
||||
stderr_path = output_dir / f'{log_prefix}.stderr.log'
|
||||
cmd_path = output_dir / f'{log_prefix}.sh'
|
||||
pid_path = output_dir / f'{log_prefix}.pid'
|
||||
|
||||
self._write_cmd_file(cmd_path, event.cmd)
|
||||
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*event.cmd,
|
||||
cwd=str(cwd),
|
||||
env=env,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
start_new_session=True,
|
||||
)
|
||||
|
||||
self._write_pid_file(pid_path, proc.pid)
|
||||
|
||||
record = ProcessRecord(
|
||||
id=proc_id,
|
||||
cmd=event.cmd,
|
||||
cwd=str(cwd),
|
||||
env=env,
|
||||
pid=proc.pid,
|
||||
started_at=_utcnow(),
|
||||
stdout_path=str(stdout_path),
|
||||
stderr_path=str(stderr_path),
|
||||
cmd_path=str(cmd_path),
|
||||
pid_path=str(pid_path),
|
||||
is_background=event.is_background,
|
||||
parent_process_id=event.parent_process_id,
|
||||
)
|
||||
|
||||
await event.event_bus.dispatch(
|
||||
ProcessStarted(process=record, event_parent_id=parent_event_id)
|
||||
)
|
||||
|
||||
stdout_task = asyncio.create_task(
|
||||
self._consume_stream(
|
||||
proc.stdout, stdout_path, parent_event_id, event.parse_stdout_events
|
||||
)
|
||||
)
|
||||
stderr_task = asyncio.create_task(
|
||||
self._consume_stream(proc.stderr, stderr_path, parent_event_id, False)
|
||||
)
|
||||
|
||||
running = _RunningProcess(
|
||||
process=proc,
|
||||
record=record,
|
||||
stdout_task=stdout_task,
|
||||
stderr_task=stderr_task,
|
||||
watcher_task=None,
|
||||
parent_event_id=parent_event_id,
|
||||
)
|
||||
self._running[proc_id] = running
|
||||
|
||||
if event.is_background:
|
||||
running.watcher_task = asyncio.create_task(
|
||||
self._watch_process(proc_id, event.timeout)
|
||||
)
|
||||
return record
|
||||
|
||||
await self._watch_process(proc_id, event.timeout)
|
||||
return self._running.get(proc_id, running).record
|
||||
|
||||
async def on_ProcessKill(self, event: ProcessKill) -> ProcessRecord:
|
||||
running = self._running.get(event.process_id)
|
||||
if not running:
|
||||
raise RuntimeError(f'Process not found: {event.process_id}')
|
||||
|
||||
proc = running.process
|
||||
self._terminate_process(proc, event.signal)
|
||||
|
||||
if event.timeout is not None:
|
||||
try:
|
||||
await asyncio.wait_for(proc.wait(), timeout=event.timeout)
|
||||
except asyncio.TimeoutError:
|
||||
self._terminate_process(proc, signal.SIGKILL)
|
||||
else:
|
||||
await proc.wait()
|
||||
|
||||
await self._finalize_process(event.process_id)
|
||||
return self._running.get(event.process_id, running).record
|
||||
|
||||
async def _watch_process(self, process_id: str, timeout: float | None) -> None:
|
||||
running = self._running.get(process_id)
|
||||
if not running:
|
||||
return
|
||||
proc = running.process
|
||||
try:
|
||||
if timeout is not None:
|
||||
await asyncio.wait_for(proc.wait(), timeout=timeout)
|
||||
else:
|
||||
await proc.wait()
|
||||
except asyncio.TimeoutError:
|
||||
self._terminate_process(proc, signal.SIGTERM)
|
||||
await asyncio.sleep(2)
|
||||
if proc.returncode is None:
|
||||
self._terminate_process(proc, signal.SIGKILL)
|
||||
await proc.wait()
|
||||
await self._finalize_process(process_id)
|
||||
|
||||
async def _finalize_process(self, process_id: str) -> None:
|
||||
running = self._running.get(process_id)
|
||||
if not running:
|
||||
return
|
||||
|
||||
proc = running.process
|
||||
record = running.record
|
||||
|
||||
if running.stdout_task:
|
||||
await running.stdout_task
|
||||
if running.stderr_task:
|
||||
await running.stderr_task
|
||||
|
||||
record.exit_code = proc.returncode
|
||||
record.ended_at = _utcnow()
|
||||
|
||||
await self.bus.dispatch(
|
||||
ProcessExited(process=record, event_parent_id=running.parent_event_id)
|
||||
)
|
||||
|
||||
self._running.pop(process_id, None)
|
||||
|
||||
async def _consume_stream(
|
||||
self,
|
||||
stream: asyncio.StreamReader | None,
|
||||
path: Path,
|
||||
parent_event_id: str | None,
|
||||
parse_events: bool,
|
||||
) -> None:
|
||||
if stream is None:
|
||||
return
|
||||
with path.open('w', encoding='utf-8') as fh:
|
||||
while True:
|
||||
line = await stream.readline()
|
||||
if not line:
|
||||
break
|
||||
text = line.decode('utf-8', errors='replace')
|
||||
fh.write(text)
|
||||
fh.flush()
|
||||
if parse_events:
|
||||
await self._maybe_dispatch_json_event(text, parent_event_id)
|
||||
|
||||
async def _maybe_dispatch_json_event(self, line: str, parent_event_id: str | None) -> None:
|
||||
text = line.strip()
|
||||
if not text.startswith('{') or not text.endswith('}'):
|
||||
return
|
||||
try:
|
||||
data = json.loads(text)
|
||||
except json.JSONDecodeError:
|
||||
return
|
||||
|
||||
event = None
|
||||
if self.json_event_adapter:
|
||||
event = self.json_event_adapter(data, parent_event_id)
|
||||
elif isinstance(data, dict) and 'event_type' in data:
|
||||
try:
|
||||
event = BaseEvent.model_validate(data)
|
||||
except Exception:
|
||||
event = None
|
||||
|
||||
if event is None:
|
||||
return
|
||||
|
||||
if not getattr(event, 'event_parent_id', None) and parent_event_id:
|
||||
event.event_parent_id = parent_event_id
|
||||
await self.bus.dispatch(event)
|
||||
|
||||
@staticmethod
|
||||
def _write_cmd_file(path: Path, cmd: list[str]) -> None:
|
||||
cmd_line = ' '.join(shlex.quote(part) for part in cmd)
|
||||
path.write_text(cmd_line + '\n', encoding='utf-8')
|
||||
|
||||
@staticmethod
|
||||
def _write_pid_file(path: Path, pid: int) -> None:
|
||||
path.write_text(str(pid), encoding='utf-8')
|
||||
ts = datetime.now().timestamp()
|
||||
os.utime(path, (ts, ts))
|
||||
|
||||
@staticmethod
|
||||
def _terminate_process(proc: asyncio.subprocess.Process, sig: int) -> None:
|
||||
if proc.returncode is not None:
|
||||
return
|
||||
try:
|
||||
os.killpg(proc.pid, sig)
|
||||
except Exception:
|
||||
try:
|
||||
os.kill(proc.pid, sig)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
__all__ = [
|
||||
'ProcessRecord',
|
||||
'ProcessLaunch',
|
||||
'ProcessStarted',
|
||||
'ProcessExited',
|
||||
'ProcessKill',
|
||||
'ProcessPlugin',
|
||||
]
|
||||
@@ -1,21 +0,0 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"required_plugins": ["chrome"],
|
||||
"properties": {
|
||||
"ACCESSIBILITY_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_ACCESSIBILITY", "USE_ACCESSIBILITY"],
|
||||
"description": "Enable accessibility tree capture"
|
||||
},
|
||||
"ACCESSIBILITY_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 30,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for accessibility capture in seconds"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,288 +0,0 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Extract accessibility tree and page outline from a URL.
|
||||
*
|
||||
* Extracts:
|
||||
* - Page outline (headings h1-h6, sections, articles)
|
||||
* - Iframe tree
|
||||
* - Accessibility snapshot
|
||||
* - ARIA labels and roles
|
||||
*
|
||||
* Usage: on_Snapshot__39_accessibility.js --url=<url> --snapshot-id=<uuid>
|
||||
* Output: Writes accessibility/accessibility.json
|
||||
*
|
||||
* Environment variables:
|
||||
* SAVE_ACCESSIBILITY: Enable accessibility extraction (default: true)
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const PLUGIN_NAME = 'accessibility';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'accessibility.json';
|
||||
const CHROME_SESSION_DIR = '../chrome';
|
||||
const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
const args = {};
|
||||
process.argv.slice(2).forEach(arg => {
|
||||
if (arg.startsWith('--')) {
|
||||
const [key, ...valueParts] = arg.slice(2).split('=');
|
||||
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
|
||||
}
|
||||
});
|
||||
return args;
|
||||
}
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
|
||||
function getEnvBool(name, defaultValue = false) {
|
||||
const val = getEnv(name, '').toLowerCase();
|
||||
if (['true', '1', 'yes', 'on'].includes(val)) return true;
|
||||
if (['false', '0', 'no', 'off'].includes(val)) return false;
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
// Wait for chrome tab to be fully loaded
|
||||
async function waitForChromeTabLoaded(timeoutMs = 60000) {
|
||||
const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json');
|
||||
const startTime = Date.now();
|
||||
|
||||
while (Date.now() - startTime < timeoutMs) {
|
||||
if (fs.existsSync(navigationFile)) {
|
||||
return true;
|
||||
}
|
||||
// Wait 100ms before checking again
|
||||
await new Promise(resolve => setTimeout(resolve, 100));
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// Get CDP URL from chrome plugin
|
||||
function getCdpUrl() {
|
||||
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
|
||||
if (fs.existsSync(cdpFile)) {
|
||||
return fs.readFileSync(cdpFile, 'utf8').trim();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function assertChromeSession() {
|
||||
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
|
||||
const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt');
|
||||
const pidFile = path.join(CHROME_SESSION_DIR, 'chrome.pid');
|
||||
if (!fs.existsSync(cdpFile) || !fs.existsSync(targetIdFile) || !fs.existsSync(pidFile)) {
|
||||
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
|
||||
}
|
||||
try {
|
||||
const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10);
|
||||
if (!pid || Number.isNaN(pid)) throw new Error('Invalid pid');
|
||||
process.kill(pid, 0);
|
||||
} catch (e) {
|
||||
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
|
||||
}
|
||||
const cdpUrl = getCdpUrl();
|
||||
if (!cdpUrl) {
|
||||
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
|
||||
}
|
||||
return cdpUrl;
|
||||
}
|
||||
|
||||
// Extract accessibility info
|
||||
async function extractAccessibility(url) {
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
let browser = null;
|
||||
|
||||
try {
|
||||
// Connect to existing Chrome session
|
||||
const cdpUrl = assertChromeSession();
|
||||
|
||||
browser = await puppeteer.connect({
|
||||
browserWSEndpoint: cdpUrl,
|
||||
});
|
||||
|
||||
// Get the page
|
||||
const pages = await browser.pages();
|
||||
const page = pages.find(p => p.url().startsWith('http')) || pages[0];
|
||||
|
||||
if (!page) {
|
||||
return { success: false, error: 'No page found in Chrome session' };
|
||||
}
|
||||
|
||||
// Get accessibility snapshot
|
||||
const accessibilityTree = await page.accessibility.snapshot({ interestingOnly: true });
|
||||
|
||||
// Extract page outline (headings, sections, etc.)
|
||||
const outline = await page.evaluate(() => {
|
||||
const headings = [];
|
||||
const elements = document.querySelectorAll(
|
||||
'h1, h2, h3, h4, h5, h6, a[name], header, footer, article, main, aside, nav, section, figure, summary, table, form, iframe'
|
||||
);
|
||||
|
||||
elements.forEach(elem => {
|
||||
// Skip unnamed anchors
|
||||
if (elem.tagName.toLowerCase() === 'a' && !elem.name) return;
|
||||
|
||||
const tagName = elem.tagName.toLowerCase();
|
||||
const elemId = elem.id || elem.name || elem.getAttribute('aria-label') || elem.role || '';
|
||||
const elemClasses = (elem.className || '').toString().trim().split(/\s+/).slice(0, 3).join(' .');
|
||||
const action = elem.action?.split('/').pop() || '';
|
||||
|
||||
let summary = (elem.innerText || '').slice(0, 128);
|
||||
if (summary.length >= 128) summary += '...';
|
||||
|
||||
let prefix = '';
|
||||
let title = '';
|
||||
|
||||
// Format headings with # prefix
|
||||
const level = parseInt(tagName.replace('h', ''));
|
||||
if (!isNaN(level)) {
|
||||
prefix = '#'.repeat(level);
|
||||
title = elem.innerText || elemId || elemClasses;
|
||||
} else {
|
||||
// For other elements, create breadcrumb path
|
||||
const parents = [tagName];
|
||||
let node = elem.parentNode;
|
||||
while (node && parents.length < 5) {
|
||||
if (node.tagName) {
|
||||
const tag = node.tagName.toLowerCase();
|
||||
if (!['div', 'span', 'p', 'body', 'html'].includes(tag)) {
|
||||
parents.unshift(tag);
|
||||
} else {
|
||||
parents.unshift('');
|
||||
}
|
||||
}
|
||||
node = node.parentNode;
|
||||
}
|
||||
prefix = parents.join('>');
|
||||
|
||||
title = elemId ? `#${elemId}` : '';
|
||||
if (!title && elemClasses) title = `.${elemClasses}`;
|
||||
if (action) title += ` /${action}`;
|
||||
if (summary && !title.includes(summary)) title += `: ${summary}`;
|
||||
}
|
||||
|
||||
// Clean up title
|
||||
title = title.replace(/\s+/g, ' ').trim();
|
||||
|
||||
if (prefix) {
|
||||
headings.push(`${prefix} ${title}`);
|
||||
}
|
||||
});
|
||||
|
||||
return headings;
|
||||
});
|
||||
|
||||
// Get iframe tree
|
||||
const iframes = [];
|
||||
function dumpFrameTree(frame, indent = '>') {
|
||||
iframes.push(indent + frame.url());
|
||||
for (const child of frame.childFrames()) {
|
||||
dumpFrameTree(child, indent + '>');
|
||||
}
|
||||
}
|
||||
dumpFrameTree(page.mainFrame(), '');
|
||||
|
||||
const accessibilityData = {
|
||||
url,
|
||||
headings: outline,
|
||||
iframes,
|
||||
tree: accessibilityTree,
|
||||
};
|
||||
|
||||
// Write output
|
||||
fs.writeFileSync(outputPath, JSON.stringify(accessibilityData, null, 2));
|
||||
|
||||
return { success: true, output: outputPath, accessibilityData };
|
||||
|
||||
} catch (e) {
|
||||
return { success: false, error: `${e.name}: ${e.message}` };
|
||||
} finally {
|
||||
if (browser) {
|
||||
browser.disconnect();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const url = args.url;
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__39_accessibility.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const startTs = new Date();
|
||||
let status = 'failed';
|
||||
let output = null;
|
||||
let error = '';
|
||||
|
||||
try {
|
||||
// Check if enabled
|
||||
if (!getEnvBool('ACCESSIBILITY_ENABLED', true)) {
|
||||
console.log('Skipping accessibility (ACCESSIBILITY_ENABLED=False)');
|
||||
// Output clean JSONL (no RESULT_JSON= prefix)
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status: 'skipped',
|
||||
output_str: 'ACCESSIBILITY_ENABLED=False',
|
||||
}));
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Check if Chrome session exists, then wait for page load
|
||||
assertChromeSession();
|
||||
const pageLoaded = await waitForChromeTabLoaded(60000);
|
||||
if (!pageLoaded) {
|
||||
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
|
||||
}
|
||||
|
||||
const result = await extractAccessibility(url);
|
||||
|
||||
if (result.success) {
|
||||
status = 'succeeded';
|
||||
output = result.output;
|
||||
const headingCount = result.accessibilityData.headings.length;
|
||||
const iframeCount = result.accessibilityData.iframes.length;
|
||||
console.log(`Accessibility extracted: ${headingCount} headings, ${iframeCount} iframes`);
|
||||
} else {
|
||||
status = 'failed';
|
||||
error = result.error;
|
||||
}
|
||||
} catch (e) {
|
||||
error = `${e.name}: ${e.message}`;
|
||||
status = 'failed';
|
||||
}
|
||||
|
||||
const endTs = new Date();
|
||||
|
||||
if (error) console.error(`ERROR: ${error}`);
|
||||
|
||||
// Output clean JSONL (no RESULT_JSON= prefix)
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status,
|
||||
output_str: output || error || '',
|
||||
}));
|
||||
|
||||
process.exit(status === 'succeeded' ? 0 : 1);
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -1 +0,0 @@
|
||||
<span class="abx-output-icon abx-output-icon--accessibility" title="Accessibility"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="4.5" r="2" fill="currentColor" stroke="none"/><path d="M4 7.5h16"/><path d="M12 7.5v12"/><path d="M7 20l5-6 5 6"/></svg></span>
|
||||
@@ -1,195 +0,0 @@
|
||||
"""
|
||||
Tests for the accessibility plugin.
|
||||
|
||||
Tests the real accessibility hook with an actual URL to verify
|
||||
accessibility tree and page outline extraction.
|
||||
"""
|
||||
|
||||
import json
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from django.test import TestCase
|
||||
|
||||
# Import chrome test helpers
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests'))
|
||||
from chrome_test_helpers import (
|
||||
chrome_session,
|
||||
get_test_env,
|
||||
get_plugin_dir,
|
||||
get_hook_script,
|
||||
)
|
||||
|
||||
|
||||
def chrome_available() -> bool:
|
||||
"""Check if Chrome/Chromium is available."""
|
||||
for name in ['chromium', 'chromium-browser', 'google-chrome', 'chrome']:
|
||||
if shutil.which(name):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
# Get the path to the accessibility hook
|
||||
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||
ACCESSIBILITY_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_accessibility.*')
|
||||
|
||||
|
||||
class TestAccessibilityPlugin(TestCase):
|
||||
"""Test the accessibility plugin."""
|
||||
|
||||
def test_accessibility_hook_exists(self):
|
||||
"""Accessibility hook script should exist."""
|
||||
self.assertIsNotNone(ACCESSIBILITY_HOOK, "Accessibility hook not found in plugin directory")
|
||||
self.assertTrue(ACCESSIBILITY_HOOK.exists(), f"Hook not found: {ACCESSIBILITY_HOOK}")
|
||||
|
||||
|
||||
class TestAccessibilityWithChrome(TestCase):
|
||||
"""Integration tests for accessibility plugin with Chrome."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test environment."""
|
||||
self.temp_dir = Path(tempfile.mkdtemp())
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up."""
|
||||
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
||||
|
||||
def test_accessibility_extracts_page_outline(self):
|
||||
"""Accessibility hook should extract headings and accessibility tree."""
|
||||
test_url = 'https://example.com'
|
||||
snapshot_id = 'test-accessibility-snapshot'
|
||||
|
||||
try:
|
||||
with chrome_session(
|
||||
self.temp_dir,
|
||||
crawl_id='test-accessibility-crawl',
|
||||
snapshot_id=snapshot_id,
|
||||
test_url=test_url,
|
||||
navigate=True,
|
||||
timeout=30,
|
||||
) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
|
||||
# Use the environment from chrome_session (already has CHROME_HEADLESS=true)
|
||||
|
||||
# Run accessibility hook with the active Chrome session
|
||||
result = subprocess.run(
|
||||
['node', str(ACCESSIBILITY_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Check for output file
|
||||
accessibility_output = snapshot_chrome_dir / 'accessibility.json'
|
||||
|
||||
accessibility_data = None
|
||||
|
||||
# Try parsing from file first
|
||||
if accessibility_output.exists():
|
||||
with open(accessibility_output) as f:
|
||||
try:
|
||||
accessibility_data = json.load(f)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Verify hook ran successfully
|
||||
self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}")
|
||||
self.assertNotIn('Traceback', result.stderr)
|
||||
|
||||
# example.com has headings, so we should get accessibility data
|
||||
self.assertIsNotNone(accessibility_data, "No accessibility data was generated")
|
||||
|
||||
# Verify we got page outline data
|
||||
self.assertIn('headings', accessibility_data, f"Missing headings: {accessibility_data}")
|
||||
self.assertIn('url', accessibility_data, f"Missing url: {accessibility_data}")
|
||||
|
||||
except RuntimeError:
|
||||
raise
|
||||
|
||||
def test_accessibility_disabled_skips(self):
|
||||
"""Test that ACCESSIBILITY_ENABLED=False skips without error."""
|
||||
test_url = 'https://example.com'
|
||||
snapshot_id = 'test-disabled'
|
||||
|
||||
env = get_test_env()
|
||||
env['ACCESSIBILITY_ENABLED'] = 'False'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(ACCESSIBILITY_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
|
||||
cwd=str(self.temp_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Should exit 0 even when disabled
|
||||
self.assertEqual(result.returncode, 0, f"Should succeed when disabled: {result.stderr}")
|
||||
|
||||
# Should NOT create output file when disabled
|
||||
accessibility_output = self.temp_dir / 'accessibility.json'
|
||||
self.assertFalse(accessibility_output.exists(), "Should not create file when disabled")
|
||||
|
||||
def test_accessibility_missing_url_argument(self):
|
||||
"""Test that missing --url argument causes error."""
|
||||
snapshot_id = 'test-missing-url'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(ACCESSIBILITY_HOOK), f'--snapshot-id={snapshot_id}'],
|
||||
cwd=str(self.temp_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
env=get_test_env()
|
||||
)
|
||||
|
||||
# Should fail with non-zero exit code
|
||||
self.assertNotEqual(result.returncode, 0, "Should fail when URL missing")
|
||||
|
||||
def test_accessibility_missing_snapshot_id_argument(self):
|
||||
"""Test that missing --snapshot-id argument causes error."""
|
||||
test_url = 'https://example.com'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(ACCESSIBILITY_HOOK), f'--url={test_url}'],
|
||||
cwd=str(self.temp_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
env=get_test_env()
|
||||
)
|
||||
|
||||
# Should fail with non-zero exit code
|
||||
self.assertNotEqual(result.returncode, 0, "Should fail when snapshot-id missing")
|
||||
|
||||
def test_accessibility_with_no_chrome_session(self):
|
||||
"""Test that hook fails gracefully when no Chrome session exists."""
|
||||
test_url = 'https://example.com'
|
||||
snapshot_id = 'test-no-chrome'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(ACCESSIBILITY_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
|
||||
cwd=str(self.temp_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
env=get_test_env()
|
||||
)
|
||||
|
||||
# Should fail when no Chrome session
|
||||
self.assertNotEqual(result.returncode, 0, "Should fail when no Chrome session exists")
|
||||
# Error should mention CDP or Chrome
|
||||
err_lower = result.stderr.lower()
|
||||
self.assertTrue(
|
||||
any(x in err_lower for x in ['chrome', 'cdp', 'cannot find', 'puppeteer']),
|
||||
f"Should mention Chrome/CDP in error: {result.stderr}"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
@@ -1,83 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Install a binary using apt package manager.
|
||||
|
||||
Usage: on_Binary__install_using_apt_provider.py --binary-id=<uuid> --machine-id=<uuid> --name=<name>
|
||||
Output: Binary JSONL record to stdout after installation
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
|
||||
import rich_click as click
|
||||
from abx_pkg import Binary, AptProvider, BinProviderOverrides
|
||||
|
||||
# Fix pydantic forward reference issue
|
||||
AptProvider.model_rebuild()
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--binary-id', required=True, help="Binary UUID")
|
||||
@click.option('--machine-id', required=True, help="Machine UUID")
|
||||
@click.option('--name', required=True, help="Binary name to install")
|
||||
@click.option('--binproviders', default='*', help="Allowed providers (comma-separated)")
|
||||
@click.option('--overrides', default=None, help="JSON-encoded overrides dict")
|
||||
def main(binary_id: str, machine_id: str, name: str, binproviders: str, overrides: str | None):
|
||||
"""Install binary using apt package manager."""
|
||||
|
||||
# Check if apt provider is allowed
|
||||
if binproviders != '*' and 'apt' not in binproviders.split(','):
|
||||
click.echo(f"apt provider not allowed for {name}", err=True)
|
||||
sys.exit(0) # Not an error, just skip
|
||||
|
||||
# Use abx-pkg AptProvider to install binary
|
||||
provider = AptProvider()
|
||||
if not provider.INSTALLER_BIN:
|
||||
click.echo("apt not available on this system", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
click.echo(f"Installing {name} via apt...", err=True)
|
||||
|
||||
try:
|
||||
# Parse overrides if provided
|
||||
overrides_dict = None
|
||||
if overrides:
|
||||
try:
|
||||
overrides_dict = json.loads(overrides)
|
||||
# Extract apt-specific overrides
|
||||
overrides_dict = overrides_dict.get('apt', {})
|
||||
click.echo(f"Using apt install overrides: {overrides_dict}", err=True)
|
||||
except json.JSONDecodeError:
|
||||
click.echo(f"Warning: Failed to parse overrides JSON: {overrides}", err=True)
|
||||
|
||||
binary = Binary(name=name, binproviders=[provider], overrides={'apt': overrides_dict} if overrides_dict else {}).install()
|
||||
except Exception as e:
|
||||
click.echo(f"apt install failed: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
if not binary.abspath:
|
||||
click.echo(f"{name} not found after apt install", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Output Binary JSONL record to stdout
|
||||
record = {
|
||||
'type': 'Binary',
|
||||
'name': name,
|
||||
'abspath': str(binary.abspath),
|
||||
'version': str(binary.version) if binary.version else '',
|
||||
'sha256': binary.sha256 or '',
|
||||
'binprovider': 'apt',
|
||||
'machine_id': machine_id,
|
||||
'binary_id': binary_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
# Log human-readable info to stderr
|
||||
click.echo(f"Installed {name} at {binary.abspath}", err=True)
|
||||
click.echo(f" version: {binary.version}", err=True)
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,154 +0,0 @@
|
||||
"""
|
||||
Tests for the apt binary provider plugin.
|
||||
|
||||
Tests cover:
|
||||
1. Hook script execution
|
||||
2. apt package availability detection
|
||||
3. JSONL output format
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from django.test import TestCase
|
||||
|
||||
|
||||
# Get the path to the apt provider hook
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
INSTALL_HOOK = next(PLUGIN_DIR.glob('on_Binary__*_apt_install.py'), None)
|
||||
|
||||
|
||||
def apt_available() -> bool:
|
||||
"""Check if apt is installed."""
|
||||
return shutil.which('apt') is not None or shutil.which('apt-get') is not None
|
||||
|
||||
|
||||
def is_linux() -> bool:
|
||||
"""Check if running on Linux."""
|
||||
import platform
|
||||
return platform.system().lower() == 'linux'
|
||||
|
||||
|
||||
class TestAptProviderHook(TestCase):
|
||||
"""Test the apt binary provider installation hook."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test environment."""
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up."""
|
||||
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
||||
|
||||
def test_hook_script_exists(self):
|
||||
"""Hook script should exist."""
|
||||
self.assertTrue(INSTALL_HOOK and INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}")
|
||||
|
||||
def test_hook_skips_when_apt_not_allowed(self):
|
||||
"""Hook should skip when apt not in allowed binproviders."""
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable, str(INSTALL_HOOK),
|
||||
'--name=wget',
|
||||
'--binary-id=test-uuid',
|
||||
'--machine-id=test-machine',
|
||||
'--binproviders=pip,npm', # apt not allowed
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Should exit cleanly (code 0) when apt not allowed
|
||||
self.assertIn('apt provider not allowed', result.stderr)
|
||||
self.assertEqual(result.returncode, 0)
|
||||
|
||||
@pytest.mark.skipif(not is_linux(), reason="apt only available on Linux")
|
||||
def test_hook_detects_apt(self):
|
||||
"""Hook should detect apt binary when available."""
|
||||
assert apt_available(), "apt not installed"
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable, str(INSTALL_HOOK),
|
||||
'--name=nonexistent-pkg-xyz123',
|
||||
'--binary-id=test-uuid',
|
||||
'--machine-id=test-machine',
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Should not say apt is not available
|
||||
self.assertNotIn('apt not available', result.stderr)
|
||||
|
||||
def test_hook_handles_overrides(self):
|
||||
"""Hook should accept overrides JSON."""
|
||||
overrides = json.dumps({
|
||||
'apt': {'packages': ['custom-package-name']}
|
||||
})
|
||||
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable, str(INSTALL_HOOK),
|
||||
'--name=test-pkg',
|
||||
'--binary-id=test-uuid',
|
||||
'--machine-id=test-machine',
|
||||
f'--overrides={overrides}',
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Should not crash parsing overrides
|
||||
self.assertNotIn('Traceback', result.stderr)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not is_linux(), reason="apt only available on Linux")
|
||||
class TestAptProviderSystemBinaries(TestCase):
|
||||
"""Test apt provider with system binaries."""
|
||||
|
||||
def test_detect_existing_binary(self):
|
||||
"""apt provider should detect already-installed system binaries."""
|
||||
assert apt_available(), "apt not installed"
|
||||
# Check for a binary that's almost certainly installed (like 'ls' or 'bash')
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable, str(INSTALL_HOOK),
|
||||
'--name=bash',
|
||||
'--binary-id=test-uuid',
|
||||
'--machine-id=test-machine',
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Parse JSONL output
|
||||
for line in result.stdout.split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Binary' and record.get('name') == 'bash':
|
||||
# Found bash
|
||||
self.assertTrue(record.get('abspath'))
|
||||
self.assertTrue(Path(record['abspath']).exists())
|
||||
return
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
# apt may not be able to "install" bash (already installed)
|
||||
# Just verify no crash
|
||||
self.assertNotIn('Traceback', result.stderr)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
@@ -1,26 +0,0 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"ARCHIVEDOTORG_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_ARCHIVEDOTORG", "USE_ARCHIVEDOTORG", "SUBMIT_ARCHIVEDOTORG"],
|
||||
"description": "Submit URLs to archive.org Wayback Machine"
|
||||
},
|
||||
"ARCHIVEDOTORG_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 60,
|
||||
"minimum": 10,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for archive.org submission in seconds"
|
||||
},
|
||||
"ARCHIVEDOTORG_USER_AGENT": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"x-fallback": "USER_AGENT",
|
||||
"description": "User agent string"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,154 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Submit a URL to archive.org for archiving.
|
||||
|
||||
Usage: on_Snapshot__archivedotorg.bg.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Writes archive.org.txt to $PWD with the archived URL
|
||||
|
||||
Environment variables:
|
||||
ARCHIVEDOTORG_TIMEOUT: Timeout in seconds (default: 60)
|
||||
USER_AGENT: User agent string
|
||||
|
||||
# Fallback to ARCHIVING_CONFIG values if ARCHIVEDOTORG_* not set:
|
||||
TIMEOUT: Fallback timeout
|
||||
|
||||
Note: This extractor uses the 'requests' library which is bundled with ArchiveBox.
|
||||
It can run standalone if requests is installed: pip install requests
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
PLUGIN_NAME = 'archivedotorg'
|
||||
OUTPUT_DIR = '.'
|
||||
OUTPUT_FILE = 'archive.org.txt'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
|
||||
def get_env_int(name: str, default: int = 0) -> int:
|
||||
try:
|
||||
return int(get_env(name, str(default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
def submit_to_archivedotorg(url: str) -> tuple[bool, str | None, str]:
|
||||
"""
|
||||
Submit URL to archive.org Wayback Machine.
|
||||
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
def log(message: str) -> None:
|
||||
print(f'[archivedotorg] {message}', file=sys.stderr)
|
||||
|
||||
try:
|
||||
import requests
|
||||
except ImportError:
|
||||
return False, None, 'requests library not installed'
|
||||
|
||||
timeout = get_env_int('ARCHIVEDOTORG_TIMEOUT') or get_env_int('TIMEOUT', 60)
|
||||
user_agent = get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
|
||||
|
||||
submit_url = f'https://web.archive.org/save/{url}'
|
||||
log(f'Submitting to Wayback Machine (timeout={timeout}s)')
|
||||
log(f'GET {submit_url}')
|
||||
|
||||
try:
|
||||
response = requests.get(
|
||||
submit_url,
|
||||
timeout=timeout,
|
||||
headers={'User-Agent': user_agent},
|
||||
allow_redirects=True,
|
||||
)
|
||||
log(f'HTTP {response.status_code} final_url={response.url}')
|
||||
|
||||
# Check for successful archive
|
||||
content_location = response.headers.get('Content-Location', '')
|
||||
x_archive_orig_url = response.headers.get('X-Archive-Orig-Url', '')
|
||||
if content_location:
|
||||
log(f'Content-Location: {content_location}')
|
||||
if x_archive_orig_url:
|
||||
log(f'X-Archive-Orig-Url: {x_archive_orig_url}')
|
||||
|
||||
# Build archive URL
|
||||
if content_location:
|
||||
archive_url = f'https://web.archive.org{content_location}'
|
||||
Path(OUTPUT_FILE).write_text(archive_url, encoding='utf-8')
|
||||
log(f'Saved archive URL -> {archive_url}')
|
||||
return True, OUTPUT_FILE, ''
|
||||
elif 'web.archive.org' in response.url:
|
||||
# We were redirected to an archive page
|
||||
Path(OUTPUT_FILE).write_text(response.url, encoding='utf-8')
|
||||
log(f'Redirected to archive page -> {response.url}')
|
||||
return True, OUTPUT_FILE, ''
|
||||
else:
|
||||
# Check for errors in response
|
||||
if 'RobotAccessControlException' in response.text:
|
||||
# Blocked by robots.txt - save submit URL for manual retry
|
||||
Path(OUTPUT_FILE).write_text(submit_url, encoding='utf-8')
|
||||
log('Blocked by robots.txt, saved submit URL for manual retry')
|
||||
return True, OUTPUT_FILE, '' # Consider this a soft success
|
||||
elif response.status_code >= 400:
|
||||
return False, None, f'HTTP {response.status_code}'
|
||||
else:
|
||||
# Save submit URL anyway
|
||||
Path(OUTPUT_FILE).write_text(submit_url, encoding='utf-8')
|
||||
log('No archive URL returned, saved submit URL for manual retry')
|
||||
return True, OUTPUT_FILE, ''
|
||||
|
||||
except requests.Timeout:
|
||||
return False, None, f'Request timed out after {timeout} seconds'
|
||||
except requests.RequestException as e:
|
||||
return False, None, f'{type(e).__name__}: {e}'
|
||||
except Exception as e:
|
||||
return False, None, f'{type(e).__name__}: {e}'
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL to submit to archive.org')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Submit a URL to archive.org for archiving."""
|
||||
|
||||
# Check if feature is enabled
|
||||
if get_env('ARCHIVEDOTORG_ENABLED', 'True').lower() in ('false', '0', 'no', 'off'):
|
||||
print('Skipping archive.org submission (ARCHIVEDOTORG_ENABLED=False)', file=sys.stderr)
|
||||
# Temporary failure (config disabled) - NO JSONL emission
|
||||
sys.exit(0)
|
||||
|
||||
try:
|
||||
# Run extraction
|
||||
success, output, error = submit_to_archivedotorg(url)
|
||||
|
||||
if success:
|
||||
# Success - emit ArchiveResult with output file
|
||||
result = {
|
||||
'type': 'ArchiveResult',
|
||||
'status': 'succeeded',
|
||||
'output_str': output or '',
|
||||
}
|
||||
print(json.dumps(result))
|
||||
sys.exit(0)
|
||||
else:
|
||||
# Transient error (network, timeout, HTTP error) - emit NO JSONL
|
||||
# System will retry later
|
||||
print(f'ERROR: {error}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
# Unexpected error - also transient, emit NO JSONL
|
||||
print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,12 +0,0 @@
|
||||
{% load config_tags %}
|
||||
{% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
|
||||
{% if enabled %}
|
||||
<!-- Archive.org thumbnail - iframe preview of archived page -->
|
||||
<div class="extractor-thumbnail archivedotorg-thumbnail" style="width: 100%; height: 100px; overflow: hidden;">
|
||||
<iframe src="{{ output_path }}"
|
||||
style="width: 100%; height: 100px; border: none; pointer-events: none;"
|
||||
loading="lazy"
|
||||
sandbox="allow-same-origin">
|
||||
</iframe>
|
||||
</div>
|
||||
{% endif %}
|
||||
@@ -1 +0,0 @@
|
||||
<span class="abx-output-icon abx-output-icon--archivedotorg" title="Archive.org"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M3 7h18"/><rect x="3" y="7" width="18" height="13" rx="2"/><path d="M9 12h6"/></svg></span>
|
||||
@@ -1,93 +0,0 @@
|
||||
"""
|
||||
Integration tests for archivedotorg plugin
|
||||
|
||||
Tests verify standalone archive.org extractor execution.
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
ARCHIVEDOTORG_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_archivedotorg.*'), None)
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
def test_hook_script_exists():
|
||||
assert ARCHIVEDOTORG_HOOK.exists()
|
||||
|
||||
def test_submits_to_archivedotorg():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(ARCHIVEDOTORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
|
||||
cwd=tmpdir, capture_output=True, text=True, timeout=60
|
||||
)
|
||||
|
||||
assert result.returncode in (0, 1)
|
||||
|
||||
# Parse clean JSONL output
|
||||
result_json = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
if result.returncode == 0:
|
||||
# Success - should have ArchiveResult
|
||||
assert result_json, "Should have ArchiveResult JSONL output on success"
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
||||
else:
|
||||
# Transient error - no JSONL output, just stderr
|
||||
assert not result_json, "Should NOT emit JSONL on transient error"
|
||||
assert result.stderr, "Should have error message in stderr"
|
||||
|
||||
def test_config_save_archivedotorg_false_skips():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
import os
|
||||
env = os.environ.copy()
|
||||
env['ARCHIVEDOTORG_ENABLED'] = 'False'
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(ARCHIVEDOTORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
|
||||
cwd=tmpdir, capture_output=True, text=True, env=env, timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
|
||||
|
||||
# Feature disabled - temporary failure, should NOT emit JSONL
|
||||
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
|
||||
|
||||
# Should NOT emit any JSONL
|
||||
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
|
||||
assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}"
|
||||
|
||||
def test_handles_timeout():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
import os
|
||||
env = os.environ.copy()
|
||||
env['TIMEOUT'] = '1'
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(ARCHIVEDOTORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'testtimeout'],
|
||||
cwd=tmpdir, capture_output=True, text=True, env=env, timeout=30
|
||||
)
|
||||
|
||||
# Timeout is a transient error - should exit 1 with no JSONL
|
||||
assert result.returncode in (0, 1), "Should complete without hanging"
|
||||
|
||||
# If it timed out (exit 1), should have no JSONL output
|
||||
if result.returncode == 1:
|
||||
jsonl_lines = [line for line in result.stdout.strip().split('\n')
|
||||
if line.strip().startswith('{')]
|
||||
assert len(jsonl_lines) == 0, "Should not emit JSONL on timeout (transient error)"
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
@@ -1,87 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Install a binary using Homebrew package manager.
|
||||
|
||||
Usage: on_Binary__install_using_brew_provider.py --binary-id=<uuid> --machine-id=<uuid> --name=<name> [--custom-cmd=<cmd>]
|
||||
Output: Binary JSONL record to stdout after installation
|
||||
|
||||
Environment variables:
|
||||
MACHINE_ID: Machine UUID (set by orchestrator)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
import rich_click as click
|
||||
from abx_pkg import Binary, BrewProvider, BinProviderOverrides
|
||||
|
||||
# Fix pydantic forward reference issue
|
||||
BrewProvider.model_rebuild()
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--machine-id', required=True, help="Machine UUID")
|
||||
@click.option('--binary-id', required=True, help="Dependency UUID")
|
||||
@click.option('--name', required=True, help="Binary name to install")
|
||||
@click.option('--binproviders', default='*', help="Allowed providers (comma-separated)")
|
||||
@click.option('--custom-cmd', default=None, help="Custom install command")
|
||||
@click.option('--overrides', default=None, help="JSON-encoded overrides dict")
|
||||
def main(binary_id: str, machine_id: str, name: str, binproviders: str, custom_cmd: str | None, overrides: str | None):
|
||||
"""Install binary using Homebrew."""
|
||||
|
||||
if binproviders != '*' and 'brew' not in binproviders.split(','):
|
||||
click.echo(f"brew provider not allowed for {name}", err=True)
|
||||
sys.exit(0)
|
||||
|
||||
# Use abx-pkg BrewProvider to install binary
|
||||
provider = BrewProvider()
|
||||
if not provider.INSTALLER_BIN:
|
||||
click.echo("brew not available on this system", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
click.echo(f"Installing {name} via brew...", err=True)
|
||||
|
||||
try:
|
||||
# Parse overrides if provided
|
||||
overrides_dict = None
|
||||
if overrides:
|
||||
try:
|
||||
overrides_dict = json.loads(overrides)
|
||||
click.echo(f"Using custom install overrides: {overrides_dict}", err=True)
|
||||
except json.JSONDecodeError:
|
||||
click.echo(f"Warning: Failed to parse overrides JSON: {overrides}", err=True)
|
||||
|
||||
binary = Binary(name=name, binproviders=[provider], overrides=overrides_dict or {}).install()
|
||||
except Exception as e:
|
||||
click.echo(f"brew install failed: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
if not binary.abspath:
|
||||
click.echo(f"{name} not found after brew install", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
# Output Binary JSONL record to stdout
|
||||
record = {
|
||||
'type': 'Binary',
|
||||
'name': name,
|
||||
'abspath': str(binary.abspath),
|
||||
'version': str(binary.version) if binary.version else '',
|
||||
'sha256': binary.sha256 or '',
|
||||
'binprovider': 'brew',
|
||||
'machine_id': machine_id,
|
||||
'binary_id': binary_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
# Log human-readable info to stderr
|
||||
click.echo(f"Installed {name} at {binary.abspath}", err=True)
|
||||
click.echo(f" version: {binary.version}", err=True)
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,157 +0,0 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"CHROME_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["USE_CHROME"],
|
||||
"description": "Enable Chromium browser integration for archiving"
|
||||
},
|
||||
"CHROME_BINARY": {
|
||||
"type": "string",
|
||||
"default": "chromium",
|
||||
"x-aliases": ["CHROMIUM_BINARY", "GOOGLE_CHROME_BINARY"],
|
||||
"description": "Path to Chromium binary"
|
||||
},
|
||||
"CHROME_NODE_BINARY": {
|
||||
"type": "string",
|
||||
"default": "node",
|
||||
"x-fallback": "NODE_BINARY",
|
||||
"description": "Path to Node.js binary (for Puppeteer)"
|
||||
},
|
||||
"CHROME_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 60,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for Chrome operations in seconds"
|
||||
},
|
||||
"CHROME_HEADLESS": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Run Chrome in headless mode"
|
||||
},
|
||||
"CHROME_SANDBOX": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Enable Chrome sandbox (disable in Docker with --no-sandbox)"
|
||||
},
|
||||
"CHROME_RESOLUTION": {
|
||||
"type": "string",
|
||||
"default": "1440,2000",
|
||||
"pattern": "^\\d+,\\d+$",
|
||||
"x-fallback": "RESOLUTION",
|
||||
"description": "Browser viewport resolution (width,height)"
|
||||
},
|
||||
"CHROME_USER_DATA_DIR": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"description": "Path to Chrome user data directory for persistent sessions (derived from ACTIVE_PERSONA if not set)"
|
||||
},
|
||||
"CHROME_USER_AGENT": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"x-fallback": "USER_AGENT",
|
||||
"description": "User agent string for Chrome"
|
||||
},
|
||||
"CHROME_ARGS": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": [
|
||||
"--no-first-run",
|
||||
"--no-default-browser-check",
|
||||
"--disable-default-apps",
|
||||
"--disable-sync",
|
||||
"--disable-infobars",
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--disable-component-update",
|
||||
"--disable-domain-reliability",
|
||||
"--disable-breakpad",
|
||||
"--disable-client-side-phishing-detection",
|
||||
"--disable-hang-monitor",
|
||||
"--disable-speech-synthesis-api",
|
||||
"--disable-speech-api",
|
||||
"--disable-print-preview",
|
||||
"--disable-notifications",
|
||||
"--disable-desktop-notifications",
|
||||
"--disable-popup-blocking",
|
||||
"--disable-prompt-on-repost",
|
||||
"--disable-external-intent-requests",
|
||||
"--disable-session-crashed-bubble",
|
||||
"--disable-search-engine-choice-screen",
|
||||
"--disable-datasaver-prompt",
|
||||
"--ash-no-nudges",
|
||||
"--hide-crash-restore-bubble",
|
||||
"--suppress-message-center-popups",
|
||||
"--noerrdialogs",
|
||||
"--no-pings",
|
||||
"--silent-debugger-extension-api",
|
||||
"--deny-permission-prompts",
|
||||
"--safebrowsing-disable-auto-update",
|
||||
"--metrics-recording-only",
|
||||
"--password-store=basic",
|
||||
"--use-mock-keychain",
|
||||
"--disable-cookie-encryption",
|
||||
"--font-render-hinting=none",
|
||||
"--force-color-profile=srgb",
|
||||
"--disable-partial-raster",
|
||||
"--disable-skia-runtime-opts",
|
||||
"--disable-2d-canvas-clip-aa",
|
||||
"--enable-webgl",
|
||||
"--hide-scrollbars",
|
||||
"--export-tagged-pdf",
|
||||
"--generate-pdf-document-outline",
|
||||
"--disable-lazy-loading",
|
||||
"--disable-renderer-backgrounding",
|
||||
"--disable-background-networking",
|
||||
"--disable-background-timer-throttling",
|
||||
"--disable-backgrounding-occluded-windows",
|
||||
"--disable-ipc-flooding-protection",
|
||||
"--disable-extensions-http-throttling",
|
||||
"--disable-field-trial-config",
|
||||
"--disable-back-forward-cache",
|
||||
"--autoplay-policy=no-user-gesture-required",
|
||||
"--disable-gesture-requirement-for-media-playback",
|
||||
"--lang=en-US,en;q=0.9",
|
||||
"--log-level=2",
|
||||
"--enable-logging=stderr"
|
||||
],
|
||||
"x-aliases": ["CHROME_DEFAULT_ARGS"],
|
||||
"description": "Default Chrome command-line arguments (static flags only, dynamic args like --user-data-dir are added at runtime)"
|
||||
},
|
||||
"CHROME_ARGS_EXTRA": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": [],
|
||||
"x-aliases": ["CHROME_EXTRA_ARGS"],
|
||||
"description": "Extra arguments to append to Chrome command (for user customization)"
|
||||
},
|
||||
"CHROME_PAGELOAD_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 60,
|
||||
"minimum": 5,
|
||||
"x-fallback": "CHROME_TIMEOUT",
|
||||
"description": "Timeout for page navigation/load in seconds"
|
||||
},
|
||||
"CHROME_WAIT_FOR": {
|
||||
"type": "string",
|
||||
"default": "networkidle2",
|
||||
"enum": ["domcontentloaded", "load", "networkidle0", "networkidle2"],
|
||||
"description": "Page load completion condition (domcontentloaded, load, networkidle0, networkidle2)"
|
||||
},
|
||||
"CHROME_DELAY_AFTER_LOAD": {
|
||||
"type": "number",
|
||||
"default": 0,
|
||||
"minimum": 0,
|
||||
"description": "Extra delay in seconds after page load completes before archiving (useful for JS-heavy SPAs)"
|
||||
},
|
||||
"CHROME_CHECK_SSL_VALIDITY": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-fallback": "CHECK_SSL_VALIDITY",
|
||||
"description": "Whether to verify SSL certificates (disable for self-signed certs)"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,254 +0,0 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Extract cookies from Chrome via CDP and write to Netscape cookies.txt format.
|
||||
*
|
||||
* This script launches Chrome with a given user data directory, connects via CDP,
|
||||
* extracts all cookies, and writes them to a cookies.txt file in Netscape format.
|
||||
*
|
||||
* Usage:
|
||||
* CHROME_USER_DATA_DIR=/path/to/profile COOKIES_OUTPUT_FILE=/path/to/cookies.txt node extract_cookies.js
|
||||
*
|
||||
* Environment variables:
|
||||
* CHROME_USER_DATA_DIR: Path to Chrome user data directory (required)
|
||||
* COOKIES_OUTPUT_FILE: Path to output cookies.txt file (required)
|
||||
* CHROME_HEADLESS: Run in headless mode (default: true)
|
||||
* NODE_MODULES_DIR: Path to node_modules for module resolution
|
||||
*/
|
||||
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) {
|
||||
module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
}
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const {
|
||||
findAnyChromiumBinary,
|
||||
launchChromium,
|
||||
killChrome,
|
||||
getEnv,
|
||||
} = require('./chrome_utils.js');
|
||||
|
||||
/**
|
||||
* Convert a cookie object to Netscape cookies.txt format line.
|
||||
*
|
||||
* Format: domain includeSubdomains path secure expiry name value
|
||||
*
|
||||
* @param {Object} cookie - CDP cookie object
|
||||
* @returns {string} - Netscape format cookie line
|
||||
*/
|
||||
function cookieToNetscape(cookie) {
|
||||
// Domain: prefix with . for domain cookies (not host-only)
|
||||
let domain = cookie.domain;
|
||||
if (!domain.startsWith('.') && !cookie.hostOnly) {
|
||||
domain = '.' + domain;
|
||||
}
|
||||
|
||||
// Include subdomains: TRUE if domain cookie (starts with .)
|
||||
const includeSubdomains = domain.startsWith('.') ? 'TRUE' : 'FALSE';
|
||||
|
||||
// Path
|
||||
const cookiePath = cookie.path || '/';
|
||||
|
||||
// Secure flag
|
||||
const secure = cookie.secure ? 'TRUE' : 'FALSE';
|
||||
|
||||
// Expiry timestamp (0 for session cookies)
|
||||
let expiry = '0';
|
||||
if (cookie.expires && cookie.expires > 0) {
|
||||
// CDP returns expiry in seconds since epoch
|
||||
expiry = Math.floor(cookie.expires).toString();
|
||||
}
|
||||
|
||||
// Name and value
|
||||
const name = cookie.name;
|
||||
const value = cookie.value;
|
||||
|
||||
return `${domain}\t${includeSubdomains}\t${cookiePath}\t${secure}\t${expiry}\t${name}\t${value}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Write cookies to Netscape cookies.txt format file.
|
||||
*
|
||||
* @param {Array} cookies - Array of CDP cookie objects
|
||||
* @param {string} outputPath - Path to output file
|
||||
*/
|
||||
function writeCookiesFile(cookies, outputPath) {
|
||||
const lines = [
|
||||
'# Netscape HTTP Cookie File',
|
||||
'# https://curl.se/docs/http-cookies.html',
|
||||
'# This file was generated by ArchiveBox persona cookie extraction',
|
||||
'#',
|
||||
'# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue',
|
||||
'',
|
||||
];
|
||||
|
||||
for (const cookie of cookies) {
|
||||
lines.push(cookieToNetscape(cookie));
|
||||
}
|
||||
|
||||
fs.writeFileSync(outputPath, lines.join('\n') + '\n');
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const userDataDir = getEnv('CHROME_USER_DATA_DIR');
|
||||
const outputFile = getEnv('COOKIES_OUTPUT_FILE');
|
||||
|
||||
if (!userDataDir) {
|
||||
console.error('ERROR: CHROME_USER_DATA_DIR environment variable is required');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
if (!outputFile) {
|
||||
console.error('ERROR: COOKIES_OUTPUT_FILE environment variable is required');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
if (!fs.existsSync(userDataDir)) {
|
||||
console.error(`ERROR: User data directory does not exist: ${userDataDir}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const binary = findAnyChromiumBinary();
|
||||
if (!binary) {
|
||||
console.error('ERROR: Chromium-based browser binary not found');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.error(`[*] Extracting cookies from: ${userDataDir}`);
|
||||
console.error(`[*] Output file: ${outputFile}`);
|
||||
console.error(`[*] Using browser: ${binary}`);
|
||||
|
||||
// Create a temporary output directory for Chrome files
|
||||
const outputDir = fs.mkdtempSync(path.join(require('os').tmpdir(), 'chrome-cookies-'));
|
||||
|
||||
let chromePid = null;
|
||||
|
||||
try {
|
||||
// Launch Chrome with the user data directory
|
||||
const result = await launchChromium({
|
||||
binary,
|
||||
outputDir,
|
||||
userDataDir,
|
||||
headless: true,
|
||||
killZombies: false, // Don't kill other Chrome instances
|
||||
});
|
||||
|
||||
if (!result.success) {
|
||||
console.error(`ERROR: Failed to launch Chrome: ${result.error}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
chromePid = result.pid;
|
||||
const cdpUrl = result.cdpUrl;
|
||||
const port = result.port;
|
||||
|
||||
console.error(`[*] Chrome launched (PID: ${chromePid})`);
|
||||
console.error(`[*] CDP URL: ${cdpUrl}`);
|
||||
|
||||
// Connect to CDP and get cookies
|
||||
const http = require('http');
|
||||
|
||||
// Use CDP directly via HTTP to get all cookies
|
||||
const getCookies = () => {
|
||||
return new Promise((resolve, reject) => {
|
||||
const req = http.request(
|
||||
{
|
||||
hostname: '127.0.0.1',
|
||||
port: port,
|
||||
path: '/json/list',
|
||||
method: 'GET',
|
||||
},
|
||||
(res) => {
|
||||
let data = '';
|
||||
res.on('data', (chunk) => (data += chunk));
|
||||
res.on('end', () => {
|
||||
try {
|
||||
const targets = JSON.parse(data);
|
||||
// Find a page target
|
||||
const pageTarget = targets.find(t => t.type === 'page') || targets[0];
|
||||
if (!pageTarget) {
|
||||
reject(new Error('No page target found'));
|
||||
return;
|
||||
}
|
||||
|
||||
// Connect via WebSocket and send CDP command
|
||||
const WebSocket = require('ws');
|
||||
const ws = new WebSocket(pageTarget.webSocketDebuggerUrl);
|
||||
|
||||
ws.on('open', () => {
|
||||
ws.send(JSON.stringify({
|
||||
id: 1,
|
||||
method: 'Network.getAllCookies',
|
||||
}));
|
||||
});
|
||||
|
||||
ws.on('message', (message) => {
|
||||
const response = JSON.parse(message);
|
||||
if (response.id === 1) {
|
||||
ws.close();
|
||||
if (response.result && response.result.cookies) {
|
||||
resolve(response.result.cookies);
|
||||
} else {
|
||||
reject(new Error('Failed to get cookies: ' + JSON.stringify(response)));
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
ws.on('error', (err) => {
|
||||
reject(err);
|
||||
});
|
||||
} catch (e) {
|
||||
reject(e);
|
||||
}
|
||||
});
|
||||
}
|
||||
);
|
||||
|
||||
req.on('error', reject);
|
||||
req.end();
|
||||
});
|
||||
};
|
||||
|
||||
// Wait a moment for the browser to fully initialize
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
|
||||
console.error('[*] Fetching cookies via CDP...');
|
||||
const cookies = await getCookies();
|
||||
|
||||
console.error(`[+] Retrieved ${cookies.length} cookies`);
|
||||
|
||||
// Write cookies to file
|
||||
writeCookiesFile(cookies, outputFile);
|
||||
console.error(`[+] Wrote cookies to: ${outputFile}`);
|
||||
|
||||
// Clean up
|
||||
await killChrome(chromePid, outputDir);
|
||||
chromePid = null;
|
||||
|
||||
// Remove temp directory
|
||||
fs.rmSync(outputDir, { recursive: true, force: true });
|
||||
|
||||
console.error('[+] Cookie extraction complete');
|
||||
process.exit(0);
|
||||
|
||||
} catch (error) {
|
||||
console.error(`ERROR: ${error.message}`);
|
||||
|
||||
// Clean up on error
|
||||
if (chromePid) {
|
||||
await killChrome(chromePid, outputDir);
|
||||
}
|
||||
|
||||
try {
|
||||
fs.rmSync(outputDir, { recursive: true, force: true });
|
||||
} catch (e) {}
|
||||
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch((e) => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -1,34 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Emit Chromium Binary dependency for the crawl.
|
||||
|
||||
NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
|
||||
--load-extension and --disable-extensions-except flags, which are needed for
|
||||
loading unpacked extensions in headless mode.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
def main():
|
||||
# Check if Chrome is enabled
|
||||
chrome_enabled = os.environ.get('CHROME_ENABLED', 'true').lower() not in ('false', '0', 'no', 'off')
|
||||
if not chrome_enabled:
|
||||
sys.exit(0)
|
||||
|
||||
record = {
|
||||
'type': 'Binary',
|
||||
'name': 'chromium',
|
||||
'binproviders': 'puppeteer,env',
|
||||
'overrides': {
|
||||
'puppeteer': ['chromium@latest', '--install-deps'],
|
||||
},
|
||||
}
|
||||
print(json.dumps(record))
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,427 +0,0 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Launch a shared Chromium browser session for the entire crawl.
|
||||
*
|
||||
* This runs once per crawl and keeps Chromium alive for all snapshots to share.
|
||||
* Each snapshot creates its own tab via on_Snapshot__10_chrome_tab.bg.js.
|
||||
*
|
||||
* NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
|
||||
* --load-extension and --disable-extensions-except flags.
|
||||
*
|
||||
* Usage: on_Crawl__90_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
|
||||
* Output: Writes to current directory (executor creates chrome/ dir):
|
||||
* - cdp_url.txt: WebSocket URL for CDP connection
|
||||
* - chrome.pid: Chromium process ID (for cleanup)
|
||||
* - port.txt: Debug port number
|
||||
* - extensions.json: Loaded extensions metadata
|
||||
*
|
||||
* Environment variables:
|
||||
* NODE_MODULES_DIR: Path to node_modules directory for module resolution
|
||||
* CHROME_BINARY: Path to Chromium binary (falls back to auto-detection)
|
||||
* CHROME_RESOLUTION: Page resolution (default: 1440,2000)
|
||||
* CHROME_HEADLESS: Run in headless mode (default: true)
|
||||
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
|
||||
* CHROME_EXTENSIONS_DIR: Directory containing Chrome extensions
|
||||
*/
|
||||
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) {
|
||||
module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
}
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const http = require('http');
|
||||
const puppeteer = require('puppeteer');
|
||||
const {
|
||||
findChromium,
|
||||
launchChromium,
|
||||
killChrome,
|
||||
getEnv,
|
||||
getEnvBool,
|
||||
getExtensionId,
|
||||
writePidWithMtime,
|
||||
getExtensionsDir,
|
||||
} = require('./chrome_utils.js');
|
||||
|
||||
// Extractor metadata
|
||||
const PLUGIN_NAME = 'chrome_launch';
|
||||
const OUTPUT_DIR = '.';
|
||||
|
||||
// Global state for cleanup
|
||||
let chromePid = null;
|
||||
let browserInstance = null;
|
||||
|
||||
function parseCookiesTxt(contents) {
|
||||
const cookies = [];
|
||||
let skipped = 0;
|
||||
|
||||
for (const rawLine of contents.split(/\r?\n/)) {
|
||||
const line = rawLine.trim();
|
||||
if (!line) continue;
|
||||
|
||||
let httpOnly = false;
|
||||
let dataLine = line;
|
||||
|
||||
if (dataLine.startsWith('#HttpOnly_')) {
|
||||
httpOnly = true;
|
||||
dataLine = dataLine.slice('#HttpOnly_'.length);
|
||||
} else if (dataLine.startsWith('#')) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const parts = dataLine.split('\t');
|
||||
if (parts.length < 7) {
|
||||
skipped += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
const [domainRaw, includeSubdomainsRaw, pathRaw, secureRaw, expiryRaw, name, value] = parts;
|
||||
if (!name || !domainRaw) {
|
||||
skipped += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
const includeSubdomains = (includeSubdomainsRaw || '').toUpperCase() === 'TRUE';
|
||||
let domain = domainRaw;
|
||||
if (includeSubdomains && !domain.startsWith('.')) domain = `.${domain}`;
|
||||
if (!includeSubdomains && domain.startsWith('.')) domain = domain.slice(1);
|
||||
|
||||
const cookie = {
|
||||
name,
|
||||
value,
|
||||
domain,
|
||||
path: pathRaw || '/',
|
||||
secure: (secureRaw || '').toUpperCase() === 'TRUE',
|
||||
httpOnly,
|
||||
};
|
||||
|
||||
const expires = parseInt(expiryRaw, 10);
|
||||
if (!isNaN(expires) && expires > 0) {
|
||||
cookie.expires = expires;
|
||||
}
|
||||
|
||||
cookies.push(cookie);
|
||||
}
|
||||
|
||||
return { cookies, skipped };
|
||||
}
|
||||
|
||||
async function importCookiesFromFile(browser, cookiesFile, userDataDir) {
|
||||
if (!cookiesFile) return;
|
||||
|
||||
if (!fs.existsSync(cookiesFile)) {
|
||||
console.error(`[!] Cookies file not found: ${cookiesFile}`);
|
||||
return;
|
||||
}
|
||||
|
||||
let contents = '';
|
||||
try {
|
||||
contents = fs.readFileSync(cookiesFile, 'utf-8');
|
||||
} catch (e) {
|
||||
console.error(`[!] Failed to read COOKIES_TXT_FILE: ${e.message}`);
|
||||
return;
|
||||
}
|
||||
|
||||
const { cookies, skipped } = parseCookiesTxt(contents);
|
||||
if (cookies.length === 0) {
|
||||
console.error('[!] No cookies found to import');
|
||||
return;
|
||||
}
|
||||
|
||||
console.error(`[*] Importing ${cookies.length} cookies from ${cookiesFile}...`);
|
||||
if (skipped) {
|
||||
console.error(`[*] Skipped ${skipped} malformed cookie line(s)`);
|
||||
}
|
||||
if (!userDataDir) {
|
||||
console.error('[!] CHROME_USER_DATA_DIR not set; cookies will not persist beyond this session');
|
||||
}
|
||||
|
||||
const page = await browser.newPage();
|
||||
const client = await page.target().createCDPSession();
|
||||
await client.send('Network.enable');
|
||||
|
||||
const chunkSize = 200;
|
||||
let imported = 0;
|
||||
for (let i = 0; i < cookies.length; i += chunkSize) {
|
||||
const chunk = cookies.slice(i, i + chunkSize);
|
||||
try {
|
||||
await client.send('Network.setCookies', { cookies: chunk });
|
||||
imported += chunk.length;
|
||||
} catch (e) {
|
||||
console.error(`[!] Failed to import cookies ${i + 1}-${i + chunk.length}: ${e.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
await page.close();
|
||||
console.error(`[+] Imported ${imported}/${cookies.length} cookies`);
|
||||
}
|
||||
|
||||
function getPortFromCdpUrl(cdpUrl) {
|
||||
if (!cdpUrl) return null;
|
||||
const match = cdpUrl.match(/:(\d+)\/devtools\//);
|
||||
return match ? match[1] : null;
|
||||
}
|
||||
|
||||
async function fetchDevtoolsTargets(cdpUrl) {
|
||||
const port = getPortFromCdpUrl(cdpUrl);
|
||||
if (!port) return [];
|
||||
|
||||
const urlPath = '/json/list';
|
||||
return new Promise((resolve, reject) => {
|
||||
const req = http.get(
|
||||
{ hostname: '127.0.0.1', port, path: urlPath },
|
||||
(res) => {
|
||||
let data = '';
|
||||
res.on('data', (chunk) => (data += chunk));
|
||||
res.on('end', () => {
|
||||
try {
|
||||
const targets = JSON.parse(data);
|
||||
resolve(Array.isArray(targets) ? targets : []);
|
||||
} catch (e) {
|
||||
reject(e);
|
||||
}
|
||||
});
|
||||
}
|
||||
);
|
||||
req.on('error', reject);
|
||||
});
|
||||
}
|
||||
|
||||
async function discoverExtensionTargets(cdpUrl, installedExtensions) {
|
||||
const builtinIds = [
|
||||
'nkeimhogjdpnpccoofpliimaahmaaome',
|
||||
'fignfifoniblkonapihmkfakmlgkbkcf',
|
||||
'ahfgeienlihckogmohjhadlkjgocpleb',
|
||||
'mhjfbmdgcfjbbpaeojofohoefgiehjai',
|
||||
];
|
||||
|
||||
let targets = [];
|
||||
for (let i = 0; i < 10; i += 1) {
|
||||
try {
|
||||
targets = await fetchDevtoolsTargets(cdpUrl);
|
||||
if (targets.length > 0) break;
|
||||
} catch (e) {
|
||||
// Ignore and retry
|
||||
}
|
||||
await new Promise(r => setTimeout(r, 500));
|
||||
}
|
||||
|
||||
const customExtTargets = targets.filter(t => {
|
||||
const url = t.url || '';
|
||||
if (!url.startsWith('chrome-extension://')) return false;
|
||||
const extId = url.split('://')[1].split('/')[0];
|
||||
return !builtinIds.includes(extId);
|
||||
});
|
||||
|
||||
console.error(`[+] Found ${customExtTargets.length} custom extension target(s) via /json/list`);
|
||||
|
||||
for (const target of customExtTargets) {
|
||||
const url = target.url || '';
|
||||
const extId = url.split('://')[1].split('/')[0];
|
||||
console.error(`[+] Extension target: ${extId} (${target.type || 'unknown'})`);
|
||||
}
|
||||
|
||||
const runtimeIds = new Set(customExtTargets.map(t => (t.url || '').split('://')[1].split('/')[0]));
|
||||
for (const ext of installedExtensions) {
|
||||
if (ext.id) {
|
||||
ext.loaded = runtimeIds.has(ext.id);
|
||||
}
|
||||
}
|
||||
|
||||
if (customExtTargets.length === 0 && installedExtensions.length > 0) {
|
||||
console.error(`[!] Warning: No custom extensions detected. Extension loading may have failed.`);
|
||||
console.error(`[!] Make sure you are using Chromium, not Chrome (Chrome 137+ removed --load-extension support)`);
|
||||
}
|
||||
}
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
const args = {};
|
||||
process.argv.slice(2).forEach((arg) => {
|
||||
if (arg.startsWith('--')) {
|
||||
const [key, ...valueParts] = arg.slice(2).split('=');
|
||||
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
|
||||
}
|
||||
});
|
||||
return args;
|
||||
}
|
||||
|
||||
// Cleanup handler for SIGTERM
|
||||
async function cleanup() {
|
||||
console.error('[*] Cleaning up Chrome session...');
|
||||
|
||||
// Try graceful browser close first
|
||||
if (browserInstance) {
|
||||
try {
|
||||
console.error('[*] Closing browser gracefully...');
|
||||
await browserInstance.close();
|
||||
browserInstance = null;
|
||||
console.error('[+] Browser closed gracefully');
|
||||
} catch (e) {
|
||||
console.error(`[!] Graceful close failed: ${e.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Kill Chrome process
|
||||
if (chromePid) {
|
||||
await killChrome(chromePid, OUTPUT_DIR);
|
||||
}
|
||||
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Register signal handlers
|
||||
process.on('SIGTERM', cleanup);
|
||||
process.on('SIGINT', cleanup);
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const crawlId = args.crawl_id;
|
||||
|
||||
try {
|
||||
const binary = findChromium();
|
||||
if (!binary) {
|
||||
console.error('ERROR: Chromium binary not found');
|
||||
console.error('DEPENDENCY_NEEDED=chromium');
|
||||
console.error('BIN_PROVIDERS=puppeteer,env,playwright,apt,brew');
|
||||
console.error('INSTALL_HINT=npx @puppeteer/browsers install chromium@latest');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Get Chromium version
|
||||
let version = '';
|
||||
try {
|
||||
const { execSync } = require('child_process');
|
||||
version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 })
|
||||
.trim()
|
||||
.slice(0, 64);
|
||||
} catch (e) {}
|
||||
|
||||
console.error(`[*] Using browser: ${binary}`);
|
||||
if (version) console.error(`[*] Version: ${version}`);
|
||||
|
||||
// Load installed extensions
|
||||
const extensionsDir = getExtensionsDir();
|
||||
const userDataDir = getEnv('CHROME_USER_DATA_DIR');
|
||||
const cookiesFile = getEnv('COOKIES_TXT_FILE') || getEnv('COOKIES_FILE');
|
||||
|
||||
if (userDataDir) {
|
||||
console.error(`[*] Using user data dir: ${userDataDir}`);
|
||||
}
|
||||
if (cookiesFile) {
|
||||
console.error(`[*] Using cookies file: ${cookiesFile}`);
|
||||
}
|
||||
|
||||
const installedExtensions = [];
|
||||
const extensionPaths = [];
|
||||
if (fs.existsSync(extensionsDir)) {
|
||||
const files = fs.readdirSync(extensionsDir);
|
||||
for (const file of files) {
|
||||
if (file.endsWith('.extension.json')) {
|
||||
try {
|
||||
const extPath = path.join(extensionsDir, file);
|
||||
const extData = JSON.parse(fs.readFileSync(extPath, 'utf-8'));
|
||||
if (extData.unpacked_path && fs.existsSync(extData.unpacked_path)) {
|
||||
installedExtensions.push(extData);
|
||||
extensionPaths.push(extData.unpacked_path);
|
||||
console.error(`[*] Loading extension: ${extData.name || file}`);
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn(`[!] Skipping invalid extension cache: ${file}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (installedExtensions.length > 0) {
|
||||
console.error(`[+] Found ${installedExtensions.length} extension(s) to load`);
|
||||
}
|
||||
|
||||
// Ensure extension IDs are available without chrome://extensions
|
||||
for (const ext of installedExtensions) {
|
||||
if (!ext.id && ext.unpacked_path) {
|
||||
try {
|
||||
ext.id = getExtensionId(ext.unpacked_path);
|
||||
} catch (e) {
|
||||
console.error(`[!] Failed to compute extension id for ${ext.name}: ${e.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Note: PID file is written by run_hook() with hook-specific name
|
||||
// Snapshot.cleanup() kills all *.pid processes when done
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
|
||||
// Launch Chromium using consolidated function
|
||||
// userDataDir is derived from ACTIVE_PERSONA by get_config() if not explicitly set
|
||||
const result = await launchChromium({
|
||||
binary,
|
||||
outputDir: OUTPUT_DIR,
|
||||
userDataDir,
|
||||
extensionPaths,
|
||||
});
|
||||
|
||||
if (!result.success) {
|
||||
console.error(`ERROR: ${result.error}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
chromePid = result.pid;
|
||||
const cdpUrl = result.cdpUrl;
|
||||
|
||||
// Discover extension targets at launch (no chrome://extensions)
|
||||
if (extensionPaths.length > 0) {
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
console.error('[*] Discovering extension targets via devtools /json/list...');
|
||||
await discoverExtensionTargets(cdpUrl, installedExtensions);
|
||||
}
|
||||
|
||||
// Only connect to CDP when cookies import is needed to reduce crash risk.
|
||||
if (cookiesFile) {
|
||||
console.error(`[*] Connecting puppeteer to CDP for cookie import...`);
|
||||
const browser = await puppeteer.connect({
|
||||
browserWSEndpoint: cdpUrl,
|
||||
defaultViewport: null,
|
||||
});
|
||||
browserInstance = browser;
|
||||
|
||||
// Import cookies into Chrome profile at crawl start
|
||||
await importCookiesFromFile(browser, cookiesFile, userDataDir);
|
||||
|
||||
try {
|
||||
browser.disconnect();
|
||||
} catch (e) {}
|
||||
browserInstance = null;
|
||||
} else {
|
||||
console.error('[*] Skipping puppeteer CDP connection (no cookies to import)');
|
||||
}
|
||||
|
||||
// Write extensions metadata with actual IDs
|
||||
if (installedExtensions.length > 0) {
|
||||
fs.writeFileSync(
|
||||
path.join(OUTPUT_DIR, 'extensions.json'),
|
||||
JSON.stringify(installedExtensions, null, 2)
|
||||
);
|
||||
}
|
||||
|
||||
console.error(`[+] Chromium session started for crawl ${crawlId}`);
|
||||
console.error(`[+] CDP URL: ${cdpUrl}`);
|
||||
console.error(`[+] PID: ${chromePid}`);
|
||||
|
||||
// Stay alive to handle cleanup on SIGTERM
|
||||
console.log('[*] Chromium launch hook staying alive to handle cleanup...');
|
||||
setInterval(() => {}, 1000000);
|
||||
|
||||
} catch (e) {
|
||||
console.error(`ERROR: ${e.name}: ${e.message}`);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch((e) => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -1,264 +0,0 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Create a Chrome tab for this snapshot in the shared crawl Chrome session.
|
||||
*
|
||||
* Connects to the crawl-level Chrome session (from on_Crawl__90_chrome_launch.bg.js)
|
||||
* and creates a new tab. This hook does NOT launch its own Chrome instance.
|
||||
*
|
||||
* Usage: on_Snapshot__10_chrome_tab.bg.js --url=<url> --snapshot-id=<uuid> --crawl-id=<uuid>
|
||||
* Output: Creates chrome/ directory under snapshot output dir with:
|
||||
* - cdp_url.txt: WebSocket URL for CDP connection
|
||||
* - chrome.pid: Chrome process ID (from crawl)
|
||||
* - target_id.txt: Target ID of this snapshot's tab
|
||||
* - url.txt: The URL to be navigated to
|
||||
*
|
||||
* Environment variables:
|
||||
* CRAWL_OUTPUT_DIR: Crawl output directory (to find crawl's Chrome session)
|
||||
* CHROME_BINARY: Path to Chromium binary (optional, for version info)
|
||||
*
|
||||
* This is a background hook that stays alive until SIGTERM so the tab
|
||||
* can be closed cleanly at the end of the snapshot run.
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const { execSync } = require('child_process');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
|
||||
const puppeteer = require('puppeteer');
|
||||
const { getEnv, getEnvInt } = require('./chrome_utils.js');
|
||||
|
||||
// Extractor metadata
|
||||
const PLUGIN_NAME = 'chrome_tab';
|
||||
const OUTPUT_DIR = '.'; // Hook already runs in chrome/ output directory
|
||||
const CHROME_SESSION_DIR = '.';
|
||||
const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)';
|
||||
|
||||
let finalStatus = 'failed';
|
||||
let finalOutput = '';
|
||||
let finalError = '';
|
||||
let cmdVersion = '';
|
||||
let finalized = false;
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
const args = {};
|
||||
process.argv.slice(2).forEach(arg => {
|
||||
if (arg.startsWith('--')) {
|
||||
const [key, ...valueParts] = arg.slice(2).split('=');
|
||||
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
|
||||
}
|
||||
});
|
||||
return args;
|
||||
}
|
||||
|
||||
function emitResult(statusOverride) {
|
||||
if (finalized) return;
|
||||
finalized = true;
|
||||
|
||||
const status = statusOverride || finalStatus;
|
||||
const outputStr = status === 'succeeded'
|
||||
? finalOutput
|
||||
: (finalError || finalOutput || '');
|
||||
|
||||
const result = {
|
||||
type: 'ArchiveResult',
|
||||
status,
|
||||
output_str: outputStr,
|
||||
};
|
||||
if (cmdVersion) {
|
||||
result.cmd_version = cmdVersion;
|
||||
}
|
||||
console.log(JSON.stringify(result));
|
||||
}
|
||||
|
||||
// Cleanup handler for SIGTERM - close this snapshot's tab
|
||||
async function cleanup(signal) {
|
||||
if (signal) {
|
||||
console.error(`\nReceived ${signal}, closing chrome tab...`);
|
||||
}
|
||||
try {
|
||||
const cdpFile = path.join(OUTPUT_DIR, 'cdp_url.txt');
|
||||
const targetIdFile = path.join(OUTPUT_DIR, 'target_id.txt');
|
||||
|
||||
if (fs.existsSync(cdpFile) && fs.existsSync(targetIdFile)) {
|
||||
const cdpUrl = fs.readFileSync(cdpFile, 'utf8').trim();
|
||||
const targetId = fs.readFileSync(targetIdFile, 'utf8').trim();
|
||||
|
||||
const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
|
||||
const pages = await browser.pages();
|
||||
const page = pages.find(p => p.target()._targetId === targetId);
|
||||
|
||||
if (page) {
|
||||
await page.close();
|
||||
}
|
||||
browser.disconnect();
|
||||
}
|
||||
} catch (e) {
|
||||
// Best effort
|
||||
}
|
||||
emitResult();
|
||||
process.exit(finalStatus === 'succeeded' ? 0 : 1);
|
||||
}
|
||||
|
||||
// Register signal handlers
|
||||
process.on('SIGTERM', () => cleanup('SIGTERM'));
|
||||
process.on('SIGINT', () => cleanup('SIGINT'));
|
||||
|
||||
// Try to find the crawl's Chrome session
|
||||
function getCrawlChromeSession() {
|
||||
// Use CRAWL_OUTPUT_DIR env var set by get_config() in configset.py
|
||||
const crawlOutputDir = getEnv('CRAWL_OUTPUT_DIR', '');
|
||||
if (!crawlOutputDir) {
|
||||
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
|
||||
}
|
||||
|
||||
const crawlChromeDir = path.join(crawlOutputDir, 'chrome');
|
||||
const cdpFile = path.join(crawlChromeDir, 'cdp_url.txt');
|
||||
const pidFile = path.join(crawlChromeDir, 'chrome.pid');
|
||||
|
||||
if (!fs.existsSync(cdpFile)) {
|
||||
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
|
||||
}
|
||||
if (!fs.existsSync(pidFile)) {
|
||||
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
|
||||
}
|
||||
|
||||
const cdpUrl = fs.readFileSync(cdpFile, 'utf-8').trim();
|
||||
const pid = parseInt(fs.readFileSync(pidFile, 'utf-8').trim(), 10);
|
||||
if (!cdpUrl) {
|
||||
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
|
||||
}
|
||||
if (!pid || Number.isNaN(pid)) {
|
||||
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
|
||||
}
|
||||
|
||||
// Verify the process is still running
|
||||
try {
|
||||
process.kill(pid, 0); // Signal 0 = check if process exists
|
||||
} catch (e) {
|
||||
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
|
||||
}
|
||||
|
||||
return { cdpUrl, pid };
|
||||
}
|
||||
|
||||
async function waitForCrawlChromeSession(timeoutMs, intervalMs = 250) {
|
||||
const startTime = Date.now();
|
||||
let lastError = null;
|
||||
|
||||
while (Date.now() - startTime < timeoutMs) {
|
||||
try {
|
||||
return getCrawlChromeSession();
|
||||
} catch (e) {
|
||||
lastError = e;
|
||||
}
|
||||
await new Promise(resolve => setTimeout(resolve, intervalMs));
|
||||
}
|
||||
|
||||
if (lastError) {
|
||||
throw lastError;
|
||||
}
|
||||
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
|
||||
}
|
||||
|
||||
// Create a new tab in an existing Chrome session
|
||||
async function createTabInExistingChrome(cdpUrl, url, pid) {
|
||||
console.log(`[*] Connecting to existing Chrome session: ${cdpUrl}`);
|
||||
|
||||
// Connect Puppeteer to the running Chrome
|
||||
const browser = await puppeteer.connect({
|
||||
browserWSEndpoint: cdpUrl,
|
||||
defaultViewport: null,
|
||||
});
|
||||
|
||||
// Create a new tab for this snapshot
|
||||
const page = await browser.newPage();
|
||||
|
||||
// Get the page target ID
|
||||
const target = page.target();
|
||||
const targetId = target._targetId;
|
||||
|
||||
// Write session info
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, 'cdp_url.txt'), cdpUrl);
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, 'chrome.pid'), String(pid));
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, 'target_id.txt'), targetId);
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, 'url.txt'), url);
|
||||
|
||||
// Disconnect Puppeteer (Chrome and tab stay alive)
|
||||
browser.disconnect();
|
||||
|
||||
return { success: true, output: OUTPUT_DIR, cdpUrl, targetId, pid };
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const url = args.url;
|
||||
const snapshotId = args.snapshot_id;
|
||||
const crawlId = args.crawl_id || getEnv('CRAWL_ID', '');
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__10_chrome_tab.bg.js --url=<url> --snapshot-id=<uuid> [--crawl-id=<uuid>]');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
let status = 'failed';
|
||||
let output = '';
|
||||
let error = '';
|
||||
let version = '';
|
||||
|
||||
try {
|
||||
// Get Chrome version
|
||||
try {
|
||||
const binary = getEnv('CHROME_BINARY', '').trim();
|
||||
if (binary) {
|
||||
version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 }).trim().slice(0, 64);
|
||||
}
|
||||
} catch (e) {
|
||||
version = '';
|
||||
}
|
||||
|
||||
// Try to use existing crawl Chrome session (wait for readiness)
|
||||
const timeoutSeconds = getEnvInt('CHROME_TAB_TIMEOUT', getEnvInt('CHROME_TIMEOUT', getEnvInt('TIMEOUT', 60)));
|
||||
const crawlSession = await waitForCrawlChromeSession(timeoutSeconds * 1000);
|
||||
console.log(`[*] Found existing Chrome session from crawl ${crawlId}`);
|
||||
const result = await createTabInExistingChrome(crawlSession.cdpUrl, url, crawlSession.pid);
|
||||
|
||||
if (result.success) {
|
||||
status = 'succeeded';
|
||||
output = result.output;
|
||||
console.log(`[+] Chrome tab ready`);
|
||||
console.log(`[+] CDP URL: ${result.cdpUrl}`);
|
||||
console.log(`[+] Page target ID: ${result.targetId}`);
|
||||
} else {
|
||||
status = 'failed';
|
||||
error = result.error;
|
||||
}
|
||||
} catch (e) {
|
||||
error = `${e.name}: ${e.message}`;
|
||||
status = 'failed';
|
||||
}
|
||||
|
||||
if (error) {
|
||||
console.error(`ERROR: ${error}`);
|
||||
}
|
||||
|
||||
finalStatus = status;
|
||||
finalOutput = output || '';
|
||||
finalError = error || '';
|
||||
cmdVersion = version || '';
|
||||
|
||||
if (status !== 'succeeded') {
|
||||
emitResult(status);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log('[*] Chrome tab created, waiting for cleanup signal...');
|
||||
await new Promise(() => {}); // Keep alive until SIGTERM
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -1,77 +0,0 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Wait for Chrome session files to exist (cdp_url.txt + target_id.txt).
|
||||
*
|
||||
* This is a foreground hook that blocks until the Chrome tab is ready,
|
||||
* so downstream hooks can safely connect to CDP.
|
||||
*
|
||||
* Usage: on_Snapshot__11_chrome_wait.js --url=<url> --snapshot-id=<uuid>
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
|
||||
const {
|
||||
getEnvInt,
|
||||
waitForChromeSession,
|
||||
readCdpUrl,
|
||||
readTargetId,
|
||||
} = require('./chrome_utils.js');
|
||||
|
||||
const CHROME_SESSION_DIR = '.';
|
||||
const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)';
|
||||
|
||||
function parseArgs() {
|
||||
const args = {};
|
||||
process.argv.slice(2).forEach(arg => {
|
||||
if (arg.startsWith('--')) {
|
||||
const [key, ...valueParts] = arg.slice(2).split('=');
|
||||
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
|
||||
}
|
||||
});
|
||||
return args;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const url = args.url;
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__11_chrome_wait.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const timeoutSeconds = getEnvInt('CHROME_TAB_TIMEOUT', getEnvInt('CHROME_TIMEOUT', getEnvInt('TIMEOUT', 60)));
|
||||
const timeoutMs = timeoutSeconds * 1000;
|
||||
|
||||
console.error(`[chrome_wait] Waiting for Chrome session (timeout=${timeoutSeconds}s)...`);
|
||||
|
||||
const ready = await waitForChromeSession(CHROME_SESSION_DIR, timeoutMs);
|
||||
if (!ready) {
|
||||
const error = CHROME_SESSION_REQUIRED_ERROR;
|
||||
console.error(`[chrome_wait] ERROR: ${error}`);
|
||||
console.log(JSON.stringify({ type: 'ArchiveResult', status: 'failed', output_str: error }));
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
|
||||
const targetId = readTargetId(CHROME_SESSION_DIR);
|
||||
if (!cdpUrl || !targetId) {
|
||||
const error = CHROME_SESSION_REQUIRED_ERROR;
|
||||
console.error(`[chrome_wait] ERROR: ${error}`);
|
||||
console.log(JSON.stringify({ type: 'ArchiveResult', status: 'failed', output_str: error }));
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.error(`[chrome_wait] Chrome session ready (cdp_url=${cdpUrl.slice(0, 32)}..., target_id=${targetId}).`);
|
||||
console.log(JSON.stringify({ type: 'ArchiveResult', status: 'succeeded', output_str: 'chrome session ready' }));
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -1,225 +0,0 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Navigate the Chrome browser to the target URL.
|
||||
*
|
||||
* This is a simple hook that ONLY navigates - nothing else.
|
||||
* Pre-load hooks (21-29) should set up their own CDP listeners.
|
||||
* Post-load hooks (31+) can then read from the loaded page.
|
||||
*
|
||||
* Usage: on_Snapshot__30_chrome_navigate.js --url=<url> --snapshot-id=<uuid>
|
||||
* Output: Writes page_loaded.txt marker when navigation completes
|
||||
*
|
||||
* Environment variables:
|
||||
* CHROME_PAGELOAD_TIMEOUT: Timeout in seconds (default: 60)
|
||||
* CHROME_DELAY_AFTER_LOAD: Extra delay after load in seconds (default: 0)
|
||||
* CHROME_WAIT_FOR: Wait condition (default: networkidle2)
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
const puppeteer = require('puppeteer');
|
||||
|
||||
const PLUGIN_NAME = 'chrome_navigate';
|
||||
const CHROME_SESSION_DIR = '.';
|
||||
const OUTPUT_DIR = '.';
|
||||
const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)';
|
||||
|
||||
function parseArgs() {
|
||||
const args = {};
|
||||
process.argv.slice(2).forEach(arg => {
|
||||
if (arg.startsWith('--')) {
|
||||
const [key, ...valueParts] = arg.slice(2).split('=');
|
||||
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
|
||||
}
|
||||
});
|
||||
return args;
|
||||
}
|
||||
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
|
||||
function getEnvInt(name, defaultValue = 0) {
|
||||
const val = parseInt(getEnv(name, String(defaultValue)), 10);
|
||||
return isNaN(val) ? defaultValue : val;
|
||||
}
|
||||
|
||||
function getEnvFloat(name, defaultValue = 0) {
|
||||
const val = parseFloat(getEnv(name, String(defaultValue)));
|
||||
return isNaN(val) ? defaultValue : val;
|
||||
}
|
||||
|
||||
async function waitForChromeTabOpen(timeoutMs = 60000) {
|
||||
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
|
||||
const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt');
|
||||
const startTime = Date.now();
|
||||
|
||||
while (Date.now() - startTime < timeoutMs) {
|
||||
if (fs.existsSync(cdpFile) && fs.existsSync(targetIdFile)) {
|
||||
return true;
|
||||
}
|
||||
// Wait 100ms before checking again
|
||||
await new Promise(resolve => setTimeout(resolve, 100));
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
function getCdpUrl() {
|
||||
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
|
||||
if (!fs.existsSync(cdpFile)) return null;
|
||||
return fs.readFileSync(cdpFile, 'utf8').trim();
|
||||
}
|
||||
|
||||
function getPageId() {
|
||||
const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt');
|
||||
if (!fs.existsSync(targetIdFile)) return null;
|
||||
return fs.readFileSync(targetIdFile, 'utf8').trim();
|
||||
}
|
||||
|
||||
function getWaitCondition() {
|
||||
const waitFor = getEnv('CHROME_WAIT_FOR', 'networkidle2').toLowerCase();
|
||||
const valid = ['domcontentloaded', 'load', 'networkidle0', 'networkidle2'];
|
||||
return valid.includes(waitFor) ? waitFor : 'networkidle2';
|
||||
}
|
||||
|
||||
function sleep(ms) {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
async function navigate(url, cdpUrl) {
|
||||
const timeout = (getEnvInt('CHROME_PAGELOAD_TIMEOUT') || getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000;
|
||||
const delayAfterLoad = getEnvFloat('CHROME_DELAY_AFTER_LOAD', 0) * 1000;
|
||||
const waitUntil = getWaitCondition();
|
||||
const targetId = getPageId();
|
||||
|
||||
let browser = null;
|
||||
const navStartTime = Date.now();
|
||||
|
||||
try {
|
||||
browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
|
||||
|
||||
const pages = await browser.pages();
|
||||
if (pages.length === 0) {
|
||||
return { success: false, error: 'No pages found in browser', waitUntil, elapsed: Date.now() - navStartTime };
|
||||
}
|
||||
|
||||
// Find page by target ID if available
|
||||
let page = null;
|
||||
if (targetId) {
|
||||
page = pages.find(p => {
|
||||
const target = p.target();
|
||||
return target && target._targetId === targetId;
|
||||
});
|
||||
}
|
||||
if (!page) {
|
||||
page = pages[pages.length - 1];
|
||||
}
|
||||
|
||||
// Navigate
|
||||
console.log(`Navigating to ${url} (wait: ${waitUntil}, timeout: ${timeout}ms)`);
|
||||
const response = await page.goto(url, { waitUntil, timeout });
|
||||
|
||||
// Optional delay
|
||||
if (delayAfterLoad > 0) {
|
||||
console.log(`Waiting ${delayAfterLoad}ms after load...`);
|
||||
await sleep(delayAfterLoad);
|
||||
}
|
||||
|
||||
const finalUrl = page.url();
|
||||
const status = response ? response.status() : null;
|
||||
const elapsed = Date.now() - navStartTime;
|
||||
|
||||
// Write navigation state as JSON
|
||||
const navigationState = {
|
||||
waitUntil,
|
||||
elapsed,
|
||||
url,
|
||||
finalUrl,
|
||||
status,
|
||||
timestamp: new Date().toISOString()
|
||||
};
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, 'navigation.json'), JSON.stringify(navigationState, null, 2));
|
||||
|
||||
// Write marker files for backwards compatibility
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, 'page_loaded.txt'), new Date().toISOString());
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, 'final_url.txt'), finalUrl);
|
||||
|
||||
browser.disconnect();
|
||||
|
||||
return { success: true, finalUrl, status, waitUntil, elapsed };
|
||||
|
||||
} catch (e) {
|
||||
if (browser) browser.disconnect();
|
||||
const elapsed = Date.now() - navStartTime;
|
||||
return { success: false, error: `${e.name}: ${e.message}`, waitUntil, elapsed };
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const url = args.url;
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__30_chrome_navigate.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const startTs = new Date();
|
||||
let status = 'failed';
|
||||
let output = null;
|
||||
let error = '';
|
||||
|
||||
// Wait for chrome tab to be open (up to 60s)
|
||||
const tabOpen = await waitForChromeTabOpen(60000);
|
||||
if (!tabOpen) {
|
||||
console.error(`ERROR: ${CHROME_SESSION_REQUIRED_ERROR}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const cdpUrl = getCdpUrl();
|
||||
if (!cdpUrl) {
|
||||
console.error(`ERROR: ${CHROME_SESSION_REQUIRED_ERROR}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const result = await navigate(url, cdpUrl);
|
||||
|
||||
if (result.success) {
|
||||
status = 'succeeded';
|
||||
output = 'navigation.json';
|
||||
console.log(`Page loaded: ${result.finalUrl} (HTTP ${result.status}) in ${result.elapsed}ms (waitUntil: ${result.waitUntil})`);
|
||||
} else {
|
||||
error = result.error;
|
||||
// Save navigation state even on failure
|
||||
const navigationState = {
|
||||
waitUntil: result.waitUntil,
|
||||
elapsed: result.elapsed,
|
||||
url,
|
||||
error: result.error,
|
||||
timestamp: new Date().toISOString()
|
||||
};
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, 'navigation.json'), JSON.stringify(navigationState, null, 2));
|
||||
}
|
||||
|
||||
const endTs = new Date();
|
||||
|
||||
if (error) console.error(`ERROR: ${error}`);
|
||||
|
||||
// Output clean JSONL (no RESULT_JSON= prefix)
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status,
|
||||
output_str: output || error || '',
|
||||
}));
|
||||
|
||||
process.exit(status === 'succeeded' ? 0 : 1);
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -1 +0,0 @@
|
||||
<span class="abx-output-icon abx-output-icon--chrome" title="Chrome"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><rect x="3" y="4.5" width="18" height="15" rx="2"/><path d="M3 9h18"/><circle cx="7" cy="7" r="1" fill="currentColor" stroke="none"/><circle cx="11" cy="7" r="1" fill="currentColor" stroke="none"/></svg></span>
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,722 +0,0 @@
|
||||
"""
|
||||
Integration tests for chrome plugin
|
||||
|
||||
Tests verify:
|
||||
1. Chromium install via @puppeteer/browsers
|
||||
2. Verify deps with abx-pkg
|
||||
3. Chrome hooks exist
|
||||
4. Chromium launches at crawl level
|
||||
5. Tab creation at snapshot level
|
||||
6. Tab navigation works
|
||||
7. Tab cleanup on SIGTERM
|
||||
8. Chromium cleanup on crawl end
|
||||
|
||||
NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
|
||||
--load-extension and --disable-extensions-except flags, which are needed for
|
||||
loading unpacked extensions in headless mode.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import signal
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
import tempfile
|
||||
import shutil
|
||||
import platform
|
||||
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_test_env,
|
||||
find_chromium_binary,
|
||||
install_chromium_with_hooks,
|
||||
CHROME_PLUGIN_DIR as PLUGIN_DIR,
|
||||
CHROME_LAUNCH_HOOK,
|
||||
CHROME_TAB_HOOK,
|
||||
CHROME_NAVIGATE_HOOK,
|
||||
)
|
||||
|
||||
def _get_cookies_via_cdp(port: int, env: dict) -> list[dict]:
|
||||
node_script = r"""
|
||||
const http = require('http');
|
||||
const WebSocket = require('ws');
|
||||
const port = process.env.CDP_PORT;
|
||||
|
||||
function getTargets() {
|
||||
return new Promise((resolve, reject) => {
|
||||
const req = http.get(`http://127.0.0.1:${port}/json/list`, (res) => {
|
||||
let data = '';
|
||||
res.on('data', (chunk) => (data += chunk));
|
||||
res.on('end', () => {
|
||||
try {
|
||||
resolve(JSON.parse(data));
|
||||
} catch (e) {
|
||||
reject(e);
|
||||
}
|
||||
});
|
||||
});
|
||||
req.on('error', reject);
|
||||
});
|
||||
}
|
||||
|
||||
(async () => {
|
||||
const targets = await getTargets();
|
||||
const pageTarget = targets.find(t => t.type === 'page') || targets[0];
|
||||
if (!pageTarget) {
|
||||
console.error('No page target found');
|
||||
process.exit(2);
|
||||
}
|
||||
|
||||
const ws = new WebSocket(pageTarget.webSocketDebuggerUrl);
|
||||
const timer = setTimeout(() => {
|
||||
console.error('Timeout waiting for cookies');
|
||||
process.exit(3);
|
||||
}, 10000);
|
||||
|
||||
ws.on('open', () => {
|
||||
ws.send(JSON.stringify({ id: 1, method: 'Network.getAllCookies' }));
|
||||
});
|
||||
|
||||
ws.on('message', (data) => {
|
||||
const msg = JSON.parse(data);
|
||||
if (msg.id === 1) {
|
||||
clearTimeout(timer);
|
||||
ws.close();
|
||||
if (!msg.result || !msg.result.cookies) {
|
||||
console.error('No cookies in response');
|
||||
process.exit(4);
|
||||
}
|
||||
process.stdout.write(JSON.stringify(msg.result.cookies));
|
||||
process.exit(0);
|
||||
}
|
||||
});
|
||||
|
||||
ws.on('error', (err) => {
|
||||
console.error(String(err));
|
||||
process.exit(5);
|
||||
});
|
||||
})().catch((err) => {
|
||||
console.error(String(err));
|
||||
process.exit(1);
|
||||
});
|
||||
"""
|
||||
|
||||
result = subprocess.run(
|
||||
['node', '-e', node_script],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
env=env | {'CDP_PORT': str(port)},
|
||||
)
|
||||
assert result.returncode == 0, f"Failed to read cookies via CDP: {result.stderr}\nStdout: {result.stdout}"
|
||||
return json.loads(result.stdout or '[]')
|
||||
|
||||
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
def ensure_chromium_and_puppeteer_installed(tmp_path_factory):
|
||||
"""Ensure Chromium and puppeteer are installed before running tests."""
|
||||
if not os.environ.get('DATA_DIR'):
|
||||
test_data_dir = tmp_path_factory.mktemp('chrome_test_data')
|
||||
os.environ['DATA_DIR'] = str(test_data_dir)
|
||||
env = get_test_env()
|
||||
|
||||
try:
|
||||
chromium_binary = install_chromium_with_hooks(env)
|
||||
except RuntimeError as e:
|
||||
raise RuntimeError(str(e))
|
||||
|
||||
if not chromium_binary:
|
||||
raise RuntimeError("Chromium not found after install")
|
||||
|
||||
os.environ['CHROME_BINARY'] = chromium_binary
|
||||
for key in ('NODE_MODULES_DIR', 'NODE_PATH', 'PATH'):
|
||||
if env.get(key):
|
||||
os.environ[key] = env[key]
|
||||
|
||||
|
||||
def test_hook_scripts_exist():
|
||||
"""Verify chrome hooks exist."""
|
||||
assert CHROME_LAUNCH_HOOK.exists(), f"Hook not found: {CHROME_LAUNCH_HOOK}"
|
||||
assert CHROME_TAB_HOOK.exists(), f"Hook not found: {CHROME_TAB_HOOK}"
|
||||
assert CHROME_NAVIGATE_HOOK.exists(), f"Hook not found: {CHROME_NAVIGATE_HOOK}"
|
||||
|
||||
|
||||
def test_verify_chromium_available():
|
||||
"""Verify Chromium is available via CHROME_BINARY env var."""
|
||||
chromium_binary = os.environ.get('CHROME_BINARY') or find_chromium_binary()
|
||||
|
||||
assert chromium_binary, "Chromium binary should be available (set by fixture or found)"
|
||||
assert Path(chromium_binary).exists(), f"Chromium binary should exist at {chromium_binary}"
|
||||
|
||||
# Verify it's actually Chromium by checking version
|
||||
result = subprocess.run(
|
||||
[chromium_binary, '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10
|
||||
)
|
||||
assert result.returncode == 0, f"Failed to get Chromium version: {result.stderr}"
|
||||
assert 'Chromium' in result.stdout or 'Chrome' in result.stdout, f"Unexpected version output: {result.stdout}"
|
||||
|
||||
|
||||
def test_chrome_launch_and_tab_creation():
|
||||
"""Integration test: Launch Chrome at crawl level and create tab at snapshot level."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
crawl_dir = Path(tmpdir) / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
chrome_dir.mkdir()
|
||||
|
||||
# Get test environment with NODE_MODULES_DIR set
|
||||
env = get_test_env()
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
# Launch Chrome at crawl level (background process)
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-123'],
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Wait for Chrome to launch (check process isn't dead and files exist)
|
||||
for i in range(15): # Wait up to 15 seconds for Chrome to start
|
||||
if chrome_launch_process.poll() is not None:
|
||||
stdout, stderr = chrome_launch_process.communicate()
|
||||
pytest.fail(f"Chrome launch process exited early:\nStdout: {stdout}\nStderr: {stderr}")
|
||||
if (chrome_dir / 'cdp_url.txt').exists():
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
# Verify Chrome launch outputs - if it failed, get the error from the process
|
||||
if not (chrome_dir / 'cdp_url.txt').exists():
|
||||
# Try to get output from the process
|
||||
try:
|
||||
stdout, stderr = chrome_launch_process.communicate(timeout=1)
|
||||
except subprocess.TimeoutExpired:
|
||||
# Process still running, try to read available output
|
||||
stdout = stderr = "(process still running)"
|
||||
|
||||
# Check what files exist
|
||||
if chrome_dir.exists():
|
||||
files = list(chrome_dir.iterdir())
|
||||
# Check if Chrome process is still alive
|
||||
if (chrome_dir / 'chrome.pid').exists():
|
||||
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
|
||||
try:
|
||||
os.kill(chrome_pid, 0)
|
||||
chrome_alive = "yes"
|
||||
except OSError:
|
||||
chrome_alive = "no"
|
||||
pytest.fail(f"cdp_url.txt missing after 15s. Chrome dir files: {files}. Chrome process {chrome_pid} alive: {chrome_alive}\nLaunch stdout: {stdout}\nLaunch stderr: {stderr}")
|
||||
else:
|
||||
pytest.fail(f"cdp_url.txt missing. Chrome dir exists with files: {files}\nLaunch stdout: {stdout}\nLaunch stderr: {stderr}")
|
||||
else:
|
||||
pytest.fail(f"Chrome dir {chrome_dir} doesn't exist\nLaunch stdout: {stdout}\nLaunch stderr: {stderr}")
|
||||
|
||||
assert (chrome_dir / 'cdp_url.txt').exists(), "cdp_url.txt should exist"
|
||||
assert (chrome_dir / 'chrome.pid').exists(), "chrome.pid should exist"
|
||||
assert (chrome_dir / 'port.txt').exists(), "port.txt should exist"
|
||||
|
||||
cdp_url = (chrome_dir / 'cdp_url.txt').read_text().strip()
|
||||
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
|
||||
|
||||
assert cdp_url.startswith('ws://'), f"CDP URL should be WebSocket URL: {cdp_url}"
|
||||
assert chrome_pid > 0, "Chrome PID should be valid"
|
||||
|
||||
# Verify Chrome process is running
|
||||
try:
|
||||
os.kill(chrome_pid, 0)
|
||||
except OSError:
|
||||
pytest.fail(f"Chrome process {chrome_pid} is not running")
|
||||
|
||||
# Create snapshot directory and tab
|
||||
snapshot_dir = Path(tmpdir) / 'snapshot1'
|
||||
snapshot_dir.mkdir()
|
||||
snapshot_chrome_dir = snapshot_dir / 'chrome'
|
||||
snapshot_chrome_dir.mkdir()
|
||||
|
||||
# Launch tab at snapshot level
|
||||
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
|
||||
result = subprocess.run(
|
||||
['node', str(CHROME_TAB_HOOK), '--url=https://example.com', '--snapshot-id=snap-123', '--crawl-id=test-crawl-123'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
env=env
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Tab creation failed: {result.stderr}\nStdout: {result.stdout}"
|
||||
|
||||
# Verify tab creation outputs
|
||||
assert (snapshot_chrome_dir / 'cdp_url.txt').exists(), "Snapshot cdp_url.txt should exist"
|
||||
assert (snapshot_chrome_dir / 'target_id.txt').exists(), "target_id.txt should exist"
|
||||
assert (snapshot_chrome_dir / 'url.txt').exists(), "url.txt should exist"
|
||||
|
||||
target_id = (snapshot_chrome_dir / 'target_id.txt').read_text().strip()
|
||||
assert len(target_id) > 0, "Target ID should not be empty"
|
||||
|
||||
# Cleanup: Kill Chrome and launch process
|
||||
try:
|
||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||
chrome_launch_process.wait(timeout=5)
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
os.kill(chrome_pid, signal.SIGKILL)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def test_cookies_imported_on_launch():
|
||||
"""Integration test: COOKIES_TXT_FILE is imported at crawl start."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
crawl_dir = Path(tmpdir) / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
chrome_dir.mkdir()
|
||||
|
||||
cookies_file = Path(tmpdir) / 'cookies.txt'
|
||||
cookies_file.write_text(
|
||||
'\n'.join([
|
||||
'# Netscape HTTP Cookie File',
|
||||
'# https://curl.se/docs/http-cookies.html',
|
||||
'# This file was generated by a test',
|
||||
'',
|
||||
'example.com\tTRUE\t/\tFALSE\t2147483647\tabx_test_cookie\thello',
|
||||
'',
|
||||
])
|
||||
)
|
||||
|
||||
profile_dir = Path(tmpdir) / 'profile'
|
||||
env = get_test_env()
|
||||
env.update({
|
||||
'CHROME_HEADLESS': 'true',
|
||||
'CHROME_USER_DATA_DIR': str(profile_dir),
|
||||
'COOKIES_TXT_FILE': str(cookies_file),
|
||||
})
|
||||
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-cookies'],
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
for _ in range(15):
|
||||
if (chrome_dir / 'port.txt').exists():
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
assert (chrome_dir / 'port.txt').exists(), "port.txt should exist"
|
||||
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
|
||||
port = int((chrome_dir / 'port.txt').read_text().strip())
|
||||
|
||||
cookie_found = False
|
||||
for _ in range(15):
|
||||
cookies = _get_cookies_via_cdp(port, env)
|
||||
cookie_found = any(
|
||||
c.get('name') == 'abx_test_cookie' and c.get('value') == 'hello'
|
||||
for c in cookies
|
||||
)
|
||||
if cookie_found:
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
assert cookie_found, "Imported cookie should be present in Chrome session"
|
||||
|
||||
# Cleanup
|
||||
try:
|
||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||
chrome_launch_process.wait(timeout=5)
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
os.kill(chrome_pid, signal.SIGKILL)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def test_chrome_navigation():
|
||||
"""Integration test: Navigate to a URL."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
crawl_dir = Path(tmpdir) / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
chrome_dir.mkdir()
|
||||
|
||||
# Launch Chrome (background process)
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-nav'],
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=get_test_env() | {'CHROME_HEADLESS': 'true'}
|
||||
)
|
||||
|
||||
# Wait for Chrome to launch
|
||||
time.sleep(3)
|
||||
|
||||
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
|
||||
|
||||
# Create snapshot and tab
|
||||
snapshot_dir = Path(tmpdir) / 'snapshot1'
|
||||
snapshot_dir.mkdir()
|
||||
snapshot_chrome_dir = snapshot_dir / 'chrome'
|
||||
snapshot_chrome_dir.mkdir()
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(CHROME_TAB_HOOK), '--url=https://example.com', '--snapshot-id=snap-nav-123', '--crawl-id=test-crawl-nav'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
env=get_test_env() | {'CRAWL_OUTPUT_DIR': str(crawl_dir), 'CHROME_HEADLESS': 'true'}
|
||||
)
|
||||
assert result.returncode == 0, f"Tab creation failed: {result.stderr}"
|
||||
|
||||
# Navigate to URL
|
||||
result = subprocess.run(
|
||||
['node', str(CHROME_NAVIGATE_HOOK), '--url=https://example.com', '--snapshot-id=snap-nav-123'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=get_test_env() | {'CHROME_PAGELOAD_TIMEOUT': '30', 'CHROME_WAIT_FOR': 'load'}
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Navigation failed: {result.stderr}\nStdout: {result.stdout}"
|
||||
|
||||
# Verify navigation outputs
|
||||
assert (snapshot_chrome_dir / 'navigation.json').exists(), "navigation.json should exist"
|
||||
assert (snapshot_chrome_dir / 'page_loaded.txt').exists(), "page_loaded.txt should exist"
|
||||
|
||||
nav_data = json.loads((snapshot_chrome_dir / 'navigation.json').read_text())
|
||||
assert nav_data.get('status') in [200, 301, 302], f"Should get valid HTTP status: {nav_data}"
|
||||
assert nav_data.get('finalUrl'), "Should have final URL"
|
||||
|
||||
# Cleanup
|
||||
try:
|
||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||
chrome_launch_process.wait(timeout=5)
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
os.kill(chrome_pid, signal.SIGKILL)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def test_tab_cleanup_on_sigterm():
|
||||
"""Integration test: Tab cleanup when receiving SIGTERM."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
crawl_dir = Path(tmpdir) / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
chrome_dir.mkdir()
|
||||
|
||||
# Launch Chrome (background process)
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-cleanup'],
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=get_test_env() | {'CHROME_HEADLESS': 'true'}
|
||||
)
|
||||
|
||||
# Wait for Chrome to launch
|
||||
time.sleep(3)
|
||||
|
||||
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
|
||||
|
||||
# Create snapshot and tab - run in background
|
||||
snapshot_dir = Path(tmpdir) / 'snapshot1'
|
||||
snapshot_dir.mkdir()
|
||||
snapshot_chrome_dir = snapshot_dir / 'chrome'
|
||||
snapshot_chrome_dir.mkdir()
|
||||
|
||||
tab_process = subprocess.Popen(
|
||||
['node', str(CHROME_TAB_HOOK), '--url=https://example.com', '--snapshot-id=snap-cleanup', '--crawl-id=test-cleanup'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=get_test_env() | {'CRAWL_OUTPUT_DIR': str(crawl_dir), 'CHROME_HEADLESS': 'true'}
|
||||
)
|
||||
|
||||
# Wait for tab to be created
|
||||
time.sleep(3)
|
||||
|
||||
# Send SIGTERM to tab process
|
||||
tab_process.send_signal(signal.SIGTERM)
|
||||
stdout, stderr = tab_process.communicate(timeout=10)
|
||||
|
||||
assert tab_process.returncode == 0, f"Tab process should exit cleanly: {stderr}"
|
||||
|
||||
# Chrome should still be running
|
||||
try:
|
||||
os.kill(chrome_pid, 0)
|
||||
except OSError:
|
||||
pytest.fail("Chrome should still be running after tab cleanup")
|
||||
|
||||
# Cleanup
|
||||
try:
|
||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||
chrome_launch_process.wait(timeout=5)
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
os.kill(chrome_pid, signal.SIGKILL)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def test_multiple_snapshots_share_chrome():
|
||||
"""Integration test: Multiple snapshots share one Chrome instance."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
crawl_dir = Path(tmpdir) / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
chrome_dir.mkdir()
|
||||
|
||||
# Launch Chrome at crawl level
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-multi-crawl'],
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=get_test_env() | {'CHROME_HEADLESS': 'true'}
|
||||
)
|
||||
|
||||
# Wait for Chrome to launch
|
||||
for i in range(15):
|
||||
if (chrome_dir / 'cdp_url.txt').exists():
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
|
||||
crawl_cdp_url = (chrome_dir / 'cdp_url.txt').read_text().strip()
|
||||
|
||||
# Create multiple snapshots that share this Chrome
|
||||
snapshot_dirs = []
|
||||
target_ids = []
|
||||
|
||||
for snap_num in range(3):
|
||||
snapshot_dir = Path(tmpdir) / f'snapshot{snap_num}'
|
||||
snapshot_dir.mkdir()
|
||||
snapshot_chrome_dir = snapshot_dir / 'chrome'
|
||||
snapshot_chrome_dir.mkdir()
|
||||
snapshot_dirs.append(snapshot_chrome_dir)
|
||||
|
||||
# Create tab for this snapshot
|
||||
result = subprocess.run(
|
||||
['node', str(CHROME_TAB_HOOK), f'--url=https://example.com/{snap_num}', f'--snapshot-id=snap-{snap_num}', '--crawl-id=test-multi-crawl'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
env=get_test_env() | {'CRAWL_OUTPUT_DIR': str(crawl_dir), 'CHROME_HEADLESS': 'true'}
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Tab {snap_num} creation failed: {result.stderr}"
|
||||
|
||||
# Verify each snapshot has its own target_id but same Chrome PID
|
||||
assert (snapshot_chrome_dir / 'target_id.txt').exists()
|
||||
assert (snapshot_chrome_dir / 'cdp_url.txt').exists()
|
||||
assert (snapshot_chrome_dir / 'chrome.pid').exists()
|
||||
|
||||
target_id = (snapshot_chrome_dir / 'target_id.txt').read_text().strip()
|
||||
snapshot_cdp_url = (snapshot_chrome_dir / 'cdp_url.txt').read_text().strip()
|
||||
snapshot_pid = int((snapshot_chrome_dir / 'chrome.pid').read_text().strip())
|
||||
|
||||
target_ids.append(target_id)
|
||||
|
||||
# All snapshots should share same Chrome
|
||||
assert snapshot_pid == chrome_pid, f"Snapshot {snap_num} should use crawl Chrome PID"
|
||||
assert snapshot_cdp_url == crawl_cdp_url, f"Snapshot {snap_num} should use crawl CDP URL"
|
||||
|
||||
# All target IDs should be unique (different tabs)
|
||||
assert len(set(target_ids)) == 3, f"All snapshots should have unique tabs: {target_ids}"
|
||||
|
||||
# Chrome should still be running with all 3 tabs
|
||||
try:
|
||||
os.kill(chrome_pid, 0)
|
||||
except OSError:
|
||||
pytest.fail("Chrome should still be running after creating 3 tabs")
|
||||
|
||||
# Cleanup
|
||||
try:
|
||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||
chrome_launch_process.wait(timeout=5)
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
os.kill(chrome_pid, signal.SIGKILL)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def test_chrome_cleanup_on_crawl_end():
|
||||
"""Integration test: Chrome cleanup at end of crawl."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
crawl_dir = Path(tmpdir) / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
chrome_dir.mkdir()
|
||||
|
||||
# Launch Chrome in background
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-end'],
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=get_test_env() | {'CHROME_HEADLESS': 'true'}
|
||||
)
|
||||
|
||||
# Wait for Chrome to launch
|
||||
time.sleep(3)
|
||||
|
||||
# Verify Chrome is running
|
||||
assert (chrome_dir / 'chrome.pid').exists(), "Chrome PID file should exist"
|
||||
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
|
||||
|
||||
try:
|
||||
os.kill(chrome_pid, 0)
|
||||
except OSError:
|
||||
pytest.fail("Chrome should be running")
|
||||
|
||||
# Send SIGTERM to chrome launch process
|
||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||
stdout, stderr = chrome_launch_process.communicate(timeout=10)
|
||||
|
||||
# Wait for cleanup
|
||||
time.sleep(3)
|
||||
|
||||
# Verify Chrome process is killed
|
||||
try:
|
||||
os.kill(chrome_pid, 0)
|
||||
pytest.fail("Chrome should be killed after SIGTERM")
|
||||
except OSError:
|
||||
# Expected - Chrome should be dead
|
||||
pass
|
||||
|
||||
|
||||
def test_zombie_prevention_hook_killed():
|
||||
"""Integration test: Chrome is killed even if hook process is SIGKILL'd."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
crawl_dir = Path(tmpdir) / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
chrome_dir.mkdir()
|
||||
|
||||
# Launch Chrome
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-zombie'],
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=get_test_env() | {'CHROME_HEADLESS': 'true'}
|
||||
)
|
||||
|
||||
# Wait for Chrome to launch
|
||||
for i in range(15):
|
||||
if (chrome_dir / 'chrome.pid').exists():
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
assert (chrome_dir / 'chrome.pid').exists(), "Chrome PID file should exist"
|
||||
|
||||
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
|
||||
hook_pid = chrome_launch_process.pid # Use the Popen process PID instead of hook.pid file
|
||||
|
||||
# Verify both Chrome and hook are running
|
||||
try:
|
||||
os.kill(chrome_pid, 0)
|
||||
os.kill(hook_pid, 0)
|
||||
except OSError:
|
||||
pytest.fail("Both Chrome and hook should be running")
|
||||
|
||||
# Simulate hook getting SIGKILL'd (can't cleanup)
|
||||
os.kill(hook_pid, signal.SIGKILL)
|
||||
time.sleep(1)
|
||||
|
||||
# Chrome should still be running (orphaned)
|
||||
try:
|
||||
os.kill(chrome_pid, 0)
|
||||
except OSError:
|
||||
pytest.fail("Chrome should still be running after hook SIGKILL")
|
||||
|
||||
# Simulate Crawl.cleanup() using the actual cleanup logic
|
||||
def is_process_alive(pid):
|
||||
"""Check if a process exists."""
|
||||
try:
|
||||
os.kill(pid, 0)
|
||||
return True
|
||||
except (OSError, ProcessLookupError):
|
||||
return False
|
||||
|
||||
for pid_file in chrome_dir.glob('**/*.pid'):
|
||||
try:
|
||||
pid = int(pid_file.read_text().strip())
|
||||
|
||||
# Step 1: SIGTERM for graceful shutdown
|
||||
try:
|
||||
try:
|
||||
os.killpg(pid, signal.SIGTERM)
|
||||
except (OSError, ProcessLookupError):
|
||||
os.kill(pid, signal.SIGTERM)
|
||||
except ProcessLookupError:
|
||||
pid_file.unlink(missing_ok=True)
|
||||
continue
|
||||
|
||||
# Step 2: Wait for graceful shutdown
|
||||
time.sleep(2)
|
||||
|
||||
# Step 3: Check if still alive
|
||||
if not is_process_alive(pid):
|
||||
pid_file.unlink(missing_ok=True)
|
||||
continue
|
||||
|
||||
# Step 4: Force kill ENTIRE process group with SIGKILL
|
||||
try:
|
||||
try:
|
||||
# Always kill entire process group with SIGKILL
|
||||
os.killpg(pid, signal.SIGKILL)
|
||||
except (OSError, ProcessLookupError):
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
except ProcessLookupError:
|
||||
pid_file.unlink(missing_ok=True)
|
||||
continue
|
||||
|
||||
# Step 5: Wait and verify death
|
||||
time.sleep(1)
|
||||
|
||||
if not is_process_alive(pid):
|
||||
pid_file.unlink(missing_ok=True)
|
||||
|
||||
except (ValueError, OSError):
|
||||
pass
|
||||
|
||||
# Chrome should now be dead
|
||||
try:
|
||||
os.kill(chrome_pid, 0)
|
||||
pytest.fail("Chrome should be killed after cleanup")
|
||||
except OSError:
|
||||
# Expected - Chrome is dead
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
@@ -1,260 +0,0 @@
|
||||
"""
|
||||
Tests for chrome_test_helpers.py functions.
|
||||
|
||||
These tests verify the Python helper functions used across Chrome plugin tests.
|
||||
"""
|
||||
|
||||
import os
|
||||
import pytest
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_test_env,
|
||||
get_machine_type,
|
||||
get_lib_dir,
|
||||
get_node_modules_dir,
|
||||
get_extensions_dir,
|
||||
find_chromium_binary,
|
||||
get_plugin_dir,
|
||||
get_hook_script,
|
||||
parse_jsonl_output,
|
||||
)
|
||||
|
||||
|
||||
def test_get_machine_type():
|
||||
"""Test get_machine_type() returns valid format."""
|
||||
machine_type = get_machine_type()
|
||||
assert isinstance(machine_type, str)
|
||||
assert '-' in machine_type, "Machine type should be in format: arch-os"
|
||||
# Should be one of the expected formats
|
||||
assert any(x in machine_type for x in ['arm64', 'x86_64']), "Should contain valid architecture"
|
||||
assert any(x in machine_type for x in ['darwin', 'linux', 'win32']), "Should contain valid OS"
|
||||
|
||||
|
||||
def test_get_lib_dir_with_env_var():
|
||||
"""Test get_lib_dir() respects LIB_DIR env var."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
custom_lib = Path(tmpdir) / 'custom_lib'
|
||||
custom_lib.mkdir()
|
||||
|
||||
old_lib_dir = os.environ.get('LIB_DIR')
|
||||
try:
|
||||
os.environ['LIB_DIR'] = str(custom_lib)
|
||||
lib_dir = get_lib_dir()
|
||||
assert lib_dir == custom_lib
|
||||
finally:
|
||||
if old_lib_dir:
|
||||
os.environ['LIB_DIR'] = old_lib_dir
|
||||
else:
|
||||
os.environ.pop('LIB_DIR', None)
|
||||
|
||||
|
||||
def test_get_node_modules_dir_with_env_var():
|
||||
"""Test get_node_modules_dir() respects NODE_MODULES_DIR env var."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
custom_nm = Path(tmpdir) / 'node_modules'
|
||||
custom_nm.mkdir()
|
||||
|
||||
old_nm_dir = os.environ.get('NODE_MODULES_DIR')
|
||||
try:
|
||||
os.environ['NODE_MODULES_DIR'] = str(custom_nm)
|
||||
nm_dir = get_node_modules_dir()
|
||||
assert nm_dir == custom_nm
|
||||
finally:
|
||||
if old_nm_dir:
|
||||
os.environ['NODE_MODULES_DIR'] = old_nm_dir
|
||||
else:
|
||||
os.environ.pop('NODE_MODULES_DIR', None)
|
||||
|
||||
|
||||
def test_get_extensions_dir_default():
|
||||
"""Test get_extensions_dir() returns expected path format."""
|
||||
ext_dir = get_extensions_dir()
|
||||
assert isinstance(ext_dir, str)
|
||||
assert 'personas' in ext_dir
|
||||
assert 'chrome_extensions' in ext_dir
|
||||
|
||||
|
||||
def test_get_extensions_dir_with_custom_persona():
|
||||
"""Test get_extensions_dir() respects ACTIVE_PERSONA env var."""
|
||||
old_persona = os.environ.get('ACTIVE_PERSONA')
|
||||
old_data_dir = os.environ.get('DATA_DIR')
|
||||
try:
|
||||
os.environ['ACTIVE_PERSONA'] = 'TestPersona'
|
||||
os.environ['DATA_DIR'] = '/tmp/test'
|
||||
ext_dir = get_extensions_dir()
|
||||
assert 'TestPersona' in ext_dir
|
||||
assert '/tmp/test' in ext_dir
|
||||
finally:
|
||||
if old_persona:
|
||||
os.environ['ACTIVE_PERSONA'] = old_persona
|
||||
else:
|
||||
os.environ.pop('ACTIVE_PERSONA', None)
|
||||
if old_data_dir:
|
||||
os.environ['DATA_DIR'] = old_data_dir
|
||||
else:
|
||||
os.environ.pop('DATA_DIR', None)
|
||||
|
||||
|
||||
def test_get_test_env_returns_dict():
|
||||
"""Test get_test_env() returns properly formatted environment dict."""
|
||||
env = get_test_env()
|
||||
assert isinstance(env, dict)
|
||||
|
||||
# Should include key paths
|
||||
assert 'MACHINE_TYPE' in env
|
||||
assert 'LIB_DIR' in env
|
||||
assert 'NODE_MODULES_DIR' in env
|
||||
assert 'NODE_PATH' in env # Critical for module resolution
|
||||
assert 'NPM_BIN_DIR' in env
|
||||
assert 'CHROME_EXTENSIONS_DIR' in env
|
||||
|
||||
# Verify NODE_PATH equals NODE_MODULES_DIR (for Node.js module resolution)
|
||||
assert env['NODE_PATH'] == env['NODE_MODULES_DIR']
|
||||
|
||||
|
||||
def test_get_test_env_paths_are_absolute():
|
||||
"""Test that get_test_env() returns absolute paths."""
|
||||
env = get_test_env()
|
||||
|
||||
# All path-like values should be absolute
|
||||
assert Path(env['LIB_DIR']).is_absolute()
|
||||
assert Path(env['NODE_MODULES_DIR']).is_absolute()
|
||||
assert Path(env['NODE_PATH']).is_absolute()
|
||||
|
||||
|
||||
def test_find_chromium_binary():
|
||||
"""Test find_chromium_binary() returns a path or None."""
|
||||
binary = find_chromium_binary()
|
||||
if binary:
|
||||
assert isinstance(binary, str)
|
||||
# Should be an absolute path if found
|
||||
assert os.path.isabs(binary)
|
||||
|
||||
|
||||
def test_get_plugin_dir():
|
||||
"""Test get_plugin_dir() finds correct plugin directory."""
|
||||
# Use this test file's path
|
||||
test_file = __file__
|
||||
plugin_dir = get_plugin_dir(test_file)
|
||||
|
||||
assert plugin_dir.exists()
|
||||
assert plugin_dir.is_dir()
|
||||
# Should be the chrome plugin directory
|
||||
assert plugin_dir.name == 'chrome'
|
||||
assert (plugin_dir.parent.name == 'plugins')
|
||||
|
||||
|
||||
def test_get_hook_script_finds_existing_hook():
|
||||
"""Test get_hook_script() can find an existing hook."""
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import CHROME_PLUGIN_DIR
|
||||
|
||||
# Try to find the chrome launch hook
|
||||
hook = get_hook_script(CHROME_PLUGIN_DIR, 'on_Crawl__*_chrome_launch.*')
|
||||
|
||||
if hook: # May not exist in all test environments
|
||||
assert hook.exists()
|
||||
assert hook.is_file()
|
||||
assert 'chrome_launch' in hook.name
|
||||
|
||||
|
||||
def test_get_hook_script_returns_none_for_missing():
|
||||
"""Test get_hook_script() returns None for non-existent hooks."""
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import CHROME_PLUGIN_DIR
|
||||
|
||||
hook = get_hook_script(CHROME_PLUGIN_DIR, 'nonexistent_hook_*_pattern.*')
|
||||
assert hook is None
|
||||
|
||||
|
||||
def test_parse_jsonl_output_valid():
|
||||
"""Test parse_jsonl_output() parses valid JSONL."""
|
||||
jsonl_output = '''{"type": "ArchiveResult", "status": "succeeded", "output": "test1"}
|
||||
{"type": "ArchiveResult", "status": "failed", "error": "test2"}
|
||||
'''
|
||||
|
||||
# Returns first match only
|
||||
result = parse_jsonl_output(jsonl_output)
|
||||
assert result is not None
|
||||
assert result['type'] == 'ArchiveResult'
|
||||
assert result['status'] == 'succeeded'
|
||||
assert result['output'] == 'test1'
|
||||
|
||||
|
||||
def test_parse_jsonl_output_with_non_json_lines():
|
||||
"""Test parse_jsonl_output() skips non-JSON lines."""
|
||||
mixed_output = '''Some non-JSON output
|
||||
{"type": "ArchiveResult", "status": "succeeded"}
|
||||
More non-JSON
|
||||
{"type": "ArchiveResult", "status": "failed"}
|
||||
'''
|
||||
|
||||
result = parse_jsonl_output(mixed_output)
|
||||
assert result is not None
|
||||
assert result['type'] == 'ArchiveResult'
|
||||
assert result['status'] == 'succeeded'
|
||||
|
||||
|
||||
def test_parse_jsonl_output_empty():
|
||||
"""Test parse_jsonl_output() handles empty input."""
|
||||
result = parse_jsonl_output('')
|
||||
assert result is None
|
||||
|
||||
|
||||
def test_parse_jsonl_output_filters_by_type():
|
||||
"""Test parse_jsonl_output() can filter by record type."""
|
||||
jsonl_output = '''{"type": "LogEntry", "data": "log1"}
|
||||
{"type": "ArchiveResult", "data": "result1"}
|
||||
{"type": "ArchiveResult", "data": "result2"}
|
||||
'''
|
||||
|
||||
# Should return first ArchiveResult, not LogEntry
|
||||
result = parse_jsonl_output(jsonl_output, record_type='ArchiveResult')
|
||||
assert result is not None
|
||||
assert result['type'] == 'ArchiveResult'
|
||||
assert result['data'] == 'result1' # First ArchiveResult
|
||||
|
||||
|
||||
def test_parse_jsonl_output_filters_custom_type():
|
||||
"""Test parse_jsonl_output() can filter by custom record type."""
|
||||
jsonl_output = '''{"type": "ArchiveResult", "data": "result1"}
|
||||
{"type": "LogEntry", "data": "log1"}
|
||||
{"type": "ArchiveResult", "data": "result2"}
|
||||
'''
|
||||
|
||||
result = parse_jsonl_output(jsonl_output, record_type='LogEntry')
|
||||
assert result is not None
|
||||
assert result['type'] == 'LogEntry'
|
||||
assert result['data'] == 'log1'
|
||||
|
||||
|
||||
def test_machine_type_consistency():
|
||||
"""Test that machine type is consistent across calls."""
|
||||
mt1 = get_machine_type()
|
||||
mt2 = get_machine_type()
|
||||
assert mt1 == mt2, "Machine type should be stable across calls"
|
||||
|
||||
|
||||
def test_lib_dir_is_directory():
|
||||
"""Test that lib_dir points to an actual directory when DATA_DIR is set."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
old_data_dir = os.environ.get('DATA_DIR')
|
||||
try:
|
||||
os.environ['DATA_DIR'] = tmpdir
|
||||
# Create the expected directory structure
|
||||
machine_type = get_machine_type()
|
||||
lib_dir = Path(tmpdir) / 'lib' / machine_type
|
||||
lib_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
result = get_lib_dir()
|
||||
# Should return a Path object
|
||||
assert isinstance(result, Path)
|
||||
finally:
|
||||
if old_data_dir:
|
||||
os.environ['DATA_DIR'] = old_data_dir
|
||||
else:
|
||||
os.environ.pop('DATA_DIR', None)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
@@ -1,21 +0,0 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"required_plugins": ["chrome"],
|
||||
"properties": {
|
||||
"CONSOLELOG_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_CONSOLELOG", "USE_CONSOLELOG"],
|
||||
"description": "Enable console log capture"
|
||||
},
|
||||
"CONSOLELOG_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 30,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for console log capture in seconds"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,201 +0,0 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Capture console output from a page.
|
||||
*
|
||||
* This hook sets up CDP listeners BEFORE chrome_navigate loads the page,
|
||||
* then waits for navigation to complete. The listeners stay active through
|
||||
* navigation and capture all console output.
|
||||
*
|
||||
* Usage: on_Snapshot__21_consolelog.js --url=<url> --snapshot-id=<uuid>
|
||||
* Output: Writes console.jsonl
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Import shared utilities from chrome_utils.js
|
||||
const {
|
||||
getEnvBool,
|
||||
getEnvInt,
|
||||
parseArgs,
|
||||
connectToPage,
|
||||
waitForPageLoaded,
|
||||
} = require('../chrome/chrome_utils.js');
|
||||
|
||||
const PLUGIN_NAME = 'consolelog';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'console.jsonl';
|
||||
const CHROME_SESSION_DIR = '../chrome';
|
||||
|
||||
let browser = null;
|
||||
let page = null;
|
||||
let logCount = 0;
|
||||
let errorCount = 0;
|
||||
let requestFailCount = 0;
|
||||
let shuttingDown = false;
|
||||
|
||||
async function serializeArgs(args) {
|
||||
const serialized = [];
|
||||
for (const arg of args) {
|
||||
try {
|
||||
const json = await arg.jsonValue();
|
||||
serialized.push(json);
|
||||
} catch (e) {
|
||||
try {
|
||||
serialized.push(String(arg));
|
||||
} catch (e2) {
|
||||
serialized.push('[Unserializable]');
|
||||
}
|
||||
}
|
||||
}
|
||||
return serialized;
|
||||
}
|
||||
|
||||
async function setupListeners() {
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
const timeout = getEnvInt('CONSOLELOG_TIMEOUT', 30) * 1000;
|
||||
|
||||
fs.writeFileSync(outputPath, ''); // Clear existing
|
||||
|
||||
// Connect to Chrome page using shared utility
|
||||
const { browser, page } = await connectToPage({
|
||||
chromeSessionDir: CHROME_SESSION_DIR,
|
||||
timeoutMs: timeout,
|
||||
puppeteer,
|
||||
});
|
||||
|
||||
// Set up listeners that write directly to file
|
||||
page.on('console', async (msg) => {
|
||||
try {
|
||||
const logEntry = {
|
||||
timestamp: new Date().toISOString(),
|
||||
type: msg.type(),
|
||||
text: msg.text(),
|
||||
args: await serializeArgs(msg.args()),
|
||||
location: msg.location(),
|
||||
};
|
||||
fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
|
||||
logCount += 1;
|
||||
} catch (e) {
|
||||
// Ignore errors
|
||||
}
|
||||
});
|
||||
|
||||
page.on('pageerror', (error) => {
|
||||
try {
|
||||
const logEntry = {
|
||||
timestamp: new Date().toISOString(),
|
||||
type: 'error',
|
||||
text: error.message,
|
||||
stack: error.stack || '',
|
||||
};
|
||||
fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
|
||||
errorCount += 1;
|
||||
} catch (e) {
|
||||
// Ignore
|
||||
}
|
||||
});
|
||||
|
||||
page.on('requestfailed', (request) => {
|
||||
try {
|
||||
const failure = request.failure();
|
||||
const logEntry = {
|
||||
timestamp: new Date().toISOString(),
|
||||
type: 'request_failed',
|
||||
text: `Request failed: ${request.url()}`,
|
||||
error: failure ? failure.errorText : 'Unknown error',
|
||||
url: request.url(),
|
||||
};
|
||||
fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
|
||||
requestFailCount += 1;
|
||||
} catch (e) {
|
||||
// Ignore
|
||||
}
|
||||
});
|
||||
|
||||
return { browser, page };
|
||||
}
|
||||
|
||||
function emitResult(status = 'succeeded') {
|
||||
if (shuttingDown) return;
|
||||
shuttingDown = true;
|
||||
|
||||
const counts = `${logCount} console, ${errorCount} errors, ${requestFailCount} failed requests`;
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status,
|
||||
output_str: `${OUTPUT_FILE} (${counts})`,
|
||||
}));
|
||||
}
|
||||
|
||||
async function handleShutdown(signal) {
|
||||
console.error(`\nReceived ${signal}, emitting final results...`);
|
||||
emitResult('succeeded');
|
||||
if (browser) {
|
||||
try {
|
||||
browser.disconnect();
|
||||
} catch (e) {}
|
||||
}
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const url = args.url;
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__21_consolelog.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
if (!getEnvBool('CONSOLELOG_ENABLED', true)) {
|
||||
console.error('Skipping (CONSOLELOG_ENABLED=False)');
|
||||
console.log(JSON.stringify({type: 'ArchiveResult', status: 'skipped', output_str: 'CONSOLELOG_ENABLED=False'}));
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
try {
|
||||
// Set up listeners BEFORE navigation
|
||||
const connection = await setupListeners();
|
||||
browser = connection.browser;
|
||||
page = connection.page;
|
||||
|
||||
// Register signal handlers for graceful shutdown
|
||||
process.on('SIGTERM', () => handleShutdown('SIGTERM'));
|
||||
process.on('SIGINT', () => handleShutdown('SIGINT'));
|
||||
|
||||
// Wait for chrome_navigate to complete (non-fatal)
|
||||
try {
|
||||
const timeout = getEnvInt('CONSOLELOG_TIMEOUT', 30) * 1000;
|
||||
await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 500);
|
||||
} catch (e) {
|
||||
console.error(`WARN: ${e.message}`);
|
||||
}
|
||||
|
||||
// console.error('Consolelog active, waiting for cleanup signal...');
|
||||
await new Promise(() => {}); // Keep alive until SIGTERM
|
||||
return;
|
||||
|
||||
} catch (e) {
|
||||
const error = `${e.name}: ${e.message}`;
|
||||
console.error(`ERROR: ${error}`);
|
||||
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status: 'failed',
|
||||
output_str: error,
|
||||
}));
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -1 +0,0 @@
|
||||
<span class="abx-output-icon abx-output-icon--consolelog" title="Console Log"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><rect x="3" y="4.5" width="18" height="15" rx="2"/><path d="M7 12l2 2-2 2"/><path d="M11 16h6"/></svg></span>
|
||||
@@ -1,127 +0,0 @@
|
||||
"""
|
||||
Tests for the consolelog plugin.
|
||||
|
||||
Tests the real consolelog hook with an actual URL to verify
|
||||
console output capture.
|
||||
"""
|
||||
|
||||
import json
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from django.test import TestCase
|
||||
|
||||
# Import chrome test helpers
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests'))
|
||||
from chrome_test_helpers import (
|
||||
chrome_session,
|
||||
CHROME_NAVIGATE_HOOK,
|
||||
get_plugin_dir,
|
||||
get_hook_script,
|
||||
)
|
||||
|
||||
|
||||
# Get the path to the consolelog hook
|
||||
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||
CONSOLELOG_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_consolelog.*')
|
||||
|
||||
|
||||
class TestConsolelogPlugin(TestCase):
|
||||
"""Test the consolelog plugin."""
|
||||
|
||||
def test_consolelog_hook_exists(self):
|
||||
"""Consolelog hook script should exist."""
|
||||
self.assertIsNotNone(CONSOLELOG_HOOK, "Consolelog hook not found in plugin directory")
|
||||
self.assertTrue(CONSOLELOG_HOOK.exists(), f"Hook not found: {CONSOLELOG_HOOK}")
|
||||
|
||||
|
||||
class TestConsolelogWithChrome(TestCase):
|
||||
"""Integration tests for consolelog plugin with Chrome."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test environment."""
|
||||
self.temp_dir = Path(tempfile.mkdtemp())
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up."""
|
||||
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
||||
|
||||
def test_consolelog_captures_output(self):
|
||||
"""Consolelog hook should capture console output from page."""
|
||||
test_url = 'data:text/html,<script>console.log("archivebox-console-test")</script>'
|
||||
snapshot_id = 'test-consolelog-snapshot'
|
||||
|
||||
with chrome_session(
|
||||
self.temp_dir,
|
||||
crawl_id='test-consolelog-crawl',
|
||||
snapshot_id=snapshot_id,
|
||||
test_url=test_url,
|
||||
navigate=False,
|
||||
timeout=30,
|
||||
) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
|
||||
console_dir = snapshot_chrome_dir.parent / 'consolelog'
|
||||
console_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Run consolelog hook with the active Chrome session (background hook)
|
||||
result = subprocess.Popen(
|
||||
['node', str(CONSOLELOG_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
|
||||
cwd=str(console_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
nav_result = subprocess.run(
|
||||
['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
self.assertEqual(nav_result.returncode, 0, f"Navigation failed: {nav_result.stderr}")
|
||||
|
||||
# Check for output file
|
||||
console_output = console_dir / 'console.jsonl'
|
||||
|
||||
# Allow it to run briefly, then terminate (background hook)
|
||||
for _ in range(10):
|
||||
if console_output.exists() and console_output.stat().st_size > 0:
|
||||
break
|
||||
time.sleep(1)
|
||||
if result.poll() is None:
|
||||
result.terminate()
|
||||
try:
|
||||
stdout, stderr = result.communicate(timeout=5)
|
||||
except subprocess.TimeoutExpired:
|
||||
result.kill()
|
||||
stdout, stderr = result.communicate()
|
||||
else:
|
||||
stdout, stderr = result.communicate()
|
||||
|
||||
# At minimum, verify no crash
|
||||
self.assertNotIn('Traceback', stderr)
|
||||
|
||||
# If output file exists, verify it's valid JSONL and has output
|
||||
if console_output.exists():
|
||||
with open(console_output) as f:
|
||||
content = f.read().strip()
|
||||
self.assertTrue(content, "Console output should not be empty")
|
||||
for line in content.split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
# Verify structure
|
||||
self.assertIn('timestamp', record)
|
||||
self.assertIn('type', record)
|
||||
except json.JSONDecodeError:
|
||||
pass # Some lines may be incomplete
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
@@ -1,98 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Install a binary using a custom bash command.
|
||||
|
||||
This provider runs arbitrary shell commands to install binaries
|
||||
that don't fit into standard package managers.
|
||||
|
||||
Usage: on_Binary__install_using_custom_bash.py --binary-id=<uuid> --machine-id=<uuid> --name=<name> --custom-cmd=<cmd>
|
||||
Output: Binary JSONL record to stdout after installation
|
||||
|
||||
Environment variables:
|
||||
MACHINE_ID: Machine UUID (set by orchestrator)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
import rich_click as click
|
||||
from abx_pkg import Binary, EnvProvider
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--binary-id', required=True, help="Binary UUID")
|
||||
@click.option('--machine-id', required=True, help="Machine UUID")
|
||||
@click.option('--name', required=True, help="Binary name to install")
|
||||
@click.option('--binproviders', default='*', help="Allowed providers (comma-separated)")
|
||||
@click.option('--custom-cmd', required=True, help="Custom bash command to run")
|
||||
def main(binary_id: str, machine_id: str, name: str, binproviders: str, custom_cmd: str):
|
||||
"""Install binary using custom bash command."""
|
||||
|
||||
if binproviders != '*' and 'custom' not in binproviders.split(','):
|
||||
click.echo(f"custom provider not allowed for {name}", err=True)
|
||||
sys.exit(0)
|
||||
|
||||
if not custom_cmd:
|
||||
click.echo("custom provider requires --custom-cmd", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
click.echo(f"Installing {name} via custom command: {custom_cmd}", err=True)
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
custom_cmd,
|
||||
shell=True,
|
||||
timeout=600, # 10 minute timeout for custom installs
|
||||
)
|
||||
if result.returncode != 0:
|
||||
click.echo(f"Custom install failed (exit={result.returncode})", err=True)
|
||||
sys.exit(1)
|
||||
except subprocess.TimeoutExpired:
|
||||
click.echo("Custom install timed out", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Use abx-pkg to load the binary and get its info
|
||||
provider = EnvProvider()
|
||||
try:
|
||||
binary = Binary(name=name, binproviders=[provider]).load()
|
||||
except Exception:
|
||||
try:
|
||||
binary = Binary(
|
||||
name=name,
|
||||
binproviders=[provider],
|
||||
overrides={'env': {'version': '0.0.1'}},
|
||||
).load()
|
||||
except Exception as e:
|
||||
click.echo(f"{name} not found after custom install: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
if not binary.abspath:
|
||||
click.echo(f"{name} not found after custom install", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
# Output Binary JSONL record to stdout
|
||||
record = {
|
||||
'type': 'Binary',
|
||||
'name': name,
|
||||
'abspath': str(binary.abspath),
|
||||
'version': str(binary.version) if binary.version else '',
|
||||
'sha256': binary.sha256 or '',
|
||||
'binprovider': 'custom',
|
||||
'machine_id': machine_id,
|
||||
'binary_id': binary_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
# Log human-readable info to stderr
|
||||
click.echo(f"Installed {name} at {binary.abspath}", err=True)
|
||||
click.echo(f" version: {binary.version}", err=True)
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,149 +0,0 @@
|
||||
"""
|
||||
Tests for the custom binary provider plugin.
|
||||
|
||||
Tests the custom bash binary installer with safe commands.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from django.test import TestCase
|
||||
|
||||
|
||||
# Get the path to the custom provider hook
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
INSTALL_HOOK = next(PLUGIN_DIR.glob('on_Binary__*_custom_install.py'), None)
|
||||
|
||||
|
||||
class TestCustomProviderHook(TestCase):
|
||||
"""Test the custom binary provider hook."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test environment."""
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up."""
|
||||
import shutil
|
||||
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
||||
|
||||
def test_hook_script_exists(self):
|
||||
"""Hook script should exist."""
|
||||
self.assertTrue(INSTALL_HOOK and INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}")
|
||||
|
||||
def test_hook_skips_when_custom_not_allowed(self):
|
||||
"""Hook should skip when custom not in allowed binproviders."""
|
||||
env = os.environ.copy()
|
||||
env['DATA_DIR'] = self.temp_dir
|
||||
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable, str(INSTALL_HOOK),
|
||||
'--name=echo',
|
||||
'--binary-id=test-uuid',
|
||||
'--machine-id=test-machine',
|
||||
'--binproviders=pip,apt', # custom not allowed
|
||||
'--custom-cmd=echo hello',
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Should exit cleanly (code 0) when custom not allowed
|
||||
self.assertEqual(result.returncode, 0)
|
||||
self.assertIn('custom provider not allowed', result.stderr)
|
||||
|
||||
def test_hook_runs_custom_command_and_finds_binary(self):
|
||||
"""Hook should run custom command and find the binary in PATH."""
|
||||
env = os.environ.copy()
|
||||
env['DATA_DIR'] = self.temp_dir
|
||||
|
||||
# Use a simple echo command that doesn't actually install anything
|
||||
# Then check for 'echo' which is already in PATH
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable, str(INSTALL_HOOK),
|
||||
'--name=echo',
|
||||
'--binary-id=test-uuid',
|
||||
'--machine-id=test-machine',
|
||||
'--custom-cmd=echo "custom install simulation"',
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Should succeed since echo is in PATH
|
||||
self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}")
|
||||
|
||||
# Parse JSONL output
|
||||
for line in result.stdout.split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Binary' and record.get('name') == 'echo':
|
||||
self.assertEqual(record['binprovider'], 'custom')
|
||||
self.assertTrue(record['abspath'])
|
||||
return
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
self.fail("No Binary JSONL record found in output")
|
||||
|
||||
def test_hook_fails_for_missing_binary_after_command(self):
|
||||
"""Hook should fail if binary not found after running custom command."""
|
||||
env = os.environ.copy()
|
||||
env['DATA_DIR'] = self.temp_dir
|
||||
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable, str(INSTALL_HOOK),
|
||||
'--name=nonexistent_binary_xyz123',
|
||||
'--binary-id=test-uuid',
|
||||
'--machine-id=test-machine',
|
||||
'--custom-cmd=echo "failed install"', # Doesn't actually install
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Should fail since binary not found after command
|
||||
self.assertEqual(result.returncode, 1)
|
||||
self.assertIn('not found', result.stderr.lower())
|
||||
|
||||
def test_hook_fails_for_failing_command(self):
|
||||
"""Hook should fail if custom command returns non-zero exit code."""
|
||||
env = os.environ.copy()
|
||||
env['DATA_DIR'] = self.temp_dir
|
||||
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable, str(INSTALL_HOOK),
|
||||
'--name=echo',
|
||||
'--binary-id=test-uuid',
|
||||
'--machine-id=test-machine',
|
||||
'--custom-cmd=exit 1', # Command that fails
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Should fail with exit code 1
|
||||
self.assertEqual(result.returncode, 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
@@ -1,21 +0,0 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"required_plugins": ["chrome"],
|
||||
"properties": {
|
||||
"DNS_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_DNS", "USE_DNS"],
|
||||
"description": "Enable DNS traffic recording during page load"
|
||||
},
|
||||
"DNS_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 30,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for DNS recording in seconds"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,265 +0,0 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Record all DNS traffic (hostname -> IP resolutions) during page load.
|
||||
*
|
||||
* This hook sets up CDP listeners BEFORE chrome_navigate loads the page,
|
||||
* then waits for navigation to complete. The listeners capture all DNS
|
||||
* resolutions by extracting hostname/IP pairs from network responses.
|
||||
*
|
||||
* Usage: on_Snapshot__22_dns.js --url=<url> --snapshot-id=<uuid>
|
||||
* Output: Writes dns.jsonl with one line per DNS resolution record
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Import shared utilities from chrome_utils.js
|
||||
const {
|
||||
getEnvBool,
|
||||
getEnvInt,
|
||||
parseArgs,
|
||||
connectToPage,
|
||||
waitForPageLoaded,
|
||||
} = require('../chrome/chrome_utils.js');
|
||||
|
||||
const PLUGIN_NAME = 'dns';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'dns.jsonl';
|
||||
const CHROME_SESSION_DIR = '../chrome';
|
||||
|
||||
let browser = null;
|
||||
let page = null;
|
||||
let recordCount = 0;
|
||||
let shuttingDown = false;
|
||||
|
||||
function extractHostname(url) {
|
||||
try {
|
||||
const urlObj = new URL(url);
|
||||
return urlObj.hostname;
|
||||
} catch (e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async function setupListener(targetUrl) {
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
const timeout = getEnvInt('DNS_TIMEOUT', 30) * 1000;
|
||||
|
||||
// Initialize output file
|
||||
fs.writeFileSync(outputPath, '');
|
||||
|
||||
// Track seen hostname -> IP mappings to avoid duplicates per request
|
||||
const seenResolutions = new Map();
|
||||
// Track request IDs to their URLs for correlation
|
||||
const requestUrls = new Map();
|
||||
|
||||
// Connect to Chrome page using shared utility
|
||||
const { browser, page } = await connectToPage({
|
||||
chromeSessionDir: CHROME_SESSION_DIR,
|
||||
timeoutMs: timeout,
|
||||
puppeteer,
|
||||
});
|
||||
|
||||
// Get CDP session for low-level network events
|
||||
const client = await page.target().createCDPSession();
|
||||
|
||||
// Enable network domain to receive events
|
||||
await client.send('Network.enable');
|
||||
|
||||
// Listen for request events to track URLs
|
||||
client.on('Network.requestWillBeSent', (params) => {
|
||||
requestUrls.set(params.requestId, params.request.url);
|
||||
});
|
||||
|
||||
// Listen for response events which contain remoteIPAddress (the resolved IP)
|
||||
client.on('Network.responseReceived', (params) => {
|
||||
try {
|
||||
const response = params.response;
|
||||
const url = response.url;
|
||||
const remoteIPAddress = response.remoteIPAddress;
|
||||
const remotePort = response.remotePort;
|
||||
|
||||
if (!url || !remoteIPAddress) {
|
||||
return;
|
||||
}
|
||||
|
||||
const hostname = extractHostname(url);
|
||||
if (!hostname) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Skip if IP address is same as hostname (already an IP)
|
||||
if (hostname === remoteIPAddress) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Create a unique key for this resolution
|
||||
const resolutionKey = `${hostname}:${remoteIPAddress}`;
|
||||
|
||||
// Skip if we've already recorded this resolution
|
||||
if (seenResolutions.has(resolutionKey)) {
|
||||
return;
|
||||
}
|
||||
seenResolutions.set(resolutionKey, true);
|
||||
|
||||
// Determine record type (A for IPv4, AAAA for IPv6)
|
||||
const isIPv6 = remoteIPAddress.includes(':');
|
||||
const recordType = isIPv6 ? 'AAAA' : 'A';
|
||||
|
||||
// Create DNS record
|
||||
const timestamp = new Date().toISOString();
|
||||
const dnsRecord = {
|
||||
ts: timestamp,
|
||||
hostname: hostname,
|
||||
ip: remoteIPAddress,
|
||||
port: remotePort || null,
|
||||
type: recordType,
|
||||
protocol: url.startsWith('https://') ? 'https' : 'http',
|
||||
url: url,
|
||||
requestId: params.requestId,
|
||||
};
|
||||
|
||||
// Append to output file
|
||||
fs.appendFileSync(outputPath, JSON.stringify(dnsRecord) + '\n');
|
||||
recordCount += 1;
|
||||
|
||||
} catch (e) {
|
||||
// Ignore errors
|
||||
}
|
||||
});
|
||||
|
||||
// Listen for failed requests too - they still involve DNS
|
||||
client.on('Network.loadingFailed', (params) => {
|
||||
try {
|
||||
const requestId = params.requestId;
|
||||
const url = requestUrls.get(requestId);
|
||||
|
||||
if (!url) {
|
||||
return;
|
||||
}
|
||||
|
||||
const hostname = extractHostname(url);
|
||||
if (!hostname) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Check if this is a DNS-related failure
|
||||
const errorText = params.errorText || '';
|
||||
if (errorText.includes('net::ERR_NAME_NOT_RESOLVED') ||
|
||||
errorText.includes('net::ERR_NAME_RESOLUTION_FAILED')) {
|
||||
|
||||
// Create a unique key for this failed resolution
|
||||
const resolutionKey = `${hostname}:NXDOMAIN`;
|
||||
|
||||
// Skip if we've already recorded this NXDOMAIN
|
||||
if (seenResolutions.has(resolutionKey)) {
|
||||
return;
|
||||
}
|
||||
seenResolutions.set(resolutionKey, true);
|
||||
|
||||
const timestamp = new Date().toISOString();
|
||||
const dnsRecord = {
|
||||
ts: timestamp,
|
||||
hostname: hostname,
|
||||
ip: null,
|
||||
port: null,
|
||||
type: 'NXDOMAIN',
|
||||
protocol: url.startsWith('https://') ? 'https' : 'http',
|
||||
url: url,
|
||||
requestId: requestId,
|
||||
error: errorText,
|
||||
};
|
||||
|
||||
fs.appendFileSync(outputPath, JSON.stringify(dnsRecord) + '\n');
|
||||
recordCount += 1;
|
||||
}
|
||||
} catch (e) {
|
||||
// Ignore errors
|
||||
}
|
||||
});
|
||||
|
||||
return { browser, page, client };
|
||||
}
|
||||
|
||||
function emitResult(status = 'succeeded') {
|
||||
if (shuttingDown) return;
|
||||
shuttingDown = true;
|
||||
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status,
|
||||
output_str: `${OUTPUT_FILE} (${recordCount} DNS records)`,
|
||||
}));
|
||||
}
|
||||
|
||||
async function handleShutdown(signal) {
|
||||
console.error(`\nReceived ${signal}, emitting final results...`);
|
||||
emitResult('succeeded');
|
||||
if (browser) {
|
||||
try {
|
||||
browser.disconnect();
|
||||
} catch (e) {}
|
||||
}
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const url = args.url;
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__22_dns.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
if (!getEnvBool('DNS_ENABLED', true)) {
|
||||
console.error('Skipping (DNS_ENABLED=False)');
|
||||
console.log(JSON.stringify({type: 'ArchiveResult', status: 'skipped', output_str: 'DNS_ENABLED=False'}));
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
try {
|
||||
// Set up listener BEFORE navigation
|
||||
const connection = await setupListener(url);
|
||||
browser = connection.browser;
|
||||
page = connection.page;
|
||||
|
||||
// Register signal handlers for graceful shutdown
|
||||
process.on('SIGTERM', () => handleShutdown('SIGTERM'));
|
||||
process.on('SIGINT', () => handleShutdown('SIGINT'));
|
||||
|
||||
// Wait for chrome_navigate to complete (non-fatal)
|
||||
try {
|
||||
const timeout = getEnvInt('DNS_TIMEOUT', 30) * 1000;
|
||||
await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 500);
|
||||
} catch (e) {
|
||||
console.error(`WARN: ${e.message}`);
|
||||
}
|
||||
|
||||
// console.error('DNS listener active, waiting for cleanup signal...');
|
||||
await new Promise(() => {}); // Keep alive until SIGTERM
|
||||
return;
|
||||
|
||||
} catch (e) {
|
||||
const error = `${e.name}: ${e.message}`;
|
||||
console.error(`ERROR: ${error}`);
|
||||
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status: 'failed',
|
||||
output_str: error,
|
||||
}));
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -1 +0,0 @@
|
||||
<span class="abx-output-icon abx-output-icon--dns" title="DNS"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="6" cy="12" r="2"/><circle cx="18" cy="6" r="2"/><circle cx="18" cy="18" r="2"/><path d="M8 12h6"/><path d="M16 8l-2 2"/><path d="M16 16l-2-2"/></svg></span>
|
||||
@@ -1,126 +0,0 @@
|
||||
"""
|
||||
Tests for the DNS plugin.
|
||||
|
||||
Tests the real DNS hook with an actual URL to verify
|
||||
DNS resolution capture.
|
||||
"""
|
||||
|
||||
import json
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from django.test import TestCase
|
||||
|
||||
# Import chrome test helpers
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests'))
|
||||
from chrome_test_helpers import (
|
||||
chrome_session,
|
||||
CHROME_NAVIGATE_HOOK,
|
||||
get_plugin_dir,
|
||||
get_hook_script,
|
||||
)
|
||||
|
||||
|
||||
# Get the path to the DNS hook
|
||||
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||
DNS_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_dns.*')
|
||||
|
||||
|
||||
class TestDNSPlugin(TestCase):
|
||||
"""Test the DNS plugin."""
|
||||
|
||||
def test_dns_hook_exists(self):
|
||||
"""DNS hook script should exist."""
|
||||
self.assertIsNotNone(DNS_HOOK, "DNS hook not found in plugin directory")
|
||||
self.assertTrue(DNS_HOOK.exists(), f"Hook not found: {DNS_HOOK}")
|
||||
|
||||
|
||||
class TestDNSWithChrome(TestCase):
|
||||
"""Integration tests for DNS plugin with Chrome."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test environment."""
|
||||
self.temp_dir = Path(tempfile.mkdtemp())
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up."""
|
||||
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
||||
|
||||
def test_dns_records_captured(self):
|
||||
"""DNS hook should capture DNS records from a real URL."""
|
||||
test_url = 'https://example.com'
|
||||
snapshot_id = 'test-dns-snapshot'
|
||||
|
||||
with chrome_session(
|
||||
self.temp_dir,
|
||||
crawl_id='test-dns-crawl',
|
||||
snapshot_id=snapshot_id,
|
||||
test_url=test_url,
|
||||
navigate=False,
|
||||
timeout=30,
|
||||
) as (_process, _pid, snapshot_chrome_dir, env):
|
||||
dns_dir = snapshot_chrome_dir.parent / 'dns'
|
||||
dns_dir.mkdir(exist_ok=True)
|
||||
|
||||
result = subprocess.Popen(
|
||||
['node', str(DNS_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
|
||||
cwd=str(dns_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
nav_result = subprocess.run(
|
||||
['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
self.assertEqual(nav_result.returncode, 0, f"Navigation failed: {nav_result.stderr}")
|
||||
|
||||
dns_output = dns_dir / 'dns.jsonl'
|
||||
for _ in range(30):
|
||||
if dns_output.exists() and dns_output.stat().st_size > 0:
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
if result.poll() is None:
|
||||
result.terminate()
|
||||
try:
|
||||
stdout, stderr = result.communicate(timeout=5)
|
||||
except subprocess.TimeoutExpired:
|
||||
result.kill()
|
||||
stdout, stderr = result.communicate()
|
||||
else:
|
||||
stdout, stderr = result.communicate()
|
||||
|
||||
self.assertNotIn('Traceback', stderr)
|
||||
|
||||
self.assertTrue(dns_output.exists(), "dns.jsonl not created")
|
||||
content = dns_output.read_text().strip()
|
||||
self.assertTrue(content, "DNS output should not be empty")
|
||||
|
||||
records = []
|
||||
for line in content.split('\n'):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
records.append(json.loads(line))
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
self.assertTrue(records, "No DNS records parsed")
|
||||
has_ip_record = any(r.get('hostname') and r.get('ip') for r in records)
|
||||
self.assertTrue(has_ip_record, f"No DNS record with hostname + ip: {records}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
@@ -1,21 +0,0 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"required_plugins": ["chrome"],
|
||||
"properties": {
|
||||
"DOM_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_DOM", "USE_DOM"],
|
||||
"description": "Enable DOM capture"
|
||||
},
|
||||
"DOM_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 60,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for DOM capture in seconds"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,184 +0,0 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Dump the DOM of a URL using Chrome/Puppeteer.
|
||||
*
|
||||
* Requires a Chrome session (from chrome plugin) and connects to it via CDP.
|
||||
*
|
||||
* Usage: on_Snapshot__53_dom.js --url=<url> --snapshot-id=<uuid>
|
||||
* Output: Writes dom/output.html
|
||||
*
|
||||
* Environment variables:
|
||||
* DOM_ENABLED: Enable DOM extraction (default: true)
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
|
||||
const {
|
||||
getEnvBool,
|
||||
parseArgs,
|
||||
readCdpUrl,
|
||||
} = require('../chrome/chrome_utils.js');
|
||||
|
||||
// Check if DOM is enabled BEFORE requiring puppeteer
|
||||
if (!getEnvBool('DOM_ENABLED', true)) {
|
||||
console.error('Skipping DOM (DOM_ENABLED=False)');
|
||||
// Temporary failure (config disabled) - NO JSONL emission
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Now safe to require puppeteer
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const PLUGIN_NAME = 'dom';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'output.html';
|
||||
const CHROME_SESSION_DIR = '../chrome';
|
||||
|
||||
// Check if staticfile extractor already downloaded this URL
|
||||
const STATICFILE_DIR = '../staticfile';
|
||||
function hasStaticFileOutput() {
|
||||
if (!fs.existsSync(STATICFILE_DIR)) return false;
|
||||
const stdoutPath = path.join(STATICFILE_DIR, 'stdout.log');
|
||||
if (!fs.existsSync(stdoutPath)) return false;
|
||||
const stdout = fs.readFileSync(stdoutPath, 'utf8');
|
||||
for (const line of stdout.split('\n')) {
|
||||
const trimmed = line.trim();
|
||||
if (!trimmed.startsWith('{')) continue;
|
||||
try {
|
||||
const record = JSON.parse(trimmed);
|
||||
if (record.type === 'ArchiveResult' && record.status === 'succeeded') {
|
||||
return true;
|
||||
}
|
||||
} catch (e) {}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Wait for chrome tab to be fully loaded
|
||||
async function waitForChromeTabLoaded(timeoutMs = 60000) {
|
||||
const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json');
|
||||
const startTime = Date.now();
|
||||
|
||||
while (Date.now() - startTime < timeoutMs) {
|
||||
if (fs.existsSync(navigationFile)) {
|
||||
return true;
|
||||
}
|
||||
// Wait 100ms before checking again
|
||||
await new Promise(resolve => setTimeout(resolve, 100));
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
async function dumpDom(url) {
|
||||
// Output directory is current directory (hook already runs in output dir)
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
let browser = null;
|
||||
let page = null;
|
||||
|
||||
try {
|
||||
// Connect to existing Chrome session (required)
|
||||
const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
|
||||
if (!cdpUrl) {
|
||||
return { success: false, error: 'No Chrome session found (chrome plugin must run first)' };
|
||||
}
|
||||
|
||||
browser = await puppeteer.connect({
|
||||
browserWSEndpoint: cdpUrl,
|
||||
defaultViewport: null,
|
||||
});
|
||||
|
||||
// Get existing pages or create new one
|
||||
const pages = await browser.pages();
|
||||
page = pages.find(p => p.url().startsWith('http')) || pages[0];
|
||||
|
||||
if (!page) {
|
||||
page = await browser.newPage();
|
||||
}
|
||||
|
||||
// Get the full DOM content
|
||||
const domContent = await page.content();
|
||||
|
||||
if (domContent && domContent.length > 100) {
|
||||
fs.writeFileSync(outputPath, domContent, 'utf8');
|
||||
return { success: true, output: outputPath };
|
||||
} else {
|
||||
return { success: false, error: 'DOM content too short or empty' };
|
||||
}
|
||||
|
||||
} catch (e) {
|
||||
return { success: false, error: `${e.name}: ${e.message}` };
|
||||
} finally {
|
||||
if (browser) {
|
||||
browser.disconnect();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const url = args.url;
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__53_dom.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
try {
|
||||
// Check if staticfile extractor already handled this (permanent skip)
|
||||
if (hasStaticFileOutput()) {
|
||||
console.error(`Skipping DOM - staticfile extractor already downloaded this`);
|
||||
// Permanent skip - emit ArchiveResult with status='skipped'
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status: 'skipped',
|
||||
output_str: 'staticfile already handled',
|
||||
}));
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
|
||||
if (!cdpUrl) {
|
||||
throw new Error('No Chrome session found (chrome plugin must run first)');
|
||||
}
|
||||
|
||||
// Wait for page to be fully loaded
|
||||
const pageLoaded = await waitForChromeTabLoaded(60000);
|
||||
if (!pageLoaded) {
|
||||
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
|
||||
}
|
||||
|
||||
const result = await dumpDom(url);
|
||||
|
||||
if (result.success) {
|
||||
// Success - emit ArchiveResult
|
||||
const size = fs.statSync(result.output).size;
|
||||
console.error(`DOM saved (${size} bytes)`);
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status: 'succeeded',
|
||||
output_str: result.output,
|
||||
}));
|
||||
process.exit(0);
|
||||
} else {
|
||||
// Transient error - emit NO JSONL
|
||||
console.error(`ERROR: ${result.error}`);
|
||||
process.exit(1);
|
||||
}
|
||||
} catch (e) {
|
||||
// Transient error - emit NO JSONL
|
||||
console.error(`ERROR: ${e.name}: ${e.message}`);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -1,8 +0,0 @@
|
||||
<!-- DOM thumbnail - scaled down iframe preview of captured DOM HTML -->
|
||||
<div class="extractor-thumbnail dom-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fff;">
|
||||
<iframe src="{{ output_path }}"
|
||||
style="width: 400%; height: 400px; transform: scale(0.25); transform-origin: top left; pointer-events: none; border: none;"
|
||||
loading="lazy"
|
||||
sandbox="allow-same-origin">
|
||||
</iframe>
|
||||
</div>
|
||||
@@ -1 +0,0 @@
|
||||
<span class="abx-output-icon abx-output-icon--dom" title="DOM"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M8 9l-3 3 3 3"/><path d="M16 9l3 3-3 3"/><path d="M10 20l4-16"/></svg></span>
|
||||
@@ -1,185 +0,0 @@
|
||||
"""
|
||||
Integration tests for dom plugin
|
||||
|
||||
Tests verify:
|
||||
1. Hook script exists
|
||||
2. Dependencies installed via chrome validation hooks
|
||||
3. Verify deps with abx-pkg
|
||||
4. DOM extraction works on https://example.com
|
||||
5. JSONL output is correct
|
||||
6. Filesystem output contains actual page content
|
||||
7. Config options work
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_test_env,
|
||||
get_plugin_dir,
|
||||
get_hook_script,
|
||||
run_hook_and_parse,
|
||||
LIB_DIR,
|
||||
NODE_MODULES_DIR,
|
||||
PLUGINS_ROOT,
|
||||
chrome_session,
|
||||
)
|
||||
|
||||
|
||||
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||
DOM_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_dom.*')
|
||||
NPM_PROVIDER_HOOK = get_hook_script(PLUGINS_ROOT / 'npm', 'on_Binary__install_using_npm_provider.py')
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify on_Snapshot hook exists."""
|
||||
assert DOM_HOOK.exists(), f"Hook not found: {DOM_HOOK}"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify dependencies are available via abx-pkg after hook installation."""
|
||||
from abx_pkg import Binary, EnvProvider, BinProviderOverrides
|
||||
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
# Verify node is available
|
||||
node_binary = Binary(name='node', binproviders=[EnvProvider()])
|
||||
node_loaded = node_binary.load()
|
||||
assert node_loaded and node_loaded.abspath, "Node.js required for dom plugin"
|
||||
|
||||
|
||||
def test_extracts_dom_from_example_com():
|
||||
"""Test full workflow: extract DOM from real example.com via hook."""
|
||||
# Prerequisites checked by earlier test
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
with chrome_session(tmpdir, test_url=TEST_URL) as (_process, _pid, snapshot_chrome_dir, env):
|
||||
dom_dir = snapshot_chrome_dir.parent / 'dom'
|
||||
dom_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Run DOM extraction hook
|
||||
result = subprocess.run(
|
||||
['node', str(DOM_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
|
||||
cwd=dom_dir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
|
||||
|
||||
# Parse clean JSONL output
|
||||
result_json = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert result_json, "Should have ArchiveResult JSONL output"
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
||||
|
||||
# Verify filesystem output (hook writes directly to working dir)
|
||||
dom_file = dom_dir / 'output.html'
|
||||
assert dom_file.exists(), f"output.html not created. Files: {list(tmpdir.iterdir())}"
|
||||
|
||||
# Verify HTML content contains REAL example.com text
|
||||
html_content = dom_file.read_text(errors='ignore')
|
||||
assert len(html_content) > 200, f"HTML content too short: {len(html_content)} bytes"
|
||||
assert '<html' in html_content.lower(), "Missing <html> tag"
|
||||
assert 'example domain' in html_content.lower(), "Missing 'Example Domain' in HTML"
|
||||
assert ('this domain' in html_content.lower() or
|
||||
'illustrative examples' in html_content.lower()), \
|
||||
"Missing example.com description text"
|
||||
|
||||
|
||||
def test_config_save_dom_false_skips():
|
||||
"""Test that DOM_ENABLED=False exits without emitting JSONL."""
|
||||
import os
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
env = os.environ.copy()
|
||||
env['DOM_ENABLED'] = 'False'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(DOM_HOOK), f'--url={TEST_URL}', '--snapshot-id=test999'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
|
||||
|
||||
# Feature disabled - temporary failure, should NOT emit JSONL
|
||||
assert 'Skipping DOM' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
|
||||
|
||||
# Should NOT emit any JSONL
|
||||
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
|
||||
assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}"
|
||||
|
||||
|
||||
def test_staticfile_present_skips():
|
||||
"""Test that dom skips when staticfile already downloaded."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Create directory structure like real ArchiveBox:
|
||||
# tmpdir/
|
||||
# staticfile/ <- staticfile extractor output
|
||||
# dom/ <- dom extractor runs here, looks for ../staticfile
|
||||
staticfile_dir = tmpdir / 'staticfile'
|
||||
staticfile_dir.mkdir()
|
||||
(staticfile_dir / 'stdout.log').write_text('{"type":"ArchiveResult","status":"succeeded","output_str":"index.html"}\n')
|
||||
|
||||
dom_dir = tmpdir / 'dom'
|
||||
dom_dir.mkdir()
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(DOM_HOOK), f'--url={TEST_URL}', '--snapshot-id=teststatic'],
|
||||
cwd=dom_dir, # Run from dom subdirectory
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
,
|
||||
env=get_test_env())
|
||||
|
||||
assert result.returncode == 0, "Should exit 0 when permanently skipping"
|
||||
|
||||
# Permanent skip - should emit ArchiveResult with status='skipped'
|
||||
result_json = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert result_json, "Should emit ArchiveResult JSONL for permanent skip"
|
||||
assert result_json['status'] == 'skipped', f"Should have status='skipped': {result_json}"
|
||||
assert 'staticfile' in result_json.get('output_str', '').lower(), "Should mention staticfile in output_str"
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
@@ -1,72 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Check if a binary is already available in the system PATH.
|
||||
|
||||
This is the simplest "provider" - it doesn't install anything,
|
||||
it just discovers binaries that are already installed.
|
||||
|
||||
Usage: on_Binary__install_using_env_provider.py --binary-id=<uuid> --machine-id=<uuid> --name=<name>
|
||||
Output: Binary JSONL record to stdout if binary found in PATH
|
||||
|
||||
Environment variables:
|
||||
MACHINE_ID: Machine UUID (set by orchestrator)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
import rich_click as click
|
||||
from abx_pkg import Binary, EnvProvider
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--machine-id', required=True, help="Machine UUID")
|
||||
@click.option('--binary-id', required=True, help="Dependency UUID")
|
||||
@click.option('--name', required=True, help="Binary name to find")
|
||||
@click.option('--binproviders', default='*', help="Allowed providers (comma-separated)")
|
||||
@click.option('--overrides', default=None, help="JSON-encoded overrides dict (unused)")
|
||||
def main(binary_id: str, machine_id: str, name: str, binproviders: str, overrides: str | None):
|
||||
"""Check if binary is available in PATH and record it."""
|
||||
|
||||
# Check if env provider is allowed
|
||||
if binproviders != '*' and 'env' not in binproviders.split(','):
|
||||
click.echo(f"env provider not allowed for {name}", err=True)
|
||||
sys.exit(0) # Not an error, just skip
|
||||
|
||||
# Use abx-pkg EnvProvider to find binary
|
||||
provider = EnvProvider()
|
||||
try:
|
||||
binary = Binary(name=name, binproviders=[provider]).load()
|
||||
except Exception as e:
|
||||
click.echo(f"{name} not found in PATH: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
if not binary.abspath:
|
||||
click.echo(f"{name} not found in PATH", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
# Output Binary JSONL record to stdout
|
||||
record = {
|
||||
'type': 'Binary',
|
||||
'name': name,
|
||||
'abspath': str(binary.abspath),
|
||||
'version': str(binary.version) if binary.version else '',
|
||||
'sha256': binary.sha256 or '',
|
||||
'binprovider': 'env',
|
||||
'machine_id': machine_id,
|
||||
'binary_id': binary_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
# Log human-readable info to stderr
|
||||
click.echo(f"Found {name} at {binary.abspath}", err=True)
|
||||
click.echo(f" version: {binary.version}", err=True)
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
159
archivebox/plugins/env/tests/test_env_provider.py
vendored
159
archivebox/plugins/env/tests/test_env_provider.py
vendored
@@ -1,159 +0,0 @@
|
||||
"""
|
||||
Tests for the env binary provider plugin.
|
||||
|
||||
Tests the real env provider hook with actual system binaries.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from django.test import TestCase
|
||||
|
||||
|
||||
# Get the path to the env provider hook
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
INSTALL_HOOK = next(PLUGIN_DIR.glob('on_Binary__*_env_install.py'), None)
|
||||
|
||||
|
||||
class TestEnvProviderHook(TestCase):
|
||||
"""Test the env binary provider hook."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test environment."""
|
||||
self.temp_dir = tempfile.mkdtemp()
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up."""
|
||||
import shutil
|
||||
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
||||
|
||||
def test_hook_script_exists(self):
|
||||
"""Hook script should exist."""
|
||||
self.assertTrue(INSTALL_HOOK and INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}")
|
||||
|
||||
def test_hook_finds_python(self):
|
||||
"""Hook should find python3 binary in PATH."""
|
||||
env = os.environ.copy()
|
||||
env['DATA_DIR'] = self.temp_dir
|
||||
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable, str(INSTALL_HOOK),
|
||||
'--name=python3',
|
||||
'--binary-id=test-uuid',
|
||||
'--machine-id=test-machine',
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Should succeed and output JSONL
|
||||
self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}")
|
||||
|
||||
# Parse JSONL output
|
||||
for line in result.stdout.split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Binary' and record.get('name') == 'python3':
|
||||
self.assertEqual(record['binprovider'], 'env')
|
||||
self.assertTrue(record['abspath'])
|
||||
self.assertTrue(Path(record['abspath']).exists())
|
||||
return
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
self.fail("No Binary JSONL record found in output")
|
||||
|
||||
def test_hook_finds_bash(self):
|
||||
"""Hook should find bash binary in PATH."""
|
||||
env = os.environ.copy()
|
||||
env['DATA_DIR'] = self.temp_dir
|
||||
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable, str(INSTALL_HOOK),
|
||||
'--name=bash',
|
||||
'--binary-id=test-uuid',
|
||||
'--machine-id=test-machine',
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Should succeed and output JSONL
|
||||
self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}")
|
||||
|
||||
# Parse JSONL output
|
||||
for line in result.stdout.split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Binary' and record.get('name') == 'bash':
|
||||
self.assertEqual(record['binprovider'], 'env')
|
||||
self.assertTrue(record['abspath'])
|
||||
return
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
self.fail("No Binary JSONL record found in output")
|
||||
|
||||
def test_hook_fails_for_missing_binary(self):
|
||||
"""Hook should fail for binary not in PATH."""
|
||||
env = os.environ.copy()
|
||||
env['DATA_DIR'] = self.temp_dir
|
||||
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable, str(INSTALL_HOOK),
|
||||
'--name=nonexistent_binary_xyz123',
|
||||
'--binary-id=test-uuid',
|
||||
'--machine-id=test-machine',
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Should fail with exit code 1
|
||||
self.assertEqual(result.returncode, 1)
|
||||
self.assertIn('not found', result.stderr.lower())
|
||||
|
||||
def test_hook_skips_when_env_not_allowed(self):
|
||||
"""Hook should skip when env not in allowed binproviders."""
|
||||
env = os.environ.copy()
|
||||
env['DATA_DIR'] = self.temp_dir
|
||||
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable, str(INSTALL_HOOK),
|
||||
'--name=python3',
|
||||
'--binary-id=test-uuid',
|
||||
'--machine-id=test-machine',
|
||||
'--binproviders=pip,apt', # env not allowed
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Should exit cleanly (code 0) when env not allowed
|
||||
self.assertEqual(result.returncode, 0)
|
||||
self.assertIn('env provider not allowed', result.stderr)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
@@ -1,26 +0,0 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"FAVICON_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_FAVICON", "USE_FAVICON"],
|
||||
"description": "Enable favicon downloading"
|
||||
},
|
||||
"FAVICON_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 30,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for favicon fetch in seconds"
|
||||
},
|
||||
"FAVICON_USER_AGENT": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"x-fallback": "USER_AGENT",
|
||||
"description": "User agent string"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,153 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Extract favicon from a URL.
|
||||
|
||||
Usage: on_Snapshot__favicon.bg.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Writes favicon.ico to $PWD
|
||||
|
||||
Environment variables:
|
||||
FAVICON_TIMEOUT: Timeout in seconds (default: 30)
|
||||
USER_AGENT: User agent string
|
||||
|
||||
# Fallback to ARCHIVING_CONFIG values if FAVICON_* not set:
|
||||
TIMEOUT: Fallback timeout
|
||||
|
||||
Note: This extractor uses the 'requests' library which is bundled with ArchiveBox.
|
||||
It can run standalone if requests is installed: pip install requests
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
PLUGIN_NAME = 'favicon'
|
||||
OUTPUT_DIR = '.'
|
||||
OUTPUT_FILE = 'favicon.ico'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
|
||||
def get_env_int(name: str, default: int = 0) -> int:
|
||||
try:
|
||||
return int(get_env(name, str(default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
def get_favicon(url: str) -> tuple[bool, str | None, str]:
|
||||
"""
|
||||
Fetch favicon from URL.
|
||||
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
try:
|
||||
import requests
|
||||
except ImportError:
|
||||
return False, None, 'requests library not installed'
|
||||
|
||||
timeout = get_env_int('FAVICON_TIMEOUT') or get_env_int('TIMEOUT', 30)
|
||||
user_agent = get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
|
||||
headers = {'User-Agent': user_agent}
|
||||
|
||||
# Build list of possible favicon URLs
|
||||
parsed = urlparse(url)
|
||||
base_url = f"{parsed.scheme}://{parsed.netloc}"
|
||||
|
||||
favicon_urls = [
|
||||
urljoin(base_url, '/favicon.ico'),
|
||||
urljoin(base_url, '/favicon.png'),
|
||||
urljoin(base_url, '/apple-touch-icon.png'),
|
||||
]
|
||||
|
||||
# Try to extract favicon URL from HTML link tags
|
||||
try:
|
||||
response = requests.get(url, timeout=timeout, headers=headers)
|
||||
if response.ok:
|
||||
# Look for <link rel="icon" href="...">
|
||||
for match in re.finditer(
|
||||
r'<link[^>]+rel=["\'](?:shortcut )?icon["\'][^>]+href=["\']([^"\']+)["\']',
|
||||
response.text,
|
||||
re.I
|
||||
):
|
||||
favicon_urls.insert(0, urljoin(url, match.group(1)))
|
||||
|
||||
# Also check reverse order: href before rel
|
||||
for match in re.finditer(
|
||||
r'<link[^>]+href=["\']([^"\']+)["\'][^>]+rel=["\'](?:shortcut )?icon["\']',
|
||||
response.text,
|
||||
re.I
|
||||
):
|
||||
favicon_urls.insert(0, urljoin(url, match.group(1)))
|
||||
except Exception:
|
||||
pass # Continue with default favicon URLs
|
||||
|
||||
# Try each URL until we find one that works
|
||||
for favicon_url in favicon_urls:
|
||||
try:
|
||||
response = requests.get(favicon_url, timeout=15, headers=headers)
|
||||
if response.ok and len(response.content) > 0:
|
||||
Path(OUTPUT_FILE).write_bytes(response.content)
|
||||
return True, OUTPUT_FILE, ''
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# Try Google's favicon service as fallback
|
||||
try:
|
||||
google_url = f'https://www.google.com/s2/favicons?domain={parsed.netloc}'
|
||||
response = requests.get(google_url, timeout=15, headers=headers)
|
||||
if response.ok and len(response.content) > 0:
|
||||
Path(OUTPUT_FILE).write_bytes(response.content)
|
||||
return True, OUTPUT_FILE, ''
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return False, None, 'No favicon found'
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL to extract favicon from')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Extract favicon from a URL."""
|
||||
|
||||
output = None
|
||||
status = 'failed'
|
||||
error = ''
|
||||
|
||||
try:
|
||||
# Run extraction
|
||||
success, output, error = get_favicon(url)
|
||||
if success:
|
||||
status = 'succeeded'
|
||||
else:
|
||||
status = 'failed'
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
|
||||
if error:
|
||||
print(f'ERROR: {error}', file=sys.stderr)
|
||||
|
||||
# Output clean JSONL (no RESULT_JSON= prefix)
|
||||
result = {
|
||||
'type': 'ArchiveResult',
|
||||
'status': status,
|
||||
'output_str': output or error or '',
|
||||
}
|
||||
print(json.dumps(result))
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,9 +0,0 @@
|
||||
<!-- Favicon thumbnail - small favicon preview -->
|
||||
<div class="extractor-thumbnail favicon-thumbnail" style="width: 100%; height: 100px; display: flex; align-items: center; justify-content: center; background: #fff;">
|
||||
{% if output_path %}
|
||||
<img src="{{ output_path }}"
|
||||
alt="Favicon"
|
||||
style="width: 30px; height: 30px; max-width: 30px; max-height: 30px; object-fit: contain;"
|
||||
loading="lazy">
|
||||
{% endif %}
|
||||
</div>
|
||||
@@ -1 +0,0 @@
|
||||
<span class="abx-output-icon abx-output-icon--favicon" title="Favicon"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M12 3l2.5 5.5 6 .5-4.5 3.8 1.5 5.7L12 15.5 6.5 18.5 8 12.8 3.5 9l6-.5z"/></svg></span>
|
||||
@@ -1,293 +0,0 @@
|
||||
"""
|
||||
Integration tests for favicon plugin
|
||||
|
||||
Tests verify:
|
||||
1. Plugin script exists
|
||||
2. requests library is available
|
||||
3. Favicon extraction works for real example.com
|
||||
4. Output file is actual image data
|
||||
5. Tries multiple favicon URLs
|
||||
6. Falls back to Google's favicon service
|
||||
7. Config options work (TIMEOUT, USER_AGENT)
|
||||
8. Handles failures gracefully
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_plugin_dir,
|
||||
get_hook_script,
|
||||
parse_jsonl_output,
|
||||
)
|
||||
|
||||
|
||||
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||
FAVICON_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_favicon.*')
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify hook script exists."""
|
||||
assert FAVICON_HOOK.exists(), f"Hook script not found: {FAVICON_HOOK}"
|
||||
|
||||
|
||||
def test_requests_library_available():
|
||||
"""Test that requests library is available."""
|
||||
result = subprocess.run(
|
||||
[sys.executable, '-c', 'import requests; print(requests.__version__)'],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
pass
|
||||
|
||||
assert len(result.stdout.strip()) > 0, "Should report requests version"
|
||||
|
||||
|
||||
def test_extracts_favicon_from_example_com():
|
||||
"""Test full workflow: extract favicon from real example.com.
|
||||
|
||||
Note: example.com doesn't have a favicon and Google's service may also fail,
|
||||
so we test that the extraction completes and reports appropriate status.
|
||||
"""
|
||||
|
||||
# Check requests is available
|
||||
check_result = subprocess.run(
|
||||
[sys.executable, '-c', 'import requests'],
|
||||
capture_output=True
|
||||
)
|
||||
if check_result.returncode != 0:
|
||||
pass
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Run favicon extraction
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# May succeed (if Google service works) or fail (if no favicon)
|
||||
assert result.returncode in (0, 1), "Should complete extraction attempt"
|
||||
|
||||
# Parse clean JSONL output
|
||||
result_json = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert result_json, "Should have ArchiveResult JSONL output"
|
||||
|
||||
# If it succeeded, verify the favicon file
|
||||
if result_json['status'] == 'succeeded':
|
||||
favicon_file = tmpdir / 'favicon.ico'
|
||||
assert favicon_file.exists(), "favicon.ico not created"
|
||||
|
||||
# Verify file is not empty and contains actual image data
|
||||
file_size = favicon_file.stat().st_size
|
||||
assert file_size > 0, "Favicon file should not be empty"
|
||||
assert file_size < 1024 * 1024, f"Favicon file suspiciously large: {file_size} bytes"
|
||||
|
||||
# Check for common image magic bytes
|
||||
favicon_data = favicon_file.read_bytes()
|
||||
# ICO, PNG, GIF, JPEG, or WebP
|
||||
is_image = (
|
||||
favicon_data[:4] == b'\x00\x00\x01\x00' or # ICO
|
||||
favicon_data[:8] == b'\x89PNG\r\n\x1a\n' or # PNG
|
||||
favicon_data[:3] == b'GIF' or # GIF
|
||||
favicon_data[:2] == b'\xff\xd8' or # JPEG
|
||||
favicon_data[8:12] == b'WEBP' # WebP
|
||||
)
|
||||
assert is_image, "Favicon file should be a valid image format"
|
||||
else:
|
||||
# Failed as expected
|
||||
assert result_json['status'] == 'failed', f"Should report failure: {result_json}"
|
||||
|
||||
|
||||
def test_config_timeout_honored():
|
||||
"""Test that TIMEOUT config is respected."""
|
||||
|
||||
check_result = subprocess.run(
|
||||
[sys.executable, '-c', 'import requests'],
|
||||
capture_output=True
|
||||
)
|
||||
if check_result.returncode != 0:
|
||||
pass
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set very short timeout (but example.com should still succeed)
|
||||
import os
|
||||
env = os.environ.copy()
|
||||
env['TIMEOUT'] = '5'
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'testtimeout'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Should complete (success or fail, but not hang)
|
||||
assert result.returncode in (0, 1), "Should complete without hanging"
|
||||
|
||||
|
||||
def test_config_user_agent():
|
||||
"""Test that USER_AGENT config is used."""
|
||||
|
||||
check_result = subprocess.run(
|
||||
[sys.executable, '-c', 'import requests'],
|
||||
capture_output=True
|
||||
)
|
||||
if check_result.returncode != 0:
|
||||
pass
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set custom user agent
|
||||
import os
|
||||
env = os.environ.copy()
|
||||
env['USER_AGENT'] = 'TestBot/1.0'
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'testua'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Should succeed (example.com doesn't block)
|
||||
if result.returncode == 0:
|
||||
# Parse clean JSONL output
|
||||
result_json = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
if result_json:
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
||||
|
||||
|
||||
def test_handles_https_urls():
|
||||
"""Test that HTTPS URLs work correctly."""
|
||||
|
||||
check_result = subprocess.run(
|
||||
[sys.executable, '-c', 'import requests'],
|
||||
capture_output=True
|
||||
)
|
||||
if check_result.returncode != 0:
|
||||
pass
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(FAVICON_HOOK), '--url', 'https://example.org', '--snapshot-id', 'testhttps'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
favicon_file = tmpdir / 'favicon.ico'
|
||||
if favicon_file.exists():
|
||||
assert favicon_file.stat().st_size > 0
|
||||
|
||||
|
||||
def test_handles_missing_favicon_gracefully():
|
||||
"""Test that favicon plugin handles sites without favicons gracefully.
|
||||
|
||||
Note: The plugin falls back to Google's favicon service, which generates
|
||||
a generic icon even if the site doesn't have one, so extraction usually succeeds.
|
||||
"""
|
||||
|
||||
check_result = subprocess.run(
|
||||
[sys.executable, '-c', 'import requests'],
|
||||
capture_output=True
|
||||
)
|
||||
if check_result.returncode != 0:
|
||||
pass
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Try a URL that likely doesn't have a favicon
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(FAVICON_HOOK), '--url', 'https://example.com/nonexistent', '--snapshot-id', 'test404'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# May succeed (Google fallback) or fail gracefully
|
||||
assert result.returncode in (0, 1), "Should complete (may succeed or fail)"
|
||||
|
||||
if result.returncode != 0:
|
||||
combined = result.stdout + result.stderr
|
||||
assert 'No favicon found' in combined or 'ERROR=' in combined
|
||||
|
||||
|
||||
def test_reports_missing_requests_library():
|
||||
"""Test that script reports error when requests library is missing."""
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Run with PYTHONPATH cleared to simulate missing requests
|
||||
import os
|
||||
env = os.environ.copy()
|
||||
# Keep only minimal PATH, clear PYTHONPATH
|
||||
env['PYTHONPATH'] = '/nonexistent'
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, '-S', str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'test123'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Should fail and report missing requests
|
||||
if result.returncode != 0:
|
||||
combined = result.stdout + result.stderr
|
||||
# May report missing requests or other import errors
|
||||
assert 'requests' in combined.lower() or 'import' in combined.lower() or 'ERROR=' in combined
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
@@ -1,51 +0,0 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"FORUMDL_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_FORUMDL", "USE_FORUMDL"],
|
||||
"description": "Enable forum downloading with forum-dl"
|
||||
},
|
||||
"FORUMDL_BINARY": {
|
||||
"type": "string",
|
||||
"default": "forum-dl",
|
||||
"description": "Path to forum-dl binary"
|
||||
},
|
||||
"FORUMDL_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 3600,
|
||||
"minimum": 30,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for forum downloads in seconds"
|
||||
},
|
||||
"FORUMDL_OUTPUT_FORMAT": {
|
||||
"type": "string",
|
||||
"default": "jsonl",
|
||||
"enum": ["jsonl", "warc", "mbox", "maildir", "mh", "mmdf", "babyl"],
|
||||
"description": "Output format for forum downloads"
|
||||
},
|
||||
"FORUMDL_CHECK_SSL_VALIDITY": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-fallback": "CHECK_SSL_VALIDITY",
|
||||
"description": "Whether to verify SSL certificates"
|
||||
},
|
||||
"FORUMDL_ARGS": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": [],
|
||||
"x-aliases": ["FORUMDL_DEFAULT_ARGS"],
|
||||
"description": "Default forum-dl arguments"
|
||||
},
|
||||
"FORUMDL_ARGS_EXTRA": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": [],
|
||||
"x-aliases": ["FORUMDL_EXTRA_ARGS"],
|
||||
"description": "Extra arguments to append to forum-dl command"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,31 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Wrapper for forum-dl that applies Pydantic v2 compatibility patches.
|
||||
|
||||
This wrapper fixes forum-dl 0.3.0's incompatibility with Pydantic v2 by monkey-patching
|
||||
the JsonlWriter class to use model_dump_json() instead of the deprecated json(models_as_dict=False).
|
||||
"""
|
||||
|
||||
import sys
|
||||
|
||||
# Apply Pydantic v2 compatibility patch BEFORE importing forum_dl
|
||||
try:
|
||||
from forum_dl.writers.jsonl import JsonlWriter
|
||||
from pydantic import BaseModel
|
||||
|
||||
# Check if we're using Pydantic v2
|
||||
if hasattr(BaseModel, 'model_dump_json'):
|
||||
def _patched_serialize_entry(self, entry):
|
||||
"""Use Pydantic v2's model_dump_json() instead of deprecated json(models_as_dict=False)"""
|
||||
return entry.model_dump_json()
|
||||
|
||||
JsonlWriter._serialize_entry = _patched_serialize_entry
|
||||
except (ImportError, AttributeError):
|
||||
# forum-dl not installed or already compatible - no patch needed
|
||||
pass
|
||||
|
||||
# Now import and run forum-dl's main function
|
||||
from forum_dl import main
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
@@ -1,81 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Emit forum-dl Binary dependency for the crawl.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def output_binary(name: str, binproviders: str, overrides: dict | None = None):
|
||||
"""Output Binary JSONL record for a dependency."""
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
record = {
|
||||
'type': 'Binary',
|
||||
'name': name,
|
||||
'binproviders': binproviders,
|
||||
'machine_id': machine_id,
|
||||
}
|
||||
if overrides:
|
||||
record['overrides'] = overrides
|
||||
print(json.dumps(record))
|
||||
|
||||
|
||||
def main():
|
||||
forumdl_enabled = get_env_bool('FORUMDL_ENABLED', True)
|
||||
|
||||
if not forumdl_enabled:
|
||||
sys.exit(0)
|
||||
|
||||
output_binary(
|
||||
name='forum-dl',
|
||||
binproviders='pip,env',
|
||||
overrides={
|
||||
'pip': {
|
||||
'packages': [
|
||||
'--no-deps',
|
||||
'--prefer-binary',
|
||||
'forum-dl',
|
||||
'chardet==5.2.0',
|
||||
'pydantic',
|
||||
'pydantic-core',
|
||||
'typing-extensions',
|
||||
'annotated-types',
|
||||
'typing-inspection',
|
||||
'beautifulsoup4',
|
||||
'soupsieve',
|
||||
'lxml',
|
||||
'requests',
|
||||
'urllib3',
|
||||
'certifi',
|
||||
'idna',
|
||||
'charset-normalizer',
|
||||
'tenacity',
|
||||
'python-dateutil',
|
||||
'six',
|
||||
'html2text',
|
||||
'warcio',
|
||||
]
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,266 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Download forum content from a URL using forum-dl.
|
||||
|
||||
Usage: on_Snapshot__04_forumdl.bg.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Downloads forum content to $PWD/
|
||||
|
||||
Environment variables:
|
||||
FORUMDL_ENABLED: Enable forum downloading (default: True)
|
||||
FORUMDL_BINARY: Path to forum-dl binary (default: forum-dl)
|
||||
FORUMDL_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT)
|
||||
FORUMDL_OUTPUT_FORMAT: Output format (default: jsonl)
|
||||
FORUMDL_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY)
|
||||
FORUMDL_ARGS: Default forum-dl arguments (JSON array)
|
||||
FORUMDL_ARGS_EXTRA: Extra arguments to append (JSON array)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import threading
|
||||
from pathlib import Path
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
# Monkey patch forum-dl for Pydantic v2 compatibility
|
||||
# forum-dl 0.3.0 uses deprecated json(models_as_dict=False) which doesn't work in Pydantic v2
|
||||
try:
|
||||
from forum_dl.writers.jsonl import JsonlWriter
|
||||
from pydantic import BaseModel
|
||||
|
||||
# Check if we're using Pydantic v2 (has model_dump_json)
|
||||
if hasattr(BaseModel, 'model_dump_json'):
|
||||
# Patch JsonlWriter to use Pydantic v2 API
|
||||
original_serialize = JsonlWriter._serialize_entry
|
||||
|
||||
def _patched_serialize_entry(self, entry):
|
||||
# Use Pydantic v2's model_dump_json() instead of deprecated json(models_as_dict=False)
|
||||
return entry.model_dump_json()
|
||||
|
||||
JsonlWriter._serialize_entry = _patched_serialize_entry
|
||||
except (ImportError, AttributeError):
|
||||
# forum-dl not installed or already compatible
|
||||
pass
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
PLUGIN_NAME = 'forumdl'
|
||||
BIN_NAME = 'forum-dl'
|
||||
BIN_PROVIDERS = 'pip,env'
|
||||
OUTPUT_DIR = '.'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def get_env_int(name: str, default: int = 0) -> int:
|
||||
try:
|
||||
return int(get_env(name, str(default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
|
||||
"""Parse a JSON array from environment variable."""
|
||||
val = get_env(name, '')
|
||||
if not val:
|
||||
return default if default is not None else []
|
||||
try:
|
||||
result = json.loads(val)
|
||||
if isinstance(result, list):
|
||||
return [str(item) for item in result]
|
||||
return default if default is not None else []
|
||||
except json.JSONDecodeError:
|
||||
return default if default is not None else []
|
||||
|
||||
|
||||
def get_binary_shebang(binary_path: str) -> str | None:
|
||||
"""Return interpreter from shebang line if present (e.g., /path/to/python)."""
|
||||
try:
|
||||
with open(binary_path, 'r', encoding='utf-8') as f:
|
||||
first_line = f.readline().strip()
|
||||
if first_line.startswith('#!'):
|
||||
return first_line[2:].strip().split(' ')[0]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def resolve_binary_path(binary: str) -> str | None:
|
||||
"""Resolve binary to an absolute path if possible."""
|
||||
if not binary:
|
||||
return None
|
||||
if Path(binary).is_file():
|
||||
return binary
|
||||
return shutil.which(binary)
|
||||
|
||||
|
||||
|
||||
def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
"""
|
||||
Download forum using forum-dl.
|
||||
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
# Get config from env (with FORUMDL_ prefix, x-fallback handled by config loader)
|
||||
timeout = get_env_int('FORUMDL_TIMEOUT') or get_env_int('TIMEOUT', 3600)
|
||||
check_ssl = get_env_bool('FORUMDL_CHECK_SSL_VALIDITY', True) if get_env('FORUMDL_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True)
|
||||
forumdl_args = get_env_array('FORUMDL_ARGS', [])
|
||||
forumdl_args_extra = get_env_array('FORUMDL_ARGS_EXTRA', [])
|
||||
output_format = get_env('FORUMDL_OUTPUT_FORMAT', 'jsonl')
|
||||
|
||||
# Output directory is current directory (hook already runs in output dir)
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
|
||||
# Build output filename based on format
|
||||
if output_format == 'warc':
|
||||
output_file = output_dir / 'forum.warc.gz'
|
||||
elif output_format == 'jsonl':
|
||||
output_file = output_dir / 'forum.jsonl'
|
||||
elif output_format == 'maildir':
|
||||
output_file = output_dir / 'forum' # maildir is a directory
|
||||
elif output_format in ('mbox', 'mh', 'mmdf', 'babyl'):
|
||||
output_file = output_dir / f'forum.{output_format}'
|
||||
else:
|
||||
output_file = output_dir / f'forum.{output_format}'
|
||||
|
||||
# Use our Pydantic v2 compatible wrapper if available, otherwise fall back to binary
|
||||
wrapper_path = Path(__file__).parent / 'forum-dl-wrapper.py'
|
||||
resolved_binary = resolve_binary_path(binary) or binary
|
||||
if wrapper_path.exists():
|
||||
forumdl_python = get_binary_shebang(resolved_binary) or sys.executable
|
||||
cmd = [forumdl_python, str(wrapper_path), *forumdl_args, '-f', output_format, '-o', str(output_file)]
|
||||
else:
|
||||
cmd = [resolved_binary, *forumdl_args, '-f', output_format, '-o', str(output_file)]
|
||||
|
||||
if not check_ssl:
|
||||
cmd.append('--no-check-certificate')
|
||||
|
||||
if forumdl_args_extra:
|
||||
cmd.extend(forumdl_args_extra)
|
||||
|
||||
cmd.append(url)
|
||||
|
||||
try:
|
||||
print(f'[forumdl] Starting download (timeout={timeout}s)', file=sys.stderr)
|
||||
output_lines: list[str] = []
|
||||
process = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
text=True,
|
||||
bufsize=1,
|
||||
)
|
||||
|
||||
def _read_output() -> None:
|
||||
if not process.stdout:
|
||||
return
|
||||
for line in process.stdout:
|
||||
output_lines.append(line)
|
||||
sys.stderr.write(line)
|
||||
|
||||
reader = threading.Thread(target=_read_output, daemon=True)
|
||||
reader.start()
|
||||
|
||||
try:
|
||||
process.wait(timeout=timeout)
|
||||
except subprocess.TimeoutExpired:
|
||||
process.kill()
|
||||
reader.join(timeout=1)
|
||||
return False, None, f'Timed out after {timeout} seconds'
|
||||
|
||||
reader.join(timeout=1)
|
||||
combined_output = ''.join(output_lines)
|
||||
|
||||
# Check if output file was created
|
||||
if output_file.exists() and output_file.stat().st_size > 0:
|
||||
return True, str(output_file), ''
|
||||
else:
|
||||
stderr = combined_output
|
||||
|
||||
# These are NOT errors - page simply has no downloadable forum content
|
||||
stderr_lower = stderr.lower()
|
||||
if 'unsupported url' in stderr_lower:
|
||||
return True, None, '' # Not a forum site - success, no output
|
||||
if 'no content' in stderr_lower:
|
||||
return True, None, '' # No forum found - success, no output
|
||||
if 'extractornotfounderror' in stderr_lower:
|
||||
return True, None, '' # No forum extractor for this URL - success, no output
|
||||
if process.returncode == 0:
|
||||
return True, None, '' # forum-dl exited cleanly, just no forum - success
|
||||
|
||||
# These ARE errors - something went wrong
|
||||
if '404' in stderr:
|
||||
return False, None, '404 Not Found'
|
||||
if '403' in stderr:
|
||||
return False, None, '403 Forbidden'
|
||||
if 'unable to extract' in stderr_lower:
|
||||
return False, None, 'Unable to extract forum info'
|
||||
|
||||
return False, None, f'forum-dl error: {stderr}'
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
return False, None, f'Timed out after {timeout} seconds'
|
||||
except Exception as e:
|
||||
return False, None, f'{type(e).__name__}: {e}'
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL to download forum from')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Download forum content from a URL using forum-dl."""
|
||||
|
||||
output = None
|
||||
status = 'failed'
|
||||
error = ''
|
||||
|
||||
try:
|
||||
# Check if forum-dl is enabled
|
||||
if not get_env_bool('FORUMDL_ENABLED', True):
|
||||
print('Skipping forum-dl (FORUMDL_ENABLED=False)', file=sys.stderr)
|
||||
# Temporary failure (config disabled) - NO JSONL emission
|
||||
sys.exit(0)
|
||||
|
||||
# Get binary from environment
|
||||
binary = get_env('FORUMDL_BINARY', 'forum-dl')
|
||||
|
||||
# Run extraction
|
||||
success, output, error = save_forum(url, binary)
|
||||
|
||||
if success:
|
||||
# Success - emit ArchiveResult
|
||||
result = {
|
||||
'type': 'ArchiveResult',
|
||||
'status': 'succeeded',
|
||||
'output_str': output or ''
|
||||
}
|
||||
print(json.dumps(result))
|
||||
sys.exit(0)
|
||||
else:
|
||||
# Transient error - emit NO JSONL
|
||||
print(f'ERROR: {error}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
# Transient error - emit NO JSONL
|
||||
print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,7 +0,0 @@
|
||||
<!-- Forum thumbnail - shows icon placeholder -->
|
||||
<div class="extractor-thumbnail forumdl-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #1a1a1a; display: flex; align-items: center; justify-content: center;">
|
||||
<div style="display: flex; flex-direction: column; align-items: center; color: #888; font-size: 12px;">
|
||||
<span style="font-size: 32px;">💬</span>
|
||||
<span>Forum</span>
|
||||
</div>
|
||||
</div>
|
||||
@@ -1,147 +0,0 @@
|
||||
<!-- Fullscreen forum view - renders JSONL forum posts -->
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Forum Thread</title>
|
||||
<style>
|
||||
body {
|
||||
margin: 0;
|
||||
padding: 20px;
|
||||
background: #0d1117;
|
||||
color: #c9d1d9;
|
||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif;
|
||||
line-height: 1.6;
|
||||
}
|
||||
.header {
|
||||
max-width: 1000px;
|
||||
margin: 0 auto 30px;
|
||||
text-align: center;
|
||||
padding: 20px;
|
||||
border-bottom: 1px solid #30363d;
|
||||
}
|
||||
.icon {
|
||||
font-size: 48px;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
h1 {
|
||||
margin: 0;
|
||||
font-size: 28px;
|
||||
color: #f0f6fc;
|
||||
}
|
||||
.container {
|
||||
max-width: 1000px;
|
||||
margin: 0 auto;
|
||||
}
|
||||
.post {
|
||||
background: #161b22;
|
||||
border: 1px solid #30363d;
|
||||
border-radius: 6px;
|
||||
margin-bottom: 16px;
|
||||
padding: 16px;
|
||||
transition: border-color 0.2s;
|
||||
}
|
||||
.post:hover {
|
||||
border-color: #58a6ff;
|
||||
}
|
||||
.post-header {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
margin-bottom: 12px;
|
||||
padding-bottom: 12px;
|
||||
border-bottom: 1px solid #21262d;
|
||||
}
|
||||
.post-author {
|
||||
font-weight: 600;
|
||||
color: #58a6ff;
|
||||
font-size: 14px;
|
||||
}
|
||||
.post-date {
|
||||
color: #8b949e;
|
||||
font-size: 12px;
|
||||
}
|
||||
.post-title {
|
||||
margin: 0 0 12px 0;
|
||||
font-size: 18px;
|
||||
font-weight: 600;
|
||||
color: #f0f6fc;
|
||||
}
|
||||
.post-content {
|
||||
color: #c9d1d9;
|
||||
word-wrap: break-word;
|
||||
}
|
||||
.post-content img {
|
||||
max-width: 100%;
|
||||
height: auto;
|
||||
border-radius: 4px;
|
||||
}
|
||||
.post-content a {
|
||||
color: #58a6ff;
|
||||
text-decoration: none;
|
||||
}
|
||||
.post-content a:hover {
|
||||
text-decoration: underline;
|
||||
}
|
||||
.loading {
|
||||
text-align: center;
|
||||
padding: 40px;
|
||||
color: #8b949e;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="header">
|
||||
<div class="icon">💬</div>
|
||||
<h1>Forum Thread</h1>
|
||||
</div>
|
||||
<div class="container">
|
||||
<div id="forum-posts" class="loading">Loading posts...</div>
|
||||
</div>
|
||||
<script>
|
||||
(async function() {
|
||||
try {
|
||||
const response = await fetch('{{ output_path }}');
|
||||
const text = await response.text();
|
||||
const posts = text.trim().split('\n').filter(line => line).map(line => JSON.parse(line));
|
||||
const container = document.getElementById('forum-posts');
|
||||
container.innerHTML = '';
|
||||
container.className = '';
|
||||
|
||||
posts.forEach(post => {
|
||||
const postDiv = document.createElement('div');
|
||||
postDiv.className = 'post';
|
||||
|
||||
const author = post.author || 'Anonymous';
|
||||
const date = post.date ? new Date(post.date).toLocaleString() : '';
|
||||
const title = post.title || '';
|
||||
const content = post.content || post.body || '';
|
||||
|
||||
postDiv.innerHTML = `
|
||||
<div class="post-header">
|
||||
<span class="post-author">${escapeHtml(author)}</span>
|
||||
<span class="post-date">${escapeHtml(date)}</span>
|
||||
</div>
|
||||
${title ? `<h2 class="post-title">${escapeHtml(title)}</h2>` : ''}
|
||||
<div class="post-content">${content}</div>
|
||||
`;
|
||||
container.appendChild(postDiv);
|
||||
});
|
||||
|
||||
if (posts.length === 0) {
|
||||
container.innerHTML = '<div class="loading">No posts found</div>';
|
||||
}
|
||||
} catch(e) {
|
||||
document.getElementById('forum-posts').innerHTML = '<div class="loading">Error loading posts: ' + e.message + '</div>';
|
||||
}
|
||||
})();
|
||||
|
||||
function escapeHtml(text) {
|
||||
const div = document.createElement('div');
|
||||
div.textContent = text;
|
||||
return div.innerHTML;
|
||||
}
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
@@ -1 +0,0 @@
|
||||
<span class="abx-output-icon abx-output-icon--forumdl" title="Forum"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M4 5h16v10H7l-3 3V5z"/></svg></span>
|
||||
@@ -1,317 +0,0 @@
|
||||
"""
|
||||
Integration tests for forumdl plugin
|
||||
|
||||
Tests verify:
|
||||
pass
|
||||
1. Hook script exists
|
||||
2. Dependencies installed via validation hooks
|
||||
3. Verify deps with abx-pkg
|
||||
4. Forum extraction works on forum URLs
|
||||
5. JSONL output is correct
|
||||
6. Config options work
|
||||
7. Handles non-forum URLs gracefully
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
FORUMDL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_forumdl.*'), None)
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
# Module-level cache for binary path
|
||||
_forumdl_binary_path = None
|
||||
_forumdl_lib_root = None
|
||||
|
||||
def get_forumdl_binary_path():
|
||||
"""Get the installed forum-dl binary path from cache or by running installation."""
|
||||
global _forumdl_binary_path
|
||||
if _forumdl_binary_path:
|
||||
return _forumdl_binary_path
|
||||
|
||||
# Try to find forum-dl binary using abx-pkg
|
||||
from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
try:
|
||||
binary = Binary(
|
||||
name='forum-dl',
|
||||
binproviders=[PipProvider(), EnvProvider()]
|
||||
).load()
|
||||
|
||||
if binary and binary.abspath:
|
||||
_forumdl_binary_path = str(binary.abspath)
|
||||
return _forumdl_binary_path
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# If not found, try to install via pip using the crawl hook overrides
|
||||
pip_hook = PLUGINS_ROOT / 'pip' / 'on_Binary__11_pip_install.py'
|
||||
crawl_hook = PLUGIN_DIR / 'on_Crawl__25_forumdl_install.py'
|
||||
if pip_hook.exists():
|
||||
binary_id = str(uuid.uuid4())
|
||||
machine_id = str(uuid.uuid4())
|
||||
overrides = None
|
||||
|
||||
if crawl_hook.exists():
|
||||
crawl_result = subprocess.run(
|
||||
[sys.executable, str(crawl_hook)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
)
|
||||
for crawl_line in crawl_result.stdout.strip().split('\n'):
|
||||
if crawl_line.strip().startswith('{'):
|
||||
try:
|
||||
crawl_record = json.loads(crawl_line)
|
||||
if crawl_record.get('type') == 'Binary' and crawl_record.get('name') == 'forum-dl':
|
||||
overrides = crawl_record.get('overrides')
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
# Create a persistent temp LIB_DIR for the pip provider
|
||||
import platform
|
||||
global _forumdl_lib_root
|
||||
if not _forumdl_lib_root:
|
||||
_forumdl_lib_root = tempfile.mkdtemp(prefix='forumdl-lib-')
|
||||
machine = platform.machine().lower()
|
||||
system = platform.system().lower()
|
||||
if machine in ('arm64', 'aarch64'):
|
||||
machine = 'arm64'
|
||||
elif machine in ('x86_64', 'amd64'):
|
||||
machine = 'x86_64'
|
||||
machine_type = f"{machine}-{system}"
|
||||
lib_dir = Path(_forumdl_lib_root) / 'lib' / machine_type
|
||||
lib_dir.mkdir(parents=True, exist_ok=True)
|
||||
env = os.environ.copy()
|
||||
env['LIB_DIR'] = str(lib_dir)
|
||||
env['DATA_DIR'] = str(Path(_forumdl_lib_root) / 'data')
|
||||
|
||||
cmd = [
|
||||
sys.executable, str(pip_hook),
|
||||
'--binary-id', binary_id,
|
||||
'--machine-id', machine_id,
|
||||
'--name', 'forum-dl'
|
||||
]
|
||||
if overrides:
|
||||
cmd.append(f'--overrides={json.dumps(overrides)}')
|
||||
|
||||
install_result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300,
|
||||
env=env,
|
||||
)
|
||||
|
||||
# Parse Binary from pip installation
|
||||
for install_line in install_result.stdout.strip().split('\n'):
|
||||
if install_line.strip():
|
||||
try:
|
||||
install_record = json.loads(install_line)
|
||||
if install_record.get('type') == 'Binary' and install_record.get('name') == 'forum-dl':
|
||||
_forumdl_binary_path = install_record.get('abspath')
|
||||
return _forumdl_binary_path
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify on_Snapshot hook exists."""
|
||||
assert FORUMDL_HOOK.exists(), f"Hook not found: {FORUMDL_HOOK}"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify forum-dl is installed by calling the REAL installation hooks."""
|
||||
binary_path = get_forumdl_binary_path()
|
||||
if not binary_path:
|
||||
assert False, (
|
||||
"forum-dl installation failed. Install hook should install forum-dl automatically. "
|
||||
"Note: forum-dl has a dependency on cchardet which may not compile on Python 3.14+ "
|
||||
"due to removed longintrepr.h header."
|
||||
)
|
||||
assert Path(binary_path).is_file(), f"Binary path must be a valid file: {binary_path}"
|
||||
|
||||
|
||||
def test_handles_non_forum_url():
|
||||
"""Test that forum-dl extractor handles non-forum URLs gracefully via hook."""
|
||||
import os
|
||||
|
||||
binary_path = get_forumdl_binary_path()
|
||||
if not binary_path:
|
||||
pass
|
||||
assert Path(binary_path).is_file(), f"Binary must be a valid file: {binary_path}"
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
env = os.environ.copy()
|
||||
env['FORUMDL_BINARY'] = binary_path
|
||||
|
||||
# Run forum-dl extraction hook on non-forum URL
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(FORUMDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'test789'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Should exit 0 even for non-forum URL (graceful handling)
|
||||
assert result.returncode == 0, f"Should handle non-forum URL gracefully: {result.stderr}"
|
||||
|
||||
# Parse clean JSONL output
|
||||
result_json = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert result_json, "Should have ArchiveResult JSONL output"
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed even for non-forum URL: {result_json}"
|
||||
|
||||
|
||||
def test_config_save_forumdl_false_skips():
|
||||
"""Test that FORUMDL_ENABLED=False exits without emitting JSONL."""
|
||||
import os
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
env = os.environ.copy()
|
||||
env['FORUMDL_ENABLED'] = 'False'
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(FORUMDL_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
|
||||
|
||||
# Feature disabled - temporary failure, should NOT emit JSONL
|
||||
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
|
||||
|
||||
# Should NOT emit any JSONL
|
||||
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
|
||||
assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}"
|
||||
|
||||
|
||||
def test_config_timeout():
|
||||
"""Test that FORUMDL_TIMEOUT config is respected."""
|
||||
import os
|
||||
|
||||
binary_path = get_forumdl_binary_path()
|
||||
if not binary_path:
|
||||
pass
|
||||
assert Path(binary_path).is_file(), f"Binary must be a valid file: {binary_path}"
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
env = os.environ.copy()
|
||||
env['FORUMDL_BINARY'] = binary_path
|
||||
env['FORUMDL_TIMEOUT'] = '5'
|
||||
|
||||
start_time = time.time()
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(FORUMDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'testtimeout'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=10 # Should complete in 5s, use 10s as safety margin
|
||||
)
|
||||
elapsed_time = time.time() - start_time
|
||||
|
||||
assert result.returncode == 0, f"Should complete without hanging: {result.stderr}"
|
||||
# Allow 1 second overhead for subprocess startup and Python interpreter
|
||||
assert elapsed_time <= 6.0, f"Should complete within 6 seconds (5s timeout + 1s overhead), took {elapsed_time:.2f}s"
|
||||
|
||||
|
||||
def test_real_forum_url():
|
||||
"""Test that forum-dl extracts content from a real HackerNews thread with jsonl output.
|
||||
|
||||
Uses our Pydantic v2 compatible wrapper to fix forum-dl 0.3.0's incompatibility.
|
||||
"""
|
||||
import os
|
||||
|
||||
binary_path = get_forumdl_binary_path()
|
||||
assert binary_path, "forum-dl binary not available"
|
||||
assert Path(binary_path).is_file(), f"Binary must be a valid file: {binary_path}"
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Use HackerNews - one of the most reliable forum-dl extractors
|
||||
forum_url = 'https://news.ycombinator.com/item?id=1'
|
||||
|
||||
env = os.environ.copy()
|
||||
env['FORUMDL_BINARY'] = binary_path
|
||||
env['FORUMDL_TIMEOUT'] = '60'
|
||||
env['FORUMDL_OUTPUT_FORMAT'] = 'jsonl' # Use jsonl format
|
||||
# HTML output could be added via: env['FORUMDL_ARGS_EXTRA'] = json.dumps(['--files-output', './files'])
|
||||
|
||||
start_time = time.time()
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(FORUMDL_HOOK), '--url', forum_url, '--snapshot-id', 'testforum'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=90
|
||||
)
|
||||
elapsed_time = time.time() - start_time
|
||||
|
||||
# Should succeed with our Pydantic v2 wrapper
|
||||
assert result.returncode == 0, f"Should extract forum successfully: {result.stderr}"
|
||||
|
||||
# Parse JSONL output
|
||||
result_json = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert result_json, f"Should have ArchiveResult JSONL output. stdout: {result.stdout}"
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
||||
|
||||
# Check that forum files were downloaded
|
||||
output_files = list(tmpdir.glob('**/*'))
|
||||
forum_files = [f for f in output_files if f.is_file()]
|
||||
|
||||
assert len(forum_files) > 0, f"Should have downloaded at least one forum file. Files: {output_files}"
|
||||
|
||||
# Verify the JSONL file has content
|
||||
jsonl_file = tmpdir / 'forum.jsonl'
|
||||
assert jsonl_file.exists(), "Should have created forum.jsonl"
|
||||
assert jsonl_file.stat().st_size > 0, "forum.jsonl should not be empty"
|
||||
|
||||
print(f"Successfully extracted {len(forum_files)} file(s) in {elapsed_time:.2f}s")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
@@ -1,54 +0,0 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"GALLERYDL_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_GALLERYDL", "USE_GALLERYDL"],
|
||||
"description": "Enable gallery downloading with gallery-dl"
|
||||
},
|
||||
"GALLERYDL_BINARY": {
|
||||
"type": "string",
|
||||
"default": "gallery-dl",
|
||||
"description": "Path to gallery-dl binary"
|
||||
},
|
||||
"GALLERYDL_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 3600,
|
||||
"minimum": 30,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for gallery downloads in seconds"
|
||||
},
|
||||
"GALLERYDL_COOKIES_FILE": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"x-fallback": "COOKIES_FILE",
|
||||
"description": "Path to cookies file"
|
||||
},
|
||||
"GALLERYDL_CHECK_SSL_VALIDITY": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-fallback": "CHECK_SSL_VALIDITY",
|
||||
"description": "Whether to verify SSL certificates"
|
||||
},
|
||||
"GALLERYDL_ARGS": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": [
|
||||
"--write-metadata",
|
||||
"--write-info-json"
|
||||
],
|
||||
"x-aliases": ["GALLERYDL_DEFAULT_ARGS"],
|
||||
"description": "Default gallery-dl arguments"
|
||||
},
|
||||
"GALLERYDL_ARGS_EXTRA": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": [],
|
||||
"x-aliases": ["GALLERYDL_EXTRA_ARGS"],
|
||||
"description": "Extra arguments to append to gallery-dl command"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,48 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Emit gallery-dl Binary dependency for the crawl.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def output_binary(name: str, binproviders: str):
|
||||
"""Output Binary JSONL record for a dependency."""
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
record = {
|
||||
'type': 'Binary',
|
||||
'name': name,
|
||||
'binproviders': binproviders,
|
||||
'machine_id': machine_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
|
||||
def main():
|
||||
gallerydl_enabled = get_env_bool('GALLERYDL_ENABLED', True)
|
||||
|
||||
if not gallerydl_enabled:
|
||||
sys.exit(0)
|
||||
|
||||
output_binary(name='gallery-dl', binproviders='pip,brew,apt,env')
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,261 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Download image galleries from a URL using gallery-dl.
|
||||
|
||||
Usage: on_Snapshot__03_gallerydl.bg.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Downloads gallery images to $PWD/gallerydl/
|
||||
|
||||
Environment variables:
|
||||
GALLERYDL_ENABLED: Enable gallery-dl gallery extraction (default: True)
|
||||
GALLERYDL_BINARY: Path to gallery-dl binary (default: gallery-dl)
|
||||
GALLERYDL_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT)
|
||||
GALLERYDL_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE)
|
||||
GALLERYDL_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY)
|
||||
GALLERYDL_ARGS: Default gallery-dl arguments (JSON array)
|
||||
GALLERYDL_ARGS_EXTRA: Extra arguments to append (JSON array)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import threading
|
||||
from pathlib import Path
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
PLUGIN_NAME = 'gallerydl'
|
||||
BIN_NAME = 'gallery-dl'
|
||||
BIN_PROVIDERS = 'pip,env'
|
||||
OUTPUT_DIR = '.'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def get_env_int(name: str, default: int = 0) -> int:
|
||||
try:
|
||||
return int(get_env(name, str(default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
|
||||
"""Parse a JSON array from environment variable."""
|
||||
val = get_env(name, '')
|
||||
if not val:
|
||||
return default if default is not None else []
|
||||
try:
|
||||
result = json.loads(val)
|
||||
if isinstance(result, list):
|
||||
return [str(item) for item in result]
|
||||
return default if default is not None else []
|
||||
except json.JSONDecodeError:
|
||||
return default if default is not None else []
|
||||
|
||||
|
||||
STATICFILE_DIR = '../staticfile'
|
||||
|
||||
def has_staticfile_output() -> bool:
|
||||
"""Check if staticfile extractor already downloaded this URL."""
|
||||
staticfile_dir = Path(STATICFILE_DIR)
|
||||
if not staticfile_dir.exists():
|
||||
return False
|
||||
stdout_log = staticfile_dir / 'stdout.log'
|
||||
if not stdout_log.exists():
|
||||
return False
|
||||
for line in stdout_log.read_text(errors='ignore').splitlines():
|
||||
line = line.strip()
|
||||
if not line.startswith('{'):
|
||||
continue
|
||||
try:
|
||||
record = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
if record.get('type') == 'ArchiveResult' and record.get('status') == 'succeeded':
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
"""
|
||||
Download gallery using gallery-dl.
|
||||
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
# Get config from env (with GALLERYDL_ prefix, x-fallback handled by config loader)
|
||||
timeout = get_env_int('GALLERYDL_TIMEOUT') or get_env_int('TIMEOUT', 3600)
|
||||
check_ssl = get_env_bool('GALLERYDL_CHECK_SSL_VALIDITY', True) if get_env('GALLERYDL_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True)
|
||||
gallerydl_args = get_env_array('GALLERYDL_ARGS', [])
|
||||
gallerydl_args_extra = get_env_array('GALLERYDL_ARGS_EXTRA', [])
|
||||
cookies_file = get_env('GALLERYDL_COOKIES_FILE') or get_env('COOKIES_FILE', '')
|
||||
|
||||
# Output directory is current directory (hook already runs in output dir)
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
|
||||
# Build command
|
||||
# Use -D for exact directory (flat structure) instead of -d (nested structure)
|
||||
cmd = [
|
||||
binary,
|
||||
*gallerydl_args,
|
||||
'-D', str(output_dir),
|
||||
]
|
||||
|
||||
if not check_ssl:
|
||||
cmd.append('--no-check-certificate')
|
||||
|
||||
if cookies_file and Path(cookies_file).exists():
|
||||
cmd.extend(['-C', cookies_file])
|
||||
|
||||
if gallerydl_args_extra:
|
||||
cmd.extend(gallerydl_args_extra)
|
||||
|
||||
cmd.append(url)
|
||||
|
||||
try:
|
||||
print(f'[gallerydl] Starting download (timeout={timeout}s)', file=sys.stderr)
|
||||
output_lines: list[str] = []
|
||||
process = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
text=True,
|
||||
bufsize=1,
|
||||
)
|
||||
|
||||
def _read_output() -> None:
|
||||
if not process.stdout:
|
||||
return
|
||||
for line in process.stdout:
|
||||
output_lines.append(line)
|
||||
sys.stderr.write(line)
|
||||
|
||||
reader = threading.Thread(target=_read_output, daemon=True)
|
||||
reader.start()
|
||||
|
||||
try:
|
||||
process.wait(timeout=timeout)
|
||||
except subprocess.TimeoutExpired:
|
||||
process.kill()
|
||||
reader.join(timeout=1)
|
||||
return False, None, f'Timed out after {timeout} seconds'
|
||||
|
||||
reader.join(timeout=1)
|
||||
combined_output = ''.join(output_lines)
|
||||
|
||||
# Check if any gallery files were downloaded (search recursively)
|
||||
gallery_extensions = (
|
||||
'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.svg',
|
||||
'.mp4', '.webm', '.mkv', '.avi', '.mov', '.flv',
|
||||
'.json', '.txt', '.zip',
|
||||
)
|
||||
|
||||
downloaded_files = [
|
||||
f for f in output_dir.rglob('*')
|
||||
if f.is_file() and f.suffix.lower() in gallery_extensions
|
||||
]
|
||||
|
||||
if downloaded_files:
|
||||
# Return first image file, or first file if no images
|
||||
image_files = [
|
||||
f for f in downloaded_files
|
||||
if f.suffix.lower() in ('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp')
|
||||
]
|
||||
output = str(image_files[0]) if image_files else str(downloaded_files[0])
|
||||
return True, output, ''
|
||||
else:
|
||||
stderr = combined_output
|
||||
|
||||
# These are NOT errors - page simply has no downloadable gallery
|
||||
# Return success with no output (legitimate "nothing to download")
|
||||
stderr_lower = stderr.lower()
|
||||
if 'unsupported url' in stderr_lower:
|
||||
return True, None, '' # Not a gallery site - success, no output
|
||||
if 'no results' in stderr_lower:
|
||||
return True, None, '' # No gallery found - success, no output
|
||||
if process.returncode == 0:
|
||||
return True, None, '' # gallery-dl exited cleanly, just no gallery - success
|
||||
|
||||
# These ARE errors - something went wrong
|
||||
if '404' in stderr:
|
||||
return False, None, '404 Not Found'
|
||||
if '403' in stderr:
|
||||
return False, None, '403 Forbidden'
|
||||
if 'unable to extract' in stderr_lower:
|
||||
return False, None, 'Unable to extract gallery info'
|
||||
|
||||
return False, None, f'gallery-dl error: {stderr}'
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
return False, None, f'Timed out after {timeout} seconds'
|
||||
except Exception as e:
|
||||
return False, None, f'{type(e).__name__}: {e}'
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL to download gallery from')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Download image gallery from a URL using gallery-dl."""
|
||||
|
||||
output = None
|
||||
status = 'failed'
|
||||
error = ''
|
||||
|
||||
try:
|
||||
# Check if gallery-dl is enabled
|
||||
if not get_env_bool('GALLERYDL_ENABLED', True):
|
||||
print('Skipping gallery-dl (GALLERYDL_ENABLED=False)', file=sys.stderr)
|
||||
# Temporary failure (config disabled) - NO JSONL emission
|
||||
sys.exit(0)
|
||||
|
||||
# Check if staticfile extractor already handled this (permanent skip)
|
||||
if has_staticfile_output():
|
||||
print(f'Skipping gallery-dl - staticfile extractor already downloaded this', file=sys.stderr)
|
||||
print(json.dumps({
|
||||
'type': 'ArchiveResult',
|
||||
'status': 'skipped',
|
||||
'output_str': 'staticfile already handled',
|
||||
}))
|
||||
sys.exit(0)
|
||||
|
||||
# Get binary from environment
|
||||
binary = get_env('GALLERYDL_BINARY', 'gallery-dl')
|
||||
|
||||
# Run extraction
|
||||
success, output, error = save_gallery(url, binary)
|
||||
|
||||
if success:
|
||||
# Success - emit ArchiveResult
|
||||
result = {
|
||||
'type': 'ArchiveResult',
|
||||
'status': 'succeeded',
|
||||
'output_str': output or ''
|
||||
}
|
||||
print(json.dumps(result))
|
||||
sys.exit(0)
|
||||
else:
|
||||
# Transient error - emit NO JSONL
|
||||
print(f'ERROR: {error}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
# Transient error - emit NO JSONL
|
||||
print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,11 +0,0 @@
|
||||
<!-- Gallery thumbnail - shows first image or placeholder -->
|
||||
<div class="extractor-thumbnail gallerydl-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #1a1a1a; display: flex; align-items: center; justify-content: center;">
|
||||
<img src="{{ output_path }}"
|
||||
style="width: 100%; height: 100px; object-fit: contain;"
|
||||
alt="Gallery thumbnail"
|
||||
onerror="this.style.display='none'; this.nextElementSibling.style.display='flex';">
|
||||
<div style="display: none; flex-direction: column; align-items: center; color: #888; font-size: 12px;">
|
||||
<span style="font-size: 32px;">🖼️</span>
|
||||
<span>Gallery</span>
|
||||
</div>
|
||||
</div>
|
||||
@@ -1,28 +0,0 @@
|
||||
<!-- Fullscreen gallery view - shows image in full size -->
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Gallery</title>
|
||||
<style>
|
||||
body {
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
background: #000;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
min-height: 100vh;
|
||||
}
|
||||
img {
|
||||
max-width: 100%;
|
||||
max-height: 100vh;
|
||||
object-fit: contain;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<img src="{{ output_path }}" alt="Gallery image">
|
||||
</body>
|
||||
</html>
|
||||
@@ -1 +0,0 @@
|
||||
<span class="abx-output-icon abx-output-icon--gallerydl" title="Gallery"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><rect x="3" y="5" width="18" height="14" rx="2"/><circle cx="8" cy="10" r="1.5" fill="currentColor" stroke="none"/><path d="M21 17l-5-5-5 5"/></svg></span>
|
||||
@@ -1,190 +0,0 @@
|
||||
"""
|
||||
Integration tests for gallerydl plugin
|
||||
|
||||
Tests verify:
|
||||
pass
|
||||
1. Hook script exists
|
||||
2. Dependencies installed via validation hooks
|
||||
3. Verify deps with abx-pkg
|
||||
4. Gallery extraction works on gallery URLs
|
||||
5. JSONL output is correct
|
||||
6. Config options work
|
||||
7. Handles non-gallery URLs gracefully
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
GALLERYDL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_gallerydl.*'), None)
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify on_Snapshot hook exists."""
|
||||
assert GALLERYDL_HOOK.exists(), f"Hook not found: {GALLERYDL_HOOK}"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify gallery-dl is available via abx-pkg."""
|
||||
from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
missing_binaries = []
|
||||
|
||||
# Verify gallery-dl is available
|
||||
gallerydl_binary = Binary(name='gallery-dl', binproviders=[PipProvider(), EnvProvider()])
|
||||
gallerydl_loaded = gallerydl_binary.load()
|
||||
if not (gallerydl_loaded and gallerydl_loaded.abspath):
|
||||
missing_binaries.append('gallery-dl')
|
||||
|
||||
if missing_binaries:
|
||||
pass
|
||||
|
||||
|
||||
def test_handles_non_gallery_url():
|
||||
"""Test that gallery-dl extractor handles non-gallery URLs gracefully via hook."""
|
||||
# Prerequisites checked by earlier test
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Run gallery-dl extraction hook on non-gallery URL
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(GALLERYDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'test789'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Should exit 0 even for non-gallery URL
|
||||
assert result.returncode == 0, f"Should handle non-gallery URL gracefully: {result.stderr}"
|
||||
|
||||
# Parse clean JSONL output
|
||||
result_json = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert result_json, "Should have ArchiveResult JSONL output"
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
||||
|
||||
|
||||
def test_config_save_gallery_dl_false_skips():
|
||||
"""Test that GALLERYDL_ENABLED=False exits without emitting JSONL."""
|
||||
import os
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
env = os.environ.copy()
|
||||
env['GALLERYDL_ENABLED'] = 'False'
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(GALLERYDL_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
|
||||
|
||||
# Feature disabled - temporary failure, should NOT emit JSONL
|
||||
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
|
||||
|
||||
# Should NOT emit any JSONL
|
||||
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
|
||||
assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}"
|
||||
|
||||
|
||||
def test_config_timeout():
|
||||
"""Test that GALLERY_DL_TIMEOUT config is respected."""
|
||||
import os
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
env = os.environ.copy()
|
||||
env['GALLERY_DL_TIMEOUT'] = '5'
|
||||
|
||||
start_time = time.time()
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(GALLERYDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'testtimeout'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=10 # Should complete in 5s, use 10s as safety margin
|
||||
)
|
||||
elapsed_time = time.time() - start_time
|
||||
|
||||
assert result.returncode == 0, f"Should complete without hanging: {result.stderr}"
|
||||
# Allow 1 second overhead for subprocess startup and Python interpreter
|
||||
assert elapsed_time <= 6.0, f"Should complete within 6 seconds (5s timeout + 1s overhead), took {elapsed_time:.2f}s"
|
||||
|
||||
|
||||
def test_real_gallery_url():
|
||||
"""Test that gallery-dl can extract images from a real Flickr gallery URL."""
|
||||
import os
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Use a real Flickr photo page
|
||||
gallery_url = 'https://www.flickr.com/photos/gregorydolivet/55002388567/in/explore-2025-12-25/'
|
||||
|
||||
env = os.environ.copy()
|
||||
env['GALLERY_DL_TIMEOUT'] = '60' # Give it time to download
|
||||
|
||||
start_time = time.time()
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(GALLERYDL_HOOK), '--url', gallery_url, '--snapshot-id', 'testflickr'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=90
|
||||
)
|
||||
elapsed_time = time.time() - start_time
|
||||
|
||||
# Should succeed
|
||||
assert result.returncode == 0, f"Should extract gallery successfully: {result.stderr}"
|
||||
|
||||
# Parse JSONL output
|
||||
result_json = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert result_json, f"Should have ArchiveResult JSONL output. stdout: {result.stdout}"
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
||||
|
||||
# Check that some files were downloaded
|
||||
output_files = list(tmpdir.glob('**/*'))
|
||||
image_files = [f for f in output_files if f.is_file() and f.suffix.lower() in ('.jpg', '.jpeg', '.png', '.gif', '.webp')]
|
||||
|
||||
assert len(image_files) > 0, f"Should have downloaded at least one image. Files: {output_files}"
|
||||
|
||||
print(f"Successfully extracted {len(image_files)} image(s) in {elapsed_time:.2f}s")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
@@ -1,44 +0,0 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"GIT_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_GIT", "USE_GIT"],
|
||||
"description": "Enable git repository cloning"
|
||||
},
|
||||
"GIT_BINARY": {
|
||||
"type": "string",
|
||||
"default": "git",
|
||||
"description": "Path to git binary"
|
||||
},
|
||||
"GIT_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 120,
|
||||
"minimum": 10,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for git operations in seconds"
|
||||
},
|
||||
"GIT_DOMAINS": {
|
||||
"type": "string",
|
||||
"default": "github.com,gitlab.com,bitbucket.org,gist.github.com,codeberg.org,gitea.com,git.sr.ht",
|
||||
"description": "Comma-separated list of domains to treat as git repositories"
|
||||
},
|
||||
"GIT_ARGS": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": ["clone", "--depth=1", "--recursive"],
|
||||
"x-aliases": ["GIT_DEFAULT_ARGS"],
|
||||
"description": "Default git arguments"
|
||||
},
|
||||
"GIT_ARGS_EXTRA": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": [],
|
||||
"x-aliases": ["GIT_EXTRA_ARGS"],
|
||||
"description": "Extra arguments to append to git command"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,48 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Emit git Binary dependency for the crawl.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def output_binary(name: str, binproviders: str):
|
||||
"""Output Binary JSONL record for a dependency."""
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
record = {
|
||||
'type': 'Binary',
|
||||
'name': name,
|
||||
'binproviders': binproviders,
|
||||
'machine_id': machine_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
|
||||
def main():
|
||||
git_enabled = get_env_bool('GIT_ENABLED', True)
|
||||
|
||||
if not git_enabled:
|
||||
sys.exit(0)
|
||||
|
||||
output_binary(name='git', binproviders='apt,brew,env')
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,145 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Clone a git repository from a URL.
|
||||
|
||||
Usage: on_Snapshot__05_git.bg.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Clones repository to $PWD/repo
|
||||
|
||||
Environment variables:
|
||||
GIT_BINARY: Path to git binary
|
||||
GIT_TIMEOUT: Timeout in seconds (default: 120)
|
||||
GIT_ARGS: Default git arguments (JSON array, default: ["clone", "--depth=1", "--recursive"])
|
||||
GIT_ARGS_EXTRA: Extra arguments to append (JSON array, default: [])
|
||||
|
||||
# Fallback to ARCHIVING_CONFIG values if GIT_* not set:
|
||||
TIMEOUT: Fallback timeout
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
PLUGIN_NAME = 'git'
|
||||
BIN_NAME = 'git'
|
||||
BIN_PROVIDERS = 'apt,brew,env'
|
||||
OUTPUT_DIR = '.'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
|
||||
def get_env_int(name: str, default: int = 0) -> int:
|
||||
try:
|
||||
return int(get_env(name, str(default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
|
||||
"""Parse a JSON array from environment variable."""
|
||||
val = get_env(name, '')
|
||||
if not val:
|
||||
return default if default is not None else []
|
||||
try:
|
||||
result = json.loads(val)
|
||||
if isinstance(result, list):
|
||||
return [str(item) for item in result]
|
||||
return default if default is not None else []
|
||||
except json.JSONDecodeError:
|
||||
return default if default is not None else []
|
||||
|
||||
|
||||
def is_git_url(url: str) -> bool:
|
||||
"""Check if URL looks like a git repository."""
|
||||
git_patterns = [
|
||||
'.git',
|
||||
'github.com',
|
||||
'gitlab.com',
|
||||
'bitbucket.org',
|
||||
'git://',
|
||||
'ssh://git@',
|
||||
]
|
||||
return any(p in url.lower() for p in git_patterns)
|
||||
|
||||
|
||||
def clone_git(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
"""
|
||||
Clone git repository.
|
||||
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
timeout = get_env_int('GIT_TIMEOUT') or get_env_int('TIMEOUT', 120)
|
||||
git_args = get_env_array('GIT_ARGS', ["clone", "--depth=1", "--recursive"])
|
||||
git_args_extra = get_env_array('GIT_ARGS_EXTRA', [])
|
||||
|
||||
cmd = [binary, *git_args, *git_args_extra, url, OUTPUT_DIR]
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, timeout=timeout)
|
||||
|
||||
if result.returncode == 0 and Path(OUTPUT_DIR).is_dir():
|
||||
return True, OUTPUT_DIR, ''
|
||||
else:
|
||||
return False, None, f'git clone failed (exit={result.returncode})'
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
return False, None, f'Timed out after {timeout} seconds'
|
||||
except Exception as e:
|
||||
return False, None, f'{type(e).__name__}: {e}'
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='Git repository URL')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Clone a git repository from a URL."""
|
||||
|
||||
output = None
|
||||
status = 'failed'
|
||||
error = ''
|
||||
|
||||
try:
|
||||
# Check if URL looks like a git repo
|
||||
if not is_git_url(url):
|
||||
print(f'Skipping git clone for non-git URL: {url}', file=sys.stderr)
|
||||
print(json.dumps({
|
||||
'type': 'ArchiveResult',
|
||||
'status': 'skipped',
|
||||
'output_str': 'Not a git URL',
|
||||
}))
|
||||
sys.exit(0)
|
||||
|
||||
# Get binary from environment
|
||||
binary = get_env('GIT_BINARY', 'git')
|
||||
|
||||
# Run extraction
|
||||
success, output, error = clone_git(url, binary)
|
||||
status = 'succeeded' if success else 'failed'
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
|
||||
if error:
|
||||
print(f'ERROR: {error}', file=sys.stderr)
|
||||
|
||||
# Output clean JSONL (no RESULT_JSON= prefix)
|
||||
result = {
|
||||
'type': 'ArchiveResult',
|
||||
'status': status,
|
||||
'output_str': output or error or '',
|
||||
}
|
||||
print(json.dumps(result))
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,5 +0,0 @@
|
||||
<!-- Git thumbnail - shows git repository icon and info -->
|
||||
<div class="extractor-thumbnail git-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #f6f8fa; display: flex; flex-direction: column; align-items: center; justify-content: center; padding: 10px;">
|
||||
<span style="font-size: 32px;">📂</span>
|
||||
<span style="font-size: 11px; color: #586069; margin-top: 4px;">Git Repository</span>
|
||||
</div>
|
||||
@@ -1 +0,0 @@
|
||||
<span class="abx-output-icon abx-output-icon--git" title="Git"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="6" cy="6" r="2"/><circle cx="6" cy="18" r="2"/><circle cx="18" cy="12" r="2"/><path d="M8 6h5a3 3 0 0 1 3 3v1"/><path d="M8 18h5a3 3 0 0 0 3-3v-1"/></svg></span>
|
||||
@@ -1,130 +0,0 @@
|
||||
"""
|
||||
Integration tests for git plugin
|
||||
|
||||
Tests verify:
|
||||
pass
|
||||
1. Validate hook checks for git binary
|
||||
2. Verify deps with abx-pkg
|
||||
3. Standalone git extractor execution
|
||||
"""
|
||||
|
||||
import json
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
GIT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_git.*'), None)
|
||||
TEST_URL = 'https://github.com/ArchiveBox/abx-pkg.git'
|
||||
|
||||
def test_hook_script_exists():
|
||||
assert GIT_HOOK.exists()
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify git is available via abx-pkg."""
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
git_binary = Binary(name='git', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
|
||||
git_loaded = git_binary.load()
|
||||
|
||||
assert git_loaded and git_loaded.abspath, "git is required for git plugin tests"
|
||||
|
||||
def test_reports_missing_git():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
env = {'PATH': '/nonexistent'}
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(GIT_HOOK), '--url', TEST_URL, '--snapshot-id', 'test123'],
|
||||
cwd=tmpdir, capture_output=True, text=True, env=env
|
||||
)
|
||||
if result.returncode != 0:
|
||||
combined = result.stdout + result.stderr
|
||||
assert 'DEPENDENCY_NEEDED' in combined or 'git' in combined.lower() or 'ERROR=' in combined
|
||||
|
||||
def test_handles_non_git_url():
|
||||
assert shutil.which('git'), "git binary not available"
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(GIT_HOOK), '--url', 'https://example.com', '--snapshot-id', 'test789'],
|
||||
cwd=tmpdir, capture_output=True, text=True, timeout=30
|
||||
)
|
||||
# Should fail or skip for non-git URL
|
||||
assert result.returncode in (0, 1)
|
||||
|
||||
# Parse clean JSONL output
|
||||
result_json = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
if result_json:
|
||||
# Should report failure or skip for non-git URL
|
||||
assert result_json['status'] in ['failed', 'skipped'], f"Should fail or skip: {result_json}"
|
||||
|
||||
|
||||
def test_real_git_repo():
|
||||
"""Test that git can clone a real GitHub repository."""
|
||||
import os
|
||||
|
||||
assert shutil.which('git'), "git binary not available"
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Use a real but small GitHub repository
|
||||
git_url = 'https://github.com/ArchiveBox/abx-pkg'
|
||||
|
||||
env = os.environ.copy()
|
||||
env['GIT_TIMEOUT'] = '120' # Give it time to clone
|
||||
|
||||
start_time = time.time()
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(GIT_HOOK), '--url', git_url, '--snapshot-id', 'testgit'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=180
|
||||
)
|
||||
elapsed_time = time.time() - start_time
|
||||
|
||||
# Should succeed
|
||||
assert result.returncode == 0, f"Should clone repository successfully: {result.stderr}"
|
||||
|
||||
# Parse JSONL output
|
||||
result_json = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert result_json, f"Should have ArchiveResult JSONL output. stdout: {result.stdout}"
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
||||
|
||||
# Check that the git repo was cloned
|
||||
git_dirs = list(tmpdir.glob('**/.git'))
|
||||
assert len(git_dirs) > 0, f"Should have cloned a git repository. Contents: {list(tmpdir.rglob('*'))}"
|
||||
|
||||
print(f"Successfully cloned repository in {elapsed_time:.2f}s")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
@@ -1,20 +0,0 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"HASHES_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_HASHES", "USE_HASHES"],
|
||||
"description": "Enable merkle tree hash generation"
|
||||
},
|
||||
"HASHES_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 30,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for merkle tree generation in seconds"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,185 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Create a hashed Merkle tree of all archived outputs.
|
||||
|
||||
This plugin runs after all extractors complete (priority 93) and generates
|
||||
a cryptographic Merkle hash tree of all files in the snapshot directory.
|
||||
|
||||
Output: hashes.json containing root_hash, tree structure, file list, metadata
|
||||
|
||||
Usage: on_Snapshot__93_hashes.py --url=<url> --snapshot-id=<uuid>
|
||||
|
||||
Environment variables:
|
||||
SAVE_HASHES: Enable hash merkle tree generation (default: true)
|
||||
DATA_DIR: ArchiveBox data directory
|
||||
ARCHIVE_DIR: Archive output directory
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from typing import Dict, List, Optional, Tuple, Any
|
||||
|
||||
import click
|
||||
|
||||
|
||||
def sha256_file(filepath: Path) -> str:
|
||||
"""Compute SHA256 hash of a file."""
|
||||
h = hashlib.sha256()
|
||||
try:
|
||||
with open(filepath, 'rb') as f:
|
||||
while chunk := f.read(65536):
|
||||
h.update(chunk)
|
||||
return h.hexdigest()
|
||||
except (OSError, PermissionError):
|
||||
return '0' * 64
|
||||
|
||||
|
||||
def sha256_data(data: bytes) -> str:
|
||||
"""Compute SHA256 hash of raw data."""
|
||||
return hashlib.sha256(data).hexdigest()
|
||||
|
||||
|
||||
def collect_files(snapshot_dir: Path, exclude_dirs: Optional[List[str]] = None) -> List[Tuple[Path, str, int]]:
|
||||
"""Recursively collect all files in snapshot directory."""
|
||||
exclude_dirs = exclude_dirs or ['hashes', '.git', '__pycache__']
|
||||
files = []
|
||||
|
||||
for root, dirs, filenames in os.walk(snapshot_dir):
|
||||
dirs[:] = [d for d in dirs if d not in exclude_dirs]
|
||||
|
||||
for filename in filenames:
|
||||
filepath = Path(root) / filename
|
||||
rel_path = filepath.relative_to(snapshot_dir)
|
||||
|
||||
if filepath.is_symlink():
|
||||
continue
|
||||
|
||||
file_hash = sha256_file(filepath)
|
||||
file_size = filepath.stat().st_size if filepath.exists() else 0
|
||||
files.append((rel_path, file_hash, file_size))
|
||||
|
||||
files.sort(key=lambda x: str(x[0]))
|
||||
return files
|
||||
|
||||
|
||||
def build_merkle_tree(file_hashes: List[str]) -> Tuple[str, List[List[str]]]:
|
||||
"""Build a Merkle tree from a list of leaf hashes."""
|
||||
if not file_hashes:
|
||||
return sha256_data(b''), [[]]
|
||||
|
||||
tree_levels = [file_hashes.copy()]
|
||||
|
||||
while len(tree_levels[-1]) > 1:
|
||||
current_level = tree_levels[-1]
|
||||
next_level = []
|
||||
|
||||
for i in range(0, len(current_level), 2):
|
||||
left = current_level[i]
|
||||
if i + 1 < len(current_level):
|
||||
right = current_level[i + 1]
|
||||
combined = left + right
|
||||
else:
|
||||
combined = left + left
|
||||
|
||||
parent_hash = sha256_data(combined.encode('utf-8'))
|
||||
next_level.append(parent_hash)
|
||||
|
||||
tree_levels.append(next_level)
|
||||
|
||||
root_hash = tree_levels[-1][0]
|
||||
return root_hash, tree_levels
|
||||
|
||||
|
||||
def create_hashes(snapshot_dir: Path) -> Dict[str, Any]:
|
||||
"""Create a complete Merkle hash tree of all files in snapshot directory."""
|
||||
files = collect_files(snapshot_dir)
|
||||
file_hashes = [file_hash for _, file_hash, _ in files]
|
||||
root_hash, tree_levels = build_merkle_tree(file_hashes)
|
||||
total_size = sum(size for _, _, size in files)
|
||||
|
||||
file_list = [
|
||||
{'path': str(path), 'hash': file_hash, 'size': size}
|
||||
for path, file_hash, size in files
|
||||
]
|
||||
|
||||
return {
|
||||
'root_hash': root_hash,
|
||||
'tree_levels': tree_levels,
|
||||
'files': file_list,
|
||||
'metadata': {
|
||||
'timestamp': datetime.now(timezone.utc).isoformat(),
|
||||
'file_count': len(files),
|
||||
'total_size': total_size,
|
||||
'tree_depth': len(tree_levels),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL being archived')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Generate Merkle tree of all archived outputs."""
|
||||
status = 'failed'
|
||||
output = None
|
||||
error = ''
|
||||
root_hash = None
|
||||
file_count = 0
|
||||
|
||||
try:
|
||||
# Check if enabled
|
||||
save_hashes = os.getenv('HASHES_ENABLED', 'true').lower() in ('true', '1', 'yes', 'on')
|
||||
|
||||
if not save_hashes:
|
||||
status = 'skipped'
|
||||
click.echo(json.dumps({'status': status, 'output': 'HASHES_ENABLED=false'}))
|
||||
sys.exit(0)
|
||||
|
||||
# Working directory is the extractor output dir (e.g., <snapshot>/hashes/)
|
||||
# Parent is the snapshot directory
|
||||
output_dir = Path.cwd()
|
||||
snapshot_dir = output_dir.parent
|
||||
|
||||
if not snapshot_dir.exists():
|
||||
raise FileNotFoundError(f'Snapshot directory not found: {snapshot_dir}')
|
||||
|
||||
# Ensure output directory exists
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
output_path = output_dir / 'hashes.json'
|
||||
|
||||
# Generate Merkle tree
|
||||
merkle_data = create_hashes(snapshot_dir)
|
||||
|
||||
# Write output
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(merkle_data, f, indent=2)
|
||||
|
||||
status = 'succeeded'
|
||||
output = 'hashes.json'
|
||||
root_hash = merkle_data['root_hash']
|
||||
file_count = merkle_data['metadata']['file_count']
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
click.echo(f'Error: {error}', err=True)
|
||||
|
||||
# Print JSON result for hook runner
|
||||
result = {
|
||||
'status': status,
|
||||
'output': output,
|
||||
'error': error or None,
|
||||
'root_hash': root_hash,
|
||||
'file_count': file_count,
|
||||
}
|
||||
click.echo(json.dumps(result))
|
||||
|
||||
sys.exit(0 if status in ('succeeded', 'skipped') else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1 +0,0 @@
|
||||
<span class="abx-output-icon abx-output-icon--hashes" title="Authenticity Hashes"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="5" r="2"/><circle cx="6" cy="18" r="2"/><circle cx="18" cy="18" r="2"/><path d="M12 7v6"/><path d="M12 13l-4 3"/><path d="M12 13l4 3"/></svg></span>
|
||||
@@ -1,157 +0,0 @@
|
||||
"""
|
||||
Tests for the hashes plugin.
|
||||
|
||||
Tests the real merkle tree generation with actual files.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from django.test import TestCase
|
||||
|
||||
|
||||
# Get the path to the hashes hook
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
HASHES_HOOK = PLUGIN_DIR / 'on_Snapshot__93_hashes.py'
|
||||
|
||||
|
||||
class TestHashesPlugin(TestCase):
|
||||
"""Test the hashes plugin."""
|
||||
|
||||
def test_hashes_hook_exists(self):
|
||||
"""Hashes hook script should exist."""
|
||||
self.assertTrue(HASHES_HOOK.exists(), f"Hook not found: {HASHES_HOOK}")
|
||||
|
||||
def test_hashes_generates_tree_for_files(self):
|
||||
"""Hashes hook should generate merkle tree for files in snapshot directory."""
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Create a mock snapshot directory structure
|
||||
snapshot_dir = Path(temp_dir) / 'snapshot'
|
||||
snapshot_dir.mkdir()
|
||||
|
||||
# Create output directory for hashes
|
||||
output_dir = snapshot_dir / 'hashes'
|
||||
output_dir.mkdir()
|
||||
|
||||
# Create some test files
|
||||
(snapshot_dir / 'index.html').write_text('<html><body>Test</body></html>')
|
||||
(snapshot_dir / 'screenshot.png').write_bytes(b'\x89PNG\r\n\x1a\n' + b'\x00' * 100)
|
||||
|
||||
subdir = snapshot_dir / 'media'
|
||||
subdir.mkdir()
|
||||
(subdir / 'video.mp4').write_bytes(b'\x00\x00\x00\x18ftypmp42')
|
||||
|
||||
# Run the hook from the output directory
|
||||
env = os.environ.copy()
|
||||
env['HASHES_ENABLED'] = 'true'
|
||||
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable, str(HASHES_HOOK),
|
||||
'--url=https://example.com',
|
||||
'--snapshot-id=test-snapshot',
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
cwd=str(output_dir), # Hook expects to run from output dir
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Should succeed
|
||||
self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}")
|
||||
|
||||
# Check output file exists
|
||||
output_file = output_dir / 'hashes.json'
|
||||
self.assertTrue(output_file.exists(), "hashes.json not created")
|
||||
|
||||
# Parse and verify output
|
||||
with open(output_file) as f:
|
||||
data = json.load(f)
|
||||
|
||||
self.assertIn('root_hash', data)
|
||||
self.assertIn('files', data)
|
||||
self.assertIn('metadata', data)
|
||||
|
||||
# Should have indexed our test files
|
||||
file_paths = [f['path'] for f in data['files']]
|
||||
self.assertIn('index.html', file_paths)
|
||||
self.assertIn('screenshot.png', file_paths)
|
||||
|
||||
# Verify metadata
|
||||
self.assertGreater(data['metadata']['file_count'], 0)
|
||||
self.assertGreater(data['metadata']['total_size'], 0)
|
||||
|
||||
def test_hashes_skips_when_disabled(self):
|
||||
"""Hashes hook should skip when HASHES_ENABLED=false."""
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
snapshot_dir = Path(temp_dir) / 'snapshot'
|
||||
snapshot_dir.mkdir()
|
||||
output_dir = snapshot_dir / 'hashes'
|
||||
output_dir.mkdir()
|
||||
|
||||
env = os.environ.copy()
|
||||
env['HASHES_ENABLED'] = 'false'
|
||||
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable, str(HASHES_HOOK),
|
||||
'--url=https://example.com',
|
||||
'--snapshot-id=test-snapshot',
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
cwd=str(output_dir),
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Should succeed (exit 0) but skip
|
||||
self.assertEqual(result.returncode, 0)
|
||||
self.assertIn('skipped', result.stdout)
|
||||
|
||||
def test_hashes_handles_empty_directory(self):
|
||||
"""Hashes hook should handle empty snapshot directory."""
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
snapshot_dir = Path(temp_dir) / 'snapshot'
|
||||
snapshot_dir.mkdir()
|
||||
output_dir = snapshot_dir / 'hashes'
|
||||
output_dir.mkdir()
|
||||
|
||||
env = os.environ.copy()
|
||||
env['HASHES_ENABLED'] = 'true'
|
||||
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable, str(HASHES_HOOK),
|
||||
'--url=https://example.com',
|
||||
'--snapshot-id=test-snapshot',
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
cwd=str(output_dir),
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Should succeed even with empty directory
|
||||
self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}")
|
||||
|
||||
# Check output file exists
|
||||
output_file = output_dir / 'hashes.json'
|
||||
self.assertTrue(output_file.exists())
|
||||
|
||||
with open(output_file) as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Should have empty file list
|
||||
self.assertEqual(data['metadata']['file_count'], 0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
@@ -1,21 +0,0 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"required_plugins": ["chrome"],
|
||||
"properties": {
|
||||
"HEADERS_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_HEADERS", "USE_HEADERS"],
|
||||
"description": "Enable HTTP headers capture"
|
||||
},
|
||||
"HEADERS_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 30,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for headers capture in seconds"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,247 +0,0 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Capture original request + response headers for the main navigation.
|
||||
*
|
||||
* This hook sets up CDP listeners BEFORE chrome_navigate loads the page,
|
||||
* then waits for navigation to complete. It records the first top-level
|
||||
* request headers and the corresponding response headers (with :status).
|
||||
*
|
||||
* Usage: on_Snapshot__27_headers.bg.js --url=<url> --snapshot-id=<uuid>
|
||||
* Output: Writes headers.json
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Import shared utilities from chrome_utils.js
|
||||
const {
|
||||
getEnvBool,
|
||||
getEnvInt,
|
||||
parseArgs,
|
||||
connectToPage,
|
||||
waitForPageLoaded,
|
||||
} = require('../chrome/chrome_utils.js');
|
||||
|
||||
const PLUGIN_NAME = 'headers';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'headers.json';
|
||||
const CHROME_SESSION_DIR = '../chrome';
|
||||
const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)';
|
||||
|
||||
let browser = null;
|
||||
let page = null;
|
||||
let client = null;
|
||||
let shuttingDown = false;
|
||||
let headersWritten = false;
|
||||
|
||||
let requestId = null;
|
||||
let requestUrl = null;
|
||||
let requestHeaders = null;
|
||||
let responseHeaders = null;
|
||||
let responseStatus = null;
|
||||
let responseStatusText = null;
|
||||
let responseUrl = null;
|
||||
let originalUrl = null;
|
||||
|
||||
function getFinalUrl() {
|
||||
const finalUrlFile = path.join(CHROME_SESSION_DIR, 'final_url.txt');
|
||||
if (fs.existsSync(finalUrlFile)) {
|
||||
return fs.readFileSync(finalUrlFile, 'utf8').trim();
|
||||
}
|
||||
return page ? page.url() : null;
|
||||
}
|
||||
|
||||
function writeHeadersFile() {
|
||||
if (headersWritten) return;
|
||||
if (!responseHeaders) return;
|
||||
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
const responseHeadersWithStatus = {
|
||||
...(responseHeaders || {}),
|
||||
};
|
||||
|
||||
if (responseStatus !== null && responseStatus !== undefined &&
|
||||
responseHeadersWithStatus[':status'] === undefined) {
|
||||
responseHeadersWithStatus[':status'] = String(responseStatus);
|
||||
}
|
||||
|
||||
const record = {
|
||||
url: requestUrl || originalUrl,
|
||||
final_url: getFinalUrl(),
|
||||
status: responseStatus !== undefined ? responseStatus : null,
|
||||
request_headers: requestHeaders || {},
|
||||
response_headers: responseHeadersWithStatus,
|
||||
headers: responseHeadersWithStatus, // backwards compatibility
|
||||
};
|
||||
|
||||
if (responseStatusText) {
|
||||
record.statusText = responseStatusText;
|
||||
}
|
||||
if (responseUrl) {
|
||||
record.response_url = responseUrl;
|
||||
}
|
||||
|
||||
fs.writeFileSync(outputPath, JSON.stringify(record, null, 2));
|
||||
headersWritten = true;
|
||||
}
|
||||
|
||||
async function setupListener(url) {
|
||||
const timeout = getEnvInt('HEADERS_TIMEOUT', getEnvInt('TIMEOUT', 30)) * 1000;
|
||||
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
|
||||
const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt');
|
||||
const pidFile = path.join(CHROME_SESSION_DIR, 'chrome.pid');
|
||||
|
||||
if (!fs.existsSync(cdpFile) || !fs.existsSync(targetIdFile) || !fs.existsSync(pidFile)) {
|
||||
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
|
||||
}
|
||||
try {
|
||||
const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10);
|
||||
if (!pid || Number.isNaN(pid)) throw new Error('Invalid pid');
|
||||
process.kill(pid, 0);
|
||||
} catch (e) {
|
||||
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
|
||||
}
|
||||
|
||||
const { browser, page } = await connectToPage({
|
||||
chromeSessionDir: CHROME_SESSION_DIR,
|
||||
timeoutMs: timeout,
|
||||
puppeteer,
|
||||
});
|
||||
|
||||
client = await page.target().createCDPSession();
|
||||
await client.send('Network.enable');
|
||||
|
||||
client.on('Network.requestWillBeSent', (params) => {
|
||||
try {
|
||||
if (requestId && !responseHeaders && params.redirectResponse && params.requestId === requestId) {
|
||||
responseHeaders = params.redirectResponse.headers || {};
|
||||
responseStatus = params.redirectResponse.status || null;
|
||||
responseStatusText = params.redirectResponse.statusText || null;
|
||||
responseUrl = params.redirectResponse.url || null;
|
||||
writeHeadersFile();
|
||||
}
|
||||
|
||||
if (requestId) return;
|
||||
if (params.type && params.type !== 'Document') return;
|
||||
if (!params.request || !params.request.url) return;
|
||||
if (!params.request.url.startsWith('http')) return;
|
||||
|
||||
requestId = params.requestId;
|
||||
requestUrl = params.request.url;
|
||||
requestHeaders = params.request.headers || {};
|
||||
} catch (e) {
|
||||
// Ignore errors
|
||||
}
|
||||
});
|
||||
|
||||
client.on('Network.responseReceived', (params) => {
|
||||
try {
|
||||
if (!requestId || params.requestId !== requestId || responseHeaders) return;
|
||||
const response = params.response || {};
|
||||
responseHeaders = response.headers || {};
|
||||
responseStatus = response.status || null;
|
||||
responseStatusText = response.statusText || null;
|
||||
responseUrl = response.url || null;
|
||||
writeHeadersFile();
|
||||
} catch (e) {
|
||||
// Ignore errors
|
||||
}
|
||||
});
|
||||
|
||||
return { browser, page };
|
||||
}
|
||||
|
||||
function emitResult(status = 'succeeded', outputStr = OUTPUT_FILE) {
|
||||
if (shuttingDown) return;
|
||||
shuttingDown = true;
|
||||
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status,
|
||||
output_str: outputStr,
|
||||
}));
|
||||
}
|
||||
|
||||
async function handleShutdown(signal) {
|
||||
console.error(`\nReceived ${signal}, emitting final results...`);
|
||||
if (!headersWritten) {
|
||||
writeHeadersFile();
|
||||
}
|
||||
if (headersWritten) {
|
||||
emitResult('succeeded', OUTPUT_FILE);
|
||||
} else {
|
||||
emitResult('failed', 'No headers captured');
|
||||
}
|
||||
|
||||
if (browser) {
|
||||
try {
|
||||
browser.disconnect();
|
||||
} catch (e) {}
|
||||
}
|
||||
process.exit(headersWritten ? 0 : 1);
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const url = args.url;
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__27_headers.bg.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
originalUrl = url;
|
||||
|
||||
if (!getEnvBool('HEADERS_ENABLED', true)) {
|
||||
console.error('Skipping (HEADERS_ENABLED=False)');
|
||||
console.log(JSON.stringify({type: 'ArchiveResult', status: 'skipped', output_str: 'HEADERS_ENABLED=False'}));
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
try {
|
||||
// Set up listeners BEFORE navigation
|
||||
const connection = await setupListener(url);
|
||||
browser = connection.browser;
|
||||
page = connection.page;
|
||||
|
||||
// Register signal handlers for graceful shutdown
|
||||
process.on('SIGTERM', () => handleShutdown('SIGTERM'));
|
||||
process.on('SIGINT', () => handleShutdown('SIGINT'));
|
||||
|
||||
// Wait for chrome_navigate to complete (non-fatal)
|
||||
try {
|
||||
const timeout = getEnvInt('HEADERS_TIMEOUT', getEnvInt('TIMEOUT', 30)) * 1000;
|
||||
await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 200);
|
||||
} catch (e) {
|
||||
console.error(`WARN: ${e.message}`);
|
||||
}
|
||||
|
||||
// Keep alive until SIGTERM
|
||||
await new Promise(() => {});
|
||||
return;
|
||||
|
||||
} catch (e) {
|
||||
const errorMessage = (e && e.message)
|
||||
? `${e.name || 'Error'}: ${e.message}`
|
||||
: String(e || 'Unknown error');
|
||||
console.error(`ERROR: ${errorMessage}`);
|
||||
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status: 'failed',
|
||||
output_str: errorMessage,
|
||||
}));
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -1 +0,0 @@
|
||||
<span class="abx-output-icon abx-output-icon--headers" title="Headers"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="4" cy="7" r="1" fill="currentColor" stroke="none"/><circle cx="4" cy="12" r="1" fill="currentColor" stroke="none"/><circle cx="4" cy="17" r="1" fill="currentColor" stroke="none"/><path d="M7 7h13"/><path d="M7 12h13"/><path d="M7 17h13"/></svg></span>
|
||||
@@ -1,409 +0,0 @@
|
||||
"""
|
||||
Integration tests for headers plugin
|
||||
|
||||
Tests verify:
|
||||
pass
|
||||
1. Plugin script exists and is executable
|
||||
2. Node.js is available
|
||||
3. Headers extraction works for real example.com
|
||||
4. Output JSON contains actual HTTP headers
|
||||
5. Config options work (TIMEOUT, USER_AGENT)
|
||||
"""
|
||||
|
||||
import json
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
CHROME_NAVIGATE_HOOK,
|
||||
get_test_env,
|
||||
chrome_session,
|
||||
)
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
HEADERS_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_headers.*'), None)
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
def normalize_root_url(url: str) -> str:
|
||||
return url.rstrip('/')
|
||||
|
||||
def run_headers_capture(headers_dir, snapshot_chrome_dir, env, url, snapshot_id):
|
||||
hook_proc = subprocess.Popen(
|
||||
['node', str(HEADERS_HOOK), f'--url={url}', f'--snapshot-id={snapshot_id}'],
|
||||
cwd=headers_dir,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env,
|
||||
)
|
||||
|
||||
nav_result = subprocess.run(
|
||||
['node', str(CHROME_NAVIGATE_HOOK), f'--url={url}', f'--snapshot-id={snapshot_id}'],
|
||||
cwd=snapshot_chrome_dir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env,
|
||||
)
|
||||
|
||||
headers_file = headers_dir / 'headers.json'
|
||||
for _ in range(60):
|
||||
if headers_file.exists() and headers_file.stat().st_size > 0:
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
if hook_proc.poll() is None:
|
||||
hook_proc.terminate()
|
||||
try:
|
||||
stdout, stderr = hook_proc.communicate(timeout=5)
|
||||
except subprocess.TimeoutExpired:
|
||||
hook_proc.kill()
|
||||
stdout, stderr = hook_proc.communicate()
|
||||
else:
|
||||
stdout, stderr = hook_proc.communicate()
|
||||
|
||||
return hook_proc.returncode, stdout, stderr, nav_result, headers_file
|
||||
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify hook script exists."""
|
||||
assert HEADERS_HOOK.exists(), f"Hook script not found: {HEADERS_HOOK}"
|
||||
|
||||
|
||||
def test_node_is_available():
|
||||
"""Test that Node.js is available on the system."""
|
||||
result = subprocess.run(
|
||||
['which', 'node'],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
pass
|
||||
|
||||
binary_path = result.stdout.strip()
|
||||
assert Path(binary_path).exists(), f"Binary should exist at {binary_path}"
|
||||
|
||||
# Test that node is executable and get version
|
||||
result = subprocess.run(
|
||||
['node', '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10
|
||||
,
|
||||
env=get_test_env())
|
||||
assert result.returncode == 0, f"node not executable: {result.stderr}"
|
||||
assert result.stdout.startswith('v'), f"Unexpected node version format: {result.stdout}"
|
||||
|
||||
|
||||
def test_extracts_headers_from_example_com():
|
||||
"""Test full workflow: extract headers from real example.com."""
|
||||
|
||||
# Check node is available
|
||||
if not shutil.which('node'):
|
||||
pass
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
|
||||
headers_dir = snapshot_chrome_dir.parent / 'headers'
|
||||
headers_dir.mkdir(exist_ok=True)
|
||||
|
||||
result = run_headers_capture(
|
||||
headers_dir,
|
||||
snapshot_chrome_dir,
|
||||
env,
|
||||
TEST_URL,
|
||||
'test789',
|
||||
)
|
||||
|
||||
hook_code, stdout, stderr, nav_result, headers_file = result
|
||||
assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
|
||||
assert hook_code == 0, f"Extraction failed: {stderr}"
|
||||
|
||||
# Parse clean JSONL output
|
||||
result_json = None
|
||||
for line in stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert result_json, "Should have ArchiveResult JSONL output"
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
||||
|
||||
# Verify output file exists (hook writes to current directory)
|
||||
assert headers_file.exists(), "headers.json not created"
|
||||
|
||||
# Verify headers JSON contains REAL example.com response
|
||||
headers_data = json.loads(headers_file.read_text())
|
||||
|
||||
assert 'url' in headers_data, "Should have url field"
|
||||
assert normalize_root_url(headers_data['url']) == normalize_root_url(TEST_URL), f"URL should be {TEST_URL}"
|
||||
|
||||
assert 'status' in headers_data, "Should have status field"
|
||||
assert headers_data['status'] in [200, 301, 302], \
|
||||
f"Should have valid HTTP status, got {headers_data['status']}"
|
||||
|
||||
assert 'request_headers' in headers_data, "Should have request_headers field"
|
||||
assert isinstance(headers_data['request_headers'], dict), "Request headers should be a dict"
|
||||
|
||||
assert 'response_headers' in headers_data, "Should have response_headers field"
|
||||
assert isinstance(headers_data['response_headers'], dict), "Response headers should be a dict"
|
||||
assert len(headers_data['response_headers']) > 0, "Response headers dict should not be empty"
|
||||
|
||||
assert 'headers' in headers_data, "Should have headers field"
|
||||
assert isinstance(headers_data['headers'], dict), "Headers should be a dict"
|
||||
|
||||
# Verify common HTTP headers are present
|
||||
headers_lower = {k.lower(): v for k, v in headers_data['response_headers'].items()}
|
||||
assert 'content-type' in headers_lower or 'content-length' in headers_lower, \
|
||||
"Should have at least one common HTTP header"
|
||||
|
||||
assert headers_data['response_headers'].get(':status') == str(headers_data['status']), \
|
||||
"Response headers should include :status pseudo header"
|
||||
|
||||
|
||||
def test_headers_output_structure():
|
||||
"""Test that headers plugin produces correctly structured output."""
|
||||
|
||||
if not shutil.which('node'):
|
||||
pass
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
|
||||
headers_dir = snapshot_chrome_dir.parent / 'headers'
|
||||
headers_dir.mkdir(exist_ok=True)
|
||||
|
||||
result = run_headers_capture(
|
||||
headers_dir,
|
||||
snapshot_chrome_dir,
|
||||
env,
|
||||
TEST_URL,
|
||||
'testformat',
|
||||
)
|
||||
|
||||
hook_code, stdout, stderr, nav_result, headers_file = result
|
||||
assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
|
||||
assert hook_code == 0, f"Extraction failed: {stderr}"
|
||||
|
||||
# Parse clean JSONL output
|
||||
result_json = None
|
||||
for line in stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert result_json, "Should have ArchiveResult JSONL output"
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
||||
|
||||
# Verify output structure
|
||||
assert headers_file.exists(), "Output headers.json not created"
|
||||
|
||||
output_data = json.loads(headers_file.read_text())
|
||||
|
||||
# Verify all required fields are present
|
||||
assert 'url' in output_data, "Output should have url field"
|
||||
assert 'status' in output_data, "Output should have status field"
|
||||
assert 'request_headers' in output_data, "Output should have request_headers field"
|
||||
assert 'response_headers' in output_data, "Output should have response_headers field"
|
||||
assert 'headers' in output_data, "Output should have headers field"
|
||||
|
||||
# Verify data types
|
||||
assert isinstance(output_data['status'], int), "Status should be integer"
|
||||
assert isinstance(output_data['request_headers'], dict), "Request headers should be dict"
|
||||
assert isinstance(output_data['response_headers'], dict), "Response headers should be dict"
|
||||
assert isinstance(output_data['headers'], dict), "Headers should be dict"
|
||||
|
||||
# Verify example.com returns expected headers
|
||||
assert normalize_root_url(output_data['url']) == normalize_root_url(TEST_URL)
|
||||
assert output_data['status'] in [200, 301, 302]
|
||||
|
||||
|
||||
def test_fails_without_chrome_session():
|
||||
"""Test that headers plugin fails when chrome session is missing."""
|
||||
|
||||
if not shutil.which('node'):
|
||||
pass
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Run headers extraction
|
||||
result = subprocess.run(
|
||||
['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=testhttp'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
,
|
||||
env=get_test_env())
|
||||
|
||||
assert result.returncode != 0, "Should fail without chrome session"
|
||||
assert 'No Chrome session found (chrome plugin must run first)' in (result.stdout + result.stderr)
|
||||
|
||||
|
||||
def test_config_timeout_honored():
|
||||
"""Test that TIMEOUT config is respected."""
|
||||
|
||||
if not shutil.which('node'):
|
||||
pass
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set very short timeout (but example.com should still succeed)
|
||||
import os
|
||||
env_override = os.environ.copy()
|
||||
env_override['TIMEOUT'] = '5'
|
||||
|
||||
with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
|
||||
headers_dir = snapshot_chrome_dir.parent / 'headers'
|
||||
headers_dir.mkdir(exist_ok=True)
|
||||
env.update(env_override)
|
||||
|
||||
result = run_headers_capture(
|
||||
headers_dir,
|
||||
snapshot_chrome_dir,
|
||||
env,
|
||||
TEST_URL,
|
||||
'testtimeout',
|
||||
)
|
||||
|
||||
# Should complete (success or fail, but not hang)
|
||||
hook_code, _stdout, _stderr, nav_result, _headers_file = result
|
||||
assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
|
||||
assert hook_code in (0, 1), "Should complete without hanging"
|
||||
|
||||
|
||||
def test_config_user_agent():
|
||||
"""Test that USER_AGENT config is used."""
|
||||
|
||||
if not shutil.which('node'):
|
||||
pass
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set custom user agent
|
||||
import os
|
||||
env_override = os.environ.copy()
|
||||
env_override['USER_AGENT'] = 'TestBot/1.0'
|
||||
|
||||
with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
|
||||
headers_dir = snapshot_chrome_dir.parent / 'headers'
|
||||
headers_dir.mkdir(exist_ok=True)
|
||||
env.update(env_override)
|
||||
|
||||
result = run_headers_capture(
|
||||
headers_dir,
|
||||
snapshot_chrome_dir,
|
||||
env,
|
||||
TEST_URL,
|
||||
'testua',
|
||||
)
|
||||
|
||||
# Should succeed (example.com doesn't block)
|
||||
hook_code, stdout, _stderr, nav_result, _headers_file = result
|
||||
assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
|
||||
if hook_code == 0:
|
||||
# Parse clean JSONL output
|
||||
result_json = None
|
||||
for line in stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert result_json, "Should have ArchiveResult JSONL output"
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
||||
|
||||
|
||||
def test_handles_https_urls():
|
||||
"""Test that HTTPS URLs work correctly."""
|
||||
|
||||
if not shutil.which('node'):
|
||||
pass
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
with chrome_session(tmpdir, test_url='https://example.org', navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
|
||||
headers_dir = snapshot_chrome_dir.parent / 'headers'
|
||||
headers_dir.mkdir(exist_ok=True)
|
||||
result = run_headers_capture(
|
||||
headers_dir,
|
||||
snapshot_chrome_dir,
|
||||
env,
|
||||
'https://example.org',
|
||||
'testhttps',
|
||||
)
|
||||
|
||||
hook_code, _stdout, _stderr, nav_result, headers_file = result
|
||||
assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
|
||||
if hook_code == 0:
|
||||
if headers_file.exists():
|
||||
output_data = json.loads(headers_file.read_text())
|
||||
assert normalize_root_url(output_data['url']) == normalize_root_url('https://example.org')
|
||||
assert output_data['status'] in [200, 301, 302]
|
||||
|
||||
|
||||
def test_handles_404_gracefully():
|
||||
"""Test that headers plugin handles 404s gracefully."""
|
||||
|
||||
if not shutil.which('node'):
|
||||
pass
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
with chrome_session(tmpdir, test_url='https://example.com/nonexistent-page-404', navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
|
||||
headers_dir = snapshot_chrome_dir.parent / 'headers'
|
||||
headers_dir.mkdir(exist_ok=True)
|
||||
result = run_headers_capture(
|
||||
headers_dir,
|
||||
snapshot_chrome_dir,
|
||||
env,
|
||||
'https://example.com/nonexistent-page-404',
|
||||
'test404',
|
||||
)
|
||||
|
||||
# May succeed or fail depending on server behavior
|
||||
# If it succeeds, verify 404 status is captured
|
||||
hook_code, _stdout, _stderr, nav_result, headers_file = result
|
||||
assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
|
||||
if hook_code == 0:
|
||||
if headers_file.exists():
|
||||
output_data = json.loads(headers_file.read_text())
|
||||
assert output_data['status'] == 404, "Should capture 404 status"
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
@@ -1,20 +0,0 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"HTMLTOTEXT_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_HTMLTOTEXT", "USE_HTMLTOTEXT"],
|
||||
"description": "Enable HTML to text conversion"
|
||||
},
|
||||
"HTMLTOTEXT_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 30,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for HTML to text conversion in seconds"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,161 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Convert HTML to plain text for search indexing.
|
||||
|
||||
This extractor reads HTML from other extractors (wget, singlefile, dom)
|
||||
and converts it to plain text for full-text search.
|
||||
|
||||
Usage: on_Snapshot__htmltotext.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Writes htmltotext.txt to $PWD
|
||||
|
||||
Environment variables:
|
||||
TIMEOUT: Timeout in seconds (not used, but kept for consistency)
|
||||
|
||||
Note: This extractor does not require any external binaries.
|
||||
It uses Python's built-in html.parser module.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from html.parser import HTMLParser
|
||||
from pathlib import Path
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
PLUGIN_NAME = 'htmltotext'
|
||||
OUTPUT_DIR = '.'
|
||||
OUTPUT_FILE = 'htmltotext.txt'
|
||||
|
||||
|
||||
class HTMLTextExtractor(HTMLParser):
|
||||
"""Extract text content from HTML, ignoring scripts/styles."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.result = []
|
||||
self.skip_tags = {'script', 'style', 'head', 'meta', 'link', 'noscript'}
|
||||
self.current_tag = None
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
self.current_tag = tag.lower()
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
self.current_tag = None
|
||||
|
||||
def handle_data(self, data):
|
||||
if self.current_tag not in self.skip_tags:
|
||||
text = data.strip()
|
||||
if text:
|
||||
self.result.append(text)
|
||||
|
||||
def get_text(self) -> str:
|
||||
return ' '.join(self.result)
|
||||
|
||||
|
||||
def html_to_text(html: str) -> str:
|
||||
"""Convert HTML to plain text."""
|
||||
parser = HTMLTextExtractor()
|
||||
try:
|
||||
parser.feed(html)
|
||||
return parser.get_text()
|
||||
except Exception:
|
||||
# Fallback: strip HTML tags with regex
|
||||
text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
|
||||
text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE)
|
||||
text = re.sub(r'<[^>]+>', ' ', text)
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def find_html_source() -> str | None:
|
||||
"""Find HTML content from other extractors in the snapshot directory."""
|
||||
# Hooks run in snapshot_dir, sibling extractor outputs are in subdirectories
|
||||
search_patterns = [
|
||||
'singlefile/singlefile.html',
|
||||
'*_singlefile/singlefile.html',
|
||||
'singlefile/*.html',
|
||||
'*_singlefile/*.html',
|
||||
'dom/output.html',
|
||||
'*_dom/output.html',
|
||||
'dom/*.html',
|
||||
'*_dom/*.html',
|
||||
'wget/**/*.html',
|
||||
'*_wget/**/*.html',
|
||||
'wget/**/*.htm',
|
||||
'*_wget/**/*.htm',
|
||||
]
|
||||
|
||||
for base in (Path.cwd(), Path.cwd().parent):
|
||||
for pattern in search_patterns:
|
||||
matches = list(base.glob(pattern))
|
||||
for match in matches:
|
||||
if match.is_file() and match.stat().st_size > 0:
|
||||
try:
|
||||
return match.read_text(errors='ignore')
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_htmltotext(url: str) -> tuple[bool, str | None, str]:
|
||||
"""
|
||||
Extract plain text from HTML sources.
|
||||
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
# Find HTML source from other extractors
|
||||
html_content = find_html_source()
|
||||
if not html_content:
|
||||
return False, None, 'No HTML source found (run singlefile, dom, or wget first)'
|
||||
|
||||
# Convert HTML to text
|
||||
text = html_to_text(html_content)
|
||||
|
||||
if not text or len(text) < 10:
|
||||
return False, None, 'No meaningful text extracted from HTML'
|
||||
|
||||
# Output directory is current directory (hook already runs in output dir)
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
output_path = output_dir / OUTPUT_FILE
|
||||
output_path.write_text(text, encoding='utf-8')
|
||||
|
||||
return True, str(output_path), ''
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL that was archived')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Convert HTML to plain text for search indexing."""
|
||||
|
||||
try:
|
||||
# Run extraction
|
||||
success, output, error = extract_htmltotext(url)
|
||||
|
||||
if success:
|
||||
# Success - emit ArchiveResult
|
||||
result = {
|
||||
'type': 'ArchiveResult',
|
||||
'status': 'succeeded',
|
||||
'output_str': output or ''
|
||||
}
|
||||
print(json.dumps(result))
|
||||
sys.exit(0)
|
||||
else:
|
||||
# Transient error - emit NO JSONL
|
||||
print(f'ERROR: {error}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
# Transient error - emit NO JSONL
|
||||
print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1 +0,0 @@
|
||||
<span class="abx-output-icon abx-output-icon--htmltotext" title="HTML to Text"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M4 7h16"/><path d="M4 12h12"/><path d="M4 17h14"/></svg></span>
|
||||
@@ -1,84 +0,0 @@
|
||||
"""
|
||||
Integration tests for htmltotext plugin
|
||||
|
||||
Tests verify standalone htmltotext extractor execution.
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
HTMLTOTEXT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_htmltotext.*'), None)
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
def test_hook_script_exists():
|
||||
assert HTMLTOTEXT_HOOK.exists()
|
||||
|
||||
def test_extracts_text_from_html():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
# Create HTML source
|
||||
(tmpdir / 'singlefile').mkdir()
|
||||
(tmpdir / 'singlefile' / 'singlefile.html').write_text('<html><body><h1>Example Domain</h1><p>This domain is for examples.</p></body></html>')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(HTMLTOTEXT_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
|
||||
cwd=tmpdir, capture_output=True, text=True, timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
|
||||
|
||||
# Parse clean JSONL output
|
||||
result_json = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert result_json, "Should have ArchiveResult JSONL output"
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
||||
|
||||
# Verify output file (hook writes to current directory)
|
||||
output_file = tmpdir / 'htmltotext.txt'
|
||||
assert output_file.exists(), f"htmltotext.txt not created. Files: {list(tmpdir.iterdir())}"
|
||||
content = output_file.read_text()
|
||||
assert len(content) > 0, "Content should not be empty"
|
||||
assert 'Example Domain' in content, "Should contain text from HTML"
|
||||
|
||||
def test_fails_gracefully_without_html():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(HTMLTOTEXT_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
|
||||
cwd=tmpdir, capture_output=True, text=True, timeout=30
|
||||
)
|
||||
|
||||
# Should exit with non-zero or emit failure JSONL
|
||||
# Parse clean JSONL output
|
||||
result_json = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
if result_json:
|
||||
# Should report failure or skip since no HTML source
|
||||
assert result_json['status'] in ['failed', 'skipped'], f"Should fail or skip without HTML: {result_json}"
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
@@ -1,51 +0,0 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"required_plugins": ["chrome"],
|
||||
"properties": {
|
||||
"INFINISCROLL_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_INFINISCROLL", "USE_INFINISCROLL"],
|
||||
"description": "Enable infinite scroll page expansion"
|
||||
},
|
||||
"INFINISCROLL_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 120,
|
||||
"minimum": 10,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Maximum timeout for scrolling in seconds"
|
||||
},
|
||||
"INFINISCROLL_SCROLL_DELAY": {
|
||||
"type": "integer",
|
||||
"default": 2000,
|
||||
"minimum": 500,
|
||||
"description": "Delay between scrolls in milliseconds"
|
||||
},
|
||||
"INFINISCROLL_SCROLL_DISTANCE": {
|
||||
"type": "integer",
|
||||
"default": 1600,
|
||||
"minimum": 100,
|
||||
"description": "Distance to scroll per step in pixels"
|
||||
},
|
||||
"INFINISCROLL_SCROLL_LIMIT": {
|
||||
"type": "integer",
|
||||
"default": 10,
|
||||
"minimum": 1,
|
||||
"maximum": 100,
|
||||
"description": "Maximum number of scroll steps"
|
||||
},
|
||||
"INFINISCROLL_MIN_HEIGHT": {
|
||||
"type": "integer",
|
||||
"default": 16000,
|
||||
"minimum": 1000,
|
||||
"description": "Minimum page height to scroll to in pixels"
|
||||
},
|
||||
"INFINISCROLL_EXPAND_DETAILS": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Expand <details> elements and click 'load more' buttons for comments"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,427 +0,0 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Scroll the page down to trigger infinite scroll / lazy loading.
|
||||
*
|
||||
* Scrolls down 1 page at a time, up to INFINISCROLL_SCROLL_LIMIT times,
|
||||
* ensuring at least INFINISCROLL_MIN_HEIGHT (default 16,000px) is reached.
|
||||
* Stops early if no new content loads after a scroll.
|
||||
*
|
||||
* Optionally expands <details> elements and clicks "load more" buttons.
|
||||
*
|
||||
* Usage: on_Snapshot__45_infiniscroll.js --url=<url> --snapshot-id=<uuid>
|
||||
* Output: JSONL with scroll stats (no files created)
|
||||
*
|
||||
* Environment variables:
|
||||
* INFINISCROLL_ENABLED: Enable/disable (default: true)
|
||||
* INFINISCROLL_TIMEOUT: Max timeout in seconds (default: 120)
|
||||
* INFINISCROLL_SCROLL_DELAY: Delay between scrolls in ms (default: 2000)
|
||||
* INFINISCROLL_SCROLL_DISTANCE: Pixels per scroll (default: 1600)
|
||||
* INFINISCROLL_SCROLL_LIMIT: Max scroll iterations (default: 10)
|
||||
* INFINISCROLL_MIN_HEIGHT: Min page height to reach in px (default: 16000)
|
||||
* INFINISCROLL_EXPAND_DETAILS: Expand <details> and comments (default: true)
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
|
||||
const {
|
||||
getEnv,
|
||||
getEnvBool,
|
||||
getEnvInt,
|
||||
} = require('../chrome/chrome_utils.js');
|
||||
|
||||
// Check if infiniscroll is enabled BEFORE requiring puppeteer
|
||||
if (!getEnvBool('INFINISCROLL_ENABLED', true)) {
|
||||
console.error('Skipping infiniscroll (INFINISCROLL_ENABLED=False)');
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
const PLUGIN_NAME = 'infiniscroll';
|
||||
const CHROME_SESSION_DIR = '../chrome';
|
||||
const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)';
|
||||
|
||||
function parseArgs() {
|
||||
const args = {};
|
||||
process.argv.slice(2).forEach(arg => {
|
||||
if (arg.startsWith('--')) {
|
||||
const [key, ...valueParts] = arg.slice(2).split('=');
|
||||
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
|
||||
}
|
||||
});
|
||||
return args;
|
||||
}
|
||||
|
||||
function getCdpUrl() {
|
||||
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
|
||||
if (fs.existsSync(cdpFile)) {
|
||||
return fs.readFileSync(cdpFile, 'utf8').trim();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function getPageId() {
|
||||
const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt');
|
||||
if (fs.existsSync(targetIdFile)) {
|
||||
return fs.readFileSync(targetIdFile, 'utf8').trim();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function waitForChromeTabLoaded(timeoutMs = 60000) {
|
||||
const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json');
|
||||
const startTime = Date.now();
|
||||
|
||||
while (Date.now() - startTime < timeoutMs) {
|
||||
if (fs.existsSync(navigationFile)) {
|
||||
return true;
|
||||
}
|
||||
await new Promise(resolve => setTimeout(resolve, 100));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
function sleep(ms) {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
/**
|
||||
* Expand <details> elements and click "load more" buttons for comments.
|
||||
* Based on archivebox.ts expandComments function.
|
||||
*/
|
||||
async function expandDetails(page, options = {}) {
|
||||
const {
|
||||
timeout = 30000,
|
||||
limit = 500,
|
||||
delay = 500,
|
||||
} = options;
|
||||
|
||||
const startTime = Date.now();
|
||||
|
||||
// First, expand all <details> elements
|
||||
const detailsExpanded = await page.evaluate(() => {
|
||||
let count = 0;
|
||||
// Generic <details> elements
|
||||
document.querySelectorAll('details:not([open])').forEach(el => {
|
||||
el.open = true;
|
||||
count++;
|
||||
});
|
||||
// Github README details sections
|
||||
document.querySelectorAll('article details:not([open])').forEach(el => {
|
||||
el.open = true;
|
||||
count++;
|
||||
});
|
||||
// Github issue discussion hidden comments
|
||||
document.querySelectorAll('div.js-discussion details:not(.details-overlay):not([open])').forEach(el => {
|
||||
el.open = true;
|
||||
count++;
|
||||
});
|
||||
// HedgeDoc/Markdown details sections
|
||||
document.querySelectorAll('.markdown-body details:not([open])').forEach(el => {
|
||||
el.open = true;
|
||||
count++;
|
||||
});
|
||||
return count;
|
||||
});
|
||||
|
||||
if (detailsExpanded > 0) {
|
||||
console.error(`Expanded ${detailsExpanded} <details> elements`);
|
||||
}
|
||||
|
||||
// Then click "load more" buttons for comments
|
||||
const numExpanded = await page.evaluate(async ({ timeout, limit, delay }) => {
|
||||
// Helper to find elements by XPath
|
||||
function getElementsByXPath(xpath) {
|
||||
const results = [];
|
||||
const xpathResult = document.evaluate(
|
||||
xpath,
|
||||
document,
|
||||
null,
|
||||
XPathResult.ORDERED_NODE_ITERATOR_TYPE,
|
||||
null
|
||||
);
|
||||
let node;
|
||||
while ((node = xpathResult.iterateNext()) != null) {
|
||||
results.push(node);
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
const wait = (ms) => new Promise(res => setTimeout(res, ms));
|
||||
|
||||
// Find all "load more" type buttons/links
|
||||
const getLoadMoreLinks = () => [
|
||||
// Reddit (new)
|
||||
...document.querySelectorAll('faceplate-partial[loading=action]'),
|
||||
// Reddit (old) - show more replies
|
||||
...document.querySelectorAll('a[onclick^="return morechildren"]'),
|
||||
// Reddit (old) - show hidden replies
|
||||
...document.querySelectorAll('a[onclick^="return togglecomment"]'),
|
||||
// Twitter/X - show more replies
|
||||
...getElementsByXPath("//*[text()='Show more replies']"),
|
||||
...getElementsByXPath("//*[text()='Show replies']"),
|
||||
// Generic "load more" / "show more" buttons
|
||||
...getElementsByXPath("//*[contains(text(),'Load more')]"),
|
||||
...getElementsByXPath("//*[contains(text(),'Show more')]"),
|
||||
// Hacker News
|
||||
...document.querySelectorAll('a.morelink'),
|
||||
];
|
||||
|
||||
let expanded = 0;
|
||||
let loadMoreLinks = getLoadMoreLinks();
|
||||
const startTime = Date.now();
|
||||
|
||||
while (loadMoreLinks.length > 0) {
|
||||
for (const link of loadMoreLinks) {
|
||||
// Skip certain elements
|
||||
if (link.slot === 'children') continue;
|
||||
|
||||
try {
|
||||
link.scrollIntoView({ behavior: 'smooth' });
|
||||
link.click();
|
||||
expanded++;
|
||||
await wait(delay);
|
||||
} catch (e) {
|
||||
// Ignore click errors
|
||||
}
|
||||
|
||||
// Check limits
|
||||
if (expanded >= limit) return expanded;
|
||||
if (Date.now() - startTime >= timeout) return expanded;
|
||||
}
|
||||
|
||||
// Check for new load more links after clicking
|
||||
await wait(delay);
|
||||
loadMoreLinks = getLoadMoreLinks();
|
||||
}
|
||||
|
||||
return expanded;
|
||||
}, { timeout, limit, delay });
|
||||
|
||||
if (numExpanded > 0) {
|
||||
console.error(`Clicked ${numExpanded} "load more" buttons`);
|
||||
}
|
||||
|
||||
return {
|
||||
detailsExpanded,
|
||||
commentsExpanded: numExpanded,
|
||||
total: detailsExpanded + numExpanded,
|
||||
};
|
||||
}
|
||||
|
||||
async function scrollDown(page, options = {}) {
|
||||
const {
|
||||
timeout = 120000,
|
||||
scrollDelay = 2000,
|
||||
scrollDistance = 1600,
|
||||
scrollLimit = 10,
|
||||
minHeight = 16000,
|
||||
} = options;
|
||||
|
||||
const startTime = Date.now();
|
||||
|
||||
// Get page height using multiple methods (some pages use different scroll containers)
|
||||
const getPageHeight = () => page.evaluate(() => {
|
||||
return Math.max(
|
||||
document.body.scrollHeight || 0,
|
||||
document.body.offsetHeight || 0,
|
||||
document.documentElement.scrollHeight || 0,
|
||||
document.documentElement.offsetHeight || 0
|
||||
);
|
||||
});
|
||||
|
||||
const startingHeight = await getPageHeight();
|
||||
let lastHeight = startingHeight;
|
||||
let scrollCount = 0;
|
||||
let scrollPosition = 0;
|
||||
|
||||
console.error(`Initial page height: ${startingHeight}px`);
|
||||
|
||||
// Scroll to top first
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, left: 0, behavior: 'smooth' });
|
||||
});
|
||||
await sleep(500);
|
||||
|
||||
while (scrollCount < scrollLimit) {
|
||||
// Check timeout
|
||||
const elapsed = Date.now() - startTime;
|
||||
if (elapsed >= timeout) {
|
||||
console.error(`Timeout reached after ${scrollCount} scrolls`);
|
||||
break;
|
||||
}
|
||||
|
||||
scrollPosition = (scrollCount + 1) * scrollDistance;
|
||||
console.error(`Scrolling down ${scrollCount + 1}x ${scrollDistance}px... (${scrollPosition}/${lastHeight})`);
|
||||
|
||||
await page.evaluate((yOffset) => {
|
||||
window.scrollTo({ top: yOffset, left: 0, behavior: 'smooth' });
|
||||
}, scrollPosition);
|
||||
|
||||
scrollCount++;
|
||||
await sleep(scrollDelay);
|
||||
|
||||
// Check if new content was added (infinite scroll detection)
|
||||
const newHeight = await getPageHeight();
|
||||
const addedPx = newHeight - lastHeight;
|
||||
|
||||
if (addedPx > 0) {
|
||||
console.error(`Detected infini-scrolling: ${lastHeight}+${addedPx} => ${newHeight}`);
|
||||
} else if (scrollPosition >= newHeight + scrollDistance) {
|
||||
// Reached the bottom
|
||||
if (scrollCount > 2) {
|
||||
console.error(`Reached bottom of page at ${newHeight}px`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
lastHeight = newHeight;
|
||||
|
||||
// Check if we've reached minimum height and can stop
|
||||
if (lastHeight >= minHeight && scrollPosition >= lastHeight) {
|
||||
console.error(`Reached minimum height target (${minHeight}px)`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Scroll to absolute bottom
|
||||
if (scrollPosition < lastHeight) {
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: document.documentElement.scrollHeight, left: 0, behavior: 'smooth' });
|
||||
});
|
||||
await sleep(scrollDelay);
|
||||
}
|
||||
|
||||
// Scroll back to top
|
||||
console.error(`Reached bottom of page at ${lastHeight}px, scrolling back to top...`);
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, left: 0, behavior: 'smooth' });
|
||||
});
|
||||
await sleep(scrollDelay);
|
||||
|
||||
const totalElapsed = Date.now() - startTime;
|
||||
|
||||
return {
|
||||
scrollCount,
|
||||
finalHeight: lastHeight,
|
||||
startingHeight,
|
||||
elapsedMs: totalElapsed,
|
||||
};
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const url = args.url;
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__45_infiniscroll.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const timeout = getEnvInt('INFINISCROLL_TIMEOUT', 120) * 1000;
|
||||
const scrollDelay = getEnvInt('INFINISCROLL_SCROLL_DELAY', 2000);
|
||||
const scrollDistance = getEnvInt('INFINISCROLL_SCROLL_DISTANCE', 1600);
|
||||
const scrollLimit = getEnvInt('INFINISCROLL_SCROLL_LIMIT', 10);
|
||||
const minHeight = getEnvInt('INFINISCROLL_MIN_HEIGHT', 16000);
|
||||
const expandDetailsEnabled = getEnvBool('INFINISCROLL_EXPAND_DETAILS', true);
|
||||
|
||||
const cdpUrl = getCdpUrl();
|
||||
if (!cdpUrl) {
|
||||
console.error(CHROME_SESSION_REQUIRED_ERROR);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Wait for page to be loaded
|
||||
const pageLoaded = await waitForChromeTabLoaded(60000);
|
||||
if (!pageLoaded) {
|
||||
console.error('ERROR: Page not loaded after 60s (chrome_navigate must complete first)');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
let browser = null;
|
||||
try {
|
||||
browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
|
||||
|
||||
const pages = await browser.pages();
|
||||
if (pages.length === 0) {
|
||||
throw new Error('No pages found in browser');
|
||||
}
|
||||
|
||||
// Find the right page by target ID
|
||||
const targetId = getPageId();
|
||||
let page = null;
|
||||
if (targetId) {
|
||||
page = pages.find(p => {
|
||||
const target = p.target();
|
||||
return target && target._targetId === targetId;
|
||||
});
|
||||
}
|
||||
if (!page) {
|
||||
page = pages[pages.length - 1];
|
||||
}
|
||||
|
||||
console.error(`Starting infinite scroll on ${url}`);
|
||||
|
||||
// Expand <details> and comments before scrolling (if enabled)
|
||||
let expandResult = { total: 0, detailsExpanded: 0, commentsExpanded: 0 };
|
||||
if (expandDetailsEnabled) {
|
||||
console.error('Expanding <details> and comments...');
|
||||
expandResult = await expandDetails(page, {
|
||||
timeout: Math.min(timeout / 4, 30000),
|
||||
limit: 500,
|
||||
delay: scrollDelay / 4,
|
||||
});
|
||||
}
|
||||
|
||||
const result = await scrollDown(page, {
|
||||
timeout,
|
||||
scrollDelay,
|
||||
scrollDistance,
|
||||
scrollLimit,
|
||||
minHeight,
|
||||
});
|
||||
|
||||
// Expand again after scrolling (new content may have loaded)
|
||||
if (expandDetailsEnabled) {
|
||||
const expandResult2 = await expandDetails(page, {
|
||||
timeout: Math.min(timeout / 4, 30000),
|
||||
limit: 500,
|
||||
delay: scrollDelay / 4,
|
||||
});
|
||||
expandResult.total += expandResult2.total;
|
||||
expandResult.detailsExpanded += expandResult2.detailsExpanded;
|
||||
expandResult.commentsExpanded += expandResult2.commentsExpanded;
|
||||
}
|
||||
|
||||
browser.disconnect();
|
||||
|
||||
const elapsedSec = (result.elapsedMs / 1000).toFixed(1);
|
||||
const finalHeightStr = result.finalHeight.toLocaleString();
|
||||
const addedHeight = result.finalHeight - result.startingHeight;
|
||||
const addedStr = addedHeight > 0 ? `+${addedHeight.toLocaleString()}px new content` : 'no new content';
|
||||
const expandStr = expandResult.total > 0 ? `, expanded ${expandResult.total}` : '';
|
||||
const outputStr = `scrolled to ${finalHeightStr}px (${addedStr}${expandStr}) over ${elapsedSec}s`;
|
||||
|
||||
console.error(`Success: ${outputStr}`);
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status: 'succeeded',
|
||||
output_str: outputStr,
|
||||
}));
|
||||
process.exit(0);
|
||||
|
||||
} catch (e) {
|
||||
if (browser) browser.disconnect();
|
||||
console.error(`ERROR: ${e.name}: ${e.message}`);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -1 +0,0 @@
|
||||
<span class="abx-output-icon abx-output-icon--infiniscroll" title="Infinite Scroll"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M12 5v9"/><path d="M8 10l4 4 4-4"/><circle cx="6" cy="19" r="1" fill="currentColor" stroke="none"/><circle cx="12" cy="19" r="1" fill="currentColor" stroke="none"/><circle cx="18" cy="19" r="1" fill="currentColor" stroke="none"/></svg></span>
|
||||
@@ -1,245 +0,0 @@
|
||||
"""
|
||||
Integration tests for infiniscroll plugin
|
||||
|
||||
Tests verify:
|
||||
1. Hook script exists
|
||||
2. Dependencies installed via chrome validation hooks
|
||||
3. Verify deps with abx-pkg
|
||||
4. INFINISCROLL_ENABLED=False skips without JSONL
|
||||
5. Fails gracefully when no chrome session exists
|
||||
6. Full integration test: scrolls page and outputs stats
|
||||
7. Config options work (scroll limit, min height)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import time
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
# Import shared Chrome test helpers
|
||||
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
get_test_env,
|
||||
chrome_session,
|
||||
)
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
INFINISCROLL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_infiniscroll.*'), None)
|
||||
TEST_URL = 'https://www.singsing.movie/'
|
||||
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify on_Snapshot hook exists."""
|
||||
assert INFINISCROLL_HOOK is not None, "Infiniscroll hook not found"
|
||||
assert INFINISCROLL_HOOK.exists(), f"Hook not found: {INFINISCROLL_HOOK}"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify dependencies are available via abx-pkg after hook installation."""
|
||||
from abx_pkg import Binary, EnvProvider, BinProviderOverrides
|
||||
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
# Verify node is available
|
||||
node_binary = Binary(name='node', binproviders=[EnvProvider()])
|
||||
node_loaded = node_binary.load()
|
||||
assert node_loaded and node_loaded.abspath, "Node.js required for infiniscroll plugin"
|
||||
|
||||
|
||||
def test_config_infiniscroll_disabled_skips():
|
||||
"""Test that INFINISCROLL_ENABLED=False exits without emitting JSONL."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
env = get_test_env()
|
||||
env['INFINISCROLL_ENABLED'] = 'False'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-disabled'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
|
||||
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
|
||||
|
||||
# Should NOT emit any JSONL
|
||||
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
|
||||
assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, got: {jsonl_lines}"
|
||||
|
||||
|
||||
def test_fails_gracefully_without_chrome_session():
|
||||
"""Test that hook fails gracefully when no chrome session exists."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
infiniscroll_dir = tmpdir / 'snapshot' / 'infiniscroll'
|
||||
infiniscroll_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-no-chrome'],
|
||||
cwd=infiniscroll_dir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=get_test_env(),
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Should fail (exit 1) when no chrome session
|
||||
assert result.returncode != 0, "Should fail when no chrome session exists"
|
||||
# Error could be about chrome/CDP not found, or puppeteer module missing
|
||||
err_lower = result.stderr.lower()
|
||||
assert any(x in err_lower for x in ['chrome', 'cdp', 'puppeteer', 'module']), \
|
||||
f"Should mention chrome/CDP/puppeteer in error: {result.stderr}"
|
||||
|
||||
|
||||
def test_scrolls_page_and_outputs_stats():
|
||||
"""Integration test: scroll page and verify JSONL output format."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
with chrome_session(
|
||||
Path(tmpdir),
|
||||
crawl_id='test-infiniscroll',
|
||||
snapshot_id='snap-infiniscroll',
|
||||
test_url=TEST_URL,
|
||||
) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
|
||||
# Create infiniscroll output directory (sibling to chrome)
|
||||
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
|
||||
infiniscroll_dir.mkdir()
|
||||
|
||||
# Run infiniscroll hook
|
||||
env['INFINISCROLL_SCROLL_LIMIT'] = '3' # Limit scrolls for faster test
|
||||
env['INFINISCROLL_SCROLL_DELAY'] = '500' # Faster scrolling
|
||||
env['INFINISCROLL_MIN_HEIGHT'] = '1000' # Lower threshold for test
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll'],
|
||||
cwd=str(infiniscroll_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
env=env
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Infiniscroll failed: {result.stderr}\nStdout: {result.stdout}"
|
||||
|
||||
# Parse JSONL output
|
||||
result_json = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert result_json is not None, f"Should have ArchiveResult JSONL output. Stdout: {result.stdout}"
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
||||
|
||||
# Verify output_str format: "scrolled to X,XXXpx (+Y,YYYpx new content) over Z.Zs"
|
||||
output_str = result_json.get('output_str', '')
|
||||
assert output_str.startswith('scrolled to'), f"output_str should start with 'scrolled to': {output_str}"
|
||||
assert 'px' in output_str, f"output_str should contain pixel count: {output_str}"
|
||||
assert re.search(r'over \d+(\.\d+)?s', output_str), f"output_str should contain duration: {output_str}"
|
||||
|
||||
# Verify no files created in output directory
|
||||
output_files = list(infiniscroll_dir.iterdir())
|
||||
assert len(output_files) == 0, f"Should not create any files, but found: {output_files}"
|
||||
|
||||
|
||||
def test_config_scroll_limit_honored():
|
||||
"""Test that INFINISCROLL_SCROLL_LIMIT config is respected."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
with chrome_session(
|
||||
Path(tmpdir),
|
||||
crawl_id='test-scroll-limit',
|
||||
snapshot_id='snap-limit',
|
||||
test_url=TEST_URL,
|
||||
) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
|
||||
|
||||
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
|
||||
infiniscroll_dir.mkdir()
|
||||
|
||||
# Set scroll limit to 2 (use env from setup_chrome_session)
|
||||
env['INFINISCROLL_SCROLL_LIMIT'] = '2'
|
||||
env['INFINISCROLL_SCROLL_DELAY'] = '500'
|
||||
env['INFINISCROLL_MIN_HEIGHT'] = '100000' # High threshold so limit kicks in
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-limit'],
|
||||
cwd=str(infiniscroll_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
env=env
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Infiniscroll failed: {result.stderr}"
|
||||
|
||||
# Parse output and verify scroll count
|
||||
result_json = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip().startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert result_json is not None, "Should have JSONL output"
|
||||
output_str = result_json.get('output_str', '')
|
||||
|
||||
# Verify output format and that it completed (scroll limit enforced internally)
|
||||
assert output_str.startswith('scrolled to'), f"Should have valid output_str: {output_str}"
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed with scroll limit: {result_json}"
|
||||
|
||||
|
||||
|
||||
def test_config_timeout_honored():
|
||||
"""Test that INFINISCROLL_TIMEOUT config is respected."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
with chrome_session(
|
||||
Path(tmpdir),
|
||||
crawl_id='test-timeout',
|
||||
snapshot_id='snap-timeout',
|
||||
test_url=TEST_URL,
|
||||
) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
|
||||
|
||||
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
|
||||
infiniscroll_dir.mkdir()
|
||||
|
||||
# Set very short timeout (use env from setup_chrome_session)
|
||||
env['INFINISCROLL_TIMEOUT'] = '3' # 3 seconds
|
||||
env['INFINISCROLL_SCROLL_DELAY'] = '2000' # 2s delay - timeout should trigger
|
||||
env['INFINISCROLL_SCROLL_LIMIT'] = '100' # High limit
|
||||
env['INFINISCROLL_MIN_HEIGHT'] = '100000'
|
||||
|
||||
start_time = time.time()
|
||||
result = subprocess.run(
|
||||
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-timeout'],
|
||||
cwd=str(infiniscroll_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
env=env
|
||||
)
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
# Should complete within reasonable time (timeout + buffer)
|
||||
assert elapsed < 15, f"Should respect timeout, took {elapsed:.1f}s"
|
||||
assert result.returncode == 0, f"Should complete even with timeout: {result.stderr}"
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
@@ -1,14 +0,0 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"required_plugins": ["chrome"],
|
||||
"properties": {
|
||||
"ISTILLDONTCAREABOUTCOOKIES_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["USE_ISTILLDONTCAREABOUTCOOKIES"],
|
||||
"description": "Enable I Still Don't Care About Cookies browser extension"
|
||||
}
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user