switch to external plugins

This commit is contained in:
Nick Sweeting
2026-03-15 03:45:51 -07:00
parent 07dc880d0b
commit ecb1764590
256 changed files with 516 additions and 31272 deletions

View File

@@ -30,7 +30,8 @@
"WebFetch(domain:python-statemachine.readthedocs.io)",
"Bash(./bin/run_plugin_tests.sh:*)",
"Bash(done)",
"Bash(coverage erase:*)"
"Bash(coverage erase:*)",
"Bash(gh api:*)"
]
},
"hooks": {

View File

@@ -491,6 +491,7 @@ docker run -it -v $PWD:/data archivebox/archivebox help
# optional: import your browser cookies into a persona for logged-in archiving
archivebox persona create --import=chrome personal
# supported: chrome/chromium/brave/edge (Chromium-based only)
# use --profile to target a specific profile (e.g. Default, Profile 1)
# re-running import merges/dedupes cookies.txt (by domain/path/name) but replaces chrome_user_data
```

View File

@@ -18,6 +18,7 @@ from pathlib import Path
# Import uuid_compat early to monkey-patch uuid.uuid7 before Django loads migrations
# This fixes migrations generated on Python 3.14+ that reference uuid.uuid7 directly
from archivebox import uuid_compat # noqa: F401
from abx_plugins import get_plugins_dir
# Force unbuffered output for real-time logs
if hasattr(sys.stdout, 'reconfigure'):
@@ -56,9 +57,13 @@ check_io_encoding()
# Install monkey patches for third-party libraries
from .misc.monkey_patches import * # noqa
# Built-in plugin directories
BUILTIN_PLUGINS_DIR = PACKAGE_DIR / 'plugins'
USER_PLUGINS_DIR = Path(os.getcwd()) / 'plugins'
# Plugin directories
BUILTIN_PLUGINS_DIR = Path(get_plugins_dir()).resolve()
USER_PLUGINS_DIR = Path(
os.environ.get('ARCHIVEBOX_USER_PLUGINS_DIR')
or os.environ.get('USER_PLUGINS_DIR')
or os.environ.get('DATA_DIR', os.getcwd())
) / 'custom_plugins'
# These are kept for backwards compatibility with existing code
# that checks for plugins. The new hook system uses discover_hooks()

View File

@@ -33,6 +33,7 @@ import shutil
import platform
import subprocess
import tempfile
import json
from pathlib import Path
from typing import Optional, Iterable
from collections import OrderedDict
@@ -138,6 +139,55 @@ def get_edge_user_data_dir() -> Optional[Path]:
return None
def get_browser_binary(browser: str) -> Optional[str]:
system = platform.system()
home = Path.home()
browser = browser.lower()
if system == 'Darwin':
candidates = {
'chrome': ['/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'],
'chromium': ['/Applications/Chromium.app/Contents/MacOS/Chromium'],
'brave': ['/Applications/Brave Browser.app/Contents/MacOS/Brave Browser'],
'edge': ['/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge'],
}.get(browser, [])
elif system == 'Linux':
candidates = {
'chrome': ['/usr/bin/google-chrome', '/usr/bin/google-chrome-stable', '/usr/bin/google-chrome-beta', '/usr/bin/google-chrome-unstable'],
'chromium': ['/usr/bin/chromium', '/usr/bin/chromium-browser'],
'brave': ['/usr/bin/brave-browser', '/usr/bin/brave-browser-beta', '/usr/bin/brave-browser-nightly'],
'edge': ['/usr/bin/microsoft-edge', '/usr/bin/microsoft-edge-stable', '/usr/bin/microsoft-edge-beta', '/usr/bin/microsoft-edge-dev'],
}.get(browser, [])
elif system == 'Windows':
local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local'))
candidates = {
'chrome': [
str(local_app_data / 'Google' / 'Chrome' / 'Application' / 'chrome.exe'),
'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe',
'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe',
],
'chromium': [str(local_app_data / 'Chromium' / 'Application' / 'chrome.exe')],
'brave': [
str(local_app_data / 'BraveSoftware' / 'Brave-Browser' / 'Application' / 'brave.exe'),
'C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe',
'C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe',
],
'edge': [
str(local_app_data / 'Microsoft' / 'Edge' / 'Application' / 'msedge.exe'),
'C:\\Program Files\\Microsoft\\Edge\\Application\\msedge.exe',
'C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe',
],
}.get(browser, [])
else:
candidates = []
for candidate in candidates:
if candidate and Path(candidate).exists():
return candidate
return None
BROWSER_PROFILE_FINDERS = {
'chrome': get_chrome_user_data_dir,
'chromium': get_chrome_user_data_dir, # Same locations
@@ -194,7 +244,12 @@ def _merge_netscape_cookies(existing_file: Path, new_file: Path) -> None:
_write_netscape_cookies(existing_file, existing)
def extract_cookies_via_cdp(user_data_dir: Path, output_file: Path) -> bool:
def extract_cookies_via_cdp(
user_data_dir: Path,
output_file: Path,
profile_dir: str | None = None,
chrome_binary: str | None = None,
) -> bool:
"""
Launch Chrome with the given user data dir and extract cookies via CDP.
@@ -218,6 +273,8 @@ def extract_cookies_via_cdp(user_data_dir: Path, output_file: Path) -> bool:
env['NODE_MODULES_DIR'] = str(node_modules_dir)
env['CHROME_USER_DATA_DIR'] = str(user_data_dir)
env['CHROME_HEADLESS'] = 'true'
if chrome_binary:
env['CHROME_BINARY'] = str(chrome_binary)
output_path = output_file
temp_output = None
temp_dir = None
@@ -225,6 +282,23 @@ def extract_cookies_via_cdp(user_data_dir: Path, output_file: Path) -> bool:
temp_dir = Path(tempfile.mkdtemp(prefix='ab_cookies_'))
temp_output = temp_dir / 'cookies.txt'
output_path = temp_output
if profile_dir:
extra_arg = f'--profile-directory={profile_dir}'
existing_extra = env.get('CHROME_ARGS_EXTRA', '').strip()
args_list = []
if existing_extra:
if existing_extra.startswith('['):
try:
parsed = json.loads(existing_extra)
if isinstance(parsed, list):
args_list.extend(str(x) for x in parsed)
except Exception:
args_list.extend([s.strip() for s in existing_extra.split(',') if s.strip()])
else:
args_list.extend([s.strip() for s in existing_extra.split(',') if s.strip()])
args_list.append(extra_arg)
env['CHROME_ARGS_EXTRA'] = json.dumps(args_list)
env['COOKIES_OUTPUT_FILE'] = str(output_path)
try:
@@ -322,6 +396,7 @@ def ensure_path_within_personas_dir(persona_path: Path) -> bool:
def create_personas(
names: Iterable[str],
import_from: Optional[str] = None,
profile: Optional[str] = None,
) -> int:
"""
Create Personas from names.
@@ -360,6 +435,15 @@ def create_personas(
rprint(f'[dim]Found {import_from} profile: {source_profile_dir}[/dim]', file=sys.stderr)
if profile is None and (source_profile_dir / 'Default').exists():
profile = 'Default'
browser_binary = get_browser_binary(import_from)
if browser_binary:
rprint(f'[dim]Using {import_from} binary: {browser_binary}[/dim]', file=sys.stderr)
else:
browser_binary = None
created_count = 0
for name in name_list:
name = name.strip()
@@ -414,7 +498,12 @@ def create_personas(
# Extract cookies via CDP
rprint(f'[dim]Extracting cookies via CDP...[/dim]', file=sys.stderr)
if extract_cookies_via_cdp(persona_chrome_dir, cookies_file):
if extract_cookies_via_cdp(
persona_chrome_dir,
cookies_file,
profile_dir=profile,
chrome_binary=browser_binary,
):
rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr)
else:
rprint(f'[yellow]Could not extract cookies automatically.[/yellow]', file=sys.stderr)
@@ -652,9 +741,10 @@ def main():
@main.command('create')
@click.argument('names', nargs=-1)
@click.option('--import', 'import_from', help='Import profile from browser (chrome, chromium, brave, edge)')
def create_cmd(names: tuple, import_from: Optional[str]):
@click.option('--profile', help='Profile directory name under the user data dir (e.g. Default, Profile 1)')
def create_cmd(names: tuple, import_from: Optional[str], profile: Optional[str]):
"""Create Personas, optionally importing from a browser profile."""
sys.exit(create_personas(names, import_from=import_from))
sys.exit(create_personas(names, import_from=import_from, profile=profile))
@main.command('list')

View File

@@ -277,7 +277,7 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
# Show a helpful message when no plugins found
rows['Name'].append('(no plugins found)')
rows['Source'].append('-')
rows['Path'].append(mark_safe('<code>archivebox/plugins/</code> or <code>data/plugins/</code>'))
rows['Path'].append(mark_safe('<code>abx_plugins/plugins/</code> or <code>data/custom_plugins/</code>'))
rows['Hooks'].append('-')
rows['Config'].append('-')

View File

@@ -140,6 +140,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'crawl__created_by', TagNameListFilter)
fieldsets = (
('Actions', {
'fields': ('admin_actions',),
'classes': ('card', 'wide', 'actions-card'),
}),
('URL', {
'fields': ('url', 'title'),
'classes': ('card', 'wide'),
@@ -168,10 +172,6 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
'fields': ('output_dir',),
'classes': ('card',),
}),
('Actions', {
'fields': ('admin_actions',),
'classes': ('card', 'wide'),
}),
('Archive Results', {
'fields': ('archiveresults_list',),
'classes': ('card', 'wide'),
@@ -179,7 +179,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
)
ordering = ['-created_at']
actions = ['add_tags', 'remove_tags', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
actions = ['add_tags', 'remove_tags', 'resnapshot_snapshot', 'update_snapshots', 'overwrite_snapshots', 'delete_snapshots']
inlines = [] # Removed TagInline, using TagEditorWidget instead
list_per_page = min(max(5, SERVER_CONFIG.SNAPSHOTS_PER_PAGE), 5000)
@@ -301,6 +301,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
# obj.pk,
# )
@admin.display(description='')
def admin_actions(self, obj):
summary_url = build_web_url(f'/{obj.archive_path}')
results_url = build_web_url(f'/{obj.archive_path}/index.html#all')
@@ -311,13 +312,13 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
href="{}"
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
📄 Summary Page
📄 View Snapshot
</a>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="{}"
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
📁 Result Files
📁 All files
</a>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="{}"
@@ -329,19 +330,19 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
<span style="border-left: 1px solid #e2e8f0; height: 24px; margin: 0 4px;"></span>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #ecfdf5; border: 1px solid #a7f3d0; border-radius: 8px; color: #065f46; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/admin/core/snapshot/?id__exact={}"
title="Get missing extractors"
onmouseover="this.style.background='#d1fae5';"
onmouseout="this.style.background='#ecfdf5';">
⬇️ Finish
</a>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #eff6ff; border: 1px solid #bfdbfe; border-radius: 8px; color: #1e40af; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/admin/core/snapshot/?id__exact={}"
title="Create a fresh new snapshot of this URL"
onmouseover="this.style.background='#dbeafe';"
onmouseout="this.style.background='#eff6ff';">
🆕 Archive Again
🆕 Archive Now
</a>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #ecfdf5; border: 1px solid #a7f3d0; border-radius: 8px; color: #065f46; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/admin/core/snapshot/?id__exact={}"
title="Redo failed extractors (missing outputs)"
onmouseover="this.style.background='#d1fae5';"
onmouseout="this.style.background='#ecfdf5';">
🔁 Redo Failed
</a>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #fffbeb; border: 1px solid #fde68a; border-radius: 8px; color: #92400e; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/admin/core/snapshot/?id__exact={}"
@@ -707,7 +708,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
# return super().changelist_view(request, extra_context=None)
@admin.action(
description="⏯️ Finish"
description="🔁 Redo Failed"
)
def update_snapshots(self, request, queryset):
count = queryset.count()
@@ -721,7 +722,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
@admin.action(
description="⬇️ Fresh"
description="🆕 Archive Now"
)
def resnapshot_snapshot(self, request, queryset):
for snapshot in queryset:

View File

@@ -1704,8 +1704,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
Create ArchiveResult records for all enabled hooks.
Uses the hooks system to discover available hooks from:
- archivebox/plugins/*/on_Snapshot__*.{py,sh,js}
- data/plugins/*/on_Snapshot__*.{py,sh,js}
- abx_plugins/plugins/*/on_Snapshot__*.{py,sh,js}
- data/custom_plugins/*/on_Snapshot__*.{py,sh,js}
Creates one ArchiveResult per hook (not per plugin), with hook_name set.
This enables step-based execution where all hooks in a step can run in parallel.
@@ -2486,7 +2486,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
@property
def plugin_module(self) -> Any | None:
# Hook scripts are now used instead of Python plugin modules
# The plugin name maps to hooks in archivebox/plugins/{plugin}/
# The plugin name maps to hooks in abx_plugins/plugins/{plugin}/
return None
def output_exists(self) -> bool:

View File

@@ -349,15 +349,6 @@ def plugin_name(value: str) -> str:
return get_plugin_name(value)
@register.filter
def plugin_display_name(value: str) -> str:
"""
Human-friendly plugin name overrides for UI display.
"""
name = get_plugin_name(value)
if name == 'merkletree':
return 'hashes'
return name
@register.simple_tag(takes_context=True)

View File

@@ -1145,13 +1145,31 @@ def live_progress_view(request):
for proc in running_workers:
env = proc.env or {}
if not isinstance(env, dict):
continue
env = {}
cmd = proc.cmd or []
if proc.worker_type == 'crawl':
crawl_id = env.get('CRAWL_ID')
if not crawl_id:
for i, part in enumerate(cmd):
if part == '--crawl-id' and i + 1 < len(cmd):
crawl_id = cmd[i + 1]
break
if part.startswith('--crawl-id='):
crawl_id = part.split('=', 1)[1]
break
if crawl_id:
crawl_worker_pids[str(crawl_id)] = proc.pid
elif proc.worker_type == 'snapshot':
snapshot_id = env.get('SNAPSHOT_ID')
if not snapshot_id:
for i, part in enumerate(cmd):
if part == '--snapshot-id' and i + 1 < len(cmd):
snapshot_id = cmd[i + 1]
break
if part.startswith('--snapshot-id='):
snapshot_id = part.split('=', 1)[1]
break
if snapshot_id:
snapshot_worker_pids[str(snapshot_id)] = proc.pid
@@ -1243,7 +1261,7 @@ def live_progress_view(request):
'plugin': ar.plugin,
'status': status,
}
if ar.process_id and ar.process and ar.process.status == Process.StatusChoices.RUNNING:
if status == ArchiveResult.StatusChoices.STARTED and ar.process_id and ar.process:
plugin_payload['pid'] = ar.process.pid
if status == ArchiveResult.StatusChoices.STARTED:
plugin_payload['progress'] = progress_value

View File

@@ -6,8 +6,8 @@ with ArchiveBox via CLI arguments and stdout JSON output. This keeps the plugin
system simple and language-agnostic.
Directory structure:
archivebox/plugins/<plugin_name>/on_<Event>__<hook_name>.<ext> (built-in)
data/plugins/<plugin_name>/on_<Event>__<hook_name>.<ext> (user)
abx_plugins/plugins/<plugin_name>/on_<Event>__<hook_name>.<ext> (built-in package)
data/custom_plugins/<plugin_name>/on_<Event>__<hook_name>.<ext> (user)
Hook contract:
Input: --url=<url> (and other --key=value args)
@@ -66,14 +66,20 @@ from functools import lru_cache
from pathlib import Path
from typing import List, Dict, Any, Optional, TypedDict
from abx_plugins import get_plugins_dir
from django.conf import settings
from django.utils import timezone
from django.utils.safestring import mark_safe
from archivebox.config.constants import CONSTANTS
# Plugin directories
BUILTIN_PLUGINS_DIR = Path(__file__).parent / 'plugins'
USER_PLUGINS_DIR = Path(getattr(settings, 'DATA_DIR', Path.cwd())) / 'plugins'
BUILTIN_PLUGINS_DIR = Path(get_plugins_dir()).resolve()
USER_PLUGINS_DIR = Path(
os.environ.get('ARCHIVEBOX_USER_PLUGINS_DIR')
or getattr(settings, 'USER_PLUGINS_DIR', '')
or str(CONSTANTS.USER_PLUGINS_DIR)
).expanduser()
# =============================================================================
@@ -197,11 +203,11 @@ def discover_hooks(
for hook in hooks:
# Get plugin name from parent directory
# e.g., archivebox/plugins/wget/on_Snapshot__50_wget.py -> 'wget'
# e.g., abx_plugins/plugins/wget/on_Snapshot__50_wget.py -> 'wget'
plugin_name = hook.parent.name
# Check if this is a plugin directory (not the root plugins dir)
if plugin_name in ('plugins', '.'):
if hook.parent.resolve() in (BUILTIN_PLUGINS_DIR.resolve(), USER_PLUGINS_DIR.resolve()):
# Hook is in root plugins directory, not a plugin subdir
# Include it by default (no filtering for non-plugin hooks)
enabled_hooks.append(hook)
@@ -581,7 +587,7 @@ def get_plugins() -> List[str]:
The plugin name is the plugin directory name, not the hook script name.
Example:
archivebox/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js
abx_plugins/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js
-> plugin = 'chrome'
Sorted alphabetically (plugins control their hook order via numeric prefixes in hook names).
@@ -728,7 +734,7 @@ def discover_plugins_that_provide_interface(
try:
# Import the module dynamically
spec = importlib.util.spec_from_file_location(
f'archivebox.plugins.{plugin_name}.{module_name}',
f'archivebox.dynamic_plugins.{plugin_name}.{module_name}',
module_path
)
if spec is None or spec.loader is None:
@@ -942,7 +948,7 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
# Plugins can provide custom templates for rendering their output in the UI.
# Templates are discovered by filename convention inside each plugin's templates/ dir:
#
# archivebox/plugins/<plugin_name>/
# abx_plugins/plugins/<plugin_name>/
# templates/
# icon.html # Icon for admin table view (small inline HTML)
# card.html # Preview card for snapshot header

View File

@@ -0,0 +1,318 @@
__package__ = 'archivebox.ideas'
import asyncio
import json
import os
import shlex
import signal
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Callable, Mapping, MutableMapping, Optional
from pydantic import BaseModel, Field
try:
from bubus import BaseEvent, EventBus
except Exception as exc: # pragma: no cover - optional dependency
raise ImportError('ProcessPlugin requires bubus to be installed') from exc
try:
from bubus.service import uuid7str
except Exception: # pragma: no cover - optional dependency
from uuid import uuid4 as _uuid4
def uuid7str() -> str:
return str(_uuid4())
def _utcnow() -> datetime:
return datetime.now(timezone.utc)
class ProcessRecord(BaseModel):
id: str = Field(default_factory=uuid7str)
cmd: list[str]
cwd: str | None = None
env: dict[str, str] = Field(default_factory=dict)
pid: int | None = None
started_at: datetime | None = None
ended_at: datetime | None = None
exit_code: int | None = None
stdout_path: str | None = None
stderr_path: str | None = None
cmd_path: str | None = None
pid_path: str | None = None
is_background: bool = False
parent_process_id: str | None = None
class ProcessLaunch(BaseEvent[ProcessRecord]):
cmd: list[str]
cwd: str | None = None
env: dict[str, str] | None = None
timeout: float | None = None
output_dir: str | None = None
log_prefix: str | None = None
is_background: bool = False
parent_process_id: str | None = None
parse_stdout_events: bool = True
class ProcessStarted(BaseEvent[None]):
process: ProcessRecord
class ProcessExited(BaseEvent[None]):
process: ProcessRecord
class ProcessKill(BaseEvent[ProcessRecord]):
process_id: str
signal: int = signal.SIGTERM
timeout: float | None = 10.0
@dataclass
class _RunningProcess:
process: asyncio.subprocess.Process
record: ProcessRecord
stdout_task: asyncio.Task[None] | None
stderr_task: asyncio.Task[None] | None
watcher_task: asyncio.Task[None] | None
parent_event_id: str | None
JsonEventAdapter = Callable[[dict[str, Any], str | None], Optional[BaseEvent[Any]]]
class ProcessPlugin:
"""Spawn and monitor processes using events (no Django required)."""
def __init__(
self,
bus: EventBus,
*,
env: Mapping[str, str] | None = None,
json_event_adapter: JsonEventAdapter | None = None,
) -> None:
self.bus = bus
self.env = dict(env or os.environ)
self.json_event_adapter = json_event_adapter
self._running: MutableMapping[str, _RunningProcess] = {}
def register_event_handlers(self) -> None:
self.bus.on(ProcessLaunch, self.on_ProcessLaunch)
self.bus.on(ProcessKill, self.on_ProcessKill)
async def on_ProcessLaunch(self, event: ProcessLaunch) -> ProcessRecord:
parent_event_id = event.event_id
proc_id = uuid7str()
cwd = event.cwd or event.output_dir or os.getcwd()
output_dir = Path(event.output_dir or cwd)
output_dir.mkdir(parents=True, exist_ok=True)
env = {**self.env, **(event.env or {})}
log_prefix = event.log_prefix or proc_id
stdout_path = output_dir / f'{log_prefix}.stdout.log'
stderr_path = output_dir / f'{log_prefix}.stderr.log'
cmd_path = output_dir / f'{log_prefix}.sh'
pid_path = output_dir / f'{log_prefix}.pid'
self._write_cmd_file(cmd_path, event.cmd)
proc = await asyncio.create_subprocess_exec(
*event.cmd,
cwd=str(cwd),
env=env,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
start_new_session=True,
)
self._write_pid_file(pid_path, proc.pid)
record = ProcessRecord(
id=proc_id,
cmd=event.cmd,
cwd=str(cwd),
env=env,
pid=proc.pid,
started_at=_utcnow(),
stdout_path=str(stdout_path),
stderr_path=str(stderr_path),
cmd_path=str(cmd_path),
pid_path=str(pid_path),
is_background=event.is_background,
parent_process_id=event.parent_process_id,
)
await event.event_bus.dispatch(
ProcessStarted(process=record, event_parent_id=parent_event_id)
)
stdout_task = asyncio.create_task(
self._consume_stream(
proc.stdout, stdout_path, parent_event_id, event.parse_stdout_events
)
)
stderr_task = asyncio.create_task(
self._consume_stream(proc.stderr, stderr_path, parent_event_id, False)
)
running = _RunningProcess(
process=proc,
record=record,
stdout_task=stdout_task,
stderr_task=stderr_task,
watcher_task=None,
parent_event_id=parent_event_id,
)
self._running[proc_id] = running
if event.is_background:
running.watcher_task = asyncio.create_task(
self._watch_process(proc_id, event.timeout)
)
return record
await self._watch_process(proc_id, event.timeout)
return self._running.get(proc_id, running).record
async def on_ProcessKill(self, event: ProcessKill) -> ProcessRecord:
running = self._running.get(event.process_id)
if not running:
raise RuntimeError(f'Process not found: {event.process_id}')
proc = running.process
self._terminate_process(proc, event.signal)
if event.timeout is not None:
try:
await asyncio.wait_for(proc.wait(), timeout=event.timeout)
except asyncio.TimeoutError:
self._terminate_process(proc, signal.SIGKILL)
else:
await proc.wait()
await self._finalize_process(event.process_id)
return self._running.get(event.process_id, running).record
async def _watch_process(self, process_id: str, timeout: float | None) -> None:
running = self._running.get(process_id)
if not running:
return
proc = running.process
try:
if timeout is not None:
await asyncio.wait_for(proc.wait(), timeout=timeout)
else:
await proc.wait()
except asyncio.TimeoutError:
self._terminate_process(proc, signal.SIGTERM)
await asyncio.sleep(2)
if proc.returncode is None:
self._terminate_process(proc, signal.SIGKILL)
await proc.wait()
await self._finalize_process(process_id)
async def _finalize_process(self, process_id: str) -> None:
running = self._running.get(process_id)
if not running:
return
proc = running.process
record = running.record
if running.stdout_task:
await running.stdout_task
if running.stderr_task:
await running.stderr_task
record.exit_code = proc.returncode
record.ended_at = _utcnow()
await self.bus.dispatch(
ProcessExited(process=record, event_parent_id=running.parent_event_id)
)
self._running.pop(process_id, None)
async def _consume_stream(
self,
stream: asyncio.StreamReader | None,
path: Path,
parent_event_id: str | None,
parse_events: bool,
) -> None:
if stream is None:
return
with path.open('w', encoding='utf-8') as fh:
while True:
line = await stream.readline()
if not line:
break
text = line.decode('utf-8', errors='replace')
fh.write(text)
fh.flush()
if parse_events:
await self._maybe_dispatch_json_event(text, parent_event_id)
async def _maybe_dispatch_json_event(self, line: str, parent_event_id: str | None) -> None:
text = line.strip()
if not text.startswith('{') or not text.endswith('}'):
return
try:
data = json.loads(text)
except json.JSONDecodeError:
return
event = None
if self.json_event_adapter:
event = self.json_event_adapter(data, parent_event_id)
elif isinstance(data, dict) and 'event_type' in data:
try:
event = BaseEvent.model_validate(data)
except Exception:
event = None
if event is None:
return
if not getattr(event, 'event_parent_id', None) and parent_event_id:
event.event_parent_id = parent_event_id
await self.bus.dispatch(event)
@staticmethod
def _write_cmd_file(path: Path, cmd: list[str]) -> None:
cmd_line = ' '.join(shlex.quote(part) for part in cmd)
path.write_text(cmd_line + '\n', encoding='utf-8')
@staticmethod
def _write_pid_file(path: Path, pid: int) -> None:
path.write_text(str(pid), encoding='utf-8')
ts = datetime.now().timestamp()
os.utime(path, (ts, ts))
@staticmethod
def _terminate_process(proc: asyncio.subprocess.Process, sig: int) -> None:
if proc.returncode is not None:
return
try:
os.killpg(proc.pid, sig)
except Exception:
try:
os.kill(proc.pid, sig)
except Exception:
pass
__all__ = [
'ProcessRecord',
'ProcessLaunch',
'ProcessStarted',
'ProcessExited',
'ProcessKill',
'ProcessPlugin',
]

View File

@@ -1,21 +0,0 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"required_plugins": ["chrome"],
"properties": {
"ACCESSIBILITY_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_ACCESSIBILITY", "USE_ACCESSIBILITY"],
"description": "Enable accessibility tree capture"
},
"ACCESSIBILITY_TIMEOUT": {
"type": "integer",
"default": 30,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for accessibility capture in seconds"
}
}
}

View File

@@ -1,288 +0,0 @@
#!/usr/bin/env node
/**
* Extract accessibility tree and page outline from a URL.
*
* Extracts:
* - Page outline (headings h1-h6, sections, articles)
* - Iframe tree
* - Accessibility snapshot
* - ARIA labels and roles
*
* Usage: on_Snapshot__39_accessibility.js --url=<url> --snapshot-id=<uuid>
* Output: Writes accessibility/accessibility.json
*
* Environment variables:
* SAVE_ACCESSIBILITY: Enable accessibility extraction (default: true)
*/
const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
// Extractor metadata
const PLUGIN_NAME = 'accessibility';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'accessibility.json';
const CHROME_SESSION_DIR = '../chrome';
const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)';
// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
// Wait for chrome tab to be fully loaded
async function waitForChromeTabLoaded(timeoutMs = 60000) {
const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json');
const startTime = Date.now();
while (Date.now() - startTime < timeoutMs) {
if (fs.existsSync(navigationFile)) {
return true;
}
// Wait 100ms before checking again
await new Promise(resolve => setTimeout(resolve, 100));
}
return false;
}
// Get CDP URL from chrome plugin
function getCdpUrl() {
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
if (fs.existsSync(cdpFile)) {
return fs.readFileSync(cdpFile, 'utf8').trim();
}
return null;
}
function assertChromeSession() {
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt');
const pidFile = path.join(CHROME_SESSION_DIR, 'chrome.pid');
if (!fs.existsSync(cdpFile) || !fs.existsSync(targetIdFile) || !fs.existsSync(pidFile)) {
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
}
try {
const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10);
if (!pid || Number.isNaN(pid)) throw new Error('Invalid pid');
process.kill(pid, 0);
} catch (e) {
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
}
const cdpUrl = getCdpUrl();
if (!cdpUrl) {
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
}
return cdpUrl;
}
// Extract accessibility info
async function extractAccessibility(url) {
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;
try {
// Connect to existing Chrome session
const cdpUrl = assertChromeSession();
browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
});
// Get the page
const pages = await browser.pages();
const page = pages.find(p => p.url().startsWith('http')) || pages[0];
if (!page) {
return { success: false, error: 'No page found in Chrome session' };
}
// Get accessibility snapshot
const accessibilityTree = await page.accessibility.snapshot({ interestingOnly: true });
// Extract page outline (headings, sections, etc.)
const outline = await page.evaluate(() => {
const headings = [];
const elements = document.querySelectorAll(
'h1, h2, h3, h4, h5, h6, a[name], header, footer, article, main, aside, nav, section, figure, summary, table, form, iframe'
);
elements.forEach(elem => {
// Skip unnamed anchors
if (elem.tagName.toLowerCase() === 'a' && !elem.name) return;
const tagName = elem.tagName.toLowerCase();
const elemId = elem.id || elem.name || elem.getAttribute('aria-label') || elem.role || '';
const elemClasses = (elem.className || '').toString().trim().split(/\s+/).slice(0, 3).join(' .');
const action = elem.action?.split('/').pop() || '';
let summary = (elem.innerText || '').slice(0, 128);
if (summary.length >= 128) summary += '...';
let prefix = '';
let title = '';
// Format headings with # prefix
const level = parseInt(tagName.replace('h', ''));
if (!isNaN(level)) {
prefix = '#'.repeat(level);
title = elem.innerText || elemId || elemClasses;
} else {
// For other elements, create breadcrumb path
const parents = [tagName];
let node = elem.parentNode;
while (node && parents.length < 5) {
if (node.tagName) {
const tag = node.tagName.toLowerCase();
if (!['div', 'span', 'p', 'body', 'html'].includes(tag)) {
parents.unshift(tag);
} else {
parents.unshift('');
}
}
node = node.parentNode;
}
prefix = parents.join('>');
title = elemId ? `#${elemId}` : '';
if (!title && elemClasses) title = `.${elemClasses}`;
if (action) title += ` /${action}`;
if (summary && !title.includes(summary)) title += `: ${summary}`;
}
// Clean up title
title = title.replace(/\s+/g, ' ').trim();
if (prefix) {
headings.push(`${prefix} ${title}`);
}
});
return headings;
});
// Get iframe tree
const iframes = [];
function dumpFrameTree(frame, indent = '>') {
iframes.push(indent + frame.url());
for (const child of frame.childFrames()) {
dumpFrameTree(child, indent + '>');
}
}
dumpFrameTree(page.mainFrame(), '');
const accessibilityData = {
url,
headings: outline,
iframes,
tree: accessibilityTree,
};
// Write output
fs.writeFileSync(outputPath, JSON.stringify(accessibilityData, null, 2));
return { success: true, output: outputPath, accessibilityData };
} catch (e) {
return { success: false, error: `${e.name}: ${e.message}` };
} finally {
if (browser) {
browser.disconnect();
}
}
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__39_accessibility.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
const startTs = new Date();
let status = 'failed';
let output = null;
let error = '';
try {
// Check if enabled
if (!getEnvBool('ACCESSIBILITY_ENABLED', true)) {
console.log('Skipping accessibility (ACCESSIBILITY_ENABLED=False)');
// Output clean JSONL (no RESULT_JSON= prefix)
console.log(JSON.stringify({
type: 'ArchiveResult',
status: 'skipped',
output_str: 'ACCESSIBILITY_ENABLED=False',
}));
process.exit(0);
}
// Check if Chrome session exists, then wait for page load
assertChromeSession();
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
}
const result = await extractAccessibility(url);
if (result.success) {
status = 'succeeded';
output = result.output;
const headingCount = result.accessibilityData.headings.length;
const iframeCount = result.accessibilityData.iframes.length;
console.log(`Accessibility extracted: ${headingCount} headings, ${iframeCount} iframes`);
} else {
status = 'failed';
error = result.error;
}
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
}
const endTs = new Date();
if (error) console.error(`ERROR: ${error}`);
// Output clean JSONL (no RESULT_JSON= prefix)
console.log(JSON.stringify({
type: 'ArchiveResult',
status,
output_str: output || error || '',
}));
process.exit(status === 'succeeded' ? 0 : 1);
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -1 +0,0 @@
<span class="abx-output-icon abx-output-icon--accessibility" title="Accessibility"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="4.5" r="2" fill="currentColor" stroke="none"/><path d="M4 7.5h16"/><path d="M12 7.5v12"/><path d="M7 20l5-6 5 6"/></svg></span>

View File

@@ -1,195 +0,0 @@
"""
Tests for the accessibility plugin.
Tests the real accessibility hook with an actual URL to verify
accessibility tree and page outline extraction.
"""
import json
import shutil
import subprocess
import sys
import tempfile
from pathlib import Path
import pytest
from django.test import TestCase
# Import chrome test helpers
sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests'))
from chrome_test_helpers import (
chrome_session,
get_test_env,
get_plugin_dir,
get_hook_script,
)
def chrome_available() -> bool:
"""Check if Chrome/Chromium is available."""
for name in ['chromium', 'chromium-browser', 'google-chrome', 'chrome']:
if shutil.which(name):
return True
return False
# Get the path to the accessibility hook
PLUGIN_DIR = get_plugin_dir(__file__)
ACCESSIBILITY_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_accessibility.*')
class TestAccessibilityPlugin(TestCase):
"""Test the accessibility plugin."""
def test_accessibility_hook_exists(self):
"""Accessibility hook script should exist."""
self.assertIsNotNone(ACCESSIBILITY_HOOK, "Accessibility hook not found in plugin directory")
self.assertTrue(ACCESSIBILITY_HOOK.exists(), f"Hook not found: {ACCESSIBILITY_HOOK}")
class TestAccessibilityWithChrome(TestCase):
"""Integration tests for accessibility plugin with Chrome."""
def setUp(self):
"""Set up test environment."""
self.temp_dir = Path(tempfile.mkdtemp())
def tearDown(self):
"""Clean up."""
shutil.rmtree(self.temp_dir, ignore_errors=True)
def test_accessibility_extracts_page_outline(self):
"""Accessibility hook should extract headings and accessibility tree."""
test_url = 'https://example.com'
snapshot_id = 'test-accessibility-snapshot'
try:
with chrome_session(
self.temp_dir,
crawl_id='test-accessibility-crawl',
snapshot_id=snapshot_id,
test_url=test_url,
navigate=True,
timeout=30,
) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
# Use the environment from chrome_session (already has CHROME_HEADLESS=true)
# Run accessibility hook with the active Chrome session
result = subprocess.run(
['node', str(ACCESSIBILITY_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=60,
env=env
)
# Check for output file
accessibility_output = snapshot_chrome_dir / 'accessibility.json'
accessibility_data = None
# Try parsing from file first
if accessibility_output.exists():
with open(accessibility_output) as f:
try:
accessibility_data = json.load(f)
except json.JSONDecodeError:
pass
# Verify hook ran successfully
self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}")
self.assertNotIn('Traceback', result.stderr)
# example.com has headings, so we should get accessibility data
self.assertIsNotNone(accessibility_data, "No accessibility data was generated")
# Verify we got page outline data
self.assertIn('headings', accessibility_data, f"Missing headings: {accessibility_data}")
self.assertIn('url', accessibility_data, f"Missing url: {accessibility_data}")
except RuntimeError:
raise
def test_accessibility_disabled_skips(self):
"""Test that ACCESSIBILITY_ENABLED=False skips without error."""
test_url = 'https://example.com'
snapshot_id = 'test-disabled'
env = get_test_env()
env['ACCESSIBILITY_ENABLED'] = 'False'
result = subprocess.run(
['node', str(ACCESSIBILITY_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(self.temp_dir),
capture_output=True,
text=True,
timeout=30,
env=env
)
# Should exit 0 even when disabled
self.assertEqual(result.returncode, 0, f"Should succeed when disabled: {result.stderr}")
# Should NOT create output file when disabled
accessibility_output = self.temp_dir / 'accessibility.json'
self.assertFalse(accessibility_output.exists(), "Should not create file when disabled")
def test_accessibility_missing_url_argument(self):
"""Test that missing --url argument causes error."""
snapshot_id = 'test-missing-url'
result = subprocess.run(
['node', str(ACCESSIBILITY_HOOK), f'--snapshot-id={snapshot_id}'],
cwd=str(self.temp_dir),
capture_output=True,
text=True,
timeout=30,
env=get_test_env()
)
# Should fail with non-zero exit code
self.assertNotEqual(result.returncode, 0, "Should fail when URL missing")
def test_accessibility_missing_snapshot_id_argument(self):
"""Test that missing --snapshot-id argument causes error."""
test_url = 'https://example.com'
result = subprocess.run(
['node', str(ACCESSIBILITY_HOOK), f'--url={test_url}'],
cwd=str(self.temp_dir),
capture_output=True,
text=True,
timeout=30,
env=get_test_env()
)
# Should fail with non-zero exit code
self.assertNotEqual(result.returncode, 0, "Should fail when snapshot-id missing")
def test_accessibility_with_no_chrome_session(self):
"""Test that hook fails gracefully when no Chrome session exists."""
test_url = 'https://example.com'
snapshot_id = 'test-no-chrome'
result = subprocess.run(
['node', str(ACCESSIBILITY_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(self.temp_dir),
capture_output=True,
text=True,
timeout=30,
env=get_test_env()
)
# Should fail when no Chrome session
self.assertNotEqual(result.returncode, 0, "Should fail when no Chrome session exists")
# Error should mention CDP or Chrome
err_lower = result.stderr.lower()
self.assertTrue(
any(x in err_lower for x in ['chrome', 'cdp', 'cannot find', 'puppeteer']),
f"Should mention Chrome/CDP in error: {result.stderr}"
)
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -1,83 +0,0 @@
#!/usr/bin/env python3
"""
Install a binary using apt package manager.
Usage: on_Binary__install_using_apt_provider.py --binary-id=<uuid> --machine-id=<uuid> --name=<name>
Output: Binary JSONL record to stdout after installation
"""
import json
import sys
import rich_click as click
from abx_pkg import Binary, AptProvider, BinProviderOverrides
# Fix pydantic forward reference issue
AptProvider.model_rebuild()
@click.command()
@click.option('--binary-id', required=True, help="Binary UUID")
@click.option('--machine-id', required=True, help="Machine UUID")
@click.option('--name', required=True, help="Binary name to install")
@click.option('--binproviders', default='*', help="Allowed providers (comma-separated)")
@click.option('--overrides', default=None, help="JSON-encoded overrides dict")
def main(binary_id: str, machine_id: str, name: str, binproviders: str, overrides: str | None):
"""Install binary using apt package manager."""
# Check if apt provider is allowed
if binproviders != '*' and 'apt' not in binproviders.split(','):
click.echo(f"apt provider not allowed for {name}", err=True)
sys.exit(0) # Not an error, just skip
# Use abx-pkg AptProvider to install binary
provider = AptProvider()
if not provider.INSTALLER_BIN:
click.echo("apt not available on this system", err=True)
sys.exit(1)
click.echo(f"Installing {name} via apt...", err=True)
try:
# Parse overrides if provided
overrides_dict = None
if overrides:
try:
overrides_dict = json.loads(overrides)
# Extract apt-specific overrides
overrides_dict = overrides_dict.get('apt', {})
click.echo(f"Using apt install overrides: {overrides_dict}", err=True)
except json.JSONDecodeError:
click.echo(f"Warning: Failed to parse overrides JSON: {overrides}", err=True)
binary = Binary(name=name, binproviders=[provider], overrides={'apt': overrides_dict} if overrides_dict else {}).install()
except Exception as e:
click.echo(f"apt install failed: {e}", err=True)
sys.exit(1)
if not binary.abspath:
click.echo(f"{name} not found after apt install", err=True)
sys.exit(1)
# Output Binary JSONL record to stdout
record = {
'type': 'Binary',
'name': name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'apt',
'machine_id': machine_id,
'binary_id': binary_id,
}
print(json.dumps(record))
# Log human-readable info to stderr
click.echo(f"Installed {name} at {binary.abspath}", err=True)
click.echo(f" version: {binary.version}", err=True)
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -1,154 +0,0 @@
"""
Tests for the apt binary provider plugin.
Tests cover:
1. Hook script execution
2. apt package availability detection
3. JSONL output format
"""
import json
import os
import shutil
import subprocess
import sys
import tempfile
from pathlib import Path
import pytest
from django.test import TestCase
# Get the path to the apt provider hook
PLUGIN_DIR = Path(__file__).parent.parent
INSTALL_HOOK = next(PLUGIN_DIR.glob('on_Binary__*_apt_install.py'), None)
def apt_available() -> bool:
"""Check if apt is installed."""
return shutil.which('apt') is not None or shutil.which('apt-get') is not None
def is_linux() -> bool:
"""Check if running on Linux."""
import platform
return platform.system().lower() == 'linux'
class TestAptProviderHook(TestCase):
"""Test the apt binary provider installation hook."""
def setUp(self):
"""Set up test environment."""
self.temp_dir = tempfile.mkdtemp()
def tearDown(self):
"""Clean up."""
shutil.rmtree(self.temp_dir, ignore_errors=True)
def test_hook_script_exists(self):
"""Hook script should exist."""
self.assertTrue(INSTALL_HOOK and INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}")
def test_hook_skips_when_apt_not_allowed(self):
"""Hook should skip when apt not in allowed binproviders."""
result = subprocess.run(
[
sys.executable, str(INSTALL_HOOK),
'--name=wget',
'--binary-id=test-uuid',
'--machine-id=test-machine',
'--binproviders=pip,npm', # apt not allowed
],
capture_output=True,
text=True,
timeout=30
)
# Should exit cleanly (code 0) when apt not allowed
self.assertIn('apt provider not allowed', result.stderr)
self.assertEqual(result.returncode, 0)
@pytest.mark.skipif(not is_linux(), reason="apt only available on Linux")
def test_hook_detects_apt(self):
"""Hook should detect apt binary when available."""
assert apt_available(), "apt not installed"
result = subprocess.run(
[
sys.executable, str(INSTALL_HOOK),
'--name=nonexistent-pkg-xyz123',
'--binary-id=test-uuid',
'--machine-id=test-machine',
],
capture_output=True,
text=True,
timeout=30
)
# Should not say apt is not available
self.assertNotIn('apt not available', result.stderr)
def test_hook_handles_overrides(self):
"""Hook should accept overrides JSON."""
overrides = json.dumps({
'apt': {'packages': ['custom-package-name']}
})
result = subprocess.run(
[
sys.executable, str(INSTALL_HOOK),
'--name=test-pkg',
'--binary-id=test-uuid',
'--machine-id=test-machine',
f'--overrides={overrides}',
],
capture_output=True,
text=True,
timeout=30
)
# Should not crash parsing overrides
self.assertNotIn('Traceback', result.stderr)
@pytest.mark.skipif(not is_linux(), reason="apt only available on Linux")
class TestAptProviderSystemBinaries(TestCase):
"""Test apt provider with system binaries."""
def test_detect_existing_binary(self):
"""apt provider should detect already-installed system binaries."""
assert apt_available(), "apt not installed"
# Check for a binary that's almost certainly installed (like 'ls' or 'bash')
result = subprocess.run(
[
sys.executable, str(INSTALL_HOOK),
'--name=bash',
'--binary-id=test-uuid',
'--machine-id=test-machine',
],
capture_output=True,
text=True,
timeout=60
)
# Parse JSONL output
for line in result.stdout.split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'Binary' and record.get('name') == 'bash':
# Found bash
self.assertTrue(record.get('abspath'))
self.assertTrue(Path(record['abspath']).exists())
return
except json.JSONDecodeError:
continue
# apt may not be able to "install" bash (already installed)
# Just verify no crash
self.assertNotIn('Traceback', result.stderr)
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -1,26 +0,0 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"ARCHIVEDOTORG_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_ARCHIVEDOTORG", "USE_ARCHIVEDOTORG", "SUBMIT_ARCHIVEDOTORG"],
"description": "Submit URLs to archive.org Wayback Machine"
},
"ARCHIVEDOTORG_TIMEOUT": {
"type": "integer",
"default": 60,
"minimum": 10,
"x-fallback": "TIMEOUT",
"description": "Timeout for archive.org submission in seconds"
},
"ARCHIVEDOTORG_USER_AGENT": {
"type": "string",
"default": "",
"x-fallback": "USER_AGENT",
"description": "User agent string"
}
}
}

View File

@@ -1,154 +0,0 @@
#!/usr/bin/env python3
"""
Submit a URL to archive.org for archiving.
Usage: on_Snapshot__archivedotorg.bg.py --url=<url> --snapshot-id=<uuid>
Output: Writes archive.org.txt to $PWD with the archived URL
Environment variables:
ARCHIVEDOTORG_TIMEOUT: Timeout in seconds (default: 60)
USER_AGENT: User agent string
# Fallback to ARCHIVING_CONFIG values if ARCHIVEDOTORG_* not set:
TIMEOUT: Fallback timeout
Note: This extractor uses the 'requests' library which is bundled with ArchiveBox.
It can run standalone if requests is installed: pip install requests
"""
import json
import os
import sys
from pathlib import Path
import rich_click as click
# Extractor metadata
PLUGIN_NAME = 'archivedotorg'
OUTPUT_DIR = '.'
OUTPUT_FILE = 'archive.org.txt'
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
except ValueError:
return default
def submit_to_archivedotorg(url: str) -> tuple[bool, str | None, str]:
"""
Submit URL to archive.org Wayback Machine.
Returns: (success, output_path, error_message)
"""
def log(message: str) -> None:
print(f'[archivedotorg] {message}', file=sys.stderr)
try:
import requests
except ImportError:
return False, None, 'requests library not installed'
timeout = get_env_int('ARCHIVEDOTORG_TIMEOUT') or get_env_int('TIMEOUT', 60)
user_agent = get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
submit_url = f'https://web.archive.org/save/{url}'
log(f'Submitting to Wayback Machine (timeout={timeout}s)')
log(f'GET {submit_url}')
try:
response = requests.get(
submit_url,
timeout=timeout,
headers={'User-Agent': user_agent},
allow_redirects=True,
)
log(f'HTTP {response.status_code} final_url={response.url}')
# Check for successful archive
content_location = response.headers.get('Content-Location', '')
x_archive_orig_url = response.headers.get('X-Archive-Orig-Url', '')
if content_location:
log(f'Content-Location: {content_location}')
if x_archive_orig_url:
log(f'X-Archive-Orig-Url: {x_archive_orig_url}')
# Build archive URL
if content_location:
archive_url = f'https://web.archive.org{content_location}'
Path(OUTPUT_FILE).write_text(archive_url, encoding='utf-8')
log(f'Saved archive URL -> {archive_url}')
return True, OUTPUT_FILE, ''
elif 'web.archive.org' in response.url:
# We were redirected to an archive page
Path(OUTPUT_FILE).write_text(response.url, encoding='utf-8')
log(f'Redirected to archive page -> {response.url}')
return True, OUTPUT_FILE, ''
else:
# Check for errors in response
if 'RobotAccessControlException' in response.text:
# Blocked by robots.txt - save submit URL for manual retry
Path(OUTPUT_FILE).write_text(submit_url, encoding='utf-8')
log('Blocked by robots.txt, saved submit URL for manual retry')
return True, OUTPUT_FILE, '' # Consider this a soft success
elif response.status_code >= 400:
return False, None, f'HTTP {response.status_code}'
else:
# Save submit URL anyway
Path(OUTPUT_FILE).write_text(submit_url, encoding='utf-8')
log('No archive URL returned, saved submit URL for manual retry')
return True, OUTPUT_FILE, ''
except requests.Timeout:
return False, None, f'Request timed out after {timeout} seconds'
except requests.RequestException as e:
return False, None, f'{type(e).__name__}: {e}'
except Exception as e:
return False, None, f'{type(e).__name__}: {e}'
@click.command()
@click.option('--url', required=True, help='URL to submit to archive.org')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Submit a URL to archive.org for archiving."""
# Check if feature is enabled
if get_env('ARCHIVEDOTORG_ENABLED', 'True').lower() in ('false', '0', 'no', 'off'):
print('Skipping archive.org submission (ARCHIVEDOTORG_ENABLED=False)', file=sys.stderr)
# Temporary failure (config disabled) - NO JSONL emission
sys.exit(0)
try:
# Run extraction
success, output, error = submit_to_archivedotorg(url)
if success:
# Success - emit ArchiveResult with output file
result = {
'type': 'ArchiveResult',
'status': 'succeeded',
'output_str': output or '',
}
print(json.dumps(result))
sys.exit(0)
else:
# Transient error (network, timeout, HTTP error) - emit NO JSONL
# System will retry later
print(f'ERROR: {error}', file=sys.stderr)
sys.exit(1)
except Exception as e:
# Unexpected error - also transient, emit NO JSONL
print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -1,12 +0,0 @@
{% load config_tags %}
{% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
{% if enabled %}
<!-- Archive.org thumbnail - iframe preview of archived page -->
<div class="extractor-thumbnail archivedotorg-thumbnail" style="width: 100%; height: 100px; overflow: hidden;">
<iframe src="{{ output_path }}"
style="width: 100%; height: 100px; border: none; pointer-events: none;"
loading="lazy"
sandbox="allow-same-origin">
</iframe>
</div>
{% endif %}

View File

@@ -1 +0,0 @@
<span class="abx-output-icon abx-output-icon--archivedotorg" title="Archive.org"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M3 7h18"/><rect x="3" y="7" width="18" height="13" rx="2"/><path d="M9 12h6"/></svg></span>

View File

@@ -1,93 +0,0 @@
"""
Integration tests for archivedotorg plugin
Tests verify standalone archive.org extractor execution.
"""
import json
import subprocess
import sys
import tempfile
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
ARCHIVEDOTORG_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_archivedotorg.*'), None)
TEST_URL = 'https://example.com'
def test_hook_script_exists():
assert ARCHIVEDOTORG_HOOK.exists()
def test_submits_to_archivedotorg():
with tempfile.TemporaryDirectory() as tmpdir:
result = subprocess.run(
[sys.executable, str(ARCHIVEDOTORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
cwd=tmpdir, capture_output=True, text=True, timeout=60
)
assert result.returncode in (0, 1)
# Parse clean JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
if result.returncode == 0:
# Success - should have ArchiveResult
assert result_json, "Should have ArchiveResult JSONL output on success"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
else:
# Transient error - no JSONL output, just stderr
assert not result_json, "Should NOT emit JSONL on transient error"
assert result.stderr, "Should have error message in stderr"
def test_config_save_archivedotorg_false_skips():
with tempfile.TemporaryDirectory() as tmpdir:
import os
env = os.environ.copy()
env['ARCHIVEDOTORG_ENABLED'] = 'False'
result = subprocess.run(
[sys.executable, str(ARCHIVEDOTORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
cwd=tmpdir, capture_output=True, text=True, env=env, timeout=30
)
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
# Feature disabled - temporary failure, should NOT emit JSONL
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
# Should NOT emit any JSONL
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}"
def test_handles_timeout():
with tempfile.TemporaryDirectory() as tmpdir:
import os
env = os.environ.copy()
env['TIMEOUT'] = '1'
result = subprocess.run(
[sys.executable, str(ARCHIVEDOTORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'testtimeout'],
cwd=tmpdir, capture_output=True, text=True, env=env, timeout=30
)
# Timeout is a transient error - should exit 1 with no JSONL
assert result.returncode in (0, 1), "Should complete without hanging"
# If it timed out (exit 1), should have no JSONL output
if result.returncode == 1:
jsonl_lines = [line for line in result.stdout.strip().split('\n')
if line.strip().startswith('{')]
assert len(jsonl_lines) == 0, "Should not emit JSONL on timeout (transient error)"
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -1,87 +0,0 @@
#!/usr/bin/env python3
"""
Install a binary using Homebrew package manager.
Usage: on_Binary__install_using_brew_provider.py --binary-id=<uuid> --machine-id=<uuid> --name=<name> [--custom-cmd=<cmd>]
Output: Binary JSONL record to stdout after installation
Environment variables:
MACHINE_ID: Machine UUID (set by orchestrator)
"""
import json
import os
import sys
import rich_click as click
from abx_pkg import Binary, BrewProvider, BinProviderOverrides
# Fix pydantic forward reference issue
BrewProvider.model_rebuild()
@click.command()
@click.option('--machine-id', required=True, help="Machine UUID")
@click.option('--binary-id', required=True, help="Dependency UUID")
@click.option('--name', required=True, help="Binary name to install")
@click.option('--binproviders', default='*', help="Allowed providers (comma-separated)")
@click.option('--custom-cmd', default=None, help="Custom install command")
@click.option('--overrides', default=None, help="JSON-encoded overrides dict")
def main(binary_id: str, machine_id: str, name: str, binproviders: str, custom_cmd: str | None, overrides: str | None):
"""Install binary using Homebrew."""
if binproviders != '*' and 'brew' not in binproviders.split(','):
click.echo(f"brew provider not allowed for {name}", err=True)
sys.exit(0)
# Use abx-pkg BrewProvider to install binary
provider = BrewProvider()
if not provider.INSTALLER_BIN:
click.echo("brew not available on this system", err=True)
sys.exit(1)
click.echo(f"Installing {name} via brew...", err=True)
try:
# Parse overrides if provided
overrides_dict = None
if overrides:
try:
overrides_dict = json.loads(overrides)
click.echo(f"Using custom install overrides: {overrides_dict}", err=True)
except json.JSONDecodeError:
click.echo(f"Warning: Failed to parse overrides JSON: {overrides}", err=True)
binary = Binary(name=name, binproviders=[provider], overrides=overrides_dict or {}).install()
except Exception as e:
click.echo(f"brew install failed: {e}", err=True)
sys.exit(1)
if not binary.abspath:
click.echo(f"{name} not found after brew install", err=True)
sys.exit(1)
machine_id = os.environ.get('MACHINE_ID', '')
# Output Binary JSONL record to stdout
record = {
'type': 'Binary',
'name': name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'brew',
'machine_id': machine_id,
'binary_id': binary_id,
}
print(json.dumps(record))
# Log human-readable info to stderr
click.echo(f"Installed {name} at {binary.abspath}", err=True)
click.echo(f" version: {binary.version}", err=True)
sys.exit(0)
if __name__ == '__main__':
main()

File diff suppressed because it is too large Load Diff

View File

@@ -1,157 +0,0 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"CHROME_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["USE_CHROME"],
"description": "Enable Chromium browser integration for archiving"
},
"CHROME_BINARY": {
"type": "string",
"default": "chromium",
"x-aliases": ["CHROMIUM_BINARY", "GOOGLE_CHROME_BINARY"],
"description": "Path to Chromium binary"
},
"CHROME_NODE_BINARY": {
"type": "string",
"default": "node",
"x-fallback": "NODE_BINARY",
"description": "Path to Node.js binary (for Puppeteer)"
},
"CHROME_TIMEOUT": {
"type": "integer",
"default": 60,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for Chrome operations in seconds"
},
"CHROME_HEADLESS": {
"type": "boolean",
"default": true,
"description": "Run Chrome in headless mode"
},
"CHROME_SANDBOX": {
"type": "boolean",
"default": true,
"description": "Enable Chrome sandbox (disable in Docker with --no-sandbox)"
},
"CHROME_RESOLUTION": {
"type": "string",
"default": "1440,2000",
"pattern": "^\\d+,\\d+$",
"x-fallback": "RESOLUTION",
"description": "Browser viewport resolution (width,height)"
},
"CHROME_USER_DATA_DIR": {
"type": "string",
"default": "",
"description": "Path to Chrome user data directory for persistent sessions (derived from ACTIVE_PERSONA if not set)"
},
"CHROME_USER_AGENT": {
"type": "string",
"default": "",
"x-fallback": "USER_AGENT",
"description": "User agent string for Chrome"
},
"CHROME_ARGS": {
"type": "array",
"items": {"type": "string"},
"default": [
"--no-first-run",
"--no-default-browser-check",
"--disable-default-apps",
"--disable-sync",
"--disable-infobars",
"--disable-blink-features=AutomationControlled",
"--disable-component-update",
"--disable-domain-reliability",
"--disable-breakpad",
"--disable-client-side-phishing-detection",
"--disable-hang-monitor",
"--disable-speech-synthesis-api",
"--disable-speech-api",
"--disable-print-preview",
"--disable-notifications",
"--disable-desktop-notifications",
"--disable-popup-blocking",
"--disable-prompt-on-repost",
"--disable-external-intent-requests",
"--disable-session-crashed-bubble",
"--disable-search-engine-choice-screen",
"--disable-datasaver-prompt",
"--ash-no-nudges",
"--hide-crash-restore-bubble",
"--suppress-message-center-popups",
"--noerrdialogs",
"--no-pings",
"--silent-debugger-extension-api",
"--deny-permission-prompts",
"--safebrowsing-disable-auto-update",
"--metrics-recording-only",
"--password-store=basic",
"--use-mock-keychain",
"--disable-cookie-encryption",
"--font-render-hinting=none",
"--force-color-profile=srgb",
"--disable-partial-raster",
"--disable-skia-runtime-opts",
"--disable-2d-canvas-clip-aa",
"--enable-webgl",
"--hide-scrollbars",
"--export-tagged-pdf",
"--generate-pdf-document-outline",
"--disable-lazy-loading",
"--disable-renderer-backgrounding",
"--disable-background-networking",
"--disable-background-timer-throttling",
"--disable-backgrounding-occluded-windows",
"--disable-ipc-flooding-protection",
"--disable-extensions-http-throttling",
"--disable-field-trial-config",
"--disable-back-forward-cache",
"--autoplay-policy=no-user-gesture-required",
"--disable-gesture-requirement-for-media-playback",
"--lang=en-US,en;q=0.9",
"--log-level=2",
"--enable-logging=stderr"
],
"x-aliases": ["CHROME_DEFAULT_ARGS"],
"description": "Default Chrome command-line arguments (static flags only, dynamic args like --user-data-dir are added at runtime)"
},
"CHROME_ARGS_EXTRA": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["CHROME_EXTRA_ARGS"],
"description": "Extra arguments to append to Chrome command (for user customization)"
},
"CHROME_PAGELOAD_TIMEOUT": {
"type": "integer",
"default": 60,
"minimum": 5,
"x-fallback": "CHROME_TIMEOUT",
"description": "Timeout for page navigation/load in seconds"
},
"CHROME_WAIT_FOR": {
"type": "string",
"default": "networkidle2",
"enum": ["domcontentloaded", "load", "networkidle0", "networkidle2"],
"description": "Page load completion condition (domcontentloaded, load, networkidle0, networkidle2)"
},
"CHROME_DELAY_AFTER_LOAD": {
"type": "number",
"default": 0,
"minimum": 0,
"description": "Extra delay in seconds after page load completes before archiving (useful for JS-heavy SPAs)"
},
"CHROME_CHECK_SSL_VALIDITY": {
"type": "boolean",
"default": true,
"x-fallback": "CHECK_SSL_VALIDITY",
"description": "Whether to verify SSL certificates (disable for self-signed certs)"
}
}
}

View File

@@ -1,254 +0,0 @@
#!/usr/bin/env node
/**
* Extract cookies from Chrome via CDP and write to Netscape cookies.txt format.
*
* This script launches Chrome with a given user data directory, connects via CDP,
* extracts all cookies, and writes them to a cookies.txt file in Netscape format.
*
* Usage:
* CHROME_USER_DATA_DIR=/path/to/profile COOKIES_OUTPUT_FILE=/path/to/cookies.txt node extract_cookies.js
*
* Environment variables:
* CHROME_USER_DATA_DIR: Path to Chrome user data directory (required)
* COOKIES_OUTPUT_FILE: Path to output cookies.txt file (required)
* CHROME_HEADLESS: Run in headless mode (default: true)
* NODE_MODULES_DIR: Path to node_modules for module resolution
*/
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) {
module.paths.unshift(process.env.NODE_MODULES_DIR);
}
const fs = require('fs');
const path = require('path');
const {
findAnyChromiumBinary,
launchChromium,
killChrome,
getEnv,
} = require('./chrome_utils.js');
/**
* Convert a cookie object to Netscape cookies.txt format line.
*
* Format: domain includeSubdomains path secure expiry name value
*
* @param {Object} cookie - CDP cookie object
* @returns {string} - Netscape format cookie line
*/
function cookieToNetscape(cookie) {
// Domain: prefix with . for domain cookies (not host-only)
let domain = cookie.domain;
if (!domain.startsWith('.') && !cookie.hostOnly) {
domain = '.' + domain;
}
// Include subdomains: TRUE if domain cookie (starts with .)
const includeSubdomains = domain.startsWith('.') ? 'TRUE' : 'FALSE';
// Path
const cookiePath = cookie.path || '/';
// Secure flag
const secure = cookie.secure ? 'TRUE' : 'FALSE';
// Expiry timestamp (0 for session cookies)
let expiry = '0';
if (cookie.expires && cookie.expires > 0) {
// CDP returns expiry in seconds since epoch
expiry = Math.floor(cookie.expires).toString();
}
// Name and value
const name = cookie.name;
const value = cookie.value;
return `${domain}\t${includeSubdomains}\t${cookiePath}\t${secure}\t${expiry}\t${name}\t${value}`;
}
/**
* Write cookies to Netscape cookies.txt format file.
*
* @param {Array} cookies - Array of CDP cookie objects
* @param {string} outputPath - Path to output file
*/
function writeCookiesFile(cookies, outputPath) {
const lines = [
'# Netscape HTTP Cookie File',
'# https://curl.se/docs/http-cookies.html',
'# This file was generated by ArchiveBox persona cookie extraction',
'#',
'# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue',
'',
];
for (const cookie of cookies) {
lines.push(cookieToNetscape(cookie));
}
fs.writeFileSync(outputPath, lines.join('\n') + '\n');
}
async function main() {
const userDataDir = getEnv('CHROME_USER_DATA_DIR');
const outputFile = getEnv('COOKIES_OUTPUT_FILE');
if (!userDataDir) {
console.error('ERROR: CHROME_USER_DATA_DIR environment variable is required');
process.exit(1);
}
if (!outputFile) {
console.error('ERROR: COOKIES_OUTPUT_FILE environment variable is required');
process.exit(1);
}
if (!fs.existsSync(userDataDir)) {
console.error(`ERROR: User data directory does not exist: ${userDataDir}`);
process.exit(1);
}
const binary = findAnyChromiumBinary();
if (!binary) {
console.error('ERROR: Chromium-based browser binary not found');
process.exit(1);
}
console.error(`[*] Extracting cookies from: ${userDataDir}`);
console.error(`[*] Output file: ${outputFile}`);
console.error(`[*] Using browser: ${binary}`);
// Create a temporary output directory for Chrome files
const outputDir = fs.mkdtempSync(path.join(require('os').tmpdir(), 'chrome-cookies-'));
let chromePid = null;
try {
// Launch Chrome with the user data directory
const result = await launchChromium({
binary,
outputDir,
userDataDir,
headless: true,
killZombies: false, // Don't kill other Chrome instances
});
if (!result.success) {
console.error(`ERROR: Failed to launch Chrome: ${result.error}`);
process.exit(1);
}
chromePid = result.pid;
const cdpUrl = result.cdpUrl;
const port = result.port;
console.error(`[*] Chrome launched (PID: ${chromePid})`);
console.error(`[*] CDP URL: ${cdpUrl}`);
// Connect to CDP and get cookies
const http = require('http');
// Use CDP directly via HTTP to get all cookies
const getCookies = () => {
return new Promise((resolve, reject) => {
const req = http.request(
{
hostname: '127.0.0.1',
port: port,
path: '/json/list',
method: 'GET',
},
(res) => {
let data = '';
res.on('data', (chunk) => (data += chunk));
res.on('end', () => {
try {
const targets = JSON.parse(data);
// Find a page target
const pageTarget = targets.find(t => t.type === 'page') || targets[0];
if (!pageTarget) {
reject(new Error('No page target found'));
return;
}
// Connect via WebSocket and send CDP command
const WebSocket = require('ws');
const ws = new WebSocket(pageTarget.webSocketDebuggerUrl);
ws.on('open', () => {
ws.send(JSON.stringify({
id: 1,
method: 'Network.getAllCookies',
}));
});
ws.on('message', (message) => {
const response = JSON.parse(message);
if (response.id === 1) {
ws.close();
if (response.result && response.result.cookies) {
resolve(response.result.cookies);
} else {
reject(new Error('Failed to get cookies: ' + JSON.stringify(response)));
}
}
});
ws.on('error', (err) => {
reject(err);
});
} catch (e) {
reject(e);
}
});
}
);
req.on('error', reject);
req.end();
});
};
// Wait a moment for the browser to fully initialize
await new Promise(r => setTimeout(r, 2000));
console.error('[*] Fetching cookies via CDP...');
const cookies = await getCookies();
console.error(`[+] Retrieved ${cookies.length} cookies`);
// Write cookies to file
writeCookiesFile(cookies, outputFile);
console.error(`[+] Wrote cookies to: ${outputFile}`);
// Clean up
await killChrome(chromePid, outputDir);
chromePid = null;
// Remove temp directory
fs.rmSync(outputDir, { recursive: true, force: true });
console.error('[+] Cookie extraction complete');
process.exit(0);
} catch (error) {
console.error(`ERROR: ${error.message}`);
// Clean up on error
if (chromePid) {
await killChrome(chromePid, outputDir);
}
try {
fs.rmSync(outputDir, { recursive: true, force: true });
} catch (e) {}
process.exit(1);
}
}
main().catch((e) => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -1,34 +0,0 @@
#!/usr/bin/env python3
"""
Emit Chromium Binary dependency for the crawl.
NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
--load-extension and --disable-extensions-except flags, which are needed for
loading unpacked extensions in headless mode.
"""
import json
import os
import sys
def main():
# Check if Chrome is enabled
chrome_enabled = os.environ.get('CHROME_ENABLED', 'true').lower() not in ('false', '0', 'no', 'off')
if not chrome_enabled:
sys.exit(0)
record = {
'type': 'Binary',
'name': 'chromium',
'binproviders': 'puppeteer,env',
'overrides': {
'puppeteer': ['chromium@latest', '--install-deps'],
},
}
print(json.dumps(record))
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -1,427 +0,0 @@
#!/usr/bin/env node
/**
* Launch a shared Chromium browser session for the entire crawl.
*
* This runs once per crawl and keeps Chromium alive for all snapshots to share.
* Each snapshot creates its own tab via on_Snapshot__10_chrome_tab.bg.js.
*
* NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
* --load-extension and --disable-extensions-except flags.
*
* Usage: on_Crawl__90_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
* Output: Writes to current directory (executor creates chrome/ dir):
* - cdp_url.txt: WebSocket URL for CDP connection
* - chrome.pid: Chromium process ID (for cleanup)
* - port.txt: Debug port number
* - extensions.json: Loaded extensions metadata
*
* Environment variables:
* NODE_MODULES_DIR: Path to node_modules directory for module resolution
* CHROME_BINARY: Path to Chromium binary (falls back to auto-detection)
* CHROME_RESOLUTION: Page resolution (default: 1440,2000)
* CHROME_HEADLESS: Run in headless mode (default: true)
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
* CHROME_EXTENSIONS_DIR: Directory containing Chrome extensions
*/
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) {
module.paths.unshift(process.env.NODE_MODULES_DIR);
}
const fs = require('fs');
const path = require('path');
const http = require('http');
const puppeteer = require('puppeteer');
const {
findChromium,
launchChromium,
killChrome,
getEnv,
getEnvBool,
getExtensionId,
writePidWithMtime,
getExtensionsDir,
} = require('./chrome_utils.js');
// Extractor metadata
const PLUGIN_NAME = 'chrome_launch';
const OUTPUT_DIR = '.';
// Global state for cleanup
let chromePid = null;
let browserInstance = null;
function parseCookiesTxt(contents) {
const cookies = [];
let skipped = 0;
for (const rawLine of contents.split(/\r?\n/)) {
const line = rawLine.trim();
if (!line) continue;
let httpOnly = false;
let dataLine = line;
if (dataLine.startsWith('#HttpOnly_')) {
httpOnly = true;
dataLine = dataLine.slice('#HttpOnly_'.length);
} else if (dataLine.startsWith('#')) {
continue;
}
const parts = dataLine.split('\t');
if (parts.length < 7) {
skipped += 1;
continue;
}
const [domainRaw, includeSubdomainsRaw, pathRaw, secureRaw, expiryRaw, name, value] = parts;
if (!name || !domainRaw) {
skipped += 1;
continue;
}
const includeSubdomains = (includeSubdomainsRaw || '').toUpperCase() === 'TRUE';
let domain = domainRaw;
if (includeSubdomains && !domain.startsWith('.')) domain = `.${domain}`;
if (!includeSubdomains && domain.startsWith('.')) domain = domain.slice(1);
const cookie = {
name,
value,
domain,
path: pathRaw || '/',
secure: (secureRaw || '').toUpperCase() === 'TRUE',
httpOnly,
};
const expires = parseInt(expiryRaw, 10);
if (!isNaN(expires) && expires > 0) {
cookie.expires = expires;
}
cookies.push(cookie);
}
return { cookies, skipped };
}
async function importCookiesFromFile(browser, cookiesFile, userDataDir) {
if (!cookiesFile) return;
if (!fs.existsSync(cookiesFile)) {
console.error(`[!] Cookies file not found: ${cookiesFile}`);
return;
}
let contents = '';
try {
contents = fs.readFileSync(cookiesFile, 'utf-8');
} catch (e) {
console.error(`[!] Failed to read COOKIES_TXT_FILE: ${e.message}`);
return;
}
const { cookies, skipped } = parseCookiesTxt(contents);
if (cookies.length === 0) {
console.error('[!] No cookies found to import');
return;
}
console.error(`[*] Importing ${cookies.length} cookies from ${cookiesFile}...`);
if (skipped) {
console.error(`[*] Skipped ${skipped} malformed cookie line(s)`);
}
if (!userDataDir) {
console.error('[!] CHROME_USER_DATA_DIR not set; cookies will not persist beyond this session');
}
const page = await browser.newPage();
const client = await page.target().createCDPSession();
await client.send('Network.enable');
const chunkSize = 200;
let imported = 0;
for (let i = 0; i < cookies.length; i += chunkSize) {
const chunk = cookies.slice(i, i + chunkSize);
try {
await client.send('Network.setCookies', { cookies: chunk });
imported += chunk.length;
} catch (e) {
console.error(`[!] Failed to import cookies ${i + 1}-${i + chunk.length}: ${e.message}`);
}
}
await page.close();
console.error(`[+] Imported ${imported}/${cookies.length} cookies`);
}
function getPortFromCdpUrl(cdpUrl) {
if (!cdpUrl) return null;
const match = cdpUrl.match(/:(\d+)\/devtools\//);
return match ? match[1] : null;
}
async function fetchDevtoolsTargets(cdpUrl) {
const port = getPortFromCdpUrl(cdpUrl);
if (!port) return [];
const urlPath = '/json/list';
return new Promise((resolve, reject) => {
const req = http.get(
{ hostname: '127.0.0.1', port, path: urlPath },
(res) => {
let data = '';
res.on('data', (chunk) => (data += chunk));
res.on('end', () => {
try {
const targets = JSON.parse(data);
resolve(Array.isArray(targets) ? targets : []);
} catch (e) {
reject(e);
}
});
}
);
req.on('error', reject);
});
}
async function discoverExtensionTargets(cdpUrl, installedExtensions) {
const builtinIds = [
'nkeimhogjdpnpccoofpliimaahmaaome',
'fignfifoniblkonapihmkfakmlgkbkcf',
'ahfgeienlihckogmohjhadlkjgocpleb',
'mhjfbmdgcfjbbpaeojofohoefgiehjai',
];
let targets = [];
for (let i = 0; i < 10; i += 1) {
try {
targets = await fetchDevtoolsTargets(cdpUrl);
if (targets.length > 0) break;
} catch (e) {
// Ignore and retry
}
await new Promise(r => setTimeout(r, 500));
}
const customExtTargets = targets.filter(t => {
const url = t.url || '';
if (!url.startsWith('chrome-extension://')) return false;
const extId = url.split('://')[1].split('/')[0];
return !builtinIds.includes(extId);
});
console.error(`[+] Found ${customExtTargets.length} custom extension target(s) via /json/list`);
for (const target of customExtTargets) {
const url = target.url || '';
const extId = url.split('://')[1].split('/')[0];
console.error(`[+] Extension target: ${extId} (${target.type || 'unknown'})`);
}
const runtimeIds = new Set(customExtTargets.map(t => (t.url || '').split('://')[1].split('/')[0]));
for (const ext of installedExtensions) {
if (ext.id) {
ext.loaded = runtimeIds.has(ext.id);
}
}
if (customExtTargets.length === 0 && installedExtensions.length > 0) {
console.error(`[!] Warning: No custom extensions detected. Extension loading may have failed.`);
console.error(`[!] Make sure you are using Chromium, not Chrome (Chrome 137+ removed --load-extension support)`);
}
}
// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach((arg) => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
// Cleanup handler for SIGTERM
async function cleanup() {
console.error('[*] Cleaning up Chrome session...');
// Try graceful browser close first
if (browserInstance) {
try {
console.error('[*] Closing browser gracefully...');
await browserInstance.close();
browserInstance = null;
console.error('[+] Browser closed gracefully');
} catch (e) {
console.error(`[!] Graceful close failed: ${e.message}`);
}
}
// Kill Chrome process
if (chromePid) {
await killChrome(chromePid, OUTPUT_DIR);
}
process.exit(0);
}
// Register signal handlers
process.on('SIGTERM', cleanup);
process.on('SIGINT', cleanup);
async function main() {
const args = parseArgs();
const crawlId = args.crawl_id;
try {
const binary = findChromium();
if (!binary) {
console.error('ERROR: Chromium binary not found');
console.error('DEPENDENCY_NEEDED=chromium');
console.error('BIN_PROVIDERS=puppeteer,env,playwright,apt,brew');
console.error('INSTALL_HINT=npx @puppeteer/browsers install chromium@latest');
process.exit(1);
}
// Get Chromium version
let version = '';
try {
const { execSync } = require('child_process');
version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 })
.trim()
.slice(0, 64);
} catch (e) {}
console.error(`[*] Using browser: ${binary}`);
if (version) console.error(`[*] Version: ${version}`);
// Load installed extensions
const extensionsDir = getExtensionsDir();
const userDataDir = getEnv('CHROME_USER_DATA_DIR');
const cookiesFile = getEnv('COOKIES_TXT_FILE') || getEnv('COOKIES_FILE');
if (userDataDir) {
console.error(`[*] Using user data dir: ${userDataDir}`);
}
if (cookiesFile) {
console.error(`[*] Using cookies file: ${cookiesFile}`);
}
const installedExtensions = [];
const extensionPaths = [];
if (fs.existsSync(extensionsDir)) {
const files = fs.readdirSync(extensionsDir);
for (const file of files) {
if (file.endsWith('.extension.json')) {
try {
const extPath = path.join(extensionsDir, file);
const extData = JSON.parse(fs.readFileSync(extPath, 'utf-8'));
if (extData.unpacked_path && fs.existsSync(extData.unpacked_path)) {
installedExtensions.push(extData);
extensionPaths.push(extData.unpacked_path);
console.error(`[*] Loading extension: ${extData.name || file}`);
}
} catch (e) {
console.warn(`[!] Skipping invalid extension cache: ${file}`);
}
}
}
}
if (installedExtensions.length > 0) {
console.error(`[+] Found ${installedExtensions.length} extension(s) to load`);
}
// Ensure extension IDs are available without chrome://extensions
for (const ext of installedExtensions) {
if (!ext.id && ext.unpacked_path) {
try {
ext.id = getExtensionId(ext.unpacked_path);
} catch (e) {
console.error(`[!] Failed to compute extension id for ${ext.name}: ${e.message}`);
}
}
}
// Note: PID file is written by run_hook() with hook-specific name
// Snapshot.cleanup() kills all *.pid processes when done
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
// Launch Chromium using consolidated function
// userDataDir is derived from ACTIVE_PERSONA by get_config() if not explicitly set
const result = await launchChromium({
binary,
outputDir: OUTPUT_DIR,
userDataDir,
extensionPaths,
});
if (!result.success) {
console.error(`ERROR: ${result.error}`);
process.exit(1);
}
chromePid = result.pid;
const cdpUrl = result.cdpUrl;
// Discover extension targets at launch (no chrome://extensions)
if (extensionPaths.length > 0) {
await new Promise(r => setTimeout(r, 2000));
console.error('[*] Discovering extension targets via devtools /json/list...');
await discoverExtensionTargets(cdpUrl, installedExtensions);
}
// Only connect to CDP when cookies import is needed to reduce crash risk.
if (cookiesFile) {
console.error(`[*] Connecting puppeteer to CDP for cookie import...`);
const browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
defaultViewport: null,
});
browserInstance = browser;
// Import cookies into Chrome profile at crawl start
await importCookiesFromFile(browser, cookiesFile, userDataDir);
try {
browser.disconnect();
} catch (e) {}
browserInstance = null;
} else {
console.error('[*] Skipping puppeteer CDP connection (no cookies to import)');
}
// Write extensions metadata with actual IDs
if (installedExtensions.length > 0) {
fs.writeFileSync(
path.join(OUTPUT_DIR, 'extensions.json'),
JSON.stringify(installedExtensions, null, 2)
);
}
console.error(`[+] Chromium session started for crawl ${crawlId}`);
console.error(`[+] CDP URL: ${cdpUrl}`);
console.error(`[+] PID: ${chromePid}`);
// Stay alive to handle cleanup on SIGTERM
console.log('[*] Chromium launch hook staying alive to handle cleanup...');
setInterval(() => {}, 1000000);
} catch (e) {
console.error(`ERROR: ${e.name}: ${e.message}`);
process.exit(1);
}
}
main().catch((e) => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -1,264 +0,0 @@
#!/usr/bin/env node
/**
* Create a Chrome tab for this snapshot in the shared crawl Chrome session.
*
* Connects to the crawl-level Chrome session (from on_Crawl__90_chrome_launch.bg.js)
* and creates a new tab. This hook does NOT launch its own Chrome instance.
*
* Usage: on_Snapshot__10_chrome_tab.bg.js --url=<url> --snapshot-id=<uuid> --crawl-id=<uuid>
* Output: Creates chrome/ directory under snapshot output dir with:
* - cdp_url.txt: WebSocket URL for CDP connection
* - chrome.pid: Chrome process ID (from crawl)
* - target_id.txt: Target ID of this snapshot's tab
* - url.txt: The URL to be navigated to
*
* Environment variables:
* CRAWL_OUTPUT_DIR: Crawl output directory (to find crawl's Chrome session)
* CHROME_BINARY: Path to Chromium binary (optional, for version info)
*
* This is a background hook that stays alive until SIGTERM so the tab
* can be closed cleanly at the end of the snapshot run.
*/
const fs = require('fs');
const path = require('path');
const { execSync } = require('child_process');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer');
const { getEnv, getEnvInt } = require('./chrome_utils.js');
// Extractor metadata
const PLUGIN_NAME = 'chrome_tab';
const OUTPUT_DIR = '.'; // Hook already runs in chrome/ output directory
const CHROME_SESSION_DIR = '.';
const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)';
let finalStatus = 'failed';
let finalOutput = '';
let finalError = '';
let cmdVersion = '';
let finalized = false;
// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
function emitResult(statusOverride) {
if (finalized) return;
finalized = true;
const status = statusOverride || finalStatus;
const outputStr = status === 'succeeded'
? finalOutput
: (finalError || finalOutput || '');
const result = {
type: 'ArchiveResult',
status,
output_str: outputStr,
};
if (cmdVersion) {
result.cmd_version = cmdVersion;
}
console.log(JSON.stringify(result));
}
// Cleanup handler for SIGTERM - close this snapshot's tab
async function cleanup(signal) {
if (signal) {
console.error(`\nReceived ${signal}, closing chrome tab...`);
}
try {
const cdpFile = path.join(OUTPUT_DIR, 'cdp_url.txt');
const targetIdFile = path.join(OUTPUT_DIR, 'target_id.txt');
if (fs.existsSync(cdpFile) && fs.existsSync(targetIdFile)) {
const cdpUrl = fs.readFileSync(cdpFile, 'utf8').trim();
const targetId = fs.readFileSync(targetIdFile, 'utf8').trim();
const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
const pages = await browser.pages();
const page = pages.find(p => p.target()._targetId === targetId);
if (page) {
await page.close();
}
browser.disconnect();
}
} catch (e) {
// Best effort
}
emitResult();
process.exit(finalStatus === 'succeeded' ? 0 : 1);
}
// Register signal handlers
process.on('SIGTERM', () => cleanup('SIGTERM'));
process.on('SIGINT', () => cleanup('SIGINT'));
// Try to find the crawl's Chrome session
function getCrawlChromeSession() {
// Use CRAWL_OUTPUT_DIR env var set by get_config() in configset.py
const crawlOutputDir = getEnv('CRAWL_OUTPUT_DIR', '');
if (!crawlOutputDir) {
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
}
const crawlChromeDir = path.join(crawlOutputDir, 'chrome');
const cdpFile = path.join(crawlChromeDir, 'cdp_url.txt');
const pidFile = path.join(crawlChromeDir, 'chrome.pid');
if (!fs.existsSync(cdpFile)) {
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
}
if (!fs.existsSync(pidFile)) {
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
}
const cdpUrl = fs.readFileSync(cdpFile, 'utf-8').trim();
const pid = parseInt(fs.readFileSync(pidFile, 'utf-8').trim(), 10);
if (!cdpUrl) {
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
}
if (!pid || Number.isNaN(pid)) {
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
}
// Verify the process is still running
try {
process.kill(pid, 0); // Signal 0 = check if process exists
} catch (e) {
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
}
return { cdpUrl, pid };
}
async function waitForCrawlChromeSession(timeoutMs, intervalMs = 250) {
const startTime = Date.now();
let lastError = null;
while (Date.now() - startTime < timeoutMs) {
try {
return getCrawlChromeSession();
} catch (e) {
lastError = e;
}
await new Promise(resolve => setTimeout(resolve, intervalMs));
}
if (lastError) {
throw lastError;
}
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
}
// Create a new tab in an existing Chrome session
async function createTabInExistingChrome(cdpUrl, url, pid) {
console.log(`[*] Connecting to existing Chrome session: ${cdpUrl}`);
// Connect Puppeteer to the running Chrome
const browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
defaultViewport: null,
});
// Create a new tab for this snapshot
const page = await browser.newPage();
// Get the page target ID
const target = page.target();
const targetId = target._targetId;
// Write session info
fs.writeFileSync(path.join(OUTPUT_DIR, 'cdp_url.txt'), cdpUrl);
fs.writeFileSync(path.join(OUTPUT_DIR, 'chrome.pid'), String(pid));
fs.writeFileSync(path.join(OUTPUT_DIR, 'target_id.txt'), targetId);
fs.writeFileSync(path.join(OUTPUT_DIR, 'url.txt'), url);
// Disconnect Puppeteer (Chrome and tab stay alive)
browser.disconnect();
return { success: true, output: OUTPUT_DIR, cdpUrl, targetId, pid };
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
const crawlId = args.crawl_id || getEnv('CRAWL_ID', '');
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__10_chrome_tab.bg.js --url=<url> --snapshot-id=<uuid> [--crawl-id=<uuid>]');
process.exit(1);
}
let status = 'failed';
let output = '';
let error = '';
let version = '';
try {
// Get Chrome version
try {
const binary = getEnv('CHROME_BINARY', '').trim();
if (binary) {
version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 }).trim().slice(0, 64);
}
} catch (e) {
version = '';
}
// Try to use existing crawl Chrome session (wait for readiness)
const timeoutSeconds = getEnvInt('CHROME_TAB_TIMEOUT', getEnvInt('CHROME_TIMEOUT', getEnvInt('TIMEOUT', 60)));
const crawlSession = await waitForCrawlChromeSession(timeoutSeconds * 1000);
console.log(`[*] Found existing Chrome session from crawl ${crawlId}`);
const result = await createTabInExistingChrome(crawlSession.cdpUrl, url, crawlSession.pid);
if (result.success) {
status = 'succeeded';
output = result.output;
console.log(`[+] Chrome tab ready`);
console.log(`[+] CDP URL: ${result.cdpUrl}`);
console.log(`[+] Page target ID: ${result.targetId}`);
} else {
status = 'failed';
error = result.error;
}
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
}
if (error) {
console.error(`ERROR: ${error}`);
}
finalStatus = status;
finalOutput = output || '';
finalError = error || '';
cmdVersion = version || '';
if (status !== 'succeeded') {
emitResult(status);
process.exit(1);
}
console.log('[*] Chrome tab created, waiting for cleanup signal...');
await new Promise(() => {}); // Keep alive until SIGTERM
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -1,77 +0,0 @@
#!/usr/bin/env node
/**
* Wait for Chrome session files to exist (cdp_url.txt + target_id.txt).
*
* This is a foreground hook that blocks until the Chrome tab is ready,
* so downstream hooks can safely connect to CDP.
*
* Usage: on_Snapshot__11_chrome_wait.js --url=<url> --snapshot-id=<uuid>
*/
const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const {
getEnvInt,
waitForChromeSession,
readCdpUrl,
readTargetId,
} = require('./chrome_utils.js');
const CHROME_SESSION_DIR = '.';
const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)';
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__11_chrome_wait.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
const timeoutSeconds = getEnvInt('CHROME_TAB_TIMEOUT', getEnvInt('CHROME_TIMEOUT', getEnvInt('TIMEOUT', 60)));
const timeoutMs = timeoutSeconds * 1000;
console.error(`[chrome_wait] Waiting for Chrome session (timeout=${timeoutSeconds}s)...`);
const ready = await waitForChromeSession(CHROME_SESSION_DIR, timeoutMs);
if (!ready) {
const error = CHROME_SESSION_REQUIRED_ERROR;
console.error(`[chrome_wait] ERROR: ${error}`);
console.log(JSON.stringify({ type: 'ArchiveResult', status: 'failed', output_str: error }));
process.exit(1);
}
const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
const targetId = readTargetId(CHROME_SESSION_DIR);
if (!cdpUrl || !targetId) {
const error = CHROME_SESSION_REQUIRED_ERROR;
console.error(`[chrome_wait] ERROR: ${error}`);
console.log(JSON.stringify({ type: 'ArchiveResult', status: 'failed', output_str: error }));
process.exit(1);
}
console.error(`[chrome_wait] Chrome session ready (cdp_url=${cdpUrl.slice(0, 32)}..., target_id=${targetId}).`);
console.log(JSON.stringify({ type: 'ArchiveResult', status: 'succeeded', output_str: 'chrome session ready' }));
process.exit(0);
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -1,225 +0,0 @@
#!/usr/bin/env node
/**
* Navigate the Chrome browser to the target URL.
*
* This is a simple hook that ONLY navigates - nothing else.
* Pre-load hooks (21-29) should set up their own CDP listeners.
* Post-load hooks (31+) can then read from the loaded page.
*
* Usage: on_Snapshot__30_chrome_navigate.js --url=<url> --snapshot-id=<uuid>
* Output: Writes page_loaded.txt marker when navigation completes
*
* Environment variables:
* CHROME_PAGELOAD_TIMEOUT: Timeout in seconds (default: 60)
* CHROME_DELAY_AFTER_LOAD: Extra delay after load in seconds (default: 0)
* CHROME_WAIT_FOR: Wait condition (default: networkidle2)
*/
const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer');
const PLUGIN_NAME = 'chrome_navigate';
const CHROME_SESSION_DIR = '.';
const OUTPUT_DIR = '.';
const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)';
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvInt(name, defaultValue = 0) {
const val = parseInt(getEnv(name, String(defaultValue)), 10);
return isNaN(val) ? defaultValue : val;
}
function getEnvFloat(name, defaultValue = 0) {
const val = parseFloat(getEnv(name, String(defaultValue)));
return isNaN(val) ? defaultValue : val;
}
async function waitForChromeTabOpen(timeoutMs = 60000) {
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt');
const startTime = Date.now();
while (Date.now() - startTime < timeoutMs) {
if (fs.existsSync(cdpFile) && fs.existsSync(targetIdFile)) {
return true;
}
// Wait 100ms before checking again
await new Promise(resolve => setTimeout(resolve, 100));
}
return false;
}
function getCdpUrl() {
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
if (!fs.existsSync(cdpFile)) return null;
return fs.readFileSync(cdpFile, 'utf8').trim();
}
function getPageId() {
const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt');
if (!fs.existsSync(targetIdFile)) return null;
return fs.readFileSync(targetIdFile, 'utf8').trim();
}
function getWaitCondition() {
const waitFor = getEnv('CHROME_WAIT_FOR', 'networkidle2').toLowerCase();
const valid = ['domcontentloaded', 'load', 'networkidle0', 'networkidle2'];
return valid.includes(waitFor) ? waitFor : 'networkidle2';
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
async function navigate(url, cdpUrl) {
const timeout = (getEnvInt('CHROME_PAGELOAD_TIMEOUT') || getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000;
const delayAfterLoad = getEnvFloat('CHROME_DELAY_AFTER_LOAD', 0) * 1000;
const waitUntil = getWaitCondition();
const targetId = getPageId();
let browser = null;
const navStartTime = Date.now();
try {
browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
const pages = await browser.pages();
if (pages.length === 0) {
return { success: false, error: 'No pages found in browser', waitUntil, elapsed: Date.now() - navStartTime };
}
// Find page by target ID if available
let page = null;
if (targetId) {
page = pages.find(p => {
const target = p.target();
return target && target._targetId === targetId;
});
}
if (!page) {
page = pages[pages.length - 1];
}
// Navigate
console.log(`Navigating to ${url} (wait: ${waitUntil}, timeout: ${timeout}ms)`);
const response = await page.goto(url, { waitUntil, timeout });
// Optional delay
if (delayAfterLoad > 0) {
console.log(`Waiting ${delayAfterLoad}ms after load...`);
await sleep(delayAfterLoad);
}
const finalUrl = page.url();
const status = response ? response.status() : null;
const elapsed = Date.now() - navStartTime;
// Write navigation state as JSON
const navigationState = {
waitUntil,
elapsed,
url,
finalUrl,
status,
timestamp: new Date().toISOString()
};
fs.writeFileSync(path.join(OUTPUT_DIR, 'navigation.json'), JSON.stringify(navigationState, null, 2));
// Write marker files for backwards compatibility
fs.writeFileSync(path.join(OUTPUT_DIR, 'page_loaded.txt'), new Date().toISOString());
fs.writeFileSync(path.join(OUTPUT_DIR, 'final_url.txt'), finalUrl);
browser.disconnect();
return { success: true, finalUrl, status, waitUntil, elapsed };
} catch (e) {
if (browser) browser.disconnect();
const elapsed = Date.now() - navStartTime;
return { success: false, error: `${e.name}: ${e.message}`, waitUntil, elapsed };
}
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__30_chrome_navigate.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
const startTs = new Date();
let status = 'failed';
let output = null;
let error = '';
// Wait for chrome tab to be open (up to 60s)
const tabOpen = await waitForChromeTabOpen(60000);
if (!tabOpen) {
console.error(`ERROR: ${CHROME_SESSION_REQUIRED_ERROR}`);
process.exit(1);
}
const cdpUrl = getCdpUrl();
if (!cdpUrl) {
console.error(`ERROR: ${CHROME_SESSION_REQUIRED_ERROR}`);
process.exit(1);
}
const result = await navigate(url, cdpUrl);
if (result.success) {
status = 'succeeded';
output = 'navigation.json';
console.log(`Page loaded: ${result.finalUrl} (HTTP ${result.status}) in ${result.elapsed}ms (waitUntil: ${result.waitUntil})`);
} else {
error = result.error;
// Save navigation state even on failure
const navigationState = {
waitUntil: result.waitUntil,
elapsed: result.elapsed,
url,
error: result.error,
timestamp: new Date().toISOString()
};
fs.writeFileSync(path.join(OUTPUT_DIR, 'navigation.json'), JSON.stringify(navigationState, null, 2));
}
const endTs = new Date();
if (error) console.error(`ERROR: ${error}`);
// Output clean JSONL (no RESULT_JSON= prefix)
console.log(JSON.stringify({
type: 'ArchiveResult',
status,
output_str: output || error || '',
}));
process.exit(status === 'succeeded' ? 0 : 1);
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -1 +0,0 @@
<span class="abx-output-icon abx-output-icon--chrome" title="Chrome"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><rect x="3" y="4.5" width="18" height="15" rx="2"/><path d="M3 9h18"/><circle cx="7" cy="7" r="1" fill="currentColor" stroke="none"/><circle cx="11" cy="7" r="1" fill="currentColor" stroke="none"/></svg></span>

File diff suppressed because it is too large Load Diff

View File

@@ -1,722 +0,0 @@
"""
Integration tests for chrome plugin
Tests verify:
1. Chromium install via @puppeteer/browsers
2. Verify deps with abx-pkg
3. Chrome hooks exist
4. Chromium launches at crawl level
5. Tab creation at snapshot level
6. Tab navigation works
7. Tab cleanup on SIGTERM
8. Chromium cleanup on crawl end
NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
--load-extension and --disable-extensions-except flags, which are needed for
loading unpacked extensions in headless mode.
"""
import json
import os
import signal
import subprocess
import sys
import time
from pathlib import Path
import pytest
import tempfile
import shutil
import platform
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_test_env,
find_chromium_binary,
install_chromium_with_hooks,
CHROME_PLUGIN_DIR as PLUGIN_DIR,
CHROME_LAUNCH_HOOK,
CHROME_TAB_HOOK,
CHROME_NAVIGATE_HOOK,
)
def _get_cookies_via_cdp(port: int, env: dict) -> list[dict]:
node_script = r"""
const http = require('http');
const WebSocket = require('ws');
const port = process.env.CDP_PORT;
function getTargets() {
return new Promise((resolve, reject) => {
const req = http.get(`http://127.0.0.1:${port}/json/list`, (res) => {
let data = '';
res.on('data', (chunk) => (data += chunk));
res.on('end', () => {
try {
resolve(JSON.parse(data));
} catch (e) {
reject(e);
}
});
});
req.on('error', reject);
});
}
(async () => {
const targets = await getTargets();
const pageTarget = targets.find(t => t.type === 'page') || targets[0];
if (!pageTarget) {
console.error('No page target found');
process.exit(2);
}
const ws = new WebSocket(pageTarget.webSocketDebuggerUrl);
const timer = setTimeout(() => {
console.error('Timeout waiting for cookies');
process.exit(3);
}, 10000);
ws.on('open', () => {
ws.send(JSON.stringify({ id: 1, method: 'Network.getAllCookies' }));
});
ws.on('message', (data) => {
const msg = JSON.parse(data);
if (msg.id === 1) {
clearTimeout(timer);
ws.close();
if (!msg.result || !msg.result.cookies) {
console.error('No cookies in response');
process.exit(4);
}
process.stdout.write(JSON.stringify(msg.result.cookies));
process.exit(0);
}
});
ws.on('error', (err) => {
console.error(String(err));
process.exit(5);
});
})().catch((err) => {
console.error(String(err));
process.exit(1);
});
"""
result = subprocess.run(
['node', '-e', node_script],
capture_output=True,
text=True,
timeout=30,
env=env | {'CDP_PORT': str(port)},
)
assert result.returncode == 0, f"Failed to read cookies via CDP: {result.stderr}\nStdout: {result.stdout}"
return json.loads(result.stdout or '[]')
@pytest.fixture(scope="session", autouse=True)
def ensure_chromium_and_puppeteer_installed(tmp_path_factory):
"""Ensure Chromium and puppeteer are installed before running tests."""
if not os.environ.get('DATA_DIR'):
test_data_dir = tmp_path_factory.mktemp('chrome_test_data')
os.environ['DATA_DIR'] = str(test_data_dir)
env = get_test_env()
try:
chromium_binary = install_chromium_with_hooks(env)
except RuntimeError as e:
raise RuntimeError(str(e))
if not chromium_binary:
raise RuntimeError("Chromium not found after install")
os.environ['CHROME_BINARY'] = chromium_binary
for key in ('NODE_MODULES_DIR', 'NODE_PATH', 'PATH'):
if env.get(key):
os.environ[key] = env[key]
def test_hook_scripts_exist():
"""Verify chrome hooks exist."""
assert CHROME_LAUNCH_HOOK.exists(), f"Hook not found: {CHROME_LAUNCH_HOOK}"
assert CHROME_TAB_HOOK.exists(), f"Hook not found: {CHROME_TAB_HOOK}"
assert CHROME_NAVIGATE_HOOK.exists(), f"Hook not found: {CHROME_NAVIGATE_HOOK}"
def test_verify_chromium_available():
"""Verify Chromium is available via CHROME_BINARY env var."""
chromium_binary = os.environ.get('CHROME_BINARY') or find_chromium_binary()
assert chromium_binary, "Chromium binary should be available (set by fixture or found)"
assert Path(chromium_binary).exists(), f"Chromium binary should exist at {chromium_binary}"
# Verify it's actually Chromium by checking version
result = subprocess.run(
[chromium_binary, '--version'],
capture_output=True,
text=True,
timeout=10
)
assert result.returncode == 0, f"Failed to get Chromium version: {result.stderr}"
assert 'Chromium' in result.stdout or 'Chrome' in result.stdout, f"Unexpected version output: {result.stdout}"
def test_chrome_launch_and_tab_creation():
"""Integration test: Launch Chrome at crawl level and create tab at snapshot level."""
with tempfile.TemporaryDirectory() as tmpdir:
crawl_dir = Path(tmpdir) / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir()
# Get test environment with NODE_MODULES_DIR set
env = get_test_env()
env['CHROME_HEADLESS'] = 'true'
# Launch Chrome at crawl level (background process)
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-123'],
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Wait for Chrome to launch (check process isn't dead and files exist)
for i in range(15): # Wait up to 15 seconds for Chrome to start
if chrome_launch_process.poll() is not None:
stdout, stderr = chrome_launch_process.communicate()
pytest.fail(f"Chrome launch process exited early:\nStdout: {stdout}\nStderr: {stderr}")
if (chrome_dir / 'cdp_url.txt').exists():
break
time.sleep(1)
# Verify Chrome launch outputs - if it failed, get the error from the process
if not (chrome_dir / 'cdp_url.txt').exists():
# Try to get output from the process
try:
stdout, stderr = chrome_launch_process.communicate(timeout=1)
except subprocess.TimeoutExpired:
# Process still running, try to read available output
stdout = stderr = "(process still running)"
# Check what files exist
if chrome_dir.exists():
files = list(chrome_dir.iterdir())
# Check if Chrome process is still alive
if (chrome_dir / 'chrome.pid').exists():
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
try:
os.kill(chrome_pid, 0)
chrome_alive = "yes"
except OSError:
chrome_alive = "no"
pytest.fail(f"cdp_url.txt missing after 15s. Chrome dir files: {files}. Chrome process {chrome_pid} alive: {chrome_alive}\nLaunch stdout: {stdout}\nLaunch stderr: {stderr}")
else:
pytest.fail(f"cdp_url.txt missing. Chrome dir exists with files: {files}\nLaunch stdout: {stdout}\nLaunch stderr: {stderr}")
else:
pytest.fail(f"Chrome dir {chrome_dir} doesn't exist\nLaunch stdout: {stdout}\nLaunch stderr: {stderr}")
assert (chrome_dir / 'cdp_url.txt').exists(), "cdp_url.txt should exist"
assert (chrome_dir / 'chrome.pid').exists(), "chrome.pid should exist"
assert (chrome_dir / 'port.txt').exists(), "port.txt should exist"
cdp_url = (chrome_dir / 'cdp_url.txt').read_text().strip()
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
assert cdp_url.startswith('ws://'), f"CDP URL should be WebSocket URL: {cdp_url}"
assert chrome_pid > 0, "Chrome PID should be valid"
# Verify Chrome process is running
try:
os.kill(chrome_pid, 0)
except OSError:
pytest.fail(f"Chrome process {chrome_pid} is not running")
# Create snapshot directory and tab
snapshot_dir = Path(tmpdir) / 'snapshot1'
snapshot_dir.mkdir()
snapshot_chrome_dir = snapshot_dir / 'chrome'
snapshot_chrome_dir.mkdir()
# Launch tab at snapshot level
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
result = subprocess.run(
['node', str(CHROME_TAB_HOOK), '--url=https://example.com', '--snapshot-id=snap-123', '--crawl-id=test-crawl-123'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=60,
env=env
)
assert result.returncode == 0, f"Tab creation failed: {result.stderr}\nStdout: {result.stdout}"
# Verify tab creation outputs
assert (snapshot_chrome_dir / 'cdp_url.txt').exists(), "Snapshot cdp_url.txt should exist"
assert (snapshot_chrome_dir / 'target_id.txt').exists(), "target_id.txt should exist"
assert (snapshot_chrome_dir / 'url.txt').exists(), "url.txt should exist"
target_id = (snapshot_chrome_dir / 'target_id.txt').read_text().strip()
assert len(target_id) > 0, "Target ID should not be empty"
# Cleanup: Kill Chrome and launch process
try:
chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5)
except:
pass
try:
os.kill(chrome_pid, signal.SIGKILL)
except OSError:
pass
def test_cookies_imported_on_launch():
"""Integration test: COOKIES_TXT_FILE is imported at crawl start."""
with tempfile.TemporaryDirectory() as tmpdir:
crawl_dir = Path(tmpdir) / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir()
cookies_file = Path(tmpdir) / 'cookies.txt'
cookies_file.write_text(
'\n'.join([
'# Netscape HTTP Cookie File',
'# https://curl.se/docs/http-cookies.html',
'# This file was generated by a test',
'',
'example.com\tTRUE\t/\tFALSE\t2147483647\tabx_test_cookie\thello',
'',
])
)
profile_dir = Path(tmpdir) / 'profile'
env = get_test_env()
env.update({
'CHROME_HEADLESS': 'true',
'CHROME_USER_DATA_DIR': str(profile_dir),
'COOKIES_TXT_FILE': str(cookies_file),
})
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-cookies'],
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
for _ in range(15):
if (chrome_dir / 'port.txt').exists():
break
time.sleep(1)
assert (chrome_dir / 'port.txt').exists(), "port.txt should exist"
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
port = int((chrome_dir / 'port.txt').read_text().strip())
cookie_found = False
for _ in range(15):
cookies = _get_cookies_via_cdp(port, env)
cookie_found = any(
c.get('name') == 'abx_test_cookie' and c.get('value') == 'hello'
for c in cookies
)
if cookie_found:
break
time.sleep(1)
assert cookie_found, "Imported cookie should be present in Chrome session"
# Cleanup
try:
chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5)
except:
pass
try:
os.kill(chrome_pid, signal.SIGKILL)
except OSError:
pass
def test_chrome_navigation():
"""Integration test: Navigate to a URL."""
with tempfile.TemporaryDirectory() as tmpdir:
crawl_dir = Path(tmpdir) / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir()
# Launch Chrome (background process)
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-nav'],
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=get_test_env() | {'CHROME_HEADLESS': 'true'}
)
# Wait for Chrome to launch
time.sleep(3)
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
# Create snapshot and tab
snapshot_dir = Path(tmpdir) / 'snapshot1'
snapshot_dir.mkdir()
snapshot_chrome_dir = snapshot_dir / 'chrome'
snapshot_chrome_dir.mkdir()
result = subprocess.run(
['node', str(CHROME_TAB_HOOK), '--url=https://example.com', '--snapshot-id=snap-nav-123', '--crawl-id=test-crawl-nav'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=60,
env=get_test_env() | {'CRAWL_OUTPUT_DIR': str(crawl_dir), 'CHROME_HEADLESS': 'true'}
)
assert result.returncode == 0, f"Tab creation failed: {result.stderr}"
# Navigate to URL
result = subprocess.run(
['node', str(CHROME_NAVIGATE_HOOK), '--url=https://example.com', '--snapshot-id=snap-nav-123'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=120,
env=get_test_env() | {'CHROME_PAGELOAD_TIMEOUT': '30', 'CHROME_WAIT_FOR': 'load'}
)
assert result.returncode == 0, f"Navigation failed: {result.stderr}\nStdout: {result.stdout}"
# Verify navigation outputs
assert (snapshot_chrome_dir / 'navigation.json').exists(), "navigation.json should exist"
assert (snapshot_chrome_dir / 'page_loaded.txt').exists(), "page_loaded.txt should exist"
nav_data = json.loads((snapshot_chrome_dir / 'navigation.json').read_text())
assert nav_data.get('status') in [200, 301, 302], f"Should get valid HTTP status: {nav_data}"
assert nav_data.get('finalUrl'), "Should have final URL"
# Cleanup
try:
chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5)
except:
pass
try:
os.kill(chrome_pid, signal.SIGKILL)
except OSError:
pass
def test_tab_cleanup_on_sigterm():
"""Integration test: Tab cleanup when receiving SIGTERM."""
with tempfile.TemporaryDirectory() as tmpdir:
crawl_dir = Path(tmpdir) / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir()
# Launch Chrome (background process)
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-cleanup'],
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=get_test_env() | {'CHROME_HEADLESS': 'true'}
)
# Wait for Chrome to launch
time.sleep(3)
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
# Create snapshot and tab - run in background
snapshot_dir = Path(tmpdir) / 'snapshot1'
snapshot_dir.mkdir()
snapshot_chrome_dir = snapshot_dir / 'chrome'
snapshot_chrome_dir.mkdir()
tab_process = subprocess.Popen(
['node', str(CHROME_TAB_HOOK), '--url=https://example.com', '--snapshot-id=snap-cleanup', '--crawl-id=test-cleanup'],
cwd=str(snapshot_chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=get_test_env() | {'CRAWL_OUTPUT_DIR': str(crawl_dir), 'CHROME_HEADLESS': 'true'}
)
# Wait for tab to be created
time.sleep(3)
# Send SIGTERM to tab process
tab_process.send_signal(signal.SIGTERM)
stdout, stderr = tab_process.communicate(timeout=10)
assert tab_process.returncode == 0, f"Tab process should exit cleanly: {stderr}"
# Chrome should still be running
try:
os.kill(chrome_pid, 0)
except OSError:
pytest.fail("Chrome should still be running after tab cleanup")
# Cleanup
try:
chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5)
except:
pass
try:
os.kill(chrome_pid, signal.SIGKILL)
except OSError:
pass
def test_multiple_snapshots_share_chrome():
"""Integration test: Multiple snapshots share one Chrome instance."""
with tempfile.TemporaryDirectory() as tmpdir:
crawl_dir = Path(tmpdir) / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir()
# Launch Chrome at crawl level
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-multi-crawl'],
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=get_test_env() | {'CHROME_HEADLESS': 'true'}
)
# Wait for Chrome to launch
for i in range(15):
if (chrome_dir / 'cdp_url.txt').exists():
break
time.sleep(1)
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
crawl_cdp_url = (chrome_dir / 'cdp_url.txt').read_text().strip()
# Create multiple snapshots that share this Chrome
snapshot_dirs = []
target_ids = []
for snap_num in range(3):
snapshot_dir = Path(tmpdir) / f'snapshot{snap_num}'
snapshot_dir.mkdir()
snapshot_chrome_dir = snapshot_dir / 'chrome'
snapshot_chrome_dir.mkdir()
snapshot_dirs.append(snapshot_chrome_dir)
# Create tab for this snapshot
result = subprocess.run(
['node', str(CHROME_TAB_HOOK), f'--url=https://example.com/{snap_num}', f'--snapshot-id=snap-{snap_num}', '--crawl-id=test-multi-crawl'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=60,
env=get_test_env() | {'CRAWL_OUTPUT_DIR': str(crawl_dir), 'CHROME_HEADLESS': 'true'}
)
assert result.returncode == 0, f"Tab {snap_num} creation failed: {result.stderr}"
# Verify each snapshot has its own target_id but same Chrome PID
assert (snapshot_chrome_dir / 'target_id.txt').exists()
assert (snapshot_chrome_dir / 'cdp_url.txt').exists()
assert (snapshot_chrome_dir / 'chrome.pid').exists()
target_id = (snapshot_chrome_dir / 'target_id.txt').read_text().strip()
snapshot_cdp_url = (snapshot_chrome_dir / 'cdp_url.txt').read_text().strip()
snapshot_pid = int((snapshot_chrome_dir / 'chrome.pid').read_text().strip())
target_ids.append(target_id)
# All snapshots should share same Chrome
assert snapshot_pid == chrome_pid, f"Snapshot {snap_num} should use crawl Chrome PID"
assert snapshot_cdp_url == crawl_cdp_url, f"Snapshot {snap_num} should use crawl CDP URL"
# All target IDs should be unique (different tabs)
assert len(set(target_ids)) == 3, f"All snapshots should have unique tabs: {target_ids}"
# Chrome should still be running with all 3 tabs
try:
os.kill(chrome_pid, 0)
except OSError:
pytest.fail("Chrome should still be running after creating 3 tabs")
# Cleanup
try:
chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5)
except:
pass
try:
os.kill(chrome_pid, signal.SIGKILL)
except OSError:
pass
def test_chrome_cleanup_on_crawl_end():
"""Integration test: Chrome cleanup at end of crawl."""
with tempfile.TemporaryDirectory() as tmpdir:
crawl_dir = Path(tmpdir) / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir()
# Launch Chrome in background
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-end'],
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=get_test_env() | {'CHROME_HEADLESS': 'true'}
)
# Wait for Chrome to launch
time.sleep(3)
# Verify Chrome is running
assert (chrome_dir / 'chrome.pid').exists(), "Chrome PID file should exist"
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
try:
os.kill(chrome_pid, 0)
except OSError:
pytest.fail("Chrome should be running")
# Send SIGTERM to chrome launch process
chrome_launch_process.send_signal(signal.SIGTERM)
stdout, stderr = chrome_launch_process.communicate(timeout=10)
# Wait for cleanup
time.sleep(3)
# Verify Chrome process is killed
try:
os.kill(chrome_pid, 0)
pytest.fail("Chrome should be killed after SIGTERM")
except OSError:
# Expected - Chrome should be dead
pass
def test_zombie_prevention_hook_killed():
"""Integration test: Chrome is killed even if hook process is SIGKILL'd."""
with tempfile.TemporaryDirectory() as tmpdir:
crawl_dir = Path(tmpdir) / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir()
# Launch Chrome
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-zombie'],
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=get_test_env() | {'CHROME_HEADLESS': 'true'}
)
# Wait for Chrome to launch
for i in range(15):
if (chrome_dir / 'chrome.pid').exists():
break
time.sleep(1)
assert (chrome_dir / 'chrome.pid').exists(), "Chrome PID file should exist"
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
hook_pid = chrome_launch_process.pid # Use the Popen process PID instead of hook.pid file
# Verify both Chrome and hook are running
try:
os.kill(chrome_pid, 0)
os.kill(hook_pid, 0)
except OSError:
pytest.fail("Both Chrome and hook should be running")
# Simulate hook getting SIGKILL'd (can't cleanup)
os.kill(hook_pid, signal.SIGKILL)
time.sleep(1)
# Chrome should still be running (orphaned)
try:
os.kill(chrome_pid, 0)
except OSError:
pytest.fail("Chrome should still be running after hook SIGKILL")
# Simulate Crawl.cleanup() using the actual cleanup logic
def is_process_alive(pid):
"""Check if a process exists."""
try:
os.kill(pid, 0)
return True
except (OSError, ProcessLookupError):
return False
for pid_file in chrome_dir.glob('**/*.pid'):
try:
pid = int(pid_file.read_text().strip())
# Step 1: SIGTERM for graceful shutdown
try:
try:
os.killpg(pid, signal.SIGTERM)
except (OSError, ProcessLookupError):
os.kill(pid, signal.SIGTERM)
except ProcessLookupError:
pid_file.unlink(missing_ok=True)
continue
# Step 2: Wait for graceful shutdown
time.sleep(2)
# Step 3: Check if still alive
if not is_process_alive(pid):
pid_file.unlink(missing_ok=True)
continue
# Step 4: Force kill ENTIRE process group with SIGKILL
try:
try:
# Always kill entire process group with SIGKILL
os.killpg(pid, signal.SIGKILL)
except (OSError, ProcessLookupError):
os.kill(pid, signal.SIGKILL)
except ProcessLookupError:
pid_file.unlink(missing_ok=True)
continue
# Step 5: Wait and verify death
time.sleep(1)
if not is_process_alive(pid):
pid_file.unlink(missing_ok=True)
except (ValueError, OSError):
pass
# Chrome should now be dead
try:
os.kill(chrome_pid, 0)
pytest.fail("Chrome should be killed after cleanup")
except OSError:
# Expected - Chrome is dead
pass
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -1,260 +0,0 @@
"""
Tests for chrome_test_helpers.py functions.
These tests verify the Python helper functions used across Chrome plugin tests.
"""
import os
import pytest
import tempfile
from pathlib import Path
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_test_env,
get_machine_type,
get_lib_dir,
get_node_modules_dir,
get_extensions_dir,
find_chromium_binary,
get_plugin_dir,
get_hook_script,
parse_jsonl_output,
)
def test_get_machine_type():
"""Test get_machine_type() returns valid format."""
machine_type = get_machine_type()
assert isinstance(machine_type, str)
assert '-' in machine_type, "Machine type should be in format: arch-os"
# Should be one of the expected formats
assert any(x in machine_type for x in ['arm64', 'x86_64']), "Should contain valid architecture"
assert any(x in machine_type for x in ['darwin', 'linux', 'win32']), "Should contain valid OS"
def test_get_lib_dir_with_env_var():
"""Test get_lib_dir() respects LIB_DIR env var."""
with tempfile.TemporaryDirectory() as tmpdir:
custom_lib = Path(tmpdir) / 'custom_lib'
custom_lib.mkdir()
old_lib_dir = os.environ.get('LIB_DIR')
try:
os.environ['LIB_DIR'] = str(custom_lib)
lib_dir = get_lib_dir()
assert lib_dir == custom_lib
finally:
if old_lib_dir:
os.environ['LIB_DIR'] = old_lib_dir
else:
os.environ.pop('LIB_DIR', None)
def test_get_node_modules_dir_with_env_var():
"""Test get_node_modules_dir() respects NODE_MODULES_DIR env var."""
with tempfile.TemporaryDirectory() as tmpdir:
custom_nm = Path(tmpdir) / 'node_modules'
custom_nm.mkdir()
old_nm_dir = os.environ.get('NODE_MODULES_DIR')
try:
os.environ['NODE_MODULES_DIR'] = str(custom_nm)
nm_dir = get_node_modules_dir()
assert nm_dir == custom_nm
finally:
if old_nm_dir:
os.environ['NODE_MODULES_DIR'] = old_nm_dir
else:
os.environ.pop('NODE_MODULES_DIR', None)
def test_get_extensions_dir_default():
"""Test get_extensions_dir() returns expected path format."""
ext_dir = get_extensions_dir()
assert isinstance(ext_dir, str)
assert 'personas' in ext_dir
assert 'chrome_extensions' in ext_dir
def test_get_extensions_dir_with_custom_persona():
"""Test get_extensions_dir() respects ACTIVE_PERSONA env var."""
old_persona = os.environ.get('ACTIVE_PERSONA')
old_data_dir = os.environ.get('DATA_DIR')
try:
os.environ['ACTIVE_PERSONA'] = 'TestPersona'
os.environ['DATA_DIR'] = '/tmp/test'
ext_dir = get_extensions_dir()
assert 'TestPersona' in ext_dir
assert '/tmp/test' in ext_dir
finally:
if old_persona:
os.environ['ACTIVE_PERSONA'] = old_persona
else:
os.environ.pop('ACTIVE_PERSONA', None)
if old_data_dir:
os.environ['DATA_DIR'] = old_data_dir
else:
os.environ.pop('DATA_DIR', None)
def test_get_test_env_returns_dict():
"""Test get_test_env() returns properly formatted environment dict."""
env = get_test_env()
assert isinstance(env, dict)
# Should include key paths
assert 'MACHINE_TYPE' in env
assert 'LIB_DIR' in env
assert 'NODE_MODULES_DIR' in env
assert 'NODE_PATH' in env # Critical for module resolution
assert 'NPM_BIN_DIR' in env
assert 'CHROME_EXTENSIONS_DIR' in env
# Verify NODE_PATH equals NODE_MODULES_DIR (for Node.js module resolution)
assert env['NODE_PATH'] == env['NODE_MODULES_DIR']
def test_get_test_env_paths_are_absolute():
"""Test that get_test_env() returns absolute paths."""
env = get_test_env()
# All path-like values should be absolute
assert Path(env['LIB_DIR']).is_absolute()
assert Path(env['NODE_MODULES_DIR']).is_absolute()
assert Path(env['NODE_PATH']).is_absolute()
def test_find_chromium_binary():
"""Test find_chromium_binary() returns a path or None."""
binary = find_chromium_binary()
if binary:
assert isinstance(binary, str)
# Should be an absolute path if found
assert os.path.isabs(binary)
def test_get_plugin_dir():
"""Test get_plugin_dir() finds correct plugin directory."""
# Use this test file's path
test_file = __file__
plugin_dir = get_plugin_dir(test_file)
assert plugin_dir.exists()
assert plugin_dir.is_dir()
# Should be the chrome plugin directory
assert plugin_dir.name == 'chrome'
assert (plugin_dir.parent.name == 'plugins')
def test_get_hook_script_finds_existing_hook():
"""Test get_hook_script() can find an existing hook."""
from archivebox.plugins.chrome.tests.chrome_test_helpers import CHROME_PLUGIN_DIR
# Try to find the chrome launch hook
hook = get_hook_script(CHROME_PLUGIN_DIR, 'on_Crawl__*_chrome_launch.*')
if hook: # May not exist in all test environments
assert hook.exists()
assert hook.is_file()
assert 'chrome_launch' in hook.name
def test_get_hook_script_returns_none_for_missing():
"""Test get_hook_script() returns None for non-existent hooks."""
from archivebox.plugins.chrome.tests.chrome_test_helpers import CHROME_PLUGIN_DIR
hook = get_hook_script(CHROME_PLUGIN_DIR, 'nonexistent_hook_*_pattern.*')
assert hook is None
def test_parse_jsonl_output_valid():
"""Test parse_jsonl_output() parses valid JSONL."""
jsonl_output = '''{"type": "ArchiveResult", "status": "succeeded", "output": "test1"}
{"type": "ArchiveResult", "status": "failed", "error": "test2"}
'''
# Returns first match only
result = parse_jsonl_output(jsonl_output)
assert result is not None
assert result['type'] == 'ArchiveResult'
assert result['status'] == 'succeeded'
assert result['output'] == 'test1'
def test_parse_jsonl_output_with_non_json_lines():
"""Test parse_jsonl_output() skips non-JSON lines."""
mixed_output = '''Some non-JSON output
{"type": "ArchiveResult", "status": "succeeded"}
More non-JSON
{"type": "ArchiveResult", "status": "failed"}
'''
result = parse_jsonl_output(mixed_output)
assert result is not None
assert result['type'] == 'ArchiveResult'
assert result['status'] == 'succeeded'
def test_parse_jsonl_output_empty():
"""Test parse_jsonl_output() handles empty input."""
result = parse_jsonl_output('')
assert result is None
def test_parse_jsonl_output_filters_by_type():
"""Test parse_jsonl_output() can filter by record type."""
jsonl_output = '''{"type": "LogEntry", "data": "log1"}
{"type": "ArchiveResult", "data": "result1"}
{"type": "ArchiveResult", "data": "result2"}
'''
# Should return first ArchiveResult, not LogEntry
result = parse_jsonl_output(jsonl_output, record_type='ArchiveResult')
assert result is not None
assert result['type'] == 'ArchiveResult'
assert result['data'] == 'result1' # First ArchiveResult
def test_parse_jsonl_output_filters_custom_type():
"""Test parse_jsonl_output() can filter by custom record type."""
jsonl_output = '''{"type": "ArchiveResult", "data": "result1"}
{"type": "LogEntry", "data": "log1"}
{"type": "ArchiveResult", "data": "result2"}
'''
result = parse_jsonl_output(jsonl_output, record_type='LogEntry')
assert result is not None
assert result['type'] == 'LogEntry'
assert result['data'] == 'log1'
def test_machine_type_consistency():
"""Test that machine type is consistent across calls."""
mt1 = get_machine_type()
mt2 = get_machine_type()
assert mt1 == mt2, "Machine type should be stable across calls"
def test_lib_dir_is_directory():
"""Test that lib_dir points to an actual directory when DATA_DIR is set."""
with tempfile.TemporaryDirectory() as tmpdir:
old_data_dir = os.environ.get('DATA_DIR')
try:
os.environ['DATA_DIR'] = tmpdir
# Create the expected directory structure
machine_type = get_machine_type()
lib_dir = Path(tmpdir) / 'lib' / machine_type
lib_dir.mkdir(parents=True, exist_ok=True)
result = get_lib_dir()
# Should return a Path object
assert isinstance(result, Path)
finally:
if old_data_dir:
os.environ['DATA_DIR'] = old_data_dir
else:
os.environ.pop('DATA_DIR', None)
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -1,21 +0,0 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"required_plugins": ["chrome"],
"properties": {
"CONSOLELOG_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_CONSOLELOG", "USE_CONSOLELOG"],
"description": "Enable console log capture"
},
"CONSOLELOG_TIMEOUT": {
"type": "integer",
"default": 30,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for console log capture in seconds"
}
}
}

View File

@@ -1,201 +0,0 @@
#!/usr/bin/env node
/**
* Capture console output from a page.
*
* This hook sets up CDP listeners BEFORE chrome_navigate loads the page,
* then waits for navigation to complete. The listeners stay active through
* navigation and capture all console output.
*
* Usage: on_Snapshot__21_consolelog.js --url=<url> --snapshot-id=<uuid>
* Output: Writes console.jsonl
*/
const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
// Import shared utilities from chrome_utils.js
const {
getEnvBool,
getEnvInt,
parseArgs,
connectToPage,
waitForPageLoaded,
} = require('../chrome/chrome_utils.js');
const PLUGIN_NAME = 'consolelog';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'console.jsonl';
const CHROME_SESSION_DIR = '../chrome';
let browser = null;
let page = null;
let logCount = 0;
let errorCount = 0;
let requestFailCount = 0;
let shuttingDown = false;
async function serializeArgs(args) {
const serialized = [];
for (const arg of args) {
try {
const json = await arg.jsonValue();
serialized.push(json);
} catch (e) {
try {
serialized.push(String(arg));
} catch (e2) {
serialized.push('[Unserializable]');
}
}
}
return serialized;
}
async function setupListeners() {
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
const timeout = getEnvInt('CONSOLELOG_TIMEOUT', 30) * 1000;
fs.writeFileSync(outputPath, ''); // Clear existing
// Connect to Chrome page using shared utility
const { browser, page } = await connectToPage({
chromeSessionDir: CHROME_SESSION_DIR,
timeoutMs: timeout,
puppeteer,
});
// Set up listeners that write directly to file
page.on('console', async (msg) => {
try {
const logEntry = {
timestamp: new Date().toISOString(),
type: msg.type(),
text: msg.text(),
args: await serializeArgs(msg.args()),
location: msg.location(),
};
fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
logCount += 1;
} catch (e) {
// Ignore errors
}
});
page.on('pageerror', (error) => {
try {
const logEntry = {
timestamp: new Date().toISOString(),
type: 'error',
text: error.message,
stack: error.stack || '',
};
fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
errorCount += 1;
} catch (e) {
// Ignore
}
});
page.on('requestfailed', (request) => {
try {
const failure = request.failure();
const logEntry = {
timestamp: new Date().toISOString(),
type: 'request_failed',
text: `Request failed: ${request.url()}`,
error: failure ? failure.errorText : 'Unknown error',
url: request.url(),
};
fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
requestFailCount += 1;
} catch (e) {
// Ignore
}
});
return { browser, page };
}
function emitResult(status = 'succeeded') {
if (shuttingDown) return;
shuttingDown = true;
const counts = `${logCount} console, ${errorCount} errors, ${requestFailCount} failed requests`;
console.log(JSON.stringify({
type: 'ArchiveResult',
status,
output_str: `${OUTPUT_FILE} (${counts})`,
}));
}
async function handleShutdown(signal) {
console.error(`\nReceived ${signal}, emitting final results...`);
emitResult('succeeded');
if (browser) {
try {
browser.disconnect();
} catch (e) {}
}
process.exit(0);
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__21_consolelog.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
if (!getEnvBool('CONSOLELOG_ENABLED', true)) {
console.error('Skipping (CONSOLELOG_ENABLED=False)');
console.log(JSON.stringify({type: 'ArchiveResult', status: 'skipped', output_str: 'CONSOLELOG_ENABLED=False'}));
process.exit(0);
}
try {
// Set up listeners BEFORE navigation
const connection = await setupListeners();
browser = connection.browser;
page = connection.page;
// Register signal handlers for graceful shutdown
process.on('SIGTERM', () => handleShutdown('SIGTERM'));
process.on('SIGINT', () => handleShutdown('SIGINT'));
// Wait for chrome_navigate to complete (non-fatal)
try {
const timeout = getEnvInt('CONSOLELOG_TIMEOUT', 30) * 1000;
await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 500);
} catch (e) {
console.error(`WARN: ${e.message}`);
}
// console.error('Consolelog active, waiting for cleanup signal...');
await new Promise(() => {}); // Keep alive until SIGTERM
return;
} catch (e) {
const error = `${e.name}: ${e.message}`;
console.error(`ERROR: ${error}`);
console.log(JSON.stringify({
type: 'ArchiveResult',
status: 'failed',
output_str: error,
}));
process.exit(1);
}
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -1 +0,0 @@
<span class="abx-output-icon abx-output-icon--consolelog" title="Console Log"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><rect x="3" y="4.5" width="18" height="15" rx="2"/><path d="M7 12l2 2-2 2"/><path d="M11 16h6"/></svg></span>

View File

@@ -1,127 +0,0 @@
"""
Tests for the consolelog plugin.
Tests the real consolelog hook with an actual URL to verify
console output capture.
"""
import json
import shutil
import subprocess
import sys
import tempfile
import time
from pathlib import Path
from django.test import TestCase
# Import chrome test helpers
sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests'))
from chrome_test_helpers import (
chrome_session,
CHROME_NAVIGATE_HOOK,
get_plugin_dir,
get_hook_script,
)
# Get the path to the consolelog hook
PLUGIN_DIR = get_plugin_dir(__file__)
CONSOLELOG_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_consolelog.*')
class TestConsolelogPlugin(TestCase):
"""Test the consolelog plugin."""
def test_consolelog_hook_exists(self):
"""Consolelog hook script should exist."""
self.assertIsNotNone(CONSOLELOG_HOOK, "Consolelog hook not found in plugin directory")
self.assertTrue(CONSOLELOG_HOOK.exists(), f"Hook not found: {CONSOLELOG_HOOK}")
class TestConsolelogWithChrome(TestCase):
"""Integration tests for consolelog plugin with Chrome."""
def setUp(self):
"""Set up test environment."""
self.temp_dir = Path(tempfile.mkdtemp())
def tearDown(self):
"""Clean up."""
shutil.rmtree(self.temp_dir, ignore_errors=True)
def test_consolelog_captures_output(self):
"""Consolelog hook should capture console output from page."""
test_url = 'data:text/html,<script>console.log("archivebox-console-test")</script>'
snapshot_id = 'test-consolelog-snapshot'
with chrome_session(
self.temp_dir,
crawl_id='test-consolelog-crawl',
snapshot_id=snapshot_id,
test_url=test_url,
navigate=False,
timeout=30,
) as (chrome_process, chrome_pid, snapshot_chrome_dir, env):
console_dir = snapshot_chrome_dir.parent / 'consolelog'
console_dir.mkdir(exist_ok=True)
# Run consolelog hook with the active Chrome session (background hook)
result = subprocess.Popen(
['node', str(CONSOLELOG_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(console_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
nav_result = subprocess.run(
['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
self.assertEqual(nav_result.returncode, 0, f"Navigation failed: {nav_result.stderr}")
# Check for output file
console_output = console_dir / 'console.jsonl'
# Allow it to run briefly, then terminate (background hook)
for _ in range(10):
if console_output.exists() and console_output.stat().st_size > 0:
break
time.sleep(1)
if result.poll() is None:
result.terminate()
try:
stdout, stderr = result.communicate(timeout=5)
except subprocess.TimeoutExpired:
result.kill()
stdout, stderr = result.communicate()
else:
stdout, stderr = result.communicate()
# At minimum, verify no crash
self.assertNotIn('Traceback', stderr)
# If output file exists, verify it's valid JSONL and has output
if console_output.exists():
with open(console_output) as f:
content = f.read().strip()
self.assertTrue(content, "Console output should not be empty")
for line in content.split('\n'):
if line.strip():
try:
record = json.loads(line)
# Verify structure
self.assertIn('timestamp', record)
self.assertIn('type', record)
except json.JSONDecodeError:
pass # Some lines may be incomplete
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -1,98 +0,0 @@
#!/usr/bin/env python3
"""
Install a binary using a custom bash command.
This provider runs arbitrary shell commands to install binaries
that don't fit into standard package managers.
Usage: on_Binary__install_using_custom_bash.py --binary-id=<uuid> --machine-id=<uuid> --name=<name> --custom-cmd=<cmd>
Output: Binary JSONL record to stdout after installation
Environment variables:
MACHINE_ID: Machine UUID (set by orchestrator)
"""
import json
import os
import subprocess
import sys
import rich_click as click
from abx_pkg import Binary, EnvProvider
@click.command()
@click.option('--binary-id', required=True, help="Binary UUID")
@click.option('--machine-id', required=True, help="Machine UUID")
@click.option('--name', required=True, help="Binary name to install")
@click.option('--binproviders', default='*', help="Allowed providers (comma-separated)")
@click.option('--custom-cmd', required=True, help="Custom bash command to run")
def main(binary_id: str, machine_id: str, name: str, binproviders: str, custom_cmd: str):
"""Install binary using custom bash command."""
if binproviders != '*' and 'custom' not in binproviders.split(','):
click.echo(f"custom provider not allowed for {name}", err=True)
sys.exit(0)
if not custom_cmd:
click.echo("custom provider requires --custom-cmd", err=True)
sys.exit(1)
click.echo(f"Installing {name} via custom command: {custom_cmd}", err=True)
try:
result = subprocess.run(
custom_cmd,
shell=True,
timeout=600, # 10 minute timeout for custom installs
)
if result.returncode != 0:
click.echo(f"Custom install failed (exit={result.returncode})", err=True)
sys.exit(1)
except subprocess.TimeoutExpired:
click.echo("Custom install timed out", err=True)
sys.exit(1)
# Use abx-pkg to load the binary and get its info
provider = EnvProvider()
try:
binary = Binary(name=name, binproviders=[provider]).load()
except Exception:
try:
binary = Binary(
name=name,
binproviders=[provider],
overrides={'env': {'version': '0.0.1'}},
).load()
except Exception as e:
click.echo(f"{name} not found after custom install: {e}", err=True)
sys.exit(1)
if not binary.abspath:
click.echo(f"{name} not found after custom install", err=True)
sys.exit(1)
machine_id = os.environ.get('MACHINE_ID', '')
# Output Binary JSONL record to stdout
record = {
'type': 'Binary',
'name': name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'custom',
'machine_id': machine_id,
'binary_id': binary_id,
}
print(json.dumps(record))
# Log human-readable info to stderr
click.echo(f"Installed {name} at {binary.abspath}", err=True)
click.echo(f" version: {binary.version}", err=True)
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -1,149 +0,0 @@
"""
Tests for the custom binary provider plugin.
Tests the custom bash binary installer with safe commands.
"""
import json
import os
import subprocess
import sys
import tempfile
from pathlib import Path
import pytest
from django.test import TestCase
# Get the path to the custom provider hook
PLUGIN_DIR = Path(__file__).parent.parent
INSTALL_HOOK = next(PLUGIN_DIR.glob('on_Binary__*_custom_install.py'), None)
class TestCustomProviderHook(TestCase):
"""Test the custom binary provider hook."""
def setUp(self):
"""Set up test environment."""
self.temp_dir = tempfile.mkdtemp()
def tearDown(self):
"""Clean up."""
import shutil
shutil.rmtree(self.temp_dir, ignore_errors=True)
def test_hook_script_exists(self):
"""Hook script should exist."""
self.assertTrue(INSTALL_HOOK and INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}")
def test_hook_skips_when_custom_not_allowed(self):
"""Hook should skip when custom not in allowed binproviders."""
env = os.environ.copy()
env['DATA_DIR'] = self.temp_dir
result = subprocess.run(
[
sys.executable, str(INSTALL_HOOK),
'--name=echo',
'--binary-id=test-uuid',
'--machine-id=test-machine',
'--binproviders=pip,apt', # custom not allowed
'--custom-cmd=echo hello',
],
capture_output=True,
text=True,
timeout=30,
env=env
)
# Should exit cleanly (code 0) when custom not allowed
self.assertEqual(result.returncode, 0)
self.assertIn('custom provider not allowed', result.stderr)
def test_hook_runs_custom_command_and_finds_binary(self):
"""Hook should run custom command and find the binary in PATH."""
env = os.environ.copy()
env['DATA_DIR'] = self.temp_dir
# Use a simple echo command that doesn't actually install anything
# Then check for 'echo' which is already in PATH
result = subprocess.run(
[
sys.executable, str(INSTALL_HOOK),
'--name=echo',
'--binary-id=test-uuid',
'--machine-id=test-machine',
'--custom-cmd=echo "custom install simulation"',
],
capture_output=True,
text=True,
timeout=30,
env=env
)
# Should succeed since echo is in PATH
self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}")
# Parse JSONL output
for line in result.stdout.split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'Binary' and record.get('name') == 'echo':
self.assertEqual(record['binprovider'], 'custom')
self.assertTrue(record['abspath'])
return
except json.JSONDecodeError:
continue
self.fail("No Binary JSONL record found in output")
def test_hook_fails_for_missing_binary_after_command(self):
"""Hook should fail if binary not found after running custom command."""
env = os.environ.copy()
env['DATA_DIR'] = self.temp_dir
result = subprocess.run(
[
sys.executable, str(INSTALL_HOOK),
'--name=nonexistent_binary_xyz123',
'--binary-id=test-uuid',
'--machine-id=test-machine',
'--custom-cmd=echo "failed install"', # Doesn't actually install
],
capture_output=True,
text=True,
timeout=30,
env=env
)
# Should fail since binary not found after command
self.assertEqual(result.returncode, 1)
self.assertIn('not found', result.stderr.lower())
def test_hook_fails_for_failing_command(self):
"""Hook should fail if custom command returns non-zero exit code."""
env = os.environ.copy()
env['DATA_DIR'] = self.temp_dir
result = subprocess.run(
[
sys.executable, str(INSTALL_HOOK),
'--name=echo',
'--binary-id=test-uuid',
'--machine-id=test-machine',
'--custom-cmd=exit 1', # Command that fails
],
capture_output=True,
text=True,
timeout=30,
env=env
)
# Should fail with exit code 1
self.assertEqual(result.returncode, 1)
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -1,21 +0,0 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"required_plugins": ["chrome"],
"properties": {
"DNS_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_DNS", "USE_DNS"],
"description": "Enable DNS traffic recording during page load"
},
"DNS_TIMEOUT": {
"type": "integer",
"default": 30,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for DNS recording in seconds"
}
}
}

View File

@@ -1,265 +0,0 @@
#!/usr/bin/env node
/**
* Record all DNS traffic (hostname -> IP resolutions) during page load.
*
* This hook sets up CDP listeners BEFORE chrome_navigate loads the page,
* then waits for navigation to complete. The listeners capture all DNS
* resolutions by extracting hostname/IP pairs from network responses.
*
* Usage: on_Snapshot__22_dns.js --url=<url> --snapshot-id=<uuid>
* Output: Writes dns.jsonl with one line per DNS resolution record
*/
const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
// Import shared utilities from chrome_utils.js
const {
getEnvBool,
getEnvInt,
parseArgs,
connectToPage,
waitForPageLoaded,
} = require('../chrome/chrome_utils.js');
const PLUGIN_NAME = 'dns';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'dns.jsonl';
const CHROME_SESSION_DIR = '../chrome';
let browser = null;
let page = null;
let recordCount = 0;
let shuttingDown = false;
function extractHostname(url) {
try {
const urlObj = new URL(url);
return urlObj.hostname;
} catch (e) {
return null;
}
}
async function setupListener(targetUrl) {
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
const timeout = getEnvInt('DNS_TIMEOUT', 30) * 1000;
// Initialize output file
fs.writeFileSync(outputPath, '');
// Track seen hostname -> IP mappings to avoid duplicates per request
const seenResolutions = new Map();
// Track request IDs to their URLs for correlation
const requestUrls = new Map();
// Connect to Chrome page using shared utility
const { browser, page } = await connectToPage({
chromeSessionDir: CHROME_SESSION_DIR,
timeoutMs: timeout,
puppeteer,
});
// Get CDP session for low-level network events
const client = await page.target().createCDPSession();
// Enable network domain to receive events
await client.send('Network.enable');
// Listen for request events to track URLs
client.on('Network.requestWillBeSent', (params) => {
requestUrls.set(params.requestId, params.request.url);
});
// Listen for response events which contain remoteIPAddress (the resolved IP)
client.on('Network.responseReceived', (params) => {
try {
const response = params.response;
const url = response.url;
const remoteIPAddress = response.remoteIPAddress;
const remotePort = response.remotePort;
if (!url || !remoteIPAddress) {
return;
}
const hostname = extractHostname(url);
if (!hostname) {
return;
}
// Skip if IP address is same as hostname (already an IP)
if (hostname === remoteIPAddress) {
return;
}
// Create a unique key for this resolution
const resolutionKey = `${hostname}:${remoteIPAddress}`;
// Skip if we've already recorded this resolution
if (seenResolutions.has(resolutionKey)) {
return;
}
seenResolutions.set(resolutionKey, true);
// Determine record type (A for IPv4, AAAA for IPv6)
const isIPv6 = remoteIPAddress.includes(':');
const recordType = isIPv6 ? 'AAAA' : 'A';
// Create DNS record
const timestamp = new Date().toISOString();
const dnsRecord = {
ts: timestamp,
hostname: hostname,
ip: remoteIPAddress,
port: remotePort || null,
type: recordType,
protocol: url.startsWith('https://') ? 'https' : 'http',
url: url,
requestId: params.requestId,
};
// Append to output file
fs.appendFileSync(outputPath, JSON.stringify(dnsRecord) + '\n');
recordCount += 1;
} catch (e) {
// Ignore errors
}
});
// Listen for failed requests too - they still involve DNS
client.on('Network.loadingFailed', (params) => {
try {
const requestId = params.requestId;
const url = requestUrls.get(requestId);
if (!url) {
return;
}
const hostname = extractHostname(url);
if (!hostname) {
return;
}
// Check if this is a DNS-related failure
const errorText = params.errorText || '';
if (errorText.includes('net::ERR_NAME_NOT_RESOLVED') ||
errorText.includes('net::ERR_NAME_RESOLUTION_FAILED')) {
// Create a unique key for this failed resolution
const resolutionKey = `${hostname}:NXDOMAIN`;
// Skip if we've already recorded this NXDOMAIN
if (seenResolutions.has(resolutionKey)) {
return;
}
seenResolutions.set(resolutionKey, true);
const timestamp = new Date().toISOString();
const dnsRecord = {
ts: timestamp,
hostname: hostname,
ip: null,
port: null,
type: 'NXDOMAIN',
protocol: url.startsWith('https://') ? 'https' : 'http',
url: url,
requestId: requestId,
error: errorText,
};
fs.appendFileSync(outputPath, JSON.stringify(dnsRecord) + '\n');
recordCount += 1;
}
} catch (e) {
// Ignore errors
}
});
return { browser, page, client };
}
function emitResult(status = 'succeeded') {
if (shuttingDown) return;
shuttingDown = true;
console.log(JSON.stringify({
type: 'ArchiveResult',
status,
output_str: `${OUTPUT_FILE} (${recordCount} DNS records)`,
}));
}
async function handleShutdown(signal) {
console.error(`\nReceived ${signal}, emitting final results...`);
emitResult('succeeded');
if (browser) {
try {
browser.disconnect();
} catch (e) {}
}
process.exit(0);
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__22_dns.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
if (!getEnvBool('DNS_ENABLED', true)) {
console.error('Skipping (DNS_ENABLED=False)');
console.log(JSON.stringify({type: 'ArchiveResult', status: 'skipped', output_str: 'DNS_ENABLED=False'}));
process.exit(0);
}
try {
// Set up listener BEFORE navigation
const connection = await setupListener(url);
browser = connection.browser;
page = connection.page;
// Register signal handlers for graceful shutdown
process.on('SIGTERM', () => handleShutdown('SIGTERM'));
process.on('SIGINT', () => handleShutdown('SIGINT'));
// Wait for chrome_navigate to complete (non-fatal)
try {
const timeout = getEnvInt('DNS_TIMEOUT', 30) * 1000;
await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 500);
} catch (e) {
console.error(`WARN: ${e.message}`);
}
// console.error('DNS listener active, waiting for cleanup signal...');
await new Promise(() => {}); // Keep alive until SIGTERM
return;
} catch (e) {
const error = `${e.name}: ${e.message}`;
console.error(`ERROR: ${error}`);
console.log(JSON.stringify({
type: 'ArchiveResult',
status: 'failed',
output_str: error,
}));
process.exit(1);
}
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -1 +0,0 @@
<span class="abx-output-icon abx-output-icon--dns" title="DNS"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="6" cy="12" r="2"/><circle cx="18" cy="6" r="2"/><circle cx="18" cy="18" r="2"/><path d="M8 12h6"/><path d="M16 8l-2 2"/><path d="M16 16l-2-2"/></svg></span>

View File

@@ -1,126 +0,0 @@
"""
Tests for the DNS plugin.
Tests the real DNS hook with an actual URL to verify
DNS resolution capture.
"""
import json
import shutil
import subprocess
import sys
import tempfile
import time
from pathlib import Path
from django.test import TestCase
# Import chrome test helpers
sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests'))
from chrome_test_helpers import (
chrome_session,
CHROME_NAVIGATE_HOOK,
get_plugin_dir,
get_hook_script,
)
# Get the path to the DNS hook
PLUGIN_DIR = get_plugin_dir(__file__)
DNS_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_dns.*')
class TestDNSPlugin(TestCase):
"""Test the DNS plugin."""
def test_dns_hook_exists(self):
"""DNS hook script should exist."""
self.assertIsNotNone(DNS_HOOK, "DNS hook not found in plugin directory")
self.assertTrue(DNS_HOOK.exists(), f"Hook not found: {DNS_HOOK}")
class TestDNSWithChrome(TestCase):
"""Integration tests for DNS plugin with Chrome."""
def setUp(self):
"""Set up test environment."""
self.temp_dir = Path(tempfile.mkdtemp())
def tearDown(self):
"""Clean up."""
shutil.rmtree(self.temp_dir, ignore_errors=True)
def test_dns_records_captured(self):
"""DNS hook should capture DNS records from a real URL."""
test_url = 'https://example.com'
snapshot_id = 'test-dns-snapshot'
with chrome_session(
self.temp_dir,
crawl_id='test-dns-crawl',
snapshot_id=snapshot_id,
test_url=test_url,
navigate=False,
timeout=30,
) as (_process, _pid, snapshot_chrome_dir, env):
dns_dir = snapshot_chrome_dir.parent / 'dns'
dns_dir.mkdir(exist_ok=True)
result = subprocess.Popen(
['node', str(DNS_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(dns_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
nav_result = subprocess.run(
['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
self.assertEqual(nav_result.returncode, 0, f"Navigation failed: {nav_result.stderr}")
dns_output = dns_dir / 'dns.jsonl'
for _ in range(30):
if dns_output.exists() and dns_output.stat().st_size > 0:
break
time.sleep(1)
if result.poll() is None:
result.terminate()
try:
stdout, stderr = result.communicate(timeout=5)
except subprocess.TimeoutExpired:
result.kill()
stdout, stderr = result.communicate()
else:
stdout, stderr = result.communicate()
self.assertNotIn('Traceback', stderr)
self.assertTrue(dns_output.exists(), "dns.jsonl not created")
content = dns_output.read_text().strip()
self.assertTrue(content, "DNS output should not be empty")
records = []
for line in content.split('\n'):
line = line.strip()
if not line:
continue
try:
records.append(json.loads(line))
except json.JSONDecodeError:
pass
self.assertTrue(records, "No DNS records parsed")
has_ip_record = any(r.get('hostname') and r.get('ip') for r in records)
self.assertTrue(has_ip_record, f"No DNS record with hostname + ip: {records}")
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -1,21 +0,0 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"required_plugins": ["chrome"],
"properties": {
"DOM_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_DOM", "USE_DOM"],
"description": "Enable DOM capture"
},
"DOM_TIMEOUT": {
"type": "integer",
"default": 60,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for DOM capture in seconds"
}
}
}

View File

@@ -1,184 +0,0 @@
#!/usr/bin/env node
/**
* Dump the DOM of a URL using Chrome/Puppeteer.
*
* Requires a Chrome session (from chrome plugin) and connects to it via CDP.
*
* Usage: on_Snapshot__53_dom.js --url=<url> --snapshot-id=<uuid>
* Output: Writes dom/output.html
*
* Environment variables:
* DOM_ENABLED: Enable DOM extraction (default: true)
*/
const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const {
getEnvBool,
parseArgs,
readCdpUrl,
} = require('../chrome/chrome_utils.js');
// Check if DOM is enabled BEFORE requiring puppeteer
if (!getEnvBool('DOM_ENABLED', true)) {
console.error('Skipping DOM (DOM_ENABLED=False)');
// Temporary failure (config disabled) - NO JSONL emission
process.exit(0);
}
// Now safe to require puppeteer
const puppeteer = require('puppeteer-core');
// Extractor metadata
const PLUGIN_NAME = 'dom';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'output.html';
const CHROME_SESSION_DIR = '../chrome';
// Check if staticfile extractor already downloaded this URL
const STATICFILE_DIR = '../staticfile';
function hasStaticFileOutput() {
if (!fs.existsSync(STATICFILE_DIR)) return false;
const stdoutPath = path.join(STATICFILE_DIR, 'stdout.log');
if (!fs.existsSync(stdoutPath)) return false;
const stdout = fs.readFileSync(stdoutPath, 'utf8');
for (const line of stdout.split('\n')) {
const trimmed = line.trim();
if (!trimmed.startsWith('{')) continue;
try {
const record = JSON.parse(trimmed);
if (record.type === 'ArchiveResult' && record.status === 'succeeded') {
return true;
}
} catch (e) {}
}
return false;
}
// Wait for chrome tab to be fully loaded
async function waitForChromeTabLoaded(timeoutMs = 60000) {
const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json');
const startTime = Date.now();
while (Date.now() - startTime < timeoutMs) {
if (fs.existsSync(navigationFile)) {
return true;
}
// Wait 100ms before checking again
await new Promise(resolve => setTimeout(resolve, 100));
}
return false;
}
async function dumpDom(url) {
// Output directory is current directory (hook already runs in output dir)
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;
let page = null;
try {
// Connect to existing Chrome session (required)
const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
if (!cdpUrl) {
return { success: false, error: 'No Chrome session found (chrome plugin must run first)' };
}
browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
defaultViewport: null,
});
// Get existing pages or create new one
const pages = await browser.pages();
page = pages.find(p => p.url().startsWith('http')) || pages[0];
if (!page) {
page = await browser.newPage();
}
// Get the full DOM content
const domContent = await page.content();
if (domContent && domContent.length > 100) {
fs.writeFileSync(outputPath, domContent, 'utf8');
return { success: true, output: outputPath };
} else {
return { success: false, error: 'DOM content too short or empty' };
}
} catch (e) {
return { success: false, error: `${e.name}: ${e.message}` };
} finally {
if (browser) {
browser.disconnect();
}
}
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__53_dom.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
try {
// Check if staticfile extractor already handled this (permanent skip)
if (hasStaticFileOutput()) {
console.error(`Skipping DOM - staticfile extractor already downloaded this`);
// Permanent skip - emit ArchiveResult with status='skipped'
console.log(JSON.stringify({
type: 'ArchiveResult',
status: 'skipped',
output_str: 'staticfile already handled',
}));
process.exit(0);
}
const cdpUrl = readCdpUrl(CHROME_SESSION_DIR);
if (!cdpUrl) {
throw new Error('No Chrome session found (chrome plugin must run first)');
}
// Wait for page to be fully loaded
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
}
const result = await dumpDom(url);
if (result.success) {
// Success - emit ArchiveResult
const size = fs.statSync(result.output).size;
console.error(`DOM saved (${size} bytes)`);
console.log(JSON.stringify({
type: 'ArchiveResult',
status: 'succeeded',
output_str: result.output,
}));
process.exit(0);
} else {
// Transient error - emit NO JSONL
console.error(`ERROR: ${result.error}`);
process.exit(1);
}
} catch (e) {
// Transient error - emit NO JSONL
console.error(`ERROR: ${e.name}: ${e.message}`);
process.exit(1);
}
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -1,8 +0,0 @@
<!-- DOM thumbnail - scaled down iframe preview of captured DOM HTML -->
<div class="extractor-thumbnail dom-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fff;">
<iframe src="{{ output_path }}"
style="width: 400%; height: 400px; transform: scale(0.25); transform-origin: top left; pointer-events: none; border: none;"
loading="lazy"
sandbox="allow-same-origin">
</iframe>
</div>

View File

@@ -1 +0,0 @@
<span class="abx-output-icon abx-output-icon--dom" title="DOM"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M8 9l-3 3 3 3"/><path d="M16 9l3 3-3 3"/><path d="M10 20l4-16"/></svg></span>

View File

@@ -1,185 +0,0 @@
"""
Integration tests for dom plugin
Tests verify:
1. Hook script exists
2. Dependencies installed via chrome validation hooks
3. Verify deps with abx-pkg
4. DOM extraction works on https://example.com
5. JSONL output is correct
6. Filesystem output contains actual page content
7. Config options work
"""
import json
import os
import subprocess
import sys
import tempfile
from pathlib import Path
import pytest
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_test_env,
get_plugin_dir,
get_hook_script,
run_hook_and_parse,
LIB_DIR,
NODE_MODULES_DIR,
PLUGINS_ROOT,
chrome_session,
)
PLUGIN_DIR = get_plugin_dir(__file__)
DOM_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_dom.*')
NPM_PROVIDER_HOOK = get_hook_script(PLUGINS_ROOT / 'npm', 'on_Binary__install_using_npm_provider.py')
TEST_URL = 'https://example.com'
def test_hook_script_exists():
"""Verify on_Snapshot hook exists."""
assert DOM_HOOK.exists(), f"Hook not found: {DOM_HOOK}"
def test_verify_deps_with_abx_pkg():
"""Verify dependencies are available via abx-pkg after hook installation."""
from abx_pkg import Binary, EnvProvider, BinProviderOverrides
EnvProvider.model_rebuild()
# Verify node is available
node_binary = Binary(name='node', binproviders=[EnvProvider()])
node_loaded = node_binary.load()
assert node_loaded and node_loaded.abspath, "Node.js required for dom plugin"
def test_extracts_dom_from_example_com():
"""Test full workflow: extract DOM from real example.com via hook."""
# Prerequisites checked by earlier test
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
with chrome_session(tmpdir, test_url=TEST_URL) as (_process, _pid, snapshot_chrome_dir, env):
dom_dir = snapshot_chrome_dir.parent / 'dom'
dom_dir.mkdir(exist_ok=True)
# Run DOM extraction hook
result = subprocess.run(
['node', str(DOM_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
cwd=dom_dir,
capture_output=True,
text=True,
timeout=120,
env=env
)
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
# Parse clean JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json, "Should have ArchiveResult JSONL output"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
# Verify filesystem output (hook writes directly to working dir)
dom_file = dom_dir / 'output.html'
assert dom_file.exists(), f"output.html not created. Files: {list(tmpdir.iterdir())}"
# Verify HTML content contains REAL example.com text
html_content = dom_file.read_text(errors='ignore')
assert len(html_content) > 200, f"HTML content too short: {len(html_content)} bytes"
assert '<html' in html_content.lower(), "Missing <html> tag"
assert 'example domain' in html_content.lower(), "Missing 'Example Domain' in HTML"
assert ('this domain' in html_content.lower() or
'illustrative examples' in html_content.lower()), \
"Missing example.com description text"
def test_config_save_dom_false_skips():
"""Test that DOM_ENABLED=False exits without emitting JSONL."""
import os
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
env = os.environ.copy()
env['DOM_ENABLED'] = 'False'
result = subprocess.run(
['node', str(DOM_HOOK), f'--url={TEST_URL}', '--snapshot-id=test999'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=30
)
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
# Feature disabled - temporary failure, should NOT emit JSONL
assert 'Skipping DOM' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
# Should NOT emit any JSONL
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}"
def test_staticfile_present_skips():
"""Test that dom skips when staticfile already downloaded."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Create directory structure like real ArchiveBox:
# tmpdir/
# staticfile/ <- staticfile extractor output
# dom/ <- dom extractor runs here, looks for ../staticfile
staticfile_dir = tmpdir / 'staticfile'
staticfile_dir.mkdir()
(staticfile_dir / 'stdout.log').write_text('{"type":"ArchiveResult","status":"succeeded","output_str":"index.html"}\n')
dom_dir = tmpdir / 'dom'
dom_dir.mkdir()
result = subprocess.run(
['node', str(DOM_HOOK), f'--url={TEST_URL}', '--snapshot-id=teststatic'],
cwd=dom_dir, # Run from dom subdirectory
capture_output=True,
text=True,
timeout=30
,
env=get_test_env())
assert result.returncode == 0, "Should exit 0 when permanently skipping"
# Permanent skip - should emit ArchiveResult with status='skipped'
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json, "Should emit ArchiveResult JSONL for permanent skip"
assert result_json['status'] == 'skipped', f"Should have status='skipped': {result_json}"
assert 'staticfile' in result_json.get('output_str', '').lower(), "Should mention staticfile in output_str"
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -1,72 +0,0 @@
#!/usr/bin/env python3
"""
Check if a binary is already available in the system PATH.
This is the simplest "provider" - it doesn't install anything,
it just discovers binaries that are already installed.
Usage: on_Binary__install_using_env_provider.py --binary-id=<uuid> --machine-id=<uuid> --name=<name>
Output: Binary JSONL record to stdout if binary found in PATH
Environment variables:
MACHINE_ID: Machine UUID (set by orchestrator)
"""
import json
import os
import sys
import rich_click as click
from abx_pkg import Binary, EnvProvider
@click.command()
@click.option('--machine-id', required=True, help="Machine UUID")
@click.option('--binary-id', required=True, help="Dependency UUID")
@click.option('--name', required=True, help="Binary name to find")
@click.option('--binproviders', default='*', help="Allowed providers (comma-separated)")
@click.option('--overrides', default=None, help="JSON-encoded overrides dict (unused)")
def main(binary_id: str, machine_id: str, name: str, binproviders: str, overrides: str | None):
"""Check if binary is available in PATH and record it."""
# Check if env provider is allowed
if binproviders != '*' and 'env' not in binproviders.split(','):
click.echo(f"env provider not allowed for {name}", err=True)
sys.exit(0) # Not an error, just skip
# Use abx-pkg EnvProvider to find binary
provider = EnvProvider()
try:
binary = Binary(name=name, binproviders=[provider]).load()
except Exception as e:
click.echo(f"{name} not found in PATH: {e}", err=True)
sys.exit(1)
if not binary.abspath:
click.echo(f"{name} not found in PATH", err=True)
sys.exit(1)
machine_id = os.environ.get('MACHINE_ID', '')
# Output Binary JSONL record to stdout
record = {
'type': 'Binary',
'name': name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'env',
'machine_id': machine_id,
'binary_id': binary_id,
}
print(json.dumps(record))
# Log human-readable info to stderr
click.echo(f"Found {name} at {binary.abspath}", err=True)
click.echo(f" version: {binary.version}", err=True)
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -1,159 +0,0 @@
"""
Tests for the env binary provider plugin.
Tests the real env provider hook with actual system binaries.
"""
import json
import os
import subprocess
import sys
import tempfile
from pathlib import Path
import pytest
from django.test import TestCase
# Get the path to the env provider hook
PLUGIN_DIR = Path(__file__).parent.parent
INSTALL_HOOK = next(PLUGIN_DIR.glob('on_Binary__*_env_install.py'), None)
class TestEnvProviderHook(TestCase):
"""Test the env binary provider hook."""
def setUp(self):
"""Set up test environment."""
self.temp_dir = tempfile.mkdtemp()
def tearDown(self):
"""Clean up."""
import shutil
shutil.rmtree(self.temp_dir, ignore_errors=True)
def test_hook_script_exists(self):
"""Hook script should exist."""
self.assertTrue(INSTALL_HOOK and INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}")
def test_hook_finds_python(self):
"""Hook should find python3 binary in PATH."""
env = os.environ.copy()
env['DATA_DIR'] = self.temp_dir
result = subprocess.run(
[
sys.executable, str(INSTALL_HOOK),
'--name=python3',
'--binary-id=test-uuid',
'--machine-id=test-machine',
],
capture_output=True,
text=True,
timeout=30,
env=env
)
# Should succeed and output JSONL
self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}")
# Parse JSONL output
for line in result.stdout.split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'Binary' and record.get('name') == 'python3':
self.assertEqual(record['binprovider'], 'env')
self.assertTrue(record['abspath'])
self.assertTrue(Path(record['abspath']).exists())
return
except json.JSONDecodeError:
continue
self.fail("No Binary JSONL record found in output")
def test_hook_finds_bash(self):
"""Hook should find bash binary in PATH."""
env = os.environ.copy()
env['DATA_DIR'] = self.temp_dir
result = subprocess.run(
[
sys.executable, str(INSTALL_HOOK),
'--name=bash',
'--binary-id=test-uuid',
'--machine-id=test-machine',
],
capture_output=True,
text=True,
timeout=30,
env=env
)
# Should succeed and output JSONL
self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}")
# Parse JSONL output
for line in result.stdout.split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'Binary' and record.get('name') == 'bash':
self.assertEqual(record['binprovider'], 'env')
self.assertTrue(record['abspath'])
return
except json.JSONDecodeError:
continue
self.fail("No Binary JSONL record found in output")
def test_hook_fails_for_missing_binary(self):
"""Hook should fail for binary not in PATH."""
env = os.environ.copy()
env['DATA_DIR'] = self.temp_dir
result = subprocess.run(
[
sys.executable, str(INSTALL_HOOK),
'--name=nonexistent_binary_xyz123',
'--binary-id=test-uuid',
'--machine-id=test-machine',
],
capture_output=True,
text=True,
timeout=30,
env=env
)
# Should fail with exit code 1
self.assertEqual(result.returncode, 1)
self.assertIn('not found', result.stderr.lower())
def test_hook_skips_when_env_not_allowed(self):
"""Hook should skip when env not in allowed binproviders."""
env = os.environ.copy()
env['DATA_DIR'] = self.temp_dir
result = subprocess.run(
[
sys.executable, str(INSTALL_HOOK),
'--name=python3',
'--binary-id=test-uuid',
'--machine-id=test-machine',
'--binproviders=pip,apt', # env not allowed
],
capture_output=True,
text=True,
timeout=30,
env=env
)
# Should exit cleanly (code 0) when env not allowed
self.assertEqual(result.returncode, 0)
self.assertIn('env provider not allowed', result.stderr)
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -1,26 +0,0 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"FAVICON_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_FAVICON", "USE_FAVICON"],
"description": "Enable favicon downloading"
},
"FAVICON_TIMEOUT": {
"type": "integer",
"default": 30,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for favicon fetch in seconds"
},
"FAVICON_USER_AGENT": {
"type": "string",
"default": "",
"x-fallback": "USER_AGENT",
"description": "User agent string"
}
}
}

View File

@@ -1,153 +0,0 @@
#!/usr/bin/env python3
"""
Extract favicon from a URL.
Usage: on_Snapshot__favicon.bg.py --url=<url> --snapshot-id=<uuid>
Output: Writes favicon.ico to $PWD
Environment variables:
FAVICON_TIMEOUT: Timeout in seconds (default: 30)
USER_AGENT: User agent string
# Fallback to ARCHIVING_CONFIG values if FAVICON_* not set:
TIMEOUT: Fallback timeout
Note: This extractor uses the 'requests' library which is bundled with ArchiveBox.
It can run standalone if requests is installed: pip install requests
"""
import json
import os
import re
import sys
from pathlib import Path
from urllib.parse import urljoin, urlparse
import rich_click as click
# Extractor metadata
PLUGIN_NAME = 'favicon'
OUTPUT_DIR = '.'
OUTPUT_FILE = 'favicon.ico'
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
except ValueError:
return default
def get_favicon(url: str) -> tuple[bool, str | None, str]:
"""
Fetch favicon from URL.
Returns: (success, output_path, error_message)
"""
try:
import requests
except ImportError:
return False, None, 'requests library not installed'
timeout = get_env_int('FAVICON_TIMEOUT') or get_env_int('TIMEOUT', 30)
user_agent = get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
headers = {'User-Agent': user_agent}
# Build list of possible favicon URLs
parsed = urlparse(url)
base_url = f"{parsed.scheme}://{parsed.netloc}"
favicon_urls = [
urljoin(base_url, '/favicon.ico'),
urljoin(base_url, '/favicon.png'),
urljoin(base_url, '/apple-touch-icon.png'),
]
# Try to extract favicon URL from HTML link tags
try:
response = requests.get(url, timeout=timeout, headers=headers)
if response.ok:
# Look for <link rel="icon" href="...">
for match in re.finditer(
r'<link[^>]+rel=["\'](?:shortcut )?icon["\'][^>]+href=["\']([^"\']+)["\']',
response.text,
re.I
):
favicon_urls.insert(0, urljoin(url, match.group(1)))
# Also check reverse order: href before rel
for match in re.finditer(
r'<link[^>]+href=["\']([^"\']+)["\'][^>]+rel=["\'](?:shortcut )?icon["\']',
response.text,
re.I
):
favicon_urls.insert(0, urljoin(url, match.group(1)))
except Exception:
pass # Continue with default favicon URLs
# Try each URL until we find one that works
for favicon_url in favicon_urls:
try:
response = requests.get(favicon_url, timeout=15, headers=headers)
if response.ok and len(response.content) > 0:
Path(OUTPUT_FILE).write_bytes(response.content)
return True, OUTPUT_FILE, ''
except Exception:
continue
# Try Google's favicon service as fallback
try:
google_url = f'https://www.google.com/s2/favicons?domain={parsed.netloc}'
response = requests.get(google_url, timeout=15, headers=headers)
if response.ok and len(response.content) > 0:
Path(OUTPUT_FILE).write_bytes(response.content)
return True, OUTPUT_FILE, ''
except Exception:
pass
return False, None, 'No favicon found'
@click.command()
@click.option('--url', required=True, help='URL to extract favicon from')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Extract favicon from a URL."""
output = None
status = 'failed'
error = ''
try:
# Run extraction
success, output, error = get_favicon(url)
if success:
status = 'succeeded'
else:
status = 'failed'
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
if error:
print(f'ERROR: {error}', file=sys.stderr)
# Output clean JSONL (no RESULT_JSON= prefix)
result = {
'type': 'ArchiveResult',
'status': status,
'output_str': output or error or '',
}
print(json.dumps(result))
sys.exit(0 if status == 'succeeded' else 1)
if __name__ == '__main__':
main()

View File

@@ -1,9 +0,0 @@
<!-- Favicon thumbnail - small favicon preview -->
<div class="extractor-thumbnail favicon-thumbnail" style="width: 100%; height: 100px; display: flex; align-items: center; justify-content: center; background: #fff;">
{% if output_path %}
<img src="{{ output_path }}"
alt="Favicon"
style="width: 30px; height: 30px; max-width: 30px; max-height: 30px; object-fit: contain;"
loading="lazy">
{% endif %}
</div>

View File

@@ -1 +0,0 @@
<span class="abx-output-icon abx-output-icon--favicon" title="Favicon"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M12 3l2.5 5.5 6 .5-4.5 3.8 1.5 5.7L12 15.5 6.5 18.5 8 12.8 3.5 9l6-.5z"/></svg></span>

View File

@@ -1,293 +0,0 @@
"""
Integration tests for favicon plugin
Tests verify:
1. Plugin script exists
2. requests library is available
3. Favicon extraction works for real example.com
4. Output file is actual image data
5. Tries multiple favicon URLs
6. Falls back to Google's favicon service
7. Config options work (TIMEOUT, USER_AGENT)
8. Handles failures gracefully
"""
import json
import subprocess
import sys
import tempfile
from pathlib import Path
import pytest
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_plugin_dir,
get_hook_script,
parse_jsonl_output,
)
PLUGIN_DIR = get_plugin_dir(__file__)
FAVICON_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_favicon.*')
TEST_URL = 'https://example.com'
def test_hook_script_exists():
"""Verify hook script exists."""
assert FAVICON_HOOK.exists(), f"Hook script not found: {FAVICON_HOOK}"
def test_requests_library_available():
"""Test that requests library is available."""
result = subprocess.run(
[sys.executable, '-c', 'import requests; print(requests.__version__)'],
capture_output=True,
text=True
)
if result.returncode != 0:
pass
assert len(result.stdout.strip()) > 0, "Should report requests version"
def test_extracts_favicon_from_example_com():
"""Test full workflow: extract favicon from real example.com.
Note: example.com doesn't have a favicon and Google's service may also fail,
so we test that the extraction completes and reports appropriate status.
"""
# Check requests is available
check_result = subprocess.run(
[sys.executable, '-c', 'import requests'],
capture_output=True
)
if check_result.returncode != 0:
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Run favicon extraction
result = subprocess.run(
[sys.executable, str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
)
# May succeed (if Google service works) or fail (if no favicon)
assert result.returncode in (0, 1), "Should complete extraction attempt"
# Parse clean JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json, "Should have ArchiveResult JSONL output"
# If it succeeded, verify the favicon file
if result_json['status'] == 'succeeded':
favicon_file = tmpdir / 'favicon.ico'
assert favicon_file.exists(), "favicon.ico not created"
# Verify file is not empty and contains actual image data
file_size = favicon_file.stat().st_size
assert file_size > 0, "Favicon file should not be empty"
assert file_size < 1024 * 1024, f"Favicon file suspiciously large: {file_size} bytes"
# Check for common image magic bytes
favicon_data = favicon_file.read_bytes()
# ICO, PNG, GIF, JPEG, or WebP
is_image = (
favicon_data[:4] == b'\x00\x00\x01\x00' or # ICO
favicon_data[:8] == b'\x89PNG\r\n\x1a\n' or # PNG
favicon_data[:3] == b'GIF' or # GIF
favicon_data[:2] == b'\xff\xd8' or # JPEG
favicon_data[8:12] == b'WEBP' # WebP
)
assert is_image, "Favicon file should be a valid image format"
else:
# Failed as expected
assert result_json['status'] == 'failed', f"Should report failure: {result_json}"
def test_config_timeout_honored():
"""Test that TIMEOUT config is respected."""
check_result = subprocess.run(
[sys.executable, '-c', 'import requests'],
capture_output=True
)
if check_result.returncode != 0:
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set very short timeout (but example.com should still succeed)
import os
env = os.environ.copy()
env['TIMEOUT'] = '5'
result = subprocess.run(
[sys.executable, str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'testtimeout'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=30
)
# Should complete (success or fail, but not hang)
assert result.returncode in (0, 1), "Should complete without hanging"
def test_config_user_agent():
"""Test that USER_AGENT config is used."""
check_result = subprocess.run(
[sys.executable, '-c', 'import requests'],
capture_output=True
)
if check_result.returncode != 0:
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set custom user agent
import os
env = os.environ.copy()
env['USER_AGENT'] = 'TestBot/1.0'
result = subprocess.run(
[sys.executable, str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'testua'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=60
)
# Should succeed (example.com doesn't block)
if result.returncode == 0:
# Parse clean JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
if result_json:
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
def test_handles_https_urls():
"""Test that HTTPS URLs work correctly."""
check_result = subprocess.run(
[sys.executable, '-c', 'import requests'],
capture_output=True
)
if check_result.returncode != 0:
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
result = subprocess.run(
[sys.executable, str(FAVICON_HOOK), '--url', 'https://example.org', '--snapshot-id', 'testhttps'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
)
if result.returncode == 0:
favicon_file = tmpdir / 'favicon.ico'
if favicon_file.exists():
assert favicon_file.stat().st_size > 0
def test_handles_missing_favicon_gracefully():
"""Test that favicon plugin handles sites without favicons gracefully.
Note: The plugin falls back to Google's favicon service, which generates
a generic icon even if the site doesn't have one, so extraction usually succeeds.
"""
check_result = subprocess.run(
[sys.executable, '-c', 'import requests'],
capture_output=True
)
if check_result.returncode != 0:
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Try a URL that likely doesn't have a favicon
result = subprocess.run(
[sys.executable, str(FAVICON_HOOK), '--url', 'https://example.com/nonexistent', '--snapshot-id', 'test404'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
)
# May succeed (Google fallback) or fail gracefully
assert result.returncode in (0, 1), "Should complete (may succeed or fail)"
if result.returncode != 0:
combined = result.stdout + result.stderr
assert 'No favicon found' in combined or 'ERROR=' in combined
def test_reports_missing_requests_library():
"""Test that script reports error when requests library is missing."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Run with PYTHONPATH cleared to simulate missing requests
import os
env = os.environ.copy()
# Keep only minimal PATH, clear PYTHONPATH
env['PYTHONPATH'] = '/nonexistent'
result = subprocess.run(
[sys.executable, '-S', str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'test123'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env
)
# Should fail and report missing requests
if result.returncode != 0:
combined = result.stdout + result.stderr
# May report missing requests or other import errors
assert 'requests' in combined.lower() or 'import' in combined.lower() or 'ERROR=' in combined
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -1,51 +0,0 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"FORUMDL_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_FORUMDL", "USE_FORUMDL"],
"description": "Enable forum downloading with forum-dl"
},
"FORUMDL_BINARY": {
"type": "string",
"default": "forum-dl",
"description": "Path to forum-dl binary"
},
"FORUMDL_TIMEOUT": {
"type": "integer",
"default": 3600,
"minimum": 30,
"x-fallback": "TIMEOUT",
"description": "Timeout for forum downloads in seconds"
},
"FORUMDL_OUTPUT_FORMAT": {
"type": "string",
"default": "jsonl",
"enum": ["jsonl", "warc", "mbox", "maildir", "mh", "mmdf", "babyl"],
"description": "Output format for forum downloads"
},
"FORUMDL_CHECK_SSL_VALIDITY": {
"type": "boolean",
"default": true,
"x-fallback": "CHECK_SSL_VALIDITY",
"description": "Whether to verify SSL certificates"
},
"FORUMDL_ARGS": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["FORUMDL_DEFAULT_ARGS"],
"description": "Default forum-dl arguments"
},
"FORUMDL_ARGS_EXTRA": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["FORUMDL_EXTRA_ARGS"],
"description": "Extra arguments to append to forum-dl command"
}
}
}

View File

@@ -1,31 +0,0 @@
#!/usr/bin/env python3
"""
Wrapper for forum-dl that applies Pydantic v2 compatibility patches.
This wrapper fixes forum-dl 0.3.0's incompatibility with Pydantic v2 by monkey-patching
the JsonlWriter class to use model_dump_json() instead of the deprecated json(models_as_dict=False).
"""
import sys
# Apply Pydantic v2 compatibility patch BEFORE importing forum_dl
try:
from forum_dl.writers.jsonl import JsonlWriter
from pydantic import BaseModel
# Check if we're using Pydantic v2
if hasattr(BaseModel, 'model_dump_json'):
def _patched_serialize_entry(self, entry):
"""Use Pydantic v2's model_dump_json() instead of deprecated json(models_as_dict=False)"""
return entry.model_dump_json()
JsonlWriter._serialize_entry = _patched_serialize_entry
except (ImportError, AttributeError):
# forum-dl not installed or already compatible - no patch needed
pass
# Now import and run forum-dl's main function
from forum_dl import main
if __name__ == '__main__':
sys.exit(main())

View File

@@ -1,81 +0,0 @@
#!/usr/bin/env python3
"""
Emit forum-dl Binary dependency for the crawl.
"""
import json
import os
import sys
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def output_binary(name: str, binproviders: str, overrides: dict | None = None):
"""Output Binary JSONL record for a dependency."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'binproviders': binproviders,
'machine_id': machine_id,
}
if overrides:
record['overrides'] = overrides
print(json.dumps(record))
def main():
forumdl_enabled = get_env_bool('FORUMDL_ENABLED', True)
if not forumdl_enabled:
sys.exit(0)
output_binary(
name='forum-dl',
binproviders='pip,env',
overrides={
'pip': {
'packages': [
'--no-deps',
'--prefer-binary',
'forum-dl',
'chardet==5.2.0',
'pydantic',
'pydantic-core',
'typing-extensions',
'annotated-types',
'typing-inspection',
'beautifulsoup4',
'soupsieve',
'lxml',
'requests',
'urllib3',
'certifi',
'idna',
'charset-normalizer',
'tenacity',
'python-dateutil',
'six',
'html2text',
'warcio',
]
}
},
)
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -1,266 +0,0 @@
#!/usr/bin/env python3
"""
Download forum content from a URL using forum-dl.
Usage: on_Snapshot__04_forumdl.bg.py --url=<url> --snapshot-id=<uuid>
Output: Downloads forum content to $PWD/
Environment variables:
FORUMDL_ENABLED: Enable forum downloading (default: True)
FORUMDL_BINARY: Path to forum-dl binary (default: forum-dl)
FORUMDL_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT)
FORUMDL_OUTPUT_FORMAT: Output format (default: jsonl)
FORUMDL_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY)
FORUMDL_ARGS: Default forum-dl arguments (JSON array)
FORUMDL_ARGS_EXTRA: Extra arguments to append (JSON array)
"""
import json
import os
import shutil
import subprocess
import sys
import threading
from pathlib import Path
import rich_click as click
# Monkey patch forum-dl for Pydantic v2 compatibility
# forum-dl 0.3.0 uses deprecated json(models_as_dict=False) which doesn't work in Pydantic v2
try:
from forum_dl.writers.jsonl import JsonlWriter
from pydantic import BaseModel
# Check if we're using Pydantic v2 (has model_dump_json)
if hasattr(BaseModel, 'model_dump_json'):
# Patch JsonlWriter to use Pydantic v2 API
original_serialize = JsonlWriter._serialize_entry
def _patched_serialize_entry(self, entry):
# Use Pydantic v2's model_dump_json() instead of deprecated json(models_as_dict=False)
return entry.model_dump_json()
JsonlWriter._serialize_entry = _patched_serialize_entry
except (ImportError, AttributeError):
# forum-dl not installed or already compatible
pass
# Extractor metadata
PLUGIN_NAME = 'forumdl'
BIN_NAME = 'forum-dl'
BIN_PROVIDERS = 'pip,env'
OUTPUT_DIR = '.'
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
except ValueError:
return default
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
"""Parse a JSON array from environment variable."""
val = get_env(name, '')
if not val:
return default if default is not None else []
try:
result = json.loads(val)
if isinstance(result, list):
return [str(item) for item in result]
return default if default is not None else []
except json.JSONDecodeError:
return default if default is not None else []
def get_binary_shebang(binary_path: str) -> str | None:
"""Return interpreter from shebang line if present (e.g., /path/to/python)."""
try:
with open(binary_path, 'r', encoding='utf-8') as f:
first_line = f.readline().strip()
if first_line.startswith('#!'):
return first_line[2:].strip().split(' ')[0]
except Exception:
pass
return None
def resolve_binary_path(binary: str) -> str | None:
"""Resolve binary to an absolute path if possible."""
if not binary:
return None
if Path(binary).is_file():
return binary
return shutil.which(binary)
def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
"""
Download forum using forum-dl.
Returns: (success, output_path, error_message)
"""
# Get config from env (with FORUMDL_ prefix, x-fallback handled by config loader)
timeout = get_env_int('FORUMDL_TIMEOUT') or get_env_int('TIMEOUT', 3600)
check_ssl = get_env_bool('FORUMDL_CHECK_SSL_VALIDITY', True) if get_env('FORUMDL_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True)
forumdl_args = get_env_array('FORUMDL_ARGS', [])
forumdl_args_extra = get_env_array('FORUMDL_ARGS_EXTRA', [])
output_format = get_env('FORUMDL_OUTPUT_FORMAT', 'jsonl')
# Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)
# Build output filename based on format
if output_format == 'warc':
output_file = output_dir / 'forum.warc.gz'
elif output_format == 'jsonl':
output_file = output_dir / 'forum.jsonl'
elif output_format == 'maildir':
output_file = output_dir / 'forum' # maildir is a directory
elif output_format in ('mbox', 'mh', 'mmdf', 'babyl'):
output_file = output_dir / f'forum.{output_format}'
else:
output_file = output_dir / f'forum.{output_format}'
# Use our Pydantic v2 compatible wrapper if available, otherwise fall back to binary
wrapper_path = Path(__file__).parent / 'forum-dl-wrapper.py'
resolved_binary = resolve_binary_path(binary) or binary
if wrapper_path.exists():
forumdl_python = get_binary_shebang(resolved_binary) or sys.executable
cmd = [forumdl_python, str(wrapper_path), *forumdl_args, '-f', output_format, '-o', str(output_file)]
else:
cmd = [resolved_binary, *forumdl_args, '-f', output_format, '-o', str(output_file)]
if not check_ssl:
cmd.append('--no-check-certificate')
if forumdl_args_extra:
cmd.extend(forumdl_args_extra)
cmd.append(url)
try:
print(f'[forumdl] Starting download (timeout={timeout}s)', file=sys.stderr)
output_lines: list[str] = []
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1,
)
def _read_output() -> None:
if not process.stdout:
return
for line in process.stdout:
output_lines.append(line)
sys.stderr.write(line)
reader = threading.Thread(target=_read_output, daemon=True)
reader.start()
try:
process.wait(timeout=timeout)
except subprocess.TimeoutExpired:
process.kill()
reader.join(timeout=1)
return False, None, f'Timed out after {timeout} seconds'
reader.join(timeout=1)
combined_output = ''.join(output_lines)
# Check if output file was created
if output_file.exists() and output_file.stat().st_size > 0:
return True, str(output_file), ''
else:
stderr = combined_output
# These are NOT errors - page simply has no downloadable forum content
stderr_lower = stderr.lower()
if 'unsupported url' in stderr_lower:
return True, None, '' # Not a forum site - success, no output
if 'no content' in stderr_lower:
return True, None, '' # No forum found - success, no output
if 'extractornotfounderror' in stderr_lower:
return True, None, '' # No forum extractor for this URL - success, no output
if process.returncode == 0:
return True, None, '' # forum-dl exited cleanly, just no forum - success
# These ARE errors - something went wrong
if '404' in stderr:
return False, None, '404 Not Found'
if '403' in stderr:
return False, None, '403 Forbidden'
if 'unable to extract' in stderr_lower:
return False, None, 'Unable to extract forum info'
return False, None, f'forum-dl error: {stderr}'
except subprocess.TimeoutExpired:
return False, None, f'Timed out after {timeout} seconds'
except Exception as e:
return False, None, f'{type(e).__name__}: {e}'
@click.command()
@click.option('--url', required=True, help='URL to download forum from')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Download forum content from a URL using forum-dl."""
output = None
status = 'failed'
error = ''
try:
# Check if forum-dl is enabled
if not get_env_bool('FORUMDL_ENABLED', True):
print('Skipping forum-dl (FORUMDL_ENABLED=False)', file=sys.stderr)
# Temporary failure (config disabled) - NO JSONL emission
sys.exit(0)
# Get binary from environment
binary = get_env('FORUMDL_BINARY', 'forum-dl')
# Run extraction
success, output, error = save_forum(url, binary)
if success:
# Success - emit ArchiveResult
result = {
'type': 'ArchiveResult',
'status': 'succeeded',
'output_str': output or ''
}
print(json.dumps(result))
sys.exit(0)
else:
# Transient error - emit NO JSONL
print(f'ERROR: {error}', file=sys.stderr)
sys.exit(1)
except Exception as e:
# Transient error - emit NO JSONL
print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -1,7 +0,0 @@
<!-- Forum thumbnail - shows icon placeholder -->
<div class="extractor-thumbnail forumdl-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #1a1a1a; display: flex; align-items: center; justify-content: center;">
<div style="display: flex; flex-direction: column; align-items: center; color: #888; font-size: 12px;">
<span style="font-size: 32px;">💬</span>
<span>Forum</span>
</div>
</div>

View File

@@ -1,147 +0,0 @@
<!-- Fullscreen forum view - renders JSONL forum posts -->
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Forum Thread</title>
<style>
body {
margin: 0;
padding: 20px;
background: #0d1117;
color: #c9d1d9;
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif;
line-height: 1.6;
}
.header {
max-width: 1000px;
margin: 0 auto 30px;
text-align: center;
padding: 20px;
border-bottom: 1px solid #30363d;
}
.icon {
font-size: 48px;
margin-bottom: 10px;
}
h1 {
margin: 0;
font-size: 28px;
color: #f0f6fc;
}
.container {
max-width: 1000px;
margin: 0 auto;
}
.post {
background: #161b22;
border: 1px solid #30363d;
border-radius: 6px;
margin-bottom: 16px;
padding: 16px;
transition: border-color 0.2s;
}
.post:hover {
border-color: #58a6ff;
}
.post-header {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 12px;
padding-bottom: 12px;
border-bottom: 1px solid #21262d;
}
.post-author {
font-weight: 600;
color: #58a6ff;
font-size: 14px;
}
.post-date {
color: #8b949e;
font-size: 12px;
}
.post-title {
margin: 0 0 12px 0;
font-size: 18px;
font-weight: 600;
color: #f0f6fc;
}
.post-content {
color: #c9d1d9;
word-wrap: break-word;
}
.post-content img {
max-width: 100%;
height: auto;
border-radius: 4px;
}
.post-content a {
color: #58a6ff;
text-decoration: none;
}
.post-content a:hover {
text-decoration: underline;
}
.loading {
text-align: center;
padding: 40px;
color: #8b949e;
}
</style>
</head>
<body>
<div class="header">
<div class="icon">💬</div>
<h1>Forum Thread</h1>
</div>
<div class="container">
<div id="forum-posts" class="loading">Loading posts...</div>
</div>
<script>
(async function() {
try {
const response = await fetch('{{ output_path }}');
const text = await response.text();
const posts = text.trim().split('\n').filter(line => line).map(line => JSON.parse(line));
const container = document.getElementById('forum-posts');
container.innerHTML = '';
container.className = '';
posts.forEach(post => {
const postDiv = document.createElement('div');
postDiv.className = 'post';
const author = post.author || 'Anonymous';
const date = post.date ? new Date(post.date).toLocaleString() : '';
const title = post.title || '';
const content = post.content || post.body || '';
postDiv.innerHTML = `
<div class="post-header">
<span class="post-author">${escapeHtml(author)}</span>
<span class="post-date">${escapeHtml(date)}</span>
</div>
${title ? `<h2 class="post-title">${escapeHtml(title)}</h2>` : ''}
<div class="post-content">${content}</div>
`;
container.appendChild(postDiv);
});
if (posts.length === 0) {
container.innerHTML = '<div class="loading">No posts found</div>';
}
} catch(e) {
document.getElementById('forum-posts').innerHTML = '<div class="loading">Error loading posts: ' + e.message + '</div>';
}
})();
function escapeHtml(text) {
const div = document.createElement('div');
div.textContent = text;
return div.innerHTML;
}
</script>
</body>
</html>

View File

@@ -1 +0,0 @@
<span class="abx-output-icon abx-output-icon--forumdl" title="Forum"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M4 5h16v10H7l-3 3V5z"/></svg></span>

View File

@@ -1,317 +0,0 @@
"""
Integration tests for forumdl plugin
Tests verify:
pass
1. Hook script exists
2. Dependencies installed via validation hooks
3. Verify deps with abx-pkg
4. Forum extraction works on forum URLs
5. JSONL output is correct
6. Config options work
7. Handles non-forum URLs gracefully
"""
import json
import os
import subprocess
import sys
import tempfile
import time
import uuid
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
FORUMDL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_forumdl.*'), None)
TEST_URL = 'https://example.com'
# Module-level cache for binary path
_forumdl_binary_path = None
_forumdl_lib_root = None
def get_forumdl_binary_path():
"""Get the installed forum-dl binary path from cache or by running installation."""
global _forumdl_binary_path
if _forumdl_binary_path:
return _forumdl_binary_path
# Try to find forum-dl binary using abx-pkg
from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides
try:
binary = Binary(
name='forum-dl',
binproviders=[PipProvider(), EnvProvider()]
).load()
if binary and binary.abspath:
_forumdl_binary_path = str(binary.abspath)
return _forumdl_binary_path
except Exception:
pass
# If not found, try to install via pip using the crawl hook overrides
pip_hook = PLUGINS_ROOT / 'pip' / 'on_Binary__11_pip_install.py'
crawl_hook = PLUGIN_DIR / 'on_Crawl__25_forumdl_install.py'
if pip_hook.exists():
binary_id = str(uuid.uuid4())
machine_id = str(uuid.uuid4())
overrides = None
if crawl_hook.exists():
crawl_result = subprocess.run(
[sys.executable, str(crawl_hook)],
capture_output=True,
text=True,
timeout=30,
)
for crawl_line in crawl_result.stdout.strip().split('\n'):
if crawl_line.strip().startswith('{'):
try:
crawl_record = json.loads(crawl_line)
if crawl_record.get('type') == 'Binary' and crawl_record.get('name') == 'forum-dl':
overrides = crawl_record.get('overrides')
break
except json.JSONDecodeError:
continue
# Create a persistent temp LIB_DIR for the pip provider
import platform
global _forumdl_lib_root
if not _forumdl_lib_root:
_forumdl_lib_root = tempfile.mkdtemp(prefix='forumdl-lib-')
machine = platform.machine().lower()
system = platform.system().lower()
if machine in ('arm64', 'aarch64'):
machine = 'arm64'
elif machine in ('x86_64', 'amd64'):
machine = 'x86_64'
machine_type = f"{machine}-{system}"
lib_dir = Path(_forumdl_lib_root) / 'lib' / machine_type
lib_dir.mkdir(parents=True, exist_ok=True)
env = os.environ.copy()
env['LIB_DIR'] = str(lib_dir)
env['DATA_DIR'] = str(Path(_forumdl_lib_root) / 'data')
cmd = [
sys.executable, str(pip_hook),
'--binary-id', binary_id,
'--machine-id', machine_id,
'--name', 'forum-dl'
]
if overrides:
cmd.append(f'--overrides={json.dumps(overrides)}')
install_result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=300,
env=env,
)
# Parse Binary from pip installation
for install_line in install_result.stdout.strip().split('\n'):
if install_line.strip():
try:
install_record = json.loads(install_line)
if install_record.get('type') == 'Binary' and install_record.get('name') == 'forum-dl':
_forumdl_binary_path = install_record.get('abspath')
return _forumdl_binary_path
except json.JSONDecodeError:
pass
return None
def test_hook_script_exists():
"""Verify on_Snapshot hook exists."""
assert FORUMDL_HOOK.exists(), f"Hook not found: {FORUMDL_HOOK}"
def test_verify_deps_with_abx_pkg():
"""Verify forum-dl is installed by calling the REAL installation hooks."""
binary_path = get_forumdl_binary_path()
if not binary_path:
assert False, (
"forum-dl installation failed. Install hook should install forum-dl automatically. "
"Note: forum-dl has a dependency on cchardet which may not compile on Python 3.14+ "
"due to removed longintrepr.h header."
)
assert Path(binary_path).is_file(), f"Binary path must be a valid file: {binary_path}"
def test_handles_non_forum_url():
"""Test that forum-dl extractor handles non-forum URLs gracefully via hook."""
import os
binary_path = get_forumdl_binary_path()
if not binary_path:
pass
assert Path(binary_path).is_file(), f"Binary must be a valid file: {binary_path}"
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
env = os.environ.copy()
env['FORUMDL_BINARY'] = binary_path
# Run forum-dl extraction hook on non-forum URL
result = subprocess.run(
[sys.executable, str(FORUMDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'test789'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=60
)
# Should exit 0 even for non-forum URL (graceful handling)
assert result.returncode == 0, f"Should handle non-forum URL gracefully: {result.stderr}"
# Parse clean JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json, "Should have ArchiveResult JSONL output"
assert result_json['status'] == 'succeeded', f"Should succeed even for non-forum URL: {result_json}"
def test_config_save_forumdl_false_skips():
"""Test that FORUMDL_ENABLED=False exits without emitting JSONL."""
import os
with tempfile.TemporaryDirectory() as tmpdir:
env = os.environ.copy()
env['FORUMDL_ENABLED'] = 'False'
result = subprocess.run(
[sys.executable, str(FORUMDL_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=30
)
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
# Feature disabled - temporary failure, should NOT emit JSONL
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
# Should NOT emit any JSONL
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}"
def test_config_timeout():
"""Test that FORUMDL_TIMEOUT config is respected."""
import os
binary_path = get_forumdl_binary_path()
if not binary_path:
pass
assert Path(binary_path).is_file(), f"Binary must be a valid file: {binary_path}"
with tempfile.TemporaryDirectory() as tmpdir:
env = os.environ.copy()
env['FORUMDL_BINARY'] = binary_path
env['FORUMDL_TIMEOUT'] = '5'
start_time = time.time()
result = subprocess.run(
[sys.executable, str(FORUMDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'testtimeout'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=10 # Should complete in 5s, use 10s as safety margin
)
elapsed_time = time.time() - start_time
assert result.returncode == 0, f"Should complete without hanging: {result.stderr}"
# Allow 1 second overhead for subprocess startup and Python interpreter
assert elapsed_time <= 6.0, f"Should complete within 6 seconds (5s timeout + 1s overhead), took {elapsed_time:.2f}s"
def test_real_forum_url():
"""Test that forum-dl extracts content from a real HackerNews thread with jsonl output.
Uses our Pydantic v2 compatible wrapper to fix forum-dl 0.3.0's incompatibility.
"""
import os
binary_path = get_forumdl_binary_path()
assert binary_path, "forum-dl binary not available"
assert Path(binary_path).is_file(), f"Binary must be a valid file: {binary_path}"
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Use HackerNews - one of the most reliable forum-dl extractors
forum_url = 'https://news.ycombinator.com/item?id=1'
env = os.environ.copy()
env['FORUMDL_BINARY'] = binary_path
env['FORUMDL_TIMEOUT'] = '60'
env['FORUMDL_OUTPUT_FORMAT'] = 'jsonl' # Use jsonl format
# HTML output could be added via: env['FORUMDL_ARGS_EXTRA'] = json.dumps(['--files-output', './files'])
start_time = time.time()
result = subprocess.run(
[sys.executable, str(FORUMDL_HOOK), '--url', forum_url, '--snapshot-id', 'testforum'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=90
)
elapsed_time = time.time() - start_time
# Should succeed with our Pydantic v2 wrapper
assert result.returncode == 0, f"Should extract forum successfully: {result.stderr}"
# Parse JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json, f"Should have ArchiveResult JSONL output. stdout: {result.stdout}"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
# Check that forum files were downloaded
output_files = list(tmpdir.glob('**/*'))
forum_files = [f for f in output_files if f.is_file()]
assert len(forum_files) > 0, f"Should have downloaded at least one forum file. Files: {output_files}"
# Verify the JSONL file has content
jsonl_file = tmpdir / 'forum.jsonl'
assert jsonl_file.exists(), "Should have created forum.jsonl"
assert jsonl_file.stat().st_size > 0, "forum.jsonl should not be empty"
print(f"Successfully extracted {len(forum_files)} file(s) in {elapsed_time:.2f}s")
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -1,54 +0,0 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"GALLERYDL_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_GALLERYDL", "USE_GALLERYDL"],
"description": "Enable gallery downloading with gallery-dl"
},
"GALLERYDL_BINARY": {
"type": "string",
"default": "gallery-dl",
"description": "Path to gallery-dl binary"
},
"GALLERYDL_TIMEOUT": {
"type": "integer",
"default": 3600,
"minimum": 30,
"x-fallback": "TIMEOUT",
"description": "Timeout for gallery downloads in seconds"
},
"GALLERYDL_COOKIES_FILE": {
"type": "string",
"default": "",
"x-fallback": "COOKIES_FILE",
"description": "Path to cookies file"
},
"GALLERYDL_CHECK_SSL_VALIDITY": {
"type": "boolean",
"default": true,
"x-fallback": "CHECK_SSL_VALIDITY",
"description": "Whether to verify SSL certificates"
},
"GALLERYDL_ARGS": {
"type": "array",
"items": {"type": "string"},
"default": [
"--write-metadata",
"--write-info-json"
],
"x-aliases": ["GALLERYDL_DEFAULT_ARGS"],
"description": "Default gallery-dl arguments"
},
"GALLERYDL_ARGS_EXTRA": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["GALLERYDL_EXTRA_ARGS"],
"description": "Extra arguments to append to gallery-dl command"
}
}
}

View File

@@ -1,48 +0,0 @@
#!/usr/bin/env python3
"""
Emit gallery-dl Binary dependency for the crawl.
"""
import json
import os
import sys
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def output_binary(name: str, binproviders: str):
"""Output Binary JSONL record for a dependency."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'binproviders': binproviders,
'machine_id': machine_id,
}
print(json.dumps(record))
def main():
gallerydl_enabled = get_env_bool('GALLERYDL_ENABLED', True)
if not gallerydl_enabled:
sys.exit(0)
output_binary(name='gallery-dl', binproviders='pip,brew,apt,env')
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -1,261 +0,0 @@
#!/usr/bin/env python3
"""
Download image galleries from a URL using gallery-dl.
Usage: on_Snapshot__03_gallerydl.bg.py --url=<url> --snapshot-id=<uuid>
Output: Downloads gallery images to $PWD/gallerydl/
Environment variables:
GALLERYDL_ENABLED: Enable gallery-dl gallery extraction (default: True)
GALLERYDL_BINARY: Path to gallery-dl binary (default: gallery-dl)
GALLERYDL_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT)
GALLERYDL_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE)
GALLERYDL_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY)
GALLERYDL_ARGS: Default gallery-dl arguments (JSON array)
GALLERYDL_ARGS_EXTRA: Extra arguments to append (JSON array)
"""
import json
import os
import subprocess
import sys
import threading
from pathlib import Path
import rich_click as click
# Extractor metadata
PLUGIN_NAME = 'gallerydl'
BIN_NAME = 'gallery-dl'
BIN_PROVIDERS = 'pip,env'
OUTPUT_DIR = '.'
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
except ValueError:
return default
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
"""Parse a JSON array from environment variable."""
val = get_env(name, '')
if not val:
return default if default is not None else []
try:
result = json.loads(val)
if isinstance(result, list):
return [str(item) for item in result]
return default if default is not None else []
except json.JSONDecodeError:
return default if default is not None else []
STATICFILE_DIR = '../staticfile'
def has_staticfile_output() -> bool:
"""Check if staticfile extractor already downloaded this URL."""
staticfile_dir = Path(STATICFILE_DIR)
if not staticfile_dir.exists():
return False
stdout_log = staticfile_dir / 'stdout.log'
if not stdout_log.exists():
return False
for line in stdout_log.read_text(errors='ignore').splitlines():
line = line.strip()
if not line.startswith('{'):
continue
try:
record = json.loads(line)
except json.JSONDecodeError:
continue
if record.get('type') == 'ArchiveResult' and record.get('status') == 'succeeded':
return True
return False
def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]:
"""
Download gallery using gallery-dl.
Returns: (success, output_path, error_message)
"""
# Get config from env (with GALLERYDL_ prefix, x-fallback handled by config loader)
timeout = get_env_int('GALLERYDL_TIMEOUT') or get_env_int('TIMEOUT', 3600)
check_ssl = get_env_bool('GALLERYDL_CHECK_SSL_VALIDITY', True) if get_env('GALLERYDL_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True)
gallerydl_args = get_env_array('GALLERYDL_ARGS', [])
gallerydl_args_extra = get_env_array('GALLERYDL_ARGS_EXTRA', [])
cookies_file = get_env('GALLERYDL_COOKIES_FILE') or get_env('COOKIES_FILE', '')
# Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)
# Build command
# Use -D for exact directory (flat structure) instead of -d (nested structure)
cmd = [
binary,
*gallerydl_args,
'-D', str(output_dir),
]
if not check_ssl:
cmd.append('--no-check-certificate')
if cookies_file and Path(cookies_file).exists():
cmd.extend(['-C', cookies_file])
if gallerydl_args_extra:
cmd.extend(gallerydl_args_extra)
cmd.append(url)
try:
print(f'[gallerydl] Starting download (timeout={timeout}s)', file=sys.stderr)
output_lines: list[str] = []
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1,
)
def _read_output() -> None:
if not process.stdout:
return
for line in process.stdout:
output_lines.append(line)
sys.stderr.write(line)
reader = threading.Thread(target=_read_output, daemon=True)
reader.start()
try:
process.wait(timeout=timeout)
except subprocess.TimeoutExpired:
process.kill()
reader.join(timeout=1)
return False, None, f'Timed out after {timeout} seconds'
reader.join(timeout=1)
combined_output = ''.join(output_lines)
# Check if any gallery files were downloaded (search recursively)
gallery_extensions = (
'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.svg',
'.mp4', '.webm', '.mkv', '.avi', '.mov', '.flv',
'.json', '.txt', '.zip',
)
downloaded_files = [
f for f in output_dir.rglob('*')
if f.is_file() and f.suffix.lower() in gallery_extensions
]
if downloaded_files:
# Return first image file, or first file if no images
image_files = [
f for f in downloaded_files
if f.suffix.lower() in ('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp')
]
output = str(image_files[0]) if image_files else str(downloaded_files[0])
return True, output, ''
else:
stderr = combined_output
# These are NOT errors - page simply has no downloadable gallery
# Return success with no output (legitimate "nothing to download")
stderr_lower = stderr.lower()
if 'unsupported url' in stderr_lower:
return True, None, '' # Not a gallery site - success, no output
if 'no results' in stderr_lower:
return True, None, '' # No gallery found - success, no output
if process.returncode == 0:
return True, None, '' # gallery-dl exited cleanly, just no gallery - success
# These ARE errors - something went wrong
if '404' in stderr:
return False, None, '404 Not Found'
if '403' in stderr:
return False, None, '403 Forbidden'
if 'unable to extract' in stderr_lower:
return False, None, 'Unable to extract gallery info'
return False, None, f'gallery-dl error: {stderr}'
except subprocess.TimeoutExpired:
return False, None, f'Timed out after {timeout} seconds'
except Exception as e:
return False, None, f'{type(e).__name__}: {e}'
@click.command()
@click.option('--url', required=True, help='URL to download gallery from')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Download image gallery from a URL using gallery-dl."""
output = None
status = 'failed'
error = ''
try:
# Check if gallery-dl is enabled
if not get_env_bool('GALLERYDL_ENABLED', True):
print('Skipping gallery-dl (GALLERYDL_ENABLED=False)', file=sys.stderr)
# Temporary failure (config disabled) - NO JSONL emission
sys.exit(0)
# Check if staticfile extractor already handled this (permanent skip)
if has_staticfile_output():
print(f'Skipping gallery-dl - staticfile extractor already downloaded this', file=sys.stderr)
print(json.dumps({
'type': 'ArchiveResult',
'status': 'skipped',
'output_str': 'staticfile already handled',
}))
sys.exit(0)
# Get binary from environment
binary = get_env('GALLERYDL_BINARY', 'gallery-dl')
# Run extraction
success, output, error = save_gallery(url, binary)
if success:
# Success - emit ArchiveResult
result = {
'type': 'ArchiveResult',
'status': 'succeeded',
'output_str': output or ''
}
print(json.dumps(result))
sys.exit(0)
else:
# Transient error - emit NO JSONL
print(f'ERROR: {error}', file=sys.stderr)
sys.exit(1)
except Exception as e:
# Transient error - emit NO JSONL
print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -1,11 +0,0 @@
<!-- Gallery thumbnail - shows first image or placeholder -->
<div class="extractor-thumbnail gallerydl-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #1a1a1a; display: flex; align-items: center; justify-content: center;">
<img src="{{ output_path }}"
style="width: 100%; height: 100px; object-fit: contain;"
alt="Gallery thumbnail"
onerror="this.style.display='none'; this.nextElementSibling.style.display='flex';">
<div style="display: none; flex-direction: column; align-items: center; color: #888; font-size: 12px;">
<span style="font-size: 32px;">🖼️</span>
<span>Gallery</span>
</div>
</div>

View File

@@ -1,28 +0,0 @@
<!-- Fullscreen gallery view - shows image in full size -->
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Gallery</title>
<style>
body {
margin: 0;
padding: 0;
background: #000;
display: flex;
align-items: center;
justify-content: center;
min-height: 100vh;
}
img {
max-width: 100%;
max-height: 100vh;
object-fit: contain;
}
</style>
</head>
<body>
<img src="{{ output_path }}" alt="Gallery image">
</body>
</html>

View File

@@ -1 +0,0 @@
<span class="abx-output-icon abx-output-icon--gallerydl" title="Gallery"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><rect x="3" y="5" width="18" height="14" rx="2"/><circle cx="8" cy="10" r="1.5" fill="currentColor" stroke="none"/><path d="M21 17l-5-5-5 5"/></svg></span>

View File

@@ -1,190 +0,0 @@
"""
Integration tests for gallerydl plugin
Tests verify:
pass
1. Hook script exists
2. Dependencies installed via validation hooks
3. Verify deps with abx-pkg
4. Gallery extraction works on gallery URLs
5. JSONL output is correct
6. Config options work
7. Handles non-gallery URLs gracefully
"""
import json
import subprocess
import sys
import tempfile
import time
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
GALLERYDL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_gallerydl.*'), None)
TEST_URL = 'https://example.com'
def test_hook_script_exists():
"""Verify on_Snapshot hook exists."""
assert GALLERYDL_HOOK.exists(), f"Hook not found: {GALLERYDL_HOOK}"
def test_verify_deps_with_abx_pkg():
"""Verify gallery-dl is available via abx-pkg."""
from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides
missing_binaries = []
# Verify gallery-dl is available
gallerydl_binary = Binary(name='gallery-dl', binproviders=[PipProvider(), EnvProvider()])
gallerydl_loaded = gallerydl_binary.load()
if not (gallerydl_loaded and gallerydl_loaded.abspath):
missing_binaries.append('gallery-dl')
if missing_binaries:
pass
def test_handles_non_gallery_url():
"""Test that gallery-dl extractor handles non-gallery URLs gracefully via hook."""
# Prerequisites checked by earlier test
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Run gallery-dl extraction hook on non-gallery URL
result = subprocess.run(
[sys.executable, str(GALLERYDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'test789'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
)
# Should exit 0 even for non-gallery URL
assert result.returncode == 0, f"Should handle non-gallery URL gracefully: {result.stderr}"
# Parse clean JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json, "Should have ArchiveResult JSONL output"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
def test_config_save_gallery_dl_false_skips():
"""Test that GALLERYDL_ENABLED=False exits without emitting JSONL."""
import os
with tempfile.TemporaryDirectory() as tmpdir:
env = os.environ.copy()
env['GALLERYDL_ENABLED'] = 'False'
result = subprocess.run(
[sys.executable, str(GALLERYDL_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=30
)
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
# Feature disabled - temporary failure, should NOT emit JSONL
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
# Should NOT emit any JSONL
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}"
def test_config_timeout():
"""Test that GALLERY_DL_TIMEOUT config is respected."""
import os
with tempfile.TemporaryDirectory() as tmpdir:
env = os.environ.copy()
env['GALLERY_DL_TIMEOUT'] = '5'
start_time = time.time()
result = subprocess.run(
[sys.executable, str(GALLERYDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'testtimeout'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=10 # Should complete in 5s, use 10s as safety margin
)
elapsed_time = time.time() - start_time
assert result.returncode == 0, f"Should complete without hanging: {result.stderr}"
# Allow 1 second overhead for subprocess startup and Python interpreter
assert elapsed_time <= 6.0, f"Should complete within 6 seconds (5s timeout + 1s overhead), took {elapsed_time:.2f}s"
def test_real_gallery_url():
"""Test that gallery-dl can extract images from a real Flickr gallery URL."""
import os
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Use a real Flickr photo page
gallery_url = 'https://www.flickr.com/photos/gregorydolivet/55002388567/in/explore-2025-12-25/'
env = os.environ.copy()
env['GALLERY_DL_TIMEOUT'] = '60' # Give it time to download
start_time = time.time()
result = subprocess.run(
[sys.executable, str(GALLERYDL_HOOK), '--url', gallery_url, '--snapshot-id', 'testflickr'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=90
)
elapsed_time = time.time() - start_time
# Should succeed
assert result.returncode == 0, f"Should extract gallery successfully: {result.stderr}"
# Parse JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json, f"Should have ArchiveResult JSONL output. stdout: {result.stdout}"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
# Check that some files were downloaded
output_files = list(tmpdir.glob('**/*'))
image_files = [f for f in output_files if f.is_file() and f.suffix.lower() in ('.jpg', '.jpeg', '.png', '.gif', '.webp')]
assert len(image_files) > 0, f"Should have downloaded at least one image. Files: {output_files}"
print(f"Successfully extracted {len(image_files)} image(s) in {elapsed_time:.2f}s")
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -1,44 +0,0 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"GIT_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_GIT", "USE_GIT"],
"description": "Enable git repository cloning"
},
"GIT_BINARY": {
"type": "string",
"default": "git",
"description": "Path to git binary"
},
"GIT_TIMEOUT": {
"type": "integer",
"default": 120,
"minimum": 10,
"x-fallback": "TIMEOUT",
"description": "Timeout for git operations in seconds"
},
"GIT_DOMAINS": {
"type": "string",
"default": "github.com,gitlab.com,bitbucket.org,gist.github.com,codeberg.org,gitea.com,git.sr.ht",
"description": "Comma-separated list of domains to treat as git repositories"
},
"GIT_ARGS": {
"type": "array",
"items": {"type": "string"},
"default": ["clone", "--depth=1", "--recursive"],
"x-aliases": ["GIT_DEFAULT_ARGS"],
"description": "Default git arguments"
},
"GIT_ARGS_EXTRA": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["GIT_EXTRA_ARGS"],
"description": "Extra arguments to append to git command"
}
}
}

View File

@@ -1,48 +0,0 @@
#!/usr/bin/env python3
"""
Emit git Binary dependency for the crawl.
"""
import json
import os
import sys
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def output_binary(name: str, binproviders: str):
"""Output Binary JSONL record for a dependency."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'binproviders': binproviders,
'machine_id': machine_id,
}
print(json.dumps(record))
def main():
git_enabled = get_env_bool('GIT_ENABLED', True)
if not git_enabled:
sys.exit(0)
output_binary(name='git', binproviders='apt,brew,env')
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -1,145 +0,0 @@
#!/usr/bin/env python3
"""
Clone a git repository from a URL.
Usage: on_Snapshot__05_git.bg.py --url=<url> --snapshot-id=<uuid>
Output: Clones repository to $PWD/repo
Environment variables:
GIT_BINARY: Path to git binary
GIT_TIMEOUT: Timeout in seconds (default: 120)
GIT_ARGS: Default git arguments (JSON array, default: ["clone", "--depth=1", "--recursive"])
GIT_ARGS_EXTRA: Extra arguments to append (JSON array, default: [])
# Fallback to ARCHIVING_CONFIG values if GIT_* not set:
TIMEOUT: Fallback timeout
"""
import json
import os
import subprocess
import sys
from pathlib import Path
import rich_click as click
# Extractor metadata
PLUGIN_NAME = 'git'
BIN_NAME = 'git'
BIN_PROVIDERS = 'apt,brew,env'
OUTPUT_DIR = '.'
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
except ValueError:
return default
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
"""Parse a JSON array from environment variable."""
val = get_env(name, '')
if not val:
return default if default is not None else []
try:
result = json.loads(val)
if isinstance(result, list):
return [str(item) for item in result]
return default if default is not None else []
except json.JSONDecodeError:
return default if default is not None else []
def is_git_url(url: str) -> bool:
"""Check if URL looks like a git repository."""
git_patterns = [
'.git',
'github.com',
'gitlab.com',
'bitbucket.org',
'git://',
'ssh://git@',
]
return any(p in url.lower() for p in git_patterns)
def clone_git(url: str, binary: str) -> tuple[bool, str | None, str]:
"""
Clone git repository.
Returns: (success, output_path, error_message)
"""
timeout = get_env_int('GIT_TIMEOUT') or get_env_int('TIMEOUT', 120)
git_args = get_env_array('GIT_ARGS', ["clone", "--depth=1", "--recursive"])
git_args_extra = get_env_array('GIT_ARGS_EXTRA', [])
cmd = [binary, *git_args, *git_args_extra, url, OUTPUT_DIR]
try:
result = subprocess.run(cmd, timeout=timeout)
if result.returncode == 0 and Path(OUTPUT_DIR).is_dir():
return True, OUTPUT_DIR, ''
else:
return False, None, f'git clone failed (exit={result.returncode})'
except subprocess.TimeoutExpired:
return False, None, f'Timed out after {timeout} seconds'
except Exception as e:
return False, None, f'{type(e).__name__}: {e}'
@click.command()
@click.option('--url', required=True, help='Git repository URL')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Clone a git repository from a URL."""
output = None
status = 'failed'
error = ''
try:
# Check if URL looks like a git repo
if not is_git_url(url):
print(f'Skipping git clone for non-git URL: {url}', file=sys.stderr)
print(json.dumps({
'type': 'ArchiveResult',
'status': 'skipped',
'output_str': 'Not a git URL',
}))
sys.exit(0)
# Get binary from environment
binary = get_env('GIT_BINARY', 'git')
# Run extraction
success, output, error = clone_git(url, binary)
status = 'succeeded' if success else 'failed'
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
if error:
print(f'ERROR: {error}', file=sys.stderr)
# Output clean JSONL (no RESULT_JSON= prefix)
result = {
'type': 'ArchiveResult',
'status': status,
'output_str': output or error or '',
}
print(json.dumps(result))
sys.exit(0 if status == 'succeeded' else 1)
if __name__ == '__main__':
main()

View File

@@ -1,5 +0,0 @@
<!-- Git thumbnail - shows git repository icon and info -->
<div class="extractor-thumbnail git-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #f6f8fa; display: flex; flex-direction: column; align-items: center; justify-content: center; padding: 10px;">
<span style="font-size: 32px;">📂</span>
<span style="font-size: 11px; color: #586069; margin-top: 4px;">Git Repository</span>
</div>

View File

@@ -1 +0,0 @@
<span class="abx-output-icon abx-output-icon--git" title="Git"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="6" cy="6" r="2"/><circle cx="6" cy="18" r="2"/><circle cx="18" cy="12" r="2"/><path d="M8 6h5a3 3 0 0 1 3 3v1"/><path d="M8 18h5a3 3 0 0 0 3-3v-1"/></svg></span>

View File

@@ -1,130 +0,0 @@
"""
Integration tests for git plugin
Tests verify:
pass
1. Validate hook checks for git binary
2. Verify deps with abx-pkg
3. Standalone git extractor execution
"""
import json
import shutil
import subprocess
import sys
import tempfile
import time
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
GIT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_git.*'), None)
TEST_URL = 'https://github.com/ArchiveBox/abx-pkg.git'
def test_hook_script_exists():
assert GIT_HOOK.exists()
def test_verify_deps_with_abx_pkg():
"""Verify git is available via abx-pkg."""
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
git_binary = Binary(name='git', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
git_loaded = git_binary.load()
assert git_loaded and git_loaded.abspath, "git is required for git plugin tests"
def test_reports_missing_git():
with tempfile.TemporaryDirectory() as tmpdir:
env = {'PATH': '/nonexistent'}
result = subprocess.run(
[sys.executable, str(GIT_HOOK), '--url', TEST_URL, '--snapshot-id', 'test123'],
cwd=tmpdir, capture_output=True, text=True, env=env
)
if result.returncode != 0:
combined = result.stdout + result.stderr
assert 'DEPENDENCY_NEEDED' in combined or 'git' in combined.lower() or 'ERROR=' in combined
def test_handles_non_git_url():
assert shutil.which('git'), "git binary not available"
with tempfile.TemporaryDirectory() as tmpdir:
result = subprocess.run(
[sys.executable, str(GIT_HOOK), '--url', 'https://example.com', '--snapshot-id', 'test789'],
cwd=tmpdir, capture_output=True, text=True, timeout=30
)
# Should fail or skip for non-git URL
assert result.returncode in (0, 1)
# Parse clean JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
if result_json:
# Should report failure or skip for non-git URL
assert result_json['status'] in ['failed', 'skipped'], f"Should fail or skip: {result_json}"
def test_real_git_repo():
"""Test that git can clone a real GitHub repository."""
import os
assert shutil.which('git'), "git binary not available"
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Use a real but small GitHub repository
git_url = 'https://github.com/ArchiveBox/abx-pkg'
env = os.environ.copy()
env['GIT_TIMEOUT'] = '120' # Give it time to clone
start_time = time.time()
result = subprocess.run(
[sys.executable, str(GIT_HOOK), '--url', git_url, '--snapshot-id', 'testgit'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=180
)
elapsed_time = time.time() - start_time
# Should succeed
assert result.returncode == 0, f"Should clone repository successfully: {result.stderr}"
# Parse JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json, f"Should have ArchiveResult JSONL output. stdout: {result.stdout}"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
# Check that the git repo was cloned
git_dirs = list(tmpdir.glob('**/.git'))
assert len(git_dirs) > 0, f"Should have cloned a git repository. Contents: {list(tmpdir.rglob('*'))}"
print(f"Successfully cloned repository in {elapsed_time:.2f}s")
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -1,20 +0,0 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"HASHES_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_HASHES", "USE_HASHES"],
"description": "Enable merkle tree hash generation"
},
"HASHES_TIMEOUT": {
"type": "integer",
"default": 30,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for merkle tree generation in seconds"
}
}
}

View File

@@ -1,185 +0,0 @@
#!/usr/bin/env python3
"""
Create a hashed Merkle tree of all archived outputs.
This plugin runs after all extractors complete (priority 93) and generates
a cryptographic Merkle hash tree of all files in the snapshot directory.
Output: hashes.json containing root_hash, tree structure, file list, metadata
Usage: on_Snapshot__93_hashes.py --url=<url> --snapshot-id=<uuid>
Environment variables:
SAVE_HASHES: Enable hash merkle tree generation (default: true)
DATA_DIR: ArchiveBox data directory
ARCHIVE_DIR: Archive output directory
"""
import os
import sys
import json
import hashlib
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, List, Optional, Tuple, Any
import click
def sha256_file(filepath: Path) -> str:
"""Compute SHA256 hash of a file."""
h = hashlib.sha256()
try:
with open(filepath, 'rb') as f:
while chunk := f.read(65536):
h.update(chunk)
return h.hexdigest()
except (OSError, PermissionError):
return '0' * 64
def sha256_data(data: bytes) -> str:
"""Compute SHA256 hash of raw data."""
return hashlib.sha256(data).hexdigest()
def collect_files(snapshot_dir: Path, exclude_dirs: Optional[List[str]] = None) -> List[Tuple[Path, str, int]]:
"""Recursively collect all files in snapshot directory."""
exclude_dirs = exclude_dirs or ['hashes', '.git', '__pycache__']
files = []
for root, dirs, filenames in os.walk(snapshot_dir):
dirs[:] = [d for d in dirs if d not in exclude_dirs]
for filename in filenames:
filepath = Path(root) / filename
rel_path = filepath.relative_to(snapshot_dir)
if filepath.is_symlink():
continue
file_hash = sha256_file(filepath)
file_size = filepath.stat().st_size if filepath.exists() else 0
files.append((rel_path, file_hash, file_size))
files.sort(key=lambda x: str(x[0]))
return files
def build_merkle_tree(file_hashes: List[str]) -> Tuple[str, List[List[str]]]:
"""Build a Merkle tree from a list of leaf hashes."""
if not file_hashes:
return sha256_data(b''), [[]]
tree_levels = [file_hashes.copy()]
while len(tree_levels[-1]) > 1:
current_level = tree_levels[-1]
next_level = []
for i in range(0, len(current_level), 2):
left = current_level[i]
if i + 1 < len(current_level):
right = current_level[i + 1]
combined = left + right
else:
combined = left + left
parent_hash = sha256_data(combined.encode('utf-8'))
next_level.append(parent_hash)
tree_levels.append(next_level)
root_hash = tree_levels[-1][0]
return root_hash, tree_levels
def create_hashes(snapshot_dir: Path) -> Dict[str, Any]:
"""Create a complete Merkle hash tree of all files in snapshot directory."""
files = collect_files(snapshot_dir)
file_hashes = [file_hash for _, file_hash, _ in files]
root_hash, tree_levels = build_merkle_tree(file_hashes)
total_size = sum(size for _, _, size in files)
file_list = [
{'path': str(path), 'hash': file_hash, 'size': size}
for path, file_hash, size in files
]
return {
'root_hash': root_hash,
'tree_levels': tree_levels,
'files': file_list,
'metadata': {
'timestamp': datetime.now(timezone.utc).isoformat(),
'file_count': len(files),
'total_size': total_size,
'tree_depth': len(tree_levels),
},
}
@click.command()
@click.option('--url', required=True, help='URL being archived')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Generate Merkle tree of all archived outputs."""
status = 'failed'
output = None
error = ''
root_hash = None
file_count = 0
try:
# Check if enabled
save_hashes = os.getenv('HASHES_ENABLED', 'true').lower() in ('true', '1', 'yes', 'on')
if not save_hashes:
status = 'skipped'
click.echo(json.dumps({'status': status, 'output': 'HASHES_ENABLED=false'}))
sys.exit(0)
# Working directory is the extractor output dir (e.g., <snapshot>/hashes/)
# Parent is the snapshot directory
output_dir = Path.cwd()
snapshot_dir = output_dir.parent
if not snapshot_dir.exists():
raise FileNotFoundError(f'Snapshot directory not found: {snapshot_dir}')
# Ensure output directory exists
output_dir.mkdir(exist_ok=True)
output_path = output_dir / 'hashes.json'
# Generate Merkle tree
merkle_data = create_hashes(snapshot_dir)
# Write output
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(merkle_data, f, indent=2)
status = 'succeeded'
output = 'hashes.json'
root_hash = merkle_data['root_hash']
file_count = merkle_data['metadata']['file_count']
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
click.echo(f'Error: {error}', err=True)
# Print JSON result for hook runner
result = {
'status': status,
'output': output,
'error': error or None,
'root_hash': root_hash,
'file_count': file_count,
}
click.echo(json.dumps(result))
sys.exit(0 if status in ('succeeded', 'skipped') else 1)
if __name__ == '__main__':
main()

View File

@@ -1 +0,0 @@
<span class="abx-output-icon abx-output-icon--hashes" title="Authenticity Hashes"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="5" r="2"/><circle cx="6" cy="18" r="2"/><circle cx="18" cy="18" r="2"/><path d="M12 7v6"/><path d="M12 13l-4 3"/><path d="M12 13l4 3"/></svg></span>

View File

@@ -1,157 +0,0 @@
"""
Tests for the hashes plugin.
Tests the real merkle tree generation with actual files.
"""
import json
import os
import subprocess
import sys
import tempfile
from pathlib import Path
import pytest
from django.test import TestCase
# Get the path to the hashes hook
PLUGIN_DIR = Path(__file__).parent.parent
HASHES_HOOK = PLUGIN_DIR / 'on_Snapshot__93_hashes.py'
class TestHashesPlugin(TestCase):
"""Test the hashes plugin."""
def test_hashes_hook_exists(self):
"""Hashes hook script should exist."""
self.assertTrue(HASHES_HOOK.exists(), f"Hook not found: {HASHES_HOOK}")
def test_hashes_generates_tree_for_files(self):
"""Hashes hook should generate merkle tree for files in snapshot directory."""
with tempfile.TemporaryDirectory() as temp_dir:
# Create a mock snapshot directory structure
snapshot_dir = Path(temp_dir) / 'snapshot'
snapshot_dir.mkdir()
# Create output directory for hashes
output_dir = snapshot_dir / 'hashes'
output_dir.mkdir()
# Create some test files
(snapshot_dir / 'index.html').write_text('<html><body>Test</body></html>')
(snapshot_dir / 'screenshot.png').write_bytes(b'\x89PNG\r\n\x1a\n' + b'\x00' * 100)
subdir = snapshot_dir / 'media'
subdir.mkdir()
(subdir / 'video.mp4').write_bytes(b'\x00\x00\x00\x18ftypmp42')
# Run the hook from the output directory
env = os.environ.copy()
env['HASHES_ENABLED'] = 'true'
result = subprocess.run(
[
sys.executable, str(HASHES_HOOK),
'--url=https://example.com',
'--snapshot-id=test-snapshot',
],
capture_output=True,
text=True,
cwd=str(output_dir), # Hook expects to run from output dir
env=env,
timeout=30
)
# Should succeed
self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}")
# Check output file exists
output_file = output_dir / 'hashes.json'
self.assertTrue(output_file.exists(), "hashes.json not created")
# Parse and verify output
with open(output_file) as f:
data = json.load(f)
self.assertIn('root_hash', data)
self.assertIn('files', data)
self.assertIn('metadata', data)
# Should have indexed our test files
file_paths = [f['path'] for f in data['files']]
self.assertIn('index.html', file_paths)
self.assertIn('screenshot.png', file_paths)
# Verify metadata
self.assertGreater(data['metadata']['file_count'], 0)
self.assertGreater(data['metadata']['total_size'], 0)
def test_hashes_skips_when_disabled(self):
"""Hashes hook should skip when HASHES_ENABLED=false."""
with tempfile.TemporaryDirectory() as temp_dir:
snapshot_dir = Path(temp_dir) / 'snapshot'
snapshot_dir.mkdir()
output_dir = snapshot_dir / 'hashes'
output_dir.mkdir()
env = os.environ.copy()
env['HASHES_ENABLED'] = 'false'
result = subprocess.run(
[
sys.executable, str(HASHES_HOOK),
'--url=https://example.com',
'--snapshot-id=test-snapshot',
],
capture_output=True,
text=True,
cwd=str(output_dir),
env=env,
timeout=30
)
# Should succeed (exit 0) but skip
self.assertEqual(result.returncode, 0)
self.assertIn('skipped', result.stdout)
def test_hashes_handles_empty_directory(self):
"""Hashes hook should handle empty snapshot directory."""
with tempfile.TemporaryDirectory() as temp_dir:
snapshot_dir = Path(temp_dir) / 'snapshot'
snapshot_dir.mkdir()
output_dir = snapshot_dir / 'hashes'
output_dir.mkdir()
env = os.environ.copy()
env['HASHES_ENABLED'] = 'true'
result = subprocess.run(
[
sys.executable, str(HASHES_HOOK),
'--url=https://example.com',
'--snapshot-id=test-snapshot',
],
capture_output=True,
text=True,
cwd=str(output_dir),
env=env,
timeout=30
)
# Should succeed even with empty directory
self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}")
# Check output file exists
output_file = output_dir / 'hashes.json'
self.assertTrue(output_file.exists())
with open(output_file) as f:
data = json.load(f)
# Should have empty file list
self.assertEqual(data['metadata']['file_count'], 0)
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -1,21 +0,0 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"required_plugins": ["chrome"],
"properties": {
"HEADERS_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_HEADERS", "USE_HEADERS"],
"description": "Enable HTTP headers capture"
},
"HEADERS_TIMEOUT": {
"type": "integer",
"default": 30,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for headers capture in seconds"
}
}
}

View File

@@ -1,247 +0,0 @@
#!/usr/bin/env node
/**
* Capture original request + response headers for the main navigation.
*
* This hook sets up CDP listeners BEFORE chrome_navigate loads the page,
* then waits for navigation to complete. It records the first top-level
* request headers and the corresponding response headers (with :status).
*
* Usage: on_Snapshot__27_headers.bg.js --url=<url> --snapshot-id=<uuid>
* Output: Writes headers.json
*/
const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const puppeteer = require('puppeteer-core');
// Import shared utilities from chrome_utils.js
const {
getEnvBool,
getEnvInt,
parseArgs,
connectToPage,
waitForPageLoaded,
} = require('../chrome/chrome_utils.js');
const PLUGIN_NAME = 'headers';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'headers.json';
const CHROME_SESSION_DIR = '../chrome';
const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)';
let browser = null;
let page = null;
let client = null;
let shuttingDown = false;
let headersWritten = false;
let requestId = null;
let requestUrl = null;
let requestHeaders = null;
let responseHeaders = null;
let responseStatus = null;
let responseStatusText = null;
let responseUrl = null;
let originalUrl = null;
function getFinalUrl() {
const finalUrlFile = path.join(CHROME_SESSION_DIR, 'final_url.txt');
if (fs.existsSync(finalUrlFile)) {
return fs.readFileSync(finalUrlFile, 'utf8').trim();
}
return page ? page.url() : null;
}
function writeHeadersFile() {
if (headersWritten) return;
if (!responseHeaders) return;
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
const responseHeadersWithStatus = {
...(responseHeaders || {}),
};
if (responseStatus !== null && responseStatus !== undefined &&
responseHeadersWithStatus[':status'] === undefined) {
responseHeadersWithStatus[':status'] = String(responseStatus);
}
const record = {
url: requestUrl || originalUrl,
final_url: getFinalUrl(),
status: responseStatus !== undefined ? responseStatus : null,
request_headers: requestHeaders || {},
response_headers: responseHeadersWithStatus,
headers: responseHeadersWithStatus, // backwards compatibility
};
if (responseStatusText) {
record.statusText = responseStatusText;
}
if (responseUrl) {
record.response_url = responseUrl;
}
fs.writeFileSync(outputPath, JSON.stringify(record, null, 2));
headersWritten = true;
}
async function setupListener(url) {
const timeout = getEnvInt('HEADERS_TIMEOUT', getEnvInt('TIMEOUT', 30)) * 1000;
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt');
const pidFile = path.join(CHROME_SESSION_DIR, 'chrome.pid');
if (!fs.existsSync(cdpFile) || !fs.existsSync(targetIdFile) || !fs.existsSync(pidFile)) {
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
}
try {
const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10);
if (!pid || Number.isNaN(pid)) throw new Error('Invalid pid');
process.kill(pid, 0);
} catch (e) {
throw new Error(CHROME_SESSION_REQUIRED_ERROR);
}
const { browser, page } = await connectToPage({
chromeSessionDir: CHROME_SESSION_DIR,
timeoutMs: timeout,
puppeteer,
});
client = await page.target().createCDPSession();
await client.send('Network.enable');
client.on('Network.requestWillBeSent', (params) => {
try {
if (requestId && !responseHeaders && params.redirectResponse && params.requestId === requestId) {
responseHeaders = params.redirectResponse.headers || {};
responseStatus = params.redirectResponse.status || null;
responseStatusText = params.redirectResponse.statusText || null;
responseUrl = params.redirectResponse.url || null;
writeHeadersFile();
}
if (requestId) return;
if (params.type && params.type !== 'Document') return;
if (!params.request || !params.request.url) return;
if (!params.request.url.startsWith('http')) return;
requestId = params.requestId;
requestUrl = params.request.url;
requestHeaders = params.request.headers || {};
} catch (e) {
// Ignore errors
}
});
client.on('Network.responseReceived', (params) => {
try {
if (!requestId || params.requestId !== requestId || responseHeaders) return;
const response = params.response || {};
responseHeaders = response.headers || {};
responseStatus = response.status || null;
responseStatusText = response.statusText || null;
responseUrl = response.url || null;
writeHeadersFile();
} catch (e) {
// Ignore errors
}
});
return { browser, page };
}
function emitResult(status = 'succeeded', outputStr = OUTPUT_FILE) {
if (shuttingDown) return;
shuttingDown = true;
console.log(JSON.stringify({
type: 'ArchiveResult',
status,
output_str: outputStr,
}));
}
async function handleShutdown(signal) {
console.error(`\nReceived ${signal}, emitting final results...`);
if (!headersWritten) {
writeHeadersFile();
}
if (headersWritten) {
emitResult('succeeded', OUTPUT_FILE);
} else {
emitResult('failed', 'No headers captured');
}
if (browser) {
try {
browser.disconnect();
} catch (e) {}
}
process.exit(headersWritten ? 0 : 1);
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__27_headers.bg.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
originalUrl = url;
if (!getEnvBool('HEADERS_ENABLED', true)) {
console.error('Skipping (HEADERS_ENABLED=False)');
console.log(JSON.stringify({type: 'ArchiveResult', status: 'skipped', output_str: 'HEADERS_ENABLED=False'}));
process.exit(0);
}
try {
// Set up listeners BEFORE navigation
const connection = await setupListener(url);
browser = connection.browser;
page = connection.page;
// Register signal handlers for graceful shutdown
process.on('SIGTERM', () => handleShutdown('SIGTERM'));
process.on('SIGINT', () => handleShutdown('SIGINT'));
// Wait for chrome_navigate to complete (non-fatal)
try {
const timeout = getEnvInt('HEADERS_TIMEOUT', getEnvInt('TIMEOUT', 30)) * 1000;
await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 200);
} catch (e) {
console.error(`WARN: ${e.message}`);
}
// Keep alive until SIGTERM
await new Promise(() => {});
return;
} catch (e) {
const errorMessage = (e && e.message)
? `${e.name || 'Error'}: ${e.message}`
: String(e || 'Unknown error');
console.error(`ERROR: ${errorMessage}`);
console.log(JSON.stringify({
type: 'ArchiveResult',
status: 'failed',
output_str: errorMessage,
}));
process.exit(1);
}
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -1 +0,0 @@
<span class="abx-output-icon abx-output-icon--headers" title="Headers"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="4" cy="7" r="1" fill="currentColor" stroke="none"/><circle cx="4" cy="12" r="1" fill="currentColor" stroke="none"/><circle cx="4" cy="17" r="1" fill="currentColor" stroke="none"/><path d="M7 7h13"/><path d="M7 12h13"/><path d="M7 17h13"/></svg></span>

View File

@@ -1,409 +0,0 @@
"""
Integration tests for headers plugin
Tests verify:
pass
1. Plugin script exists and is executable
2. Node.js is available
3. Headers extraction works for real example.com
4. Output JSON contains actual HTTP headers
5. Config options work (TIMEOUT, USER_AGENT)
"""
import json
import shutil
import subprocess
import tempfile
import time
from pathlib import Path
import pytest
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
CHROME_NAVIGATE_HOOK,
get_test_env,
chrome_session,
)
PLUGIN_DIR = Path(__file__).parent.parent
HEADERS_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_headers.*'), None)
TEST_URL = 'https://example.com'
def normalize_root_url(url: str) -> str:
return url.rstrip('/')
def run_headers_capture(headers_dir, snapshot_chrome_dir, env, url, snapshot_id):
hook_proc = subprocess.Popen(
['node', str(HEADERS_HOOK), f'--url={url}', f'--snapshot-id={snapshot_id}'],
cwd=headers_dir,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env,
)
nav_result = subprocess.run(
['node', str(CHROME_NAVIGATE_HOOK), f'--url={url}', f'--snapshot-id={snapshot_id}'],
cwd=snapshot_chrome_dir,
capture_output=True,
text=True,
timeout=120,
env=env,
)
headers_file = headers_dir / 'headers.json'
for _ in range(60):
if headers_file.exists() and headers_file.stat().st_size > 0:
break
time.sleep(1)
if hook_proc.poll() is None:
hook_proc.terminate()
try:
stdout, stderr = hook_proc.communicate(timeout=5)
except subprocess.TimeoutExpired:
hook_proc.kill()
stdout, stderr = hook_proc.communicate()
else:
stdout, stderr = hook_proc.communicate()
return hook_proc.returncode, stdout, stderr, nav_result, headers_file
def test_hook_script_exists():
"""Verify hook script exists."""
assert HEADERS_HOOK.exists(), f"Hook script not found: {HEADERS_HOOK}"
def test_node_is_available():
"""Test that Node.js is available on the system."""
result = subprocess.run(
['which', 'node'],
capture_output=True,
text=True
)
if result.returncode != 0:
pass
binary_path = result.stdout.strip()
assert Path(binary_path).exists(), f"Binary should exist at {binary_path}"
# Test that node is executable and get version
result = subprocess.run(
['node', '--version'],
capture_output=True,
text=True,
timeout=10
,
env=get_test_env())
assert result.returncode == 0, f"node not executable: {result.stderr}"
assert result.stdout.startswith('v'), f"Unexpected node version format: {result.stdout}"
def test_extracts_headers_from_example_com():
"""Test full workflow: extract headers from real example.com."""
# Check node is available
if not shutil.which('node'):
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
headers_dir = snapshot_chrome_dir.parent / 'headers'
headers_dir.mkdir(exist_ok=True)
result = run_headers_capture(
headers_dir,
snapshot_chrome_dir,
env,
TEST_URL,
'test789',
)
hook_code, stdout, stderr, nav_result, headers_file = result
assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
assert hook_code == 0, f"Extraction failed: {stderr}"
# Parse clean JSONL output
result_json = None
for line in stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json, "Should have ArchiveResult JSONL output"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
# Verify output file exists (hook writes to current directory)
assert headers_file.exists(), "headers.json not created"
# Verify headers JSON contains REAL example.com response
headers_data = json.loads(headers_file.read_text())
assert 'url' in headers_data, "Should have url field"
assert normalize_root_url(headers_data['url']) == normalize_root_url(TEST_URL), f"URL should be {TEST_URL}"
assert 'status' in headers_data, "Should have status field"
assert headers_data['status'] in [200, 301, 302], \
f"Should have valid HTTP status, got {headers_data['status']}"
assert 'request_headers' in headers_data, "Should have request_headers field"
assert isinstance(headers_data['request_headers'], dict), "Request headers should be a dict"
assert 'response_headers' in headers_data, "Should have response_headers field"
assert isinstance(headers_data['response_headers'], dict), "Response headers should be a dict"
assert len(headers_data['response_headers']) > 0, "Response headers dict should not be empty"
assert 'headers' in headers_data, "Should have headers field"
assert isinstance(headers_data['headers'], dict), "Headers should be a dict"
# Verify common HTTP headers are present
headers_lower = {k.lower(): v for k, v in headers_data['response_headers'].items()}
assert 'content-type' in headers_lower or 'content-length' in headers_lower, \
"Should have at least one common HTTP header"
assert headers_data['response_headers'].get(':status') == str(headers_data['status']), \
"Response headers should include :status pseudo header"
def test_headers_output_structure():
"""Test that headers plugin produces correctly structured output."""
if not shutil.which('node'):
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
headers_dir = snapshot_chrome_dir.parent / 'headers'
headers_dir.mkdir(exist_ok=True)
result = run_headers_capture(
headers_dir,
snapshot_chrome_dir,
env,
TEST_URL,
'testformat',
)
hook_code, stdout, stderr, nav_result, headers_file = result
assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
assert hook_code == 0, f"Extraction failed: {stderr}"
# Parse clean JSONL output
result_json = None
for line in stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json, "Should have ArchiveResult JSONL output"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
# Verify output structure
assert headers_file.exists(), "Output headers.json not created"
output_data = json.loads(headers_file.read_text())
# Verify all required fields are present
assert 'url' in output_data, "Output should have url field"
assert 'status' in output_data, "Output should have status field"
assert 'request_headers' in output_data, "Output should have request_headers field"
assert 'response_headers' in output_data, "Output should have response_headers field"
assert 'headers' in output_data, "Output should have headers field"
# Verify data types
assert isinstance(output_data['status'], int), "Status should be integer"
assert isinstance(output_data['request_headers'], dict), "Request headers should be dict"
assert isinstance(output_data['response_headers'], dict), "Response headers should be dict"
assert isinstance(output_data['headers'], dict), "Headers should be dict"
# Verify example.com returns expected headers
assert normalize_root_url(output_data['url']) == normalize_root_url(TEST_URL)
assert output_data['status'] in [200, 301, 302]
def test_fails_without_chrome_session():
"""Test that headers plugin fails when chrome session is missing."""
if not shutil.which('node'):
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Run headers extraction
result = subprocess.run(
['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=testhttp'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
,
env=get_test_env())
assert result.returncode != 0, "Should fail without chrome session"
assert 'No Chrome session found (chrome plugin must run first)' in (result.stdout + result.stderr)
def test_config_timeout_honored():
"""Test that TIMEOUT config is respected."""
if not shutil.which('node'):
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set very short timeout (but example.com should still succeed)
import os
env_override = os.environ.copy()
env_override['TIMEOUT'] = '5'
with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
headers_dir = snapshot_chrome_dir.parent / 'headers'
headers_dir.mkdir(exist_ok=True)
env.update(env_override)
result = run_headers_capture(
headers_dir,
snapshot_chrome_dir,
env,
TEST_URL,
'testtimeout',
)
# Should complete (success or fail, but not hang)
hook_code, _stdout, _stderr, nav_result, _headers_file = result
assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
assert hook_code in (0, 1), "Should complete without hanging"
def test_config_user_agent():
"""Test that USER_AGENT config is used."""
if not shutil.which('node'):
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set custom user agent
import os
env_override = os.environ.copy()
env_override['USER_AGENT'] = 'TestBot/1.0'
with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
headers_dir = snapshot_chrome_dir.parent / 'headers'
headers_dir.mkdir(exist_ok=True)
env.update(env_override)
result = run_headers_capture(
headers_dir,
snapshot_chrome_dir,
env,
TEST_URL,
'testua',
)
# Should succeed (example.com doesn't block)
hook_code, stdout, _stderr, nav_result, _headers_file = result
assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
if hook_code == 0:
# Parse clean JSONL output
result_json = None
for line in stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json, "Should have ArchiveResult JSONL output"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
def test_handles_https_urls():
"""Test that HTTPS URLs work correctly."""
if not shutil.which('node'):
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
with chrome_session(tmpdir, test_url='https://example.org', navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
headers_dir = snapshot_chrome_dir.parent / 'headers'
headers_dir.mkdir(exist_ok=True)
result = run_headers_capture(
headers_dir,
snapshot_chrome_dir,
env,
'https://example.org',
'testhttps',
)
hook_code, _stdout, _stderr, nav_result, headers_file = result
assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
if hook_code == 0:
if headers_file.exists():
output_data = json.loads(headers_file.read_text())
assert normalize_root_url(output_data['url']) == normalize_root_url('https://example.org')
assert output_data['status'] in [200, 301, 302]
def test_handles_404_gracefully():
"""Test that headers plugin handles 404s gracefully."""
if not shutil.which('node'):
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
with chrome_session(tmpdir, test_url='https://example.com/nonexistent-page-404', navigate=False) as (_process, _pid, snapshot_chrome_dir, env):
headers_dir = snapshot_chrome_dir.parent / 'headers'
headers_dir.mkdir(exist_ok=True)
result = run_headers_capture(
headers_dir,
snapshot_chrome_dir,
env,
'https://example.com/nonexistent-page-404',
'test404',
)
# May succeed or fail depending on server behavior
# If it succeeds, verify 404 status is captured
hook_code, _stdout, _stderr, nav_result, headers_file = result
assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}"
if hook_code == 0:
if headers_file.exists():
output_data = json.loads(headers_file.read_text())
assert output_data['status'] == 404, "Should capture 404 status"
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -1,20 +0,0 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"HTMLTOTEXT_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_HTMLTOTEXT", "USE_HTMLTOTEXT"],
"description": "Enable HTML to text conversion"
},
"HTMLTOTEXT_TIMEOUT": {
"type": "integer",
"default": 30,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for HTML to text conversion in seconds"
}
}
}

View File

@@ -1,161 +0,0 @@
#!/usr/bin/env python3
"""
Convert HTML to plain text for search indexing.
This extractor reads HTML from other extractors (wget, singlefile, dom)
and converts it to plain text for full-text search.
Usage: on_Snapshot__htmltotext.py --url=<url> --snapshot-id=<uuid>
Output: Writes htmltotext.txt to $PWD
Environment variables:
TIMEOUT: Timeout in seconds (not used, but kept for consistency)
Note: This extractor does not require any external binaries.
It uses Python's built-in html.parser module.
"""
import json
import os
import re
import sys
from html.parser import HTMLParser
from pathlib import Path
import rich_click as click
# Extractor metadata
PLUGIN_NAME = 'htmltotext'
OUTPUT_DIR = '.'
OUTPUT_FILE = 'htmltotext.txt'
class HTMLTextExtractor(HTMLParser):
"""Extract text content from HTML, ignoring scripts/styles."""
def __init__(self):
super().__init__()
self.result = []
self.skip_tags = {'script', 'style', 'head', 'meta', 'link', 'noscript'}
self.current_tag = None
def handle_starttag(self, tag, attrs):
self.current_tag = tag.lower()
def handle_endtag(self, tag):
self.current_tag = None
def handle_data(self, data):
if self.current_tag not in self.skip_tags:
text = data.strip()
if text:
self.result.append(text)
def get_text(self) -> str:
return ' '.join(self.result)
def html_to_text(html: str) -> str:
"""Convert HTML to plain text."""
parser = HTMLTextExtractor()
try:
parser.feed(html)
return parser.get_text()
except Exception:
# Fallback: strip HTML tags with regex
text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r'<[^>]+>', ' ', text)
text = re.sub(r'\s+', ' ', text)
return text.strip()
def find_html_source() -> str | None:
"""Find HTML content from other extractors in the snapshot directory."""
# Hooks run in snapshot_dir, sibling extractor outputs are in subdirectories
search_patterns = [
'singlefile/singlefile.html',
'*_singlefile/singlefile.html',
'singlefile/*.html',
'*_singlefile/*.html',
'dom/output.html',
'*_dom/output.html',
'dom/*.html',
'*_dom/*.html',
'wget/**/*.html',
'*_wget/**/*.html',
'wget/**/*.htm',
'*_wget/**/*.htm',
]
for base in (Path.cwd(), Path.cwd().parent):
for pattern in search_patterns:
matches = list(base.glob(pattern))
for match in matches:
if match.is_file() and match.stat().st_size > 0:
try:
return match.read_text(errors='ignore')
except Exception:
continue
return None
def extract_htmltotext(url: str) -> tuple[bool, str | None, str]:
"""
Extract plain text from HTML sources.
Returns: (success, output_path, error_message)
"""
# Find HTML source from other extractors
html_content = find_html_source()
if not html_content:
return False, None, 'No HTML source found (run singlefile, dom, or wget first)'
# Convert HTML to text
text = html_to_text(html_content)
if not text or len(text) < 10:
return False, None, 'No meaningful text extracted from HTML'
# Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)
output_path = output_dir / OUTPUT_FILE
output_path.write_text(text, encoding='utf-8')
return True, str(output_path), ''
@click.command()
@click.option('--url', required=True, help='URL that was archived')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Convert HTML to plain text for search indexing."""
try:
# Run extraction
success, output, error = extract_htmltotext(url)
if success:
# Success - emit ArchiveResult
result = {
'type': 'ArchiveResult',
'status': 'succeeded',
'output_str': output or ''
}
print(json.dumps(result))
sys.exit(0)
else:
# Transient error - emit NO JSONL
print(f'ERROR: {error}', file=sys.stderr)
sys.exit(1)
except Exception as e:
# Transient error - emit NO JSONL
print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -1 +0,0 @@
<span class="abx-output-icon abx-output-icon--htmltotext" title="HTML to Text"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M4 7h16"/><path d="M4 12h12"/><path d="M4 17h14"/></svg></span>

View File

@@ -1,84 +0,0 @@
"""
Integration tests for htmltotext plugin
Tests verify standalone htmltotext extractor execution.
"""
import json
import subprocess
import sys
import tempfile
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
HTMLTOTEXT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_htmltotext.*'), None)
TEST_URL = 'https://example.com'
def test_hook_script_exists():
assert HTMLTOTEXT_HOOK.exists()
def test_extracts_text_from_html():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Create HTML source
(tmpdir / 'singlefile').mkdir()
(tmpdir / 'singlefile' / 'singlefile.html').write_text('<html><body><h1>Example Domain</h1><p>This domain is for examples.</p></body></html>')
result = subprocess.run(
[sys.executable, str(HTMLTOTEXT_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
cwd=tmpdir, capture_output=True, text=True, timeout=30
)
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
# Parse clean JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json, "Should have ArchiveResult JSONL output"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
# Verify output file (hook writes to current directory)
output_file = tmpdir / 'htmltotext.txt'
assert output_file.exists(), f"htmltotext.txt not created. Files: {list(tmpdir.iterdir())}"
content = output_file.read_text()
assert len(content) > 0, "Content should not be empty"
assert 'Example Domain' in content, "Should contain text from HTML"
def test_fails_gracefully_without_html():
with tempfile.TemporaryDirectory() as tmpdir:
result = subprocess.run(
[sys.executable, str(HTMLTOTEXT_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
cwd=tmpdir, capture_output=True, text=True, timeout=30
)
# Should exit with non-zero or emit failure JSONL
# Parse clean JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
if result_json:
# Should report failure or skip since no HTML source
assert result_json['status'] in ['failed', 'skipped'], f"Should fail or skip without HTML: {result_json}"
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -1,51 +0,0 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"required_plugins": ["chrome"],
"properties": {
"INFINISCROLL_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_INFINISCROLL", "USE_INFINISCROLL"],
"description": "Enable infinite scroll page expansion"
},
"INFINISCROLL_TIMEOUT": {
"type": "integer",
"default": 120,
"minimum": 10,
"x-fallback": "TIMEOUT",
"description": "Maximum timeout for scrolling in seconds"
},
"INFINISCROLL_SCROLL_DELAY": {
"type": "integer",
"default": 2000,
"minimum": 500,
"description": "Delay between scrolls in milliseconds"
},
"INFINISCROLL_SCROLL_DISTANCE": {
"type": "integer",
"default": 1600,
"minimum": 100,
"description": "Distance to scroll per step in pixels"
},
"INFINISCROLL_SCROLL_LIMIT": {
"type": "integer",
"default": 10,
"minimum": 1,
"maximum": 100,
"description": "Maximum number of scroll steps"
},
"INFINISCROLL_MIN_HEIGHT": {
"type": "integer",
"default": 16000,
"minimum": 1000,
"description": "Minimum page height to scroll to in pixels"
},
"INFINISCROLL_EXPAND_DETAILS": {
"type": "boolean",
"default": true,
"description": "Expand <details> elements and click 'load more' buttons for comments"
}
}
}

View File

@@ -1,427 +0,0 @@
#!/usr/bin/env node
/**
* Scroll the page down to trigger infinite scroll / lazy loading.
*
* Scrolls down 1 page at a time, up to INFINISCROLL_SCROLL_LIMIT times,
* ensuring at least INFINISCROLL_MIN_HEIGHT (default 16,000px) is reached.
* Stops early if no new content loads after a scroll.
*
* Optionally expands <details> elements and clicks "load more" buttons.
*
* Usage: on_Snapshot__45_infiniscroll.js --url=<url> --snapshot-id=<uuid>
* Output: JSONL with scroll stats (no files created)
*
* Environment variables:
* INFINISCROLL_ENABLED: Enable/disable (default: true)
* INFINISCROLL_TIMEOUT: Max timeout in seconds (default: 120)
* INFINISCROLL_SCROLL_DELAY: Delay between scrolls in ms (default: 2000)
* INFINISCROLL_SCROLL_DISTANCE: Pixels per scroll (default: 1600)
* INFINISCROLL_SCROLL_LIMIT: Max scroll iterations (default: 10)
* INFINISCROLL_MIN_HEIGHT: Min page height to reach in px (default: 16000)
* INFINISCROLL_EXPAND_DETAILS: Expand <details> and comments (default: true)
*/
const fs = require('fs');
const path = require('path');
// Add NODE_MODULES_DIR to module resolution paths if set
if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
const {
getEnv,
getEnvBool,
getEnvInt,
} = require('../chrome/chrome_utils.js');
// Check if infiniscroll is enabled BEFORE requiring puppeteer
if (!getEnvBool('INFINISCROLL_ENABLED', true)) {
console.error('Skipping infiniscroll (INFINISCROLL_ENABLED=False)');
process.exit(0);
}
const puppeteer = require('puppeteer-core');
const PLUGIN_NAME = 'infiniscroll';
const CHROME_SESSION_DIR = '../chrome';
const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)';
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
function getCdpUrl() {
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
if (fs.existsSync(cdpFile)) {
return fs.readFileSync(cdpFile, 'utf8').trim();
}
return null;
}
function getPageId() {
const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt');
if (fs.existsSync(targetIdFile)) {
return fs.readFileSync(targetIdFile, 'utf8').trim();
}
return null;
}
async function waitForChromeTabLoaded(timeoutMs = 60000) {
const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json');
const startTime = Date.now();
while (Date.now() - startTime < timeoutMs) {
if (fs.existsSync(navigationFile)) {
return true;
}
await new Promise(resolve => setTimeout(resolve, 100));
}
return false;
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* Expand <details> elements and click "load more" buttons for comments.
* Based on archivebox.ts expandComments function.
*/
async function expandDetails(page, options = {}) {
const {
timeout = 30000,
limit = 500,
delay = 500,
} = options;
const startTime = Date.now();
// First, expand all <details> elements
const detailsExpanded = await page.evaluate(() => {
let count = 0;
// Generic <details> elements
document.querySelectorAll('details:not([open])').forEach(el => {
el.open = true;
count++;
});
// Github README details sections
document.querySelectorAll('article details:not([open])').forEach(el => {
el.open = true;
count++;
});
// Github issue discussion hidden comments
document.querySelectorAll('div.js-discussion details:not(.details-overlay):not([open])').forEach(el => {
el.open = true;
count++;
});
// HedgeDoc/Markdown details sections
document.querySelectorAll('.markdown-body details:not([open])').forEach(el => {
el.open = true;
count++;
});
return count;
});
if (detailsExpanded > 0) {
console.error(`Expanded ${detailsExpanded} <details> elements`);
}
// Then click "load more" buttons for comments
const numExpanded = await page.evaluate(async ({ timeout, limit, delay }) => {
// Helper to find elements by XPath
function getElementsByXPath(xpath) {
const results = [];
const xpathResult = document.evaluate(
xpath,
document,
null,
XPathResult.ORDERED_NODE_ITERATOR_TYPE,
null
);
let node;
while ((node = xpathResult.iterateNext()) != null) {
results.push(node);
}
return results;
}
const wait = (ms) => new Promise(res => setTimeout(res, ms));
// Find all "load more" type buttons/links
const getLoadMoreLinks = () => [
// Reddit (new)
...document.querySelectorAll('faceplate-partial[loading=action]'),
// Reddit (old) - show more replies
...document.querySelectorAll('a[onclick^="return morechildren"]'),
// Reddit (old) - show hidden replies
...document.querySelectorAll('a[onclick^="return togglecomment"]'),
// Twitter/X - show more replies
...getElementsByXPath("//*[text()='Show more replies']"),
...getElementsByXPath("//*[text()='Show replies']"),
// Generic "load more" / "show more" buttons
...getElementsByXPath("//*[contains(text(),'Load more')]"),
...getElementsByXPath("//*[contains(text(),'Show more')]"),
// Hacker News
...document.querySelectorAll('a.morelink'),
];
let expanded = 0;
let loadMoreLinks = getLoadMoreLinks();
const startTime = Date.now();
while (loadMoreLinks.length > 0) {
for (const link of loadMoreLinks) {
// Skip certain elements
if (link.slot === 'children') continue;
try {
link.scrollIntoView({ behavior: 'smooth' });
link.click();
expanded++;
await wait(delay);
} catch (e) {
// Ignore click errors
}
// Check limits
if (expanded >= limit) return expanded;
if (Date.now() - startTime >= timeout) return expanded;
}
// Check for new load more links after clicking
await wait(delay);
loadMoreLinks = getLoadMoreLinks();
}
return expanded;
}, { timeout, limit, delay });
if (numExpanded > 0) {
console.error(`Clicked ${numExpanded} "load more" buttons`);
}
return {
detailsExpanded,
commentsExpanded: numExpanded,
total: detailsExpanded + numExpanded,
};
}
async function scrollDown(page, options = {}) {
const {
timeout = 120000,
scrollDelay = 2000,
scrollDistance = 1600,
scrollLimit = 10,
minHeight = 16000,
} = options;
const startTime = Date.now();
// Get page height using multiple methods (some pages use different scroll containers)
const getPageHeight = () => page.evaluate(() => {
return Math.max(
document.body.scrollHeight || 0,
document.body.offsetHeight || 0,
document.documentElement.scrollHeight || 0,
document.documentElement.offsetHeight || 0
);
});
const startingHeight = await getPageHeight();
let lastHeight = startingHeight;
let scrollCount = 0;
let scrollPosition = 0;
console.error(`Initial page height: ${startingHeight}px`);
// Scroll to top first
await page.evaluate(() => {
window.scrollTo({ top: 0, left: 0, behavior: 'smooth' });
});
await sleep(500);
while (scrollCount < scrollLimit) {
// Check timeout
const elapsed = Date.now() - startTime;
if (elapsed >= timeout) {
console.error(`Timeout reached after ${scrollCount} scrolls`);
break;
}
scrollPosition = (scrollCount + 1) * scrollDistance;
console.error(`Scrolling down ${scrollCount + 1}x ${scrollDistance}px... (${scrollPosition}/${lastHeight})`);
await page.evaluate((yOffset) => {
window.scrollTo({ top: yOffset, left: 0, behavior: 'smooth' });
}, scrollPosition);
scrollCount++;
await sleep(scrollDelay);
// Check if new content was added (infinite scroll detection)
const newHeight = await getPageHeight();
const addedPx = newHeight - lastHeight;
if (addedPx > 0) {
console.error(`Detected infini-scrolling: ${lastHeight}+${addedPx} => ${newHeight}`);
} else if (scrollPosition >= newHeight + scrollDistance) {
// Reached the bottom
if (scrollCount > 2) {
console.error(`Reached bottom of page at ${newHeight}px`);
break;
}
}
lastHeight = newHeight;
// Check if we've reached minimum height and can stop
if (lastHeight >= minHeight && scrollPosition >= lastHeight) {
console.error(`Reached minimum height target (${minHeight}px)`);
break;
}
}
// Scroll to absolute bottom
if (scrollPosition < lastHeight) {
await page.evaluate(() => {
window.scrollTo({ top: document.documentElement.scrollHeight, left: 0, behavior: 'smooth' });
});
await sleep(scrollDelay);
}
// Scroll back to top
console.error(`Reached bottom of page at ${lastHeight}px, scrolling back to top...`);
await page.evaluate(() => {
window.scrollTo({ top: 0, left: 0, behavior: 'smooth' });
});
await sleep(scrollDelay);
const totalElapsed = Date.now() - startTime;
return {
scrollCount,
finalHeight: lastHeight,
startingHeight,
elapsedMs: totalElapsed,
};
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__45_infiniscroll.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
const timeout = getEnvInt('INFINISCROLL_TIMEOUT', 120) * 1000;
const scrollDelay = getEnvInt('INFINISCROLL_SCROLL_DELAY', 2000);
const scrollDistance = getEnvInt('INFINISCROLL_SCROLL_DISTANCE', 1600);
const scrollLimit = getEnvInt('INFINISCROLL_SCROLL_LIMIT', 10);
const minHeight = getEnvInt('INFINISCROLL_MIN_HEIGHT', 16000);
const expandDetailsEnabled = getEnvBool('INFINISCROLL_EXPAND_DETAILS', true);
const cdpUrl = getCdpUrl();
if (!cdpUrl) {
console.error(CHROME_SESSION_REQUIRED_ERROR);
process.exit(1);
}
// Wait for page to be loaded
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
console.error('ERROR: Page not loaded after 60s (chrome_navigate must complete first)');
process.exit(1);
}
let browser = null;
try {
browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
const pages = await browser.pages();
if (pages.length === 0) {
throw new Error('No pages found in browser');
}
// Find the right page by target ID
const targetId = getPageId();
let page = null;
if (targetId) {
page = pages.find(p => {
const target = p.target();
return target && target._targetId === targetId;
});
}
if (!page) {
page = pages[pages.length - 1];
}
console.error(`Starting infinite scroll on ${url}`);
// Expand <details> and comments before scrolling (if enabled)
let expandResult = { total: 0, detailsExpanded: 0, commentsExpanded: 0 };
if (expandDetailsEnabled) {
console.error('Expanding <details> and comments...');
expandResult = await expandDetails(page, {
timeout: Math.min(timeout / 4, 30000),
limit: 500,
delay: scrollDelay / 4,
});
}
const result = await scrollDown(page, {
timeout,
scrollDelay,
scrollDistance,
scrollLimit,
minHeight,
});
// Expand again after scrolling (new content may have loaded)
if (expandDetailsEnabled) {
const expandResult2 = await expandDetails(page, {
timeout: Math.min(timeout / 4, 30000),
limit: 500,
delay: scrollDelay / 4,
});
expandResult.total += expandResult2.total;
expandResult.detailsExpanded += expandResult2.detailsExpanded;
expandResult.commentsExpanded += expandResult2.commentsExpanded;
}
browser.disconnect();
const elapsedSec = (result.elapsedMs / 1000).toFixed(1);
const finalHeightStr = result.finalHeight.toLocaleString();
const addedHeight = result.finalHeight - result.startingHeight;
const addedStr = addedHeight > 0 ? `+${addedHeight.toLocaleString()}px new content` : 'no new content';
const expandStr = expandResult.total > 0 ? `, expanded ${expandResult.total}` : '';
const outputStr = `scrolled to ${finalHeightStr}px (${addedStr}${expandStr}) over ${elapsedSec}s`;
console.error(`Success: ${outputStr}`);
console.log(JSON.stringify({
type: 'ArchiveResult',
status: 'succeeded',
output_str: outputStr,
}));
process.exit(0);
} catch (e) {
if (browser) browser.disconnect();
console.error(`ERROR: ${e.name}: ${e.message}`);
process.exit(1);
}
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -1 +0,0 @@
<span class="abx-output-icon abx-output-icon--infiniscroll" title="Infinite Scroll"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M12 5v9"/><path d="M8 10l4 4 4-4"/><circle cx="6" cy="19" r="1" fill="currentColor" stroke="none"/><circle cx="12" cy="19" r="1" fill="currentColor" stroke="none"/><circle cx="18" cy="19" r="1" fill="currentColor" stroke="none"/></svg></span>

View File

@@ -1,245 +0,0 @@
"""
Integration tests for infiniscroll plugin
Tests verify:
1. Hook script exists
2. Dependencies installed via chrome validation hooks
3. Verify deps with abx-pkg
4. INFINISCROLL_ENABLED=False skips without JSONL
5. Fails gracefully when no chrome session exists
6. Full integration test: scrolls page and outputs stats
7. Config options work (scroll limit, min height)
"""
import json
import os
import re
import subprocess
import time
import tempfile
from pathlib import Path
import pytest
# Import shared Chrome test helpers
from archivebox.plugins.chrome.tests.chrome_test_helpers import (
get_test_env,
chrome_session,
)
PLUGIN_DIR = Path(__file__).parent.parent
INFINISCROLL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_infiniscroll.*'), None)
TEST_URL = 'https://www.singsing.movie/'
def test_hook_script_exists():
"""Verify on_Snapshot hook exists."""
assert INFINISCROLL_HOOK is not None, "Infiniscroll hook not found"
assert INFINISCROLL_HOOK.exists(), f"Hook not found: {INFINISCROLL_HOOK}"
def test_verify_deps_with_abx_pkg():
"""Verify dependencies are available via abx-pkg after hook installation."""
from abx_pkg import Binary, EnvProvider, BinProviderOverrides
EnvProvider.model_rebuild()
# Verify node is available
node_binary = Binary(name='node', binproviders=[EnvProvider()])
node_loaded = node_binary.load()
assert node_loaded and node_loaded.abspath, "Node.js required for infiniscroll plugin"
def test_config_infiniscroll_disabled_skips():
"""Test that INFINISCROLL_ENABLED=False exits without emitting JSONL."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
env = get_test_env()
env['INFINISCROLL_ENABLED'] = 'False'
result = subprocess.run(
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-disabled'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=30
)
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
# Should NOT emit any JSONL
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, got: {jsonl_lines}"
def test_fails_gracefully_without_chrome_session():
"""Test that hook fails gracefully when no chrome session exists."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
infiniscroll_dir = tmpdir / 'snapshot' / 'infiniscroll'
infiniscroll_dir.mkdir(parents=True, exist_ok=True)
result = subprocess.run(
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-no-chrome'],
cwd=infiniscroll_dir,
capture_output=True,
text=True,
env=get_test_env(),
timeout=30
)
# Should fail (exit 1) when no chrome session
assert result.returncode != 0, "Should fail when no chrome session exists"
# Error could be about chrome/CDP not found, or puppeteer module missing
err_lower = result.stderr.lower()
assert any(x in err_lower for x in ['chrome', 'cdp', 'puppeteer', 'module']), \
f"Should mention chrome/CDP/puppeteer in error: {result.stderr}"
def test_scrolls_page_and_outputs_stats():
"""Integration test: scroll page and verify JSONL output format."""
with tempfile.TemporaryDirectory() as tmpdir:
with chrome_session(
Path(tmpdir),
crawl_id='test-infiniscroll',
snapshot_id='snap-infiniscroll',
test_url=TEST_URL,
) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
# Create infiniscroll output directory (sibling to chrome)
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
infiniscroll_dir.mkdir()
# Run infiniscroll hook
env['INFINISCROLL_SCROLL_LIMIT'] = '3' # Limit scrolls for faster test
env['INFINISCROLL_SCROLL_DELAY'] = '500' # Faster scrolling
env['INFINISCROLL_MIN_HEIGHT'] = '1000' # Lower threshold for test
result = subprocess.run(
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll'],
cwd=str(infiniscroll_dir),
capture_output=True,
text=True,
timeout=60,
env=env
)
assert result.returncode == 0, f"Infiniscroll failed: {result.stderr}\nStdout: {result.stdout}"
# Parse JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json is not None, f"Should have ArchiveResult JSONL output. Stdout: {result.stdout}"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
# Verify output_str format: "scrolled to X,XXXpx (+Y,YYYpx new content) over Z.Zs"
output_str = result_json.get('output_str', '')
assert output_str.startswith('scrolled to'), f"output_str should start with 'scrolled to': {output_str}"
assert 'px' in output_str, f"output_str should contain pixel count: {output_str}"
assert re.search(r'over \d+(\.\d+)?s', output_str), f"output_str should contain duration: {output_str}"
# Verify no files created in output directory
output_files = list(infiniscroll_dir.iterdir())
assert len(output_files) == 0, f"Should not create any files, but found: {output_files}"
def test_config_scroll_limit_honored():
"""Test that INFINISCROLL_SCROLL_LIMIT config is respected."""
with tempfile.TemporaryDirectory() as tmpdir:
with chrome_session(
Path(tmpdir),
crawl_id='test-scroll-limit',
snapshot_id='snap-limit',
test_url=TEST_URL,
) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
infiniscroll_dir.mkdir()
# Set scroll limit to 2 (use env from setup_chrome_session)
env['INFINISCROLL_SCROLL_LIMIT'] = '2'
env['INFINISCROLL_SCROLL_DELAY'] = '500'
env['INFINISCROLL_MIN_HEIGHT'] = '100000' # High threshold so limit kicks in
result = subprocess.run(
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-limit'],
cwd=str(infiniscroll_dir),
capture_output=True,
text=True,
timeout=60,
env=env
)
assert result.returncode == 0, f"Infiniscroll failed: {result.stderr}"
# Parse output and verify scroll count
result_json = None
for line in result.stdout.strip().split('\n'):
if line.strip().startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json is not None, "Should have JSONL output"
output_str = result_json.get('output_str', '')
# Verify output format and that it completed (scroll limit enforced internally)
assert output_str.startswith('scrolled to'), f"Should have valid output_str: {output_str}"
assert result_json['status'] == 'succeeded', f"Should succeed with scroll limit: {result_json}"
def test_config_timeout_honored():
"""Test that INFINISCROLL_TIMEOUT config is respected."""
with tempfile.TemporaryDirectory() as tmpdir:
with chrome_session(
Path(tmpdir),
crawl_id='test-timeout',
snapshot_id='snap-timeout',
test_url=TEST_URL,
) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env):
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
infiniscroll_dir.mkdir()
# Set very short timeout (use env from setup_chrome_session)
env['INFINISCROLL_TIMEOUT'] = '3' # 3 seconds
env['INFINISCROLL_SCROLL_DELAY'] = '2000' # 2s delay - timeout should trigger
env['INFINISCROLL_SCROLL_LIMIT'] = '100' # High limit
env['INFINISCROLL_MIN_HEIGHT'] = '100000'
start_time = time.time()
result = subprocess.run(
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-timeout'],
cwd=str(infiniscroll_dir),
capture_output=True,
text=True,
timeout=30,
env=env
)
elapsed = time.time() - start_time
# Should complete within reasonable time (timeout + buffer)
assert elapsed < 15, f"Should respect timeout, took {elapsed:.1f}s"
assert result.returncode == 0, f"Should complete even with timeout: {result.stderr}"
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -1,14 +0,0 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"required_plugins": ["chrome"],
"properties": {
"ISTILLDONTCAREABOUTCOOKIES_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["USE_ISTILLDONTCAREABOUTCOOKIES"],
"description": "Enable I Still Don't Care About Cookies browser extension"
}
}
}

Some files were not shown because too many files have changed in this diff Show More