diff --git a/.claude/settings.local.json b/.claude/settings.local.json index abce917c..77ce73ec 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -30,7 +30,8 @@ "WebFetch(domain:python-statemachine.readthedocs.io)", "Bash(./bin/run_plugin_tests.sh:*)", "Bash(done)", - "Bash(coverage erase:*)" + "Bash(coverage erase:*)", + "Bash(gh api:*)" ] }, "hooks": { diff --git a/README.md b/README.md index 9a74338e..40598258 100644 --- a/README.md +++ b/README.md @@ -491,6 +491,7 @@ docker run -it -v $PWD:/data archivebox/archivebox help # optional: import your browser cookies into a persona for logged-in archiving archivebox persona create --import=chrome personal # supported: chrome/chromium/brave/edge (Chromium-based only) +# use --profile to target a specific profile (e.g. Default, Profile 1) # re-running import merges/dedupes cookies.txt (by domain/path/name) but replaces chrome_user_data ``` diff --git a/archivebox/__init__.py b/archivebox/__init__.py index 7d471b40..40eb6692 100755 --- a/archivebox/__init__.py +++ b/archivebox/__init__.py @@ -18,6 +18,7 @@ from pathlib import Path # Import uuid_compat early to monkey-patch uuid.uuid7 before Django loads migrations # This fixes migrations generated on Python 3.14+ that reference uuid.uuid7 directly from archivebox import uuid_compat # noqa: F401 +from abx_plugins import get_plugins_dir # Force unbuffered output for real-time logs if hasattr(sys.stdout, 'reconfigure'): @@ -56,9 +57,13 @@ check_io_encoding() # Install monkey patches for third-party libraries from .misc.monkey_patches import * # noqa -# Built-in plugin directories -BUILTIN_PLUGINS_DIR = PACKAGE_DIR / 'plugins' -USER_PLUGINS_DIR = Path(os.getcwd()) / 'plugins' +# Plugin directories +BUILTIN_PLUGINS_DIR = Path(get_plugins_dir()).resolve() +USER_PLUGINS_DIR = Path( + os.environ.get('ARCHIVEBOX_USER_PLUGINS_DIR') + or os.environ.get('USER_PLUGINS_DIR') + or os.environ.get('DATA_DIR', os.getcwd()) +) / 'custom_plugins' # These are kept for backwards compatibility with existing code # that checks for plugins. The new hook system uses discover_hooks() diff --git a/archivebox/cli/archivebox_persona.py b/archivebox/cli/archivebox_persona.py index 4a53e513..1e1d4e60 100644 --- a/archivebox/cli/archivebox_persona.py +++ b/archivebox/cli/archivebox_persona.py @@ -33,6 +33,7 @@ import shutil import platform import subprocess import tempfile +import json from pathlib import Path from typing import Optional, Iterable from collections import OrderedDict @@ -138,6 +139,55 @@ def get_edge_user_data_dir() -> Optional[Path]: return None +def get_browser_binary(browser: str) -> Optional[str]: + system = platform.system() + home = Path.home() + browser = browser.lower() + + if system == 'Darwin': + candidates = { + 'chrome': ['/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'], + 'chromium': ['/Applications/Chromium.app/Contents/MacOS/Chromium'], + 'brave': ['/Applications/Brave Browser.app/Contents/MacOS/Brave Browser'], + 'edge': ['/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge'], + }.get(browser, []) + elif system == 'Linux': + candidates = { + 'chrome': ['/usr/bin/google-chrome', '/usr/bin/google-chrome-stable', '/usr/bin/google-chrome-beta', '/usr/bin/google-chrome-unstable'], + 'chromium': ['/usr/bin/chromium', '/usr/bin/chromium-browser'], + 'brave': ['/usr/bin/brave-browser', '/usr/bin/brave-browser-beta', '/usr/bin/brave-browser-nightly'], + 'edge': ['/usr/bin/microsoft-edge', '/usr/bin/microsoft-edge-stable', '/usr/bin/microsoft-edge-beta', '/usr/bin/microsoft-edge-dev'], + }.get(browser, []) + elif system == 'Windows': + local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local')) + candidates = { + 'chrome': [ + str(local_app_data / 'Google' / 'Chrome' / 'Application' / 'chrome.exe'), + 'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe', + 'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe', + ], + 'chromium': [str(local_app_data / 'Chromium' / 'Application' / 'chrome.exe')], + 'brave': [ + str(local_app_data / 'BraveSoftware' / 'Brave-Browser' / 'Application' / 'brave.exe'), + 'C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe', + 'C:\\Program Files (x86)\\BraveSoftware\\Brave-Browser\\Application\\brave.exe', + ], + 'edge': [ + str(local_app_data / 'Microsoft' / 'Edge' / 'Application' / 'msedge.exe'), + 'C:\\Program Files\\Microsoft\\Edge\\Application\\msedge.exe', + 'C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe', + ], + }.get(browser, []) + else: + candidates = [] + + for candidate in candidates: + if candidate and Path(candidate).exists(): + return candidate + + return None + + BROWSER_PROFILE_FINDERS = { 'chrome': get_chrome_user_data_dir, 'chromium': get_chrome_user_data_dir, # Same locations @@ -194,7 +244,12 @@ def _merge_netscape_cookies(existing_file: Path, new_file: Path) -> None: _write_netscape_cookies(existing_file, existing) -def extract_cookies_via_cdp(user_data_dir: Path, output_file: Path) -> bool: +def extract_cookies_via_cdp( + user_data_dir: Path, + output_file: Path, + profile_dir: str | None = None, + chrome_binary: str | None = None, +) -> bool: """ Launch Chrome with the given user data dir and extract cookies via CDP. @@ -218,6 +273,8 @@ def extract_cookies_via_cdp(user_data_dir: Path, output_file: Path) -> bool: env['NODE_MODULES_DIR'] = str(node_modules_dir) env['CHROME_USER_DATA_DIR'] = str(user_data_dir) env['CHROME_HEADLESS'] = 'true' + if chrome_binary: + env['CHROME_BINARY'] = str(chrome_binary) output_path = output_file temp_output = None temp_dir = None @@ -225,6 +282,23 @@ def extract_cookies_via_cdp(user_data_dir: Path, output_file: Path) -> bool: temp_dir = Path(tempfile.mkdtemp(prefix='ab_cookies_')) temp_output = temp_dir / 'cookies.txt' output_path = temp_output + if profile_dir: + extra_arg = f'--profile-directory={profile_dir}' + existing_extra = env.get('CHROME_ARGS_EXTRA', '').strip() + args_list = [] + if existing_extra: + if existing_extra.startswith('['): + try: + parsed = json.loads(existing_extra) + if isinstance(parsed, list): + args_list.extend(str(x) for x in parsed) + except Exception: + args_list.extend([s.strip() for s in existing_extra.split(',') if s.strip()]) + else: + args_list.extend([s.strip() for s in existing_extra.split(',') if s.strip()]) + args_list.append(extra_arg) + env['CHROME_ARGS_EXTRA'] = json.dumps(args_list) + env['COOKIES_OUTPUT_FILE'] = str(output_path) try: @@ -322,6 +396,7 @@ def ensure_path_within_personas_dir(persona_path: Path) -> bool: def create_personas( names: Iterable[str], import_from: Optional[str] = None, + profile: Optional[str] = None, ) -> int: """ Create Personas from names. @@ -360,6 +435,15 @@ def create_personas( rprint(f'[dim]Found {import_from} profile: {source_profile_dir}[/dim]', file=sys.stderr) + if profile is None and (source_profile_dir / 'Default').exists(): + profile = 'Default' + + browser_binary = get_browser_binary(import_from) + if browser_binary: + rprint(f'[dim]Using {import_from} binary: {browser_binary}[/dim]', file=sys.stderr) + else: + browser_binary = None + created_count = 0 for name in name_list: name = name.strip() @@ -414,7 +498,12 @@ def create_personas( # Extract cookies via CDP rprint(f'[dim]Extracting cookies via CDP...[/dim]', file=sys.stderr) - if extract_cookies_via_cdp(persona_chrome_dir, cookies_file): + if extract_cookies_via_cdp( + persona_chrome_dir, + cookies_file, + profile_dir=profile, + chrome_binary=browser_binary, + ): rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr) else: rprint(f'[yellow]Could not extract cookies automatically.[/yellow]', file=sys.stderr) @@ -652,9 +741,10 @@ def main(): @main.command('create') @click.argument('names', nargs=-1) @click.option('--import', 'import_from', help='Import profile from browser (chrome, chromium, brave, edge)') -def create_cmd(names: tuple, import_from: Optional[str]): +@click.option('--profile', help='Profile directory name under the user data dir (e.g. Default, Profile 1)') +def create_cmd(names: tuple, import_from: Optional[str], profile: Optional[str]): """Create Personas, optionally importing from a browser profile.""" - sys.exit(create_personas(names, import_from=import_from)) + sys.exit(create_personas(names, import_from=import_from, profile=profile)) @main.command('list') diff --git a/archivebox/config/views.py b/archivebox/config/views.py index 67805c7d..316e1aa3 100644 --- a/archivebox/config/views.py +++ b/archivebox/config/views.py @@ -277,7 +277,7 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext: # Show a helpful message when no plugins found rows['Name'].append('(no plugins found)') rows['Source'].append('-') - rows['Path'].append(mark_safe('archivebox/plugins/ or data/plugins/')) + rows['Path'].append(mark_safe('abx_plugins/plugins/ or data/custom_plugins/')) rows['Hooks'].append('-') rows['Config'].append('-') diff --git a/archivebox/core/admin_snapshots.py b/archivebox/core/admin_snapshots.py index 25c89e15..6d01c25b 100644 --- a/archivebox/core/admin_snapshots.py +++ b/archivebox/core/admin_snapshots.py @@ -140,6 +140,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'crawl__created_by', TagNameListFilter) fieldsets = ( + ('Actions', { + 'fields': ('admin_actions',), + 'classes': ('card', 'wide', 'actions-card'), + }), ('URL', { 'fields': ('url', 'title'), 'classes': ('card', 'wide'), @@ -168,10 +172,6 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): 'fields': ('output_dir',), 'classes': ('card',), }), - ('Actions', { - 'fields': ('admin_actions',), - 'classes': ('card', 'wide'), - }), ('Archive Results', { 'fields': ('archiveresults_list',), 'classes': ('card', 'wide'), @@ -179,7 +179,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): ) ordering = ['-created_at'] - actions = ['add_tags', 'remove_tags', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots'] + actions = ['add_tags', 'remove_tags', 'resnapshot_snapshot', 'update_snapshots', 'overwrite_snapshots', 'delete_snapshots'] inlines = [] # Removed TagInline, using TagEditorWidget instead list_per_page = min(max(5, SERVER_CONFIG.SNAPSHOTS_PER_PAGE), 5000) @@ -301,6 +301,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): # obj.pk, # ) + @admin.display(description='') def admin_actions(self, obj): summary_url = build_web_url(f'/{obj.archive_path}') results_url = build_web_url(f'/{obj.archive_path}/index.html#all') @@ -311,13 +312,13 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): href="{}" onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';" onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';"> - šŸ“„ Summary Page + šŸ“„ View Snapshot - šŸ“ Result Files + šŸ“ All files - - ā¬‡ļø Finish - - šŸ†• Archive Again + šŸ†• Archive Now + + + šŸ” Redo Failed Any | None: # Hook scripts are now used instead of Python plugin modules - # The plugin name maps to hooks in archivebox/plugins/{plugin}/ + # The plugin name maps to hooks in abx_plugins/plugins/{plugin}/ return None def output_exists(self) -> bool: diff --git a/archivebox/core/templatetags/core_tags.py b/archivebox/core/templatetags/core_tags.py index e9a38023..859a4c6f 100644 --- a/archivebox/core/templatetags/core_tags.py +++ b/archivebox/core/templatetags/core_tags.py @@ -349,15 +349,6 @@ def plugin_name(value: str) -> str: return get_plugin_name(value) -@register.filter -def plugin_display_name(value: str) -> str: - """ - Human-friendly plugin name overrides for UI display. - """ - name = get_plugin_name(value) - if name == 'merkletree': - return 'hashes' - return name @register.simple_tag(takes_context=True) diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 7225cd8e..fb7fabe7 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -1145,13 +1145,31 @@ def live_progress_view(request): for proc in running_workers: env = proc.env or {} if not isinstance(env, dict): - continue + env = {} + + cmd = proc.cmd or [] if proc.worker_type == 'crawl': crawl_id = env.get('CRAWL_ID') + if not crawl_id: + for i, part in enumerate(cmd): + if part == '--crawl-id' and i + 1 < len(cmd): + crawl_id = cmd[i + 1] + break + if part.startswith('--crawl-id='): + crawl_id = part.split('=', 1)[1] + break if crawl_id: crawl_worker_pids[str(crawl_id)] = proc.pid elif proc.worker_type == 'snapshot': snapshot_id = env.get('SNAPSHOT_ID') + if not snapshot_id: + for i, part in enumerate(cmd): + if part == '--snapshot-id' and i + 1 < len(cmd): + snapshot_id = cmd[i + 1] + break + if part.startswith('--snapshot-id='): + snapshot_id = part.split('=', 1)[1] + break if snapshot_id: snapshot_worker_pids[str(snapshot_id)] = proc.pid @@ -1243,7 +1261,7 @@ def live_progress_view(request): 'plugin': ar.plugin, 'status': status, } - if ar.process_id and ar.process and ar.process.status == Process.StatusChoices.RUNNING: + if status == ArchiveResult.StatusChoices.STARTED and ar.process_id and ar.process: plugin_payload['pid'] = ar.process.pid if status == ArchiveResult.StatusChoices.STARTED: plugin_payload['progress'] = progress_value diff --git a/archivebox/hooks.py b/archivebox/hooks.py index b8429c11..1fab24af 100644 --- a/archivebox/hooks.py +++ b/archivebox/hooks.py @@ -6,8 +6,8 @@ with ArchiveBox via CLI arguments and stdout JSON output. This keeps the plugin system simple and language-agnostic. Directory structure: - archivebox/plugins//on___. (built-in) - data/plugins//on___. (user) + abx_plugins/plugins//on___. (built-in package) + data/custom_plugins//on___. (user) Hook contract: Input: --url= (and other --key=value args) @@ -66,14 +66,20 @@ from functools import lru_cache from pathlib import Path from typing import List, Dict, Any, Optional, TypedDict +from abx_plugins import get_plugins_dir from django.conf import settings from django.utils import timezone from django.utils.safestring import mark_safe +from archivebox.config.constants import CONSTANTS # Plugin directories -BUILTIN_PLUGINS_DIR = Path(__file__).parent / 'plugins' -USER_PLUGINS_DIR = Path(getattr(settings, 'DATA_DIR', Path.cwd())) / 'plugins' +BUILTIN_PLUGINS_DIR = Path(get_plugins_dir()).resolve() +USER_PLUGINS_DIR = Path( + os.environ.get('ARCHIVEBOX_USER_PLUGINS_DIR') + or getattr(settings, 'USER_PLUGINS_DIR', '') + or str(CONSTANTS.USER_PLUGINS_DIR) +).expanduser() # ============================================================================= @@ -197,11 +203,11 @@ def discover_hooks( for hook in hooks: # Get plugin name from parent directory - # e.g., archivebox/plugins/wget/on_Snapshot__50_wget.py -> 'wget' + # e.g., abx_plugins/plugins/wget/on_Snapshot__50_wget.py -> 'wget' plugin_name = hook.parent.name # Check if this is a plugin directory (not the root plugins dir) - if plugin_name in ('plugins', '.'): + if hook.parent.resolve() in (BUILTIN_PLUGINS_DIR.resolve(), USER_PLUGINS_DIR.resolve()): # Hook is in root plugins directory, not a plugin subdir # Include it by default (no filtering for non-plugin hooks) enabled_hooks.append(hook) @@ -581,7 +587,7 @@ def get_plugins() -> List[str]: The plugin name is the plugin directory name, not the hook script name. Example: - archivebox/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js + abx_plugins/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js -> plugin = 'chrome' Sorted alphabetically (plugins control their hook order via numeric prefixes in hook names). @@ -728,7 +734,7 @@ def discover_plugins_that_provide_interface( try: # Import the module dynamically spec = importlib.util.spec_from_file_location( - f'archivebox.plugins.{plugin_name}.{module_name}', + f'archivebox.dynamic_plugins.{plugin_name}.{module_name}', module_path ) if spec is None or spec.loader is None: @@ -942,7 +948,7 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[ # Plugins can provide custom templates for rendering their output in the UI. # Templates are discovered by filename convention inside each plugin's templates/ dir: # -# archivebox/plugins// +# abx_plugins/plugins// # templates/ # icon.html # Icon for admin table view (small inline HTML) # card.html # Preview card for snapshot header diff --git a/archivebox/plugins/search_backend_ripgrep/__init__.py b/archivebox/ideas/__init__.py similarity index 100% rename from archivebox/plugins/search_backend_ripgrep/__init__.py rename to archivebox/ideas/__init__.py diff --git a/archivebox/ideas/process_plugin.py b/archivebox/ideas/process_plugin.py new file mode 100644 index 00000000..cca7e743 --- /dev/null +++ b/archivebox/ideas/process_plugin.py @@ -0,0 +1,318 @@ +__package__ = 'archivebox.ideas' + +import asyncio +import json +import os +import shlex +import signal +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Callable, Mapping, MutableMapping, Optional + +from pydantic import BaseModel, Field + +try: + from bubus import BaseEvent, EventBus +except Exception as exc: # pragma: no cover - optional dependency + raise ImportError('ProcessPlugin requires bubus to be installed') from exc + +try: + from bubus.service import uuid7str +except Exception: # pragma: no cover - optional dependency + from uuid import uuid4 as _uuid4 + + def uuid7str() -> str: + return str(_uuid4()) + + +def _utcnow() -> datetime: + return datetime.now(timezone.utc) + + +class ProcessRecord(BaseModel): + id: str = Field(default_factory=uuid7str) + cmd: list[str] + cwd: str | None = None + env: dict[str, str] = Field(default_factory=dict) + pid: int | None = None + started_at: datetime | None = None + ended_at: datetime | None = None + exit_code: int | None = None + stdout_path: str | None = None + stderr_path: str | None = None + cmd_path: str | None = None + pid_path: str | None = None + is_background: bool = False + parent_process_id: str | None = None + + +class ProcessLaunch(BaseEvent[ProcessRecord]): + cmd: list[str] + cwd: str | None = None + env: dict[str, str] | None = None + timeout: float | None = None + output_dir: str | None = None + log_prefix: str | None = None + is_background: bool = False + parent_process_id: str | None = None + parse_stdout_events: bool = True + + +class ProcessStarted(BaseEvent[None]): + process: ProcessRecord + + +class ProcessExited(BaseEvent[None]): + process: ProcessRecord + + +class ProcessKill(BaseEvent[ProcessRecord]): + process_id: str + signal: int = signal.SIGTERM + timeout: float | None = 10.0 + + +@dataclass +class _RunningProcess: + process: asyncio.subprocess.Process + record: ProcessRecord + stdout_task: asyncio.Task[None] | None + stderr_task: asyncio.Task[None] | None + watcher_task: asyncio.Task[None] | None + parent_event_id: str | None + + +JsonEventAdapter = Callable[[dict[str, Any], str | None], Optional[BaseEvent[Any]]] + + +class ProcessPlugin: + """Spawn and monitor processes using events (no Django required).""" + + def __init__( + self, + bus: EventBus, + *, + env: Mapping[str, str] | None = None, + json_event_adapter: JsonEventAdapter | None = None, + ) -> None: + self.bus = bus + self.env = dict(env or os.environ) + self.json_event_adapter = json_event_adapter + self._running: MutableMapping[str, _RunningProcess] = {} + + def register_event_handlers(self) -> None: + self.bus.on(ProcessLaunch, self.on_ProcessLaunch) + self.bus.on(ProcessKill, self.on_ProcessKill) + + async def on_ProcessLaunch(self, event: ProcessLaunch) -> ProcessRecord: + parent_event_id = event.event_id + proc_id = uuid7str() + cwd = event.cwd or event.output_dir or os.getcwd() + output_dir = Path(event.output_dir or cwd) + output_dir.mkdir(parents=True, exist_ok=True) + + env = {**self.env, **(event.env or {})} + + log_prefix = event.log_prefix or proc_id + stdout_path = output_dir / f'{log_prefix}.stdout.log' + stderr_path = output_dir / f'{log_prefix}.stderr.log' + cmd_path = output_dir / f'{log_prefix}.sh' + pid_path = output_dir / f'{log_prefix}.pid' + + self._write_cmd_file(cmd_path, event.cmd) + + proc = await asyncio.create_subprocess_exec( + *event.cmd, + cwd=str(cwd), + env=env, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + start_new_session=True, + ) + + self._write_pid_file(pid_path, proc.pid) + + record = ProcessRecord( + id=proc_id, + cmd=event.cmd, + cwd=str(cwd), + env=env, + pid=proc.pid, + started_at=_utcnow(), + stdout_path=str(stdout_path), + stderr_path=str(stderr_path), + cmd_path=str(cmd_path), + pid_path=str(pid_path), + is_background=event.is_background, + parent_process_id=event.parent_process_id, + ) + + await event.event_bus.dispatch( + ProcessStarted(process=record, event_parent_id=parent_event_id) + ) + + stdout_task = asyncio.create_task( + self._consume_stream( + proc.stdout, stdout_path, parent_event_id, event.parse_stdout_events + ) + ) + stderr_task = asyncio.create_task( + self._consume_stream(proc.stderr, stderr_path, parent_event_id, False) + ) + + running = _RunningProcess( + process=proc, + record=record, + stdout_task=stdout_task, + stderr_task=stderr_task, + watcher_task=None, + parent_event_id=parent_event_id, + ) + self._running[proc_id] = running + + if event.is_background: + running.watcher_task = asyncio.create_task( + self._watch_process(proc_id, event.timeout) + ) + return record + + await self._watch_process(proc_id, event.timeout) + return self._running.get(proc_id, running).record + + async def on_ProcessKill(self, event: ProcessKill) -> ProcessRecord: + running = self._running.get(event.process_id) + if not running: + raise RuntimeError(f'Process not found: {event.process_id}') + + proc = running.process + self._terminate_process(proc, event.signal) + + if event.timeout is not None: + try: + await asyncio.wait_for(proc.wait(), timeout=event.timeout) + except asyncio.TimeoutError: + self._terminate_process(proc, signal.SIGKILL) + else: + await proc.wait() + + await self._finalize_process(event.process_id) + return self._running.get(event.process_id, running).record + + async def _watch_process(self, process_id: str, timeout: float | None) -> None: + running = self._running.get(process_id) + if not running: + return + proc = running.process + try: + if timeout is not None: + await asyncio.wait_for(proc.wait(), timeout=timeout) + else: + await proc.wait() + except asyncio.TimeoutError: + self._terminate_process(proc, signal.SIGTERM) + await asyncio.sleep(2) + if proc.returncode is None: + self._terminate_process(proc, signal.SIGKILL) + await proc.wait() + await self._finalize_process(process_id) + + async def _finalize_process(self, process_id: str) -> None: + running = self._running.get(process_id) + if not running: + return + + proc = running.process + record = running.record + + if running.stdout_task: + await running.stdout_task + if running.stderr_task: + await running.stderr_task + + record.exit_code = proc.returncode + record.ended_at = _utcnow() + + await self.bus.dispatch( + ProcessExited(process=record, event_parent_id=running.parent_event_id) + ) + + self._running.pop(process_id, None) + + async def _consume_stream( + self, + stream: asyncio.StreamReader | None, + path: Path, + parent_event_id: str | None, + parse_events: bool, + ) -> None: + if stream is None: + return + with path.open('w', encoding='utf-8') as fh: + while True: + line = await stream.readline() + if not line: + break + text = line.decode('utf-8', errors='replace') + fh.write(text) + fh.flush() + if parse_events: + await self._maybe_dispatch_json_event(text, parent_event_id) + + async def _maybe_dispatch_json_event(self, line: str, parent_event_id: str | None) -> None: + text = line.strip() + if not text.startswith('{') or not text.endswith('}'): + return + try: + data = json.loads(text) + except json.JSONDecodeError: + return + + event = None + if self.json_event_adapter: + event = self.json_event_adapter(data, parent_event_id) + elif isinstance(data, dict) and 'event_type' in data: + try: + event = BaseEvent.model_validate(data) + except Exception: + event = None + + if event is None: + return + + if not getattr(event, 'event_parent_id', None) and parent_event_id: + event.event_parent_id = parent_event_id + await self.bus.dispatch(event) + + @staticmethod + def _write_cmd_file(path: Path, cmd: list[str]) -> None: + cmd_line = ' '.join(shlex.quote(part) for part in cmd) + path.write_text(cmd_line + '\n', encoding='utf-8') + + @staticmethod + def _write_pid_file(path: Path, pid: int) -> None: + path.write_text(str(pid), encoding='utf-8') + ts = datetime.now().timestamp() + os.utime(path, (ts, ts)) + + @staticmethod + def _terminate_process(proc: asyncio.subprocess.Process, sig: int) -> None: + if proc.returncode is not None: + return + try: + os.killpg(proc.pid, sig) + except Exception: + try: + os.kill(proc.pid, sig) + except Exception: + pass + + +__all__ = [ + 'ProcessRecord', + 'ProcessLaunch', + 'ProcessStarted', + 'ProcessExited', + 'ProcessKill', + 'ProcessPlugin', +] diff --git a/archivebox/plugins/accessibility/config.json b/archivebox/plugins/accessibility/config.json deleted file mode 100644 index 208d2332..00000000 --- a/archivebox/plugins/accessibility/config.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "additionalProperties": false, - "required_plugins": ["chrome"], - "properties": { - "ACCESSIBILITY_ENABLED": { - "type": "boolean", - "default": true, - "x-aliases": ["SAVE_ACCESSIBILITY", "USE_ACCESSIBILITY"], - "description": "Enable accessibility tree capture" - }, - "ACCESSIBILITY_TIMEOUT": { - "type": "integer", - "default": 30, - "minimum": 5, - "x-fallback": "TIMEOUT", - "description": "Timeout for accessibility capture in seconds" - } - } -} diff --git a/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js b/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js deleted file mode 100755 index 7b73a422..00000000 --- a/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js +++ /dev/null @@ -1,288 +0,0 @@ -#!/usr/bin/env node -/** - * Extract accessibility tree and page outline from a URL. - * - * Extracts: - * - Page outline (headings h1-h6, sections, articles) - * - Iframe tree - * - Accessibility snapshot - * - ARIA labels and roles - * - * Usage: on_Snapshot__39_accessibility.js --url= --snapshot-id= - * Output: Writes accessibility/accessibility.json - * - * Environment variables: - * SAVE_ACCESSIBILITY: Enable accessibility extraction (default: true) - */ - -const fs = require('fs'); -const path = require('path'); -// Add NODE_MODULES_DIR to module resolution paths if set -if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); -const puppeteer = require('puppeteer-core'); - -// Extractor metadata -const PLUGIN_NAME = 'accessibility'; -const OUTPUT_DIR = '.'; -const OUTPUT_FILE = 'accessibility.json'; -const CHROME_SESSION_DIR = '../chrome'; -const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)'; - -// Parse command line arguments -function parseArgs() { - const args = {}; - process.argv.slice(2).forEach(arg => { - if (arg.startsWith('--')) { - const [key, ...valueParts] = arg.slice(2).split('='); - args[key.replace(/-/g, '_')] = valueParts.join('=') || true; - } - }); - return args; -} - -// Get environment variable with default -function getEnv(name, defaultValue = '') { - return (process.env[name] || defaultValue).trim(); -} - -function getEnvBool(name, defaultValue = false) { - const val = getEnv(name, '').toLowerCase(); - if (['true', '1', 'yes', 'on'].includes(val)) return true; - if (['false', '0', 'no', 'off'].includes(val)) return false; - return defaultValue; -} - -// Wait for chrome tab to be fully loaded -async function waitForChromeTabLoaded(timeoutMs = 60000) { - const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json'); - const startTime = Date.now(); - - while (Date.now() - startTime < timeoutMs) { - if (fs.existsSync(navigationFile)) { - return true; - } - // Wait 100ms before checking again - await new Promise(resolve => setTimeout(resolve, 100)); - } - - return false; -} - -// Get CDP URL from chrome plugin -function getCdpUrl() { - const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt'); - if (fs.existsSync(cdpFile)) { - return fs.readFileSync(cdpFile, 'utf8').trim(); - } - return null; -} - -function assertChromeSession() { - const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt'); - const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt'); - const pidFile = path.join(CHROME_SESSION_DIR, 'chrome.pid'); - if (!fs.existsSync(cdpFile) || !fs.existsSync(targetIdFile) || !fs.existsSync(pidFile)) { - throw new Error(CHROME_SESSION_REQUIRED_ERROR); - } - try { - const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10); - if (!pid || Number.isNaN(pid)) throw new Error('Invalid pid'); - process.kill(pid, 0); - } catch (e) { - throw new Error(CHROME_SESSION_REQUIRED_ERROR); - } - const cdpUrl = getCdpUrl(); - if (!cdpUrl) { - throw new Error(CHROME_SESSION_REQUIRED_ERROR); - } - return cdpUrl; -} - -// Extract accessibility info -async function extractAccessibility(url) { - // Output directory is current directory (hook already runs in output dir) - const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); - - let browser = null; - - try { - // Connect to existing Chrome session - const cdpUrl = assertChromeSession(); - - browser = await puppeteer.connect({ - browserWSEndpoint: cdpUrl, - }); - - // Get the page - const pages = await browser.pages(); - const page = pages.find(p => p.url().startsWith('http')) || pages[0]; - - if (!page) { - return { success: false, error: 'No page found in Chrome session' }; - } - - // Get accessibility snapshot - const accessibilityTree = await page.accessibility.snapshot({ interestingOnly: true }); - - // Extract page outline (headings, sections, etc.) - const outline = await page.evaluate(() => { - const headings = []; - const elements = document.querySelectorAll( - 'h1, h2, h3, h4, h5, h6, a[name], header, footer, article, main, aside, nav, section, figure, summary, table, form, iframe' - ); - - elements.forEach(elem => { - // Skip unnamed anchors - if (elem.tagName.toLowerCase() === 'a' && !elem.name) return; - - const tagName = elem.tagName.toLowerCase(); - const elemId = elem.id || elem.name || elem.getAttribute('aria-label') || elem.role || ''; - const elemClasses = (elem.className || '').toString().trim().split(/\s+/).slice(0, 3).join(' .'); - const action = elem.action?.split('/').pop() || ''; - - let summary = (elem.innerText || '').slice(0, 128); - if (summary.length >= 128) summary += '...'; - - let prefix = ''; - let title = ''; - - // Format headings with # prefix - const level = parseInt(tagName.replace('h', '')); - if (!isNaN(level)) { - prefix = '#'.repeat(level); - title = elem.innerText || elemId || elemClasses; - } else { - // For other elements, create breadcrumb path - const parents = [tagName]; - let node = elem.parentNode; - while (node && parents.length < 5) { - if (node.tagName) { - const tag = node.tagName.toLowerCase(); - if (!['div', 'span', 'p', 'body', 'html'].includes(tag)) { - parents.unshift(tag); - } else { - parents.unshift(''); - } - } - node = node.parentNode; - } - prefix = parents.join('>'); - - title = elemId ? `#${elemId}` : ''; - if (!title && elemClasses) title = `.${elemClasses}`; - if (action) title += ` /${action}`; - if (summary && !title.includes(summary)) title += `: ${summary}`; - } - - // Clean up title - title = title.replace(/\s+/g, ' ').trim(); - - if (prefix) { - headings.push(`${prefix} ${title}`); - } - }); - - return headings; - }); - - // Get iframe tree - const iframes = []; - function dumpFrameTree(frame, indent = '>') { - iframes.push(indent + frame.url()); - for (const child of frame.childFrames()) { - dumpFrameTree(child, indent + '>'); - } - } - dumpFrameTree(page.mainFrame(), ''); - - const accessibilityData = { - url, - headings: outline, - iframes, - tree: accessibilityTree, - }; - - // Write output - fs.writeFileSync(outputPath, JSON.stringify(accessibilityData, null, 2)); - - return { success: true, output: outputPath, accessibilityData }; - - } catch (e) { - return { success: false, error: `${e.name}: ${e.message}` }; - } finally { - if (browser) { - browser.disconnect(); - } - } -} - -async function main() { - const args = parseArgs(); - const url = args.url; - const snapshotId = args.snapshot_id; - - if (!url || !snapshotId) { - console.error('Usage: on_Snapshot__39_accessibility.js --url= --snapshot-id='); - process.exit(1); - } - - const startTs = new Date(); - let status = 'failed'; - let output = null; - let error = ''; - - try { - // Check if enabled - if (!getEnvBool('ACCESSIBILITY_ENABLED', true)) { - console.log('Skipping accessibility (ACCESSIBILITY_ENABLED=False)'); - // Output clean JSONL (no RESULT_JSON= prefix) - console.log(JSON.stringify({ - type: 'ArchiveResult', - status: 'skipped', - output_str: 'ACCESSIBILITY_ENABLED=False', - })); - process.exit(0); - } - - // Check if Chrome session exists, then wait for page load - assertChromeSession(); - const pageLoaded = await waitForChromeTabLoaded(60000); - if (!pageLoaded) { - throw new Error('Page not loaded after 60s (chrome_navigate must complete first)'); - } - - const result = await extractAccessibility(url); - - if (result.success) { - status = 'succeeded'; - output = result.output; - const headingCount = result.accessibilityData.headings.length; - const iframeCount = result.accessibilityData.iframes.length; - console.log(`Accessibility extracted: ${headingCount} headings, ${iframeCount} iframes`); - } else { - status = 'failed'; - error = result.error; - } - } catch (e) { - error = `${e.name}: ${e.message}`; - status = 'failed'; - } - - const endTs = new Date(); - - if (error) console.error(`ERROR: ${error}`); - - // Output clean JSONL (no RESULT_JSON= prefix) - console.log(JSON.stringify({ - type: 'ArchiveResult', - status, - output_str: output || error || '', - })); - - process.exit(status === 'succeeded' ? 0 : 1); -} - -main().catch(e => { - console.error(`Fatal error: ${e.message}`); - process.exit(1); -}); diff --git a/archivebox/plugins/accessibility/templates/icon.html b/archivebox/plugins/accessibility/templates/icon.html deleted file mode 100644 index e1c30fa0..00000000 --- a/archivebox/plugins/accessibility/templates/icon.html +++ /dev/null @@ -1 +0,0 @@ - diff --git a/archivebox/plugins/accessibility/tests/test_accessibility.py b/archivebox/plugins/accessibility/tests/test_accessibility.py deleted file mode 100644 index cccfa215..00000000 --- a/archivebox/plugins/accessibility/tests/test_accessibility.py +++ /dev/null @@ -1,195 +0,0 @@ -""" -Tests for the accessibility plugin. - -Tests the real accessibility hook with an actual URL to verify -accessibility tree and page outline extraction. -""" - -import json -import shutil -import subprocess -import sys -import tempfile -from pathlib import Path - -import pytest -from django.test import TestCase - -# Import chrome test helpers -sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests')) -from chrome_test_helpers import ( - chrome_session, - get_test_env, - get_plugin_dir, - get_hook_script, -) - - -def chrome_available() -> bool: - """Check if Chrome/Chromium is available.""" - for name in ['chromium', 'chromium-browser', 'google-chrome', 'chrome']: - if shutil.which(name): - return True - return False - - -# Get the path to the accessibility hook -PLUGIN_DIR = get_plugin_dir(__file__) -ACCESSIBILITY_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_accessibility.*') - - -class TestAccessibilityPlugin(TestCase): - """Test the accessibility plugin.""" - - def test_accessibility_hook_exists(self): - """Accessibility hook script should exist.""" - self.assertIsNotNone(ACCESSIBILITY_HOOK, "Accessibility hook not found in plugin directory") - self.assertTrue(ACCESSIBILITY_HOOK.exists(), f"Hook not found: {ACCESSIBILITY_HOOK}") - - -class TestAccessibilityWithChrome(TestCase): - """Integration tests for accessibility plugin with Chrome.""" - - def setUp(self): - """Set up test environment.""" - self.temp_dir = Path(tempfile.mkdtemp()) - - def tearDown(self): - """Clean up.""" - shutil.rmtree(self.temp_dir, ignore_errors=True) - - def test_accessibility_extracts_page_outline(self): - """Accessibility hook should extract headings and accessibility tree.""" - test_url = 'https://example.com' - snapshot_id = 'test-accessibility-snapshot' - - try: - with chrome_session( - self.temp_dir, - crawl_id='test-accessibility-crawl', - snapshot_id=snapshot_id, - test_url=test_url, - navigate=True, - timeout=30, - ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env): - # Use the environment from chrome_session (already has CHROME_HEADLESS=true) - - # Run accessibility hook with the active Chrome session - result = subprocess.run( - ['node', str(ACCESSIBILITY_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(snapshot_chrome_dir), - capture_output=True, - text=True, - timeout=60, - env=env - ) - - # Check for output file - accessibility_output = snapshot_chrome_dir / 'accessibility.json' - - accessibility_data = None - - # Try parsing from file first - if accessibility_output.exists(): - with open(accessibility_output) as f: - try: - accessibility_data = json.load(f) - except json.JSONDecodeError: - pass - - # Verify hook ran successfully - self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}") - self.assertNotIn('Traceback', result.stderr) - - # example.com has headings, so we should get accessibility data - self.assertIsNotNone(accessibility_data, "No accessibility data was generated") - - # Verify we got page outline data - self.assertIn('headings', accessibility_data, f"Missing headings: {accessibility_data}") - self.assertIn('url', accessibility_data, f"Missing url: {accessibility_data}") - - except RuntimeError: - raise - - def test_accessibility_disabled_skips(self): - """Test that ACCESSIBILITY_ENABLED=False skips without error.""" - test_url = 'https://example.com' - snapshot_id = 'test-disabled' - - env = get_test_env() - env['ACCESSIBILITY_ENABLED'] = 'False' - - result = subprocess.run( - ['node', str(ACCESSIBILITY_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(self.temp_dir), - capture_output=True, - text=True, - timeout=30, - env=env - ) - - # Should exit 0 even when disabled - self.assertEqual(result.returncode, 0, f"Should succeed when disabled: {result.stderr}") - - # Should NOT create output file when disabled - accessibility_output = self.temp_dir / 'accessibility.json' - self.assertFalse(accessibility_output.exists(), "Should not create file when disabled") - - def test_accessibility_missing_url_argument(self): - """Test that missing --url argument causes error.""" - snapshot_id = 'test-missing-url' - - result = subprocess.run( - ['node', str(ACCESSIBILITY_HOOK), f'--snapshot-id={snapshot_id}'], - cwd=str(self.temp_dir), - capture_output=True, - text=True, - timeout=30, - env=get_test_env() - ) - - # Should fail with non-zero exit code - self.assertNotEqual(result.returncode, 0, "Should fail when URL missing") - - def test_accessibility_missing_snapshot_id_argument(self): - """Test that missing --snapshot-id argument causes error.""" - test_url = 'https://example.com' - - result = subprocess.run( - ['node', str(ACCESSIBILITY_HOOK), f'--url={test_url}'], - cwd=str(self.temp_dir), - capture_output=True, - text=True, - timeout=30, - env=get_test_env() - ) - - # Should fail with non-zero exit code - self.assertNotEqual(result.returncode, 0, "Should fail when snapshot-id missing") - - def test_accessibility_with_no_chrome_session(self): - """Test that hook fails gracefully when no Chrome session exists.""" - test_url = 'https://example.com' - snapshot_id = 'test-no-chrome' - - result = subprocess.run( - ['node', str(ACCESSIBILITY_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(self.temp_dir), - capture_output=True, - text=True, - timeout=30, - env=get_test_env() - ) - - # Should fail when no Chrome session - self.assertNotEqual(result.returncode, 0, "Should fail when no Chrome session exists") - # Error should mention CDP or Chrome - err_lower = result.stderr.lower() - self.assertTrue( - any(x in err_lower for x in ['chrome', 'cdp', 'cannot find', 'puppeteer']), - f"Should mention Chrome/CDP in error: {result.stderr}" - ) - - -if __name__ == '__main__': - pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/apt/on_Binary__13_apt_install.py b/archivebox/plugins/apt/on_Binary__13_apt_install.py deleted file mode 100644 index 82e343ff..00000000 --- a/archivebox/plugins/apt/on_Binary__13_apt_install.py +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/env python3 -""" -Install a binary using apt package manager. - -Usage: on_Binary__install_using_apt_provider.py --binary-id= --machine-id= --name= -Output: Binary JSONL record to stdout after installation -""" - -import json -import sys - -import rich_click as click -from abx_pkg import Binary, AptProvider, BinProviderOverrides - -# Fix pydantic forward reference issue -AptProvider.model_rebuild() - - -@click.command() -@click.option('--binary-id', required=True, help="Binary UUID") -@click.option('--machine-id', required=True, help="Machine UUID") -@click.option('--name', required=True, help="Binary name to install") -@click.option('--binproviders', default='*', help="Allowed providers (comma-separated)") -@click.option('--overrides', default=None, help="JSON-encoded overrides dict") -def main(binary_id: str, machine_id: str, name: str, binproviders: str, overrides: str | None): - """Install binary using apt package manager.""" - - # Check if apt provider is allowed - if binproviders != '*' and 'apt' not in binproviders.split(','): - click.echo(f"apt provider not allowed for {name}", err=True) - sys.exit(0) # Not an error, just skip - - # Use abx-pkg AptProvider to install binary - provider = AptProvider() - if not provider.INSTALLER_BIN: - click.echo("apt not available on this system", err=True) - sys.exit(1) - - click.echo(f"Installing {name} via apt...", err=True) - - try: - # Parse overrides if provided - overrides_dict = None - if overrides: - try: - overrides_dict = json.loads(overrides) - # Extract apt-specific overrides - overrides_dict = overrides_dict.get('apt', {}) - click.echo(f"Using apt install overrides: {overrides_dict}", err=True) - except json.JSONDecodeError: - click.echo(f"Warning: Failed to parse overrides JSON: {overrides}", err=True) - - binary = Binary(name=name, binproviders=[provider], overrides={'apt': overrides_dict} if overrides_dict else {}).install() - except Exception as e: - click.echo(f"apt install failed: {e}", err=True) - sys.exit(1) - - if not binary.abspath: - click.echo(f"{name} not found after apt install", err=True) - sys.exit(1) - - # Output Binary JSONL record to stdout - record = { - 'type': 'Binary', - 'name': name, - 'abspath': str(binary.abspath), - 'version': str(binary.version) if binary.version else '', - 'sha256': binary.sha256 or '', - 'binprovider': 'apt', - 'machine_id': machine_id, - 'binary_id': binary_id, - } - print(json.dumps(record)) - - # Log human-readable info to stderr - click.echo(f"Installed {name} at {binary.abspath}", err=True) - click.echo(f" version: {binary.version}", err=True) - - sys.exit(0) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/apt/templates/icon.html b/archivebox/plugins/apt/templates/icon.html deleted file mode 100644 index e69de29b..00000000 diff --git a/archivebox/plugins/apt/tests/test_apt_provider.py b/archivebox/plugins/apt/tests/test_apt_provider.py deleted file mode 100644 index c8b7934e..00000000 --- a/archivebox/plugins/apt/tests/test_apt_provider.py +++ /dev/null @@ -1,154 +0,0 @@ -""" -Tests for the apt binary provider plugin. - -Tests cover: -1. Hook script execution -2. apt package availability detection -3. JSONL output format -""" - -import json -import os -import shutil -import subprocess -import sys -import tempfile -from pathlib import Path - -import pytest -from django.test import TestCase - - -# Get the path to the apt provider hook -PLUGIN_DIR = Path(__file__).parent.parent -INSTALL_HOOK = next(PLUGIN_DIR.glob('on_Binary__*_apt_install.py'), None) - - -def apt_available() -> bool: - """Check if apt is installed.""" - return shutil.which('apt') is not None or shutil.which('apt-get') is not None - - -def is_linux() -> bool: - """Check if running on Linux.""" - import platform - return platform.system().lower() == 'linux' - - -class TestAptProviderHook(TestCase): - """Test the apt binary provider installation hook.""" - - def setUp(self): - """Set up test environment.""" - self.temp_dir = tempfile.mkdtemp() - - def tearDown(self): - """Clean up.""" - shutil.rmtree(self.temp_dir, ignore_errors=True) - - def test_hook_script_exists(self): - """Hook script should exist.""" - self.assertTrue(INSTALL_HOOK and INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}") - - def test_hook_skips_when_apt_not_allowed(self): - """Hook should skip when apt not in allowed binproviders.""" - result = subprocess.run( - [ - sys.executable, str(INSTALL_HOOK), - '--name=wget', - '--binary-id=test-uuid', - '--machine-id=test-machine', - '--binproviders=pip,npm', # apt not allowed - ], - capture_output=True, - text=True, - timeout=30 - ) - - # Should exit cleanly (code 0) when apt not allowed - self.assertIn('apt provider not allowed', result.stderr) - self.assertEqual(result.returncode, 0) - - @pytest.mark.skipif(not is_linux(), reason="apt only available on Linux") - def test_hook_detects_apt(self): - """Hook should detect apt binary when available.""" - assert apt_available(), "apt not installed" - result = subprocess.run( - [ - sys.executable, str(INSTALL_HOOK), - '--name=nonexistent-pkg-xyz123', - '--binary-id=test-uuid', - '--machine-id=test-machine', - ], - capture_output=True, - text=True, - timeout=30 - ) - - # Should not say apt is not available - self.assertNotIn('apt not available', result.stderr) - - def test_hook_handles_overrides(self): - """Hook should accept overrides JSON.""" - overrides = json.dumps({ - 'apt': {'packages': ['custom-package-name']} - }) - - result = subprocess.run( - [ - sys.executable, str(INSTALL_HOOK), - '--name=test-pkg', - '--binary-id=test-uuid', - '--machine-id=test-machine', - f'--overrides={overrides}', - ], - capture_output=True, - text=True, - timeout=30 - ) - - # Should not crash parsing overrides - self.assertNotIn('Traceback', result.stderr) - - -@pytest.mark.skipif(not is_linux(), reason="apt only available on Linux") -class TestAptProviderSystemBinaries(TestCase): - """Test apt provider with system binaries.""" - - def test_detect_existing_binary(self): - """apt provider should detect already-installed system binaries.""" - assert apt_available(), "apt not installed" - # Check for a binary that's almost certainly installed (like 'ls' or 'bash') - result = subprocess.run( - [ - sys.executable, str(INSTALL_HOOK), - '--name=bash', - '--binary-id=test-uuid', - '--machine-id=test-machine', - ], - capture_output=True, - text=True, - timeout=60 - ) - - # Parse JSONL output - for line in result.stdout.split('\n'): - line = line.strip() - if line.startswith('{'): - try: - record = json.loads(line) - if record.get('type') == 'Binary' and record.get('name') == 'bash': - # Found bash - self.assertTrue(record.get('abspath')) - self.assertTrue(Path(record['abspath']).exists()) - return - except json.JSONDecodeError: - continue - - # apt may not be able to "install" bash (already installed) - # Just verify no crash - self.assertNotIn('Traceback', result.stderr) - - -if __name__ == '__main__': - pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/archivedotorg/config.json b/archivebox/plugins/archivedotorg/config.json deleted file mode 100644 index b517183e..00000000 --- a/archivebox/plugins/archivedotorg/config.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "additionalProperties": false, - "properties": { - "ARCHIVEDOTORG_ENABLED": { - "type": "boolean", - "default": true, - "x-aliases": ["SAVE_ARCHIVEDOTORG", "USE_ARCHIVEDOTORG", "SUBMIT_ARCHIVEDOTORG"], - "description": "Submit URLs to archive.org Wayback Machine" - }, - "ARCHIVEDOTORG_TIMEOUT": { - "type": "integer", - "default": 60, - "minimum": 10, - "x-fallback": "TIMEOUT", - "description": "Timeout for archive.org submission in seconds" - }, - "ARCHIVEDOTORG_USER_AGENT": { - "type": "string", - "default": "", - "x-fallback": "USER_AGENT", - "description": "User agent string" - } - } -} diff --git a/archivebox/plugins/archivedotorg/on_Snapshot__08_archivedotorg.bg.py b/archivebox/plugins/archivedotorg/on_Snapshot__08_archivedotorg.bg.py deleted file mode 100644 index 11642b24..00000000 --- a/archivebox/plugins/archivedotorg/on_Snapshot__08_archivedotorg.bg.py +++ /dev/null @@ -1,154 +0,0 @@ -#!/usr/bin/env python3 -""" -Submit a URL to archive.org for archiving. - -Usage: on_Snapshot__archivedotorg.bg.py --url= --snapshot-id= -Output: Writes archive.org.txt to $PWD with the archived URL - -Environment variables: - ARCHIVEDOTORG_TIMEOUT: Timeout in seconds (default: 60) - USER_AGENT: User agent string - - # Fallback to ARCHIVING_CONFIG values if ARCHIVEDOTORG_* not set: - TIMEOUT: Fallback timeout - -Note: This extractor uses the 'requests' library which is bundled with ArchiveBox. - It can run standalone if requests is installed: pip install requests -""" - -import json -import os -import sys -from pathlib import Path - -import rich_click as click - - -# Extractor metadata -PLUGIN_NAME = 'archivedotorg' -OUTPUT_DIR = '.' -OUTPUT_FILE = 'archive.org.txt' - - -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - - -def get_env_int(name: str, default: int = 0) -> int: - try: - return int(get_env(name, str(default))) - except ValueError: - return default - - -def submit_to_archivedotorg(url: str) -> tuple[bool, str | None, str]: - """ - Submit URL to archive.org Wayback Machine. - - Returns: (success, output_path, error_message) - """ - def log(message: str) -> None: - print(f'[archivedotorg] {message}', file=sys.stderr) - - try: - import requests - except ImportError: - return False, None, 'requests library not installed' - - timeout = get_env_int('ARCHIVEDOTORG_TIMEOUT') or get_env_int('TIMEOUT', 60) - user_agent = get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)') - - submit_url = f'https://web.archive.org/save/{url}' - log(f'Submitting to Wayback Machine (timeout={timeout}s)') - log(f'GET {submit_url}') - - try: - response = requests.get( - submit_url, - timeout=timeout, - headers={'User-Agent': user_agent}, - allow_redirects=True, - ) - log(f'HTTP {response.status_code} final_url={response.url}') - - # Check for successful archive - content_location = response.headers.get('Content-Location', '') - x_archive_orig_url = response.headers.get('X-Archive-Orig-Url', '') - if content_location: - log(f'Content-Location: {content_location}') - if x_archive_orig_url: - log(f'X-Archive-Orig-Url: {x_archive_orig_url}') - - # Build archive URL - if content_location: - archive_url = f'https://web.archive.org{content_location}' - Path(OUTPUT_FILE).write_text(archive_url, encoding='utf-8') - log(f'Saved archive URL -> {archive_url}') - return True, OUTPUT_FILE, '' - elif 'web.archive.org' in response.url: - # We were redirected to an archive page - Path(OUTPUT_FILE).write_text(response.url, encoding='utf-8') - log(f'Redirected to archive page -> {response.url}') - return True, OUTPUT_FILE, '' - else: - # Check for errors in response - if 'RobotAccessControlException' in response.text: - # Blocked by robots.txt - save submit URL for manual retry - Path(OUTPUT_FILE).write_text(submit_url, encoding='utf-8') - log('Blocked by robots.txt, saved submit URL for manual retry') - return True, OUTPUT_FILE, '' # Consider this a soft success - elif response.status_code >= 400: - return False, None, f'HTTP {response.status_code}' - else: - # Save submit URL anyway - Path(OUTPUT_FILE).write_text(submit_url, encoding='utf-8') - log('No archive URL returned, saved submit URL for manual retry') - return True, OUTPUT_FILE, '' - - except requests.Timeout: - return False, None, f'Request timed out after {timeout} seconds' - except requests.RequestException as e: - return False, None, f'{type(e).__name__}: {e}' - except Exception as e: - return False, None, f'{type(e).__name__}: {e}' - - -@click.command() -@click.option('--url', required=True, help='URL to submit to archive.org') -@click.option('--snapshot-id', required=True, help='Snapshot UUID') -def main(url: str, snapshot_id: str): - """Submit a URL to archive.org for archiving.""" - - # Check if feature is enabled - if get_env('ARCHIVEDOTORG_ENABLED', 'True').lower() in ('false', '0', 'no', 'off'): - print('Skipping archive.org submission (ARCHIVEDOTORG_ENABLED=False)', file=sys.stderr) - # Temporary failure (config disabled) - NO JSONL emission - sys.exit(0) - - try: - # Run extraction - success, output, error = submit_to_archivedotorg(url) - - if success: - # Success - emit ArchiveResult with output file - result = { - 'type': 'ArchiveResult', - 'status': 'succeeded', - 'output_str': output or '', - } - print(json.dumps(result)) - sys.exit(0) - else: - # Transient error (network, timeout, HTTP error) - emit NO JSONL - # System will retry later - print(f'ERROR: {error}', file=sys.stderr) - sys.exit(1) - - except Exception as e: - # Unexpected error - also transient, emit NO JSONL - print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr) - sys.exit(1) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/archivedotorg/templates/card.html b/archivebox/plugins/archivedotorg/templates/card.html deleted file mode 100644 index 64a3c4d1..00000000 --- a/archivebox/plugins/archivedotorg/templates/card.html +++ /dev/null @@ -1,12 +0,0 @@ -{% load config_tags %} -{% get_config "ARCHIVEDOTORG_ENABLED" as enabled %} -{% if enabled %} - -
- -
-{% endif %} diff --git a/archivebox/plugins/archivedotorg/templates/icon.html b/archivebox/plugins/archivedotorg/templates/icon.html deleted file mode 100644 index e3f48634..00000000 --- a/archivebox/plugins/archivedotorg/templates/icon.html +++ /dev/null @@ -1 +0,0 @@ - diff --git a/archivebox/plugins/archivedotorg/tests/test_archivedotorg.py b/archivebox/plugins/archivedotorg/tests/test_archivedotorg.py deleted file mode 100644 index 1e4b4a97..00000000 --- a/archivebox/plugins/archivedotorg/tests/test_archivedotorg.py +++ /dev/null @@ -1,93 +0,0 @@ -""" -Integration tests for archivedotorg plugin - -Tests verify standalone archive.org extractor execution. -""" - -import json -import subprocess -import sys -import tempfile -from pathlib import Path -import pytest - -PLUGIN_DIR = Path(__file__).parent.parent -ARCHIVEDOTORG_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_archivedotorg.*'), None) -TEST_URL = 'https://example.com' - -def test_hook_script_exists(): - assert ARCHIVEDOTORG_HOOK.exists() - -def test_submits_to_archivedotorg(): - with tempfile.TemporaryDirectory() as tmpdir: - result = subprocess.run( - [sys.executable, str(ARCHIVEDOTORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'], - cwd=tmpdir, capture_output=True, text=True, timeout=60 - ) - - assert result.returncode in (0, 1) - - # Parse clean JSONL output - result_json = None - for line in result.stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - if result.returncode == 0: - # Success - should have ArchiveResult - assert result_json, "Should have ArchiveResult JSONL output on success" - assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" - else: - # Transient error - no JSONL output, just stderr - assert not result_json, "Should NOT emit JSONL on transient error" - assert result.stderr, "Should have error message in stderr" - -def test_config_save_archivedotorg_false_skips(): - with tempfile.TemporaryDirectory() as tmpdir: - import os - env = os.environ.copy() - env['ARCHIVEDOTORG_ENABLED'] = 'False' - - result = subprocess.run( - [sys.executable, str(ARCHIVEDOTORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'], - cwd=tmpdir, capture_output=True, text=True, env=env, timeout=30 - ) - - assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}" - - # Feature disabled - temporary failure, should NOT emit JSONL - assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr" - - # Should NOT emit any JSONL - jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')] - assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}" - -def test_handles_timeout(): - with tempfile.TemporaryDirectory() as tmpdir: - import os - env = os.environ.copy() - env['TIMEOUT'] = '1' - - result = subprocess.run( - [sys.executable, str(ARCHIVEDOTORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'testtimeout'], - cwd=tmpdir, capture_output=True, text=True, env=env, timeout=30 - ) - - # Timeout is a transient error - should exit 1 with no JSONL - assert result.returncode in (0, 1), "Should complete without hanging" - - # If it timed out (exit 1), should have no JSONL output - if result.returncode == 1: - jsonl_lines = [line for line in result.stdout.strip().split('\n') - if line.strip().startswith('{')] - assert len(jsonl_lines) == 0, "Should not emit JSONL on timeout (transient error)" - -if __name__ == '__main__': - pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/brew/on_Binary__12_brew_install.py b/archivebox/plugins/brew/on_Binary__12_brew_install.py deleted file mode 100644 index 928e1bd5..00000000 --- a/archivebox/plugins/brew/on_Binary__12_brew_install.py +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env python3 -""" -Install a binary using Homebrew package manager. - -Usage: on_Binary__install_using_brew_provider.py --binary-id= --machine-id= --name= [--custom-cmd=] -Output: Binary JSONL record to stdout after installation - -Environment variables: - MACHINE_ID: Machine UUID (set by orchestrator) -""" - -import json -import os -import sys - -import rich_click as click -from abx_pkg import Binary, BrewProvider, BinProviderOverrides - -# Fix pydantic forward reference issue -BrewProvider.model_rebuild() - - -@click.command() -@click.option('--machine-id', required=True, help="Machine UUID") -@click.option('--binary-id', required=True, help="Dependency UUID") -@click.option('--name', required=True, help="Binary name to install") -@click.option('--binproviders', default='*', help="Allowed providers (comma-separated)") -@click.option('--custom-cmd', default=None, help="Custom install command") -@click.option('--overrides', default=None, help="JSON-encoded overrides dict") -def main(binary_id: str, machine_id: str, name: str, binproviders: str, custom_cmd: str | None, overrides: str | None): - """Install binary using Homebrew.""" - - if binproviders != '*' and 'brew' not in binproviders.split(','): - click.echo(f"brew provider not allowed for {name}", err=True) - sys.exit(0) - - # Use abx-pkg BrewProvider to install binary - provider = BrewProvider() - if not provider.INSTALLER_BIN: - click.echo("brew not available on this system", err=True) - sys.exit(1) - - click.echo(f"Installing {name} via brew...", err=True) - - try: - # Parse overrides if provided - overrides_dict = None - if overrides: - try: - overrides_dict = json.loads(overrides) - click.echo(f"Using custom install overrides: {overrides_dict}", err=True) - except json.JSONDecodeError: - click.echo(f"Warning: Failed to parse overrides JSON: {overrides}", err=True) - - binary = Binary(name=name, binproviders=[provider], overrides=overrides_dict or {}).install() - except Exception as e: - click.echo(f"brew install failed: {e}", err=True) - sys.exit(1) - - if not binary.abspath: - click.echo(f"{name} not found after brew install", err=True) - sys.exit(1) - - machine_id = os.environ.get('MACHINE_ID', '') - - # Output Binary JSONL record to stdout - record = { - 'type': 'Binary', - 'name': name, - 'abspath': str(binary.abspath), - 'version': str(binary.version) if binary.version else '', - 'sha256': binary.sha256 or '', - 'binprovider': 'brew', - 'machine_id': machine_id, - 'binary_id': binary_id, - } - print(json.dumps(record)) - - # Log human-readable info to stderr - click.echo(f"Installed {name} at {binary.abspath}", err=True) - click.echo(f" version: {binary.version}", err=True) - - sys.exit(0) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/brew/templates/icon.html b/archivebox/plugins/brew/templates/icon.html deleted file mode 100644 index e69de29b..00000000 diff --git a/archivebox/plugins/chrome/chrome_utils.js b/archivebox/plugins/chrome/chrome_utils.js deleted file mode 100755 index e0e42a7e..00000000 --- a/archivebox/plugins/chrome/chrome_utils.js +++ /dev/null @@ -1,1997 +0,0 @@ -#!/usr/bin/env node -/** - * Chrome Extension Management Utilities - * - * Handles downloading, installing, and managing Chrome extensions for browser automation. - * Ported from the TypeScript implementation in archivebox.ts - */ - -const fs = require('fs'); -const path = require('path'); -const crypto = require('crypto'); -const http = require('http'); -const net = require('net'); -const { exec, spawn } = require('child_process'); -const { promisify } = require('util'); -const { Readable } = require('stream'); -const { finished } = require('stream/promises'); - -const execAsync = promisify(exec); - -const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)'; - -// ============================================================================ -// Environment helpers -// ============================================================================ - -/** - * Get environment variable with default value. - * @param {string} name - Environment variable name - * @param {string} [defaultValue=''] - Default value if not set - * @returns {string} - Trimmed environment variable value - */ -function getEnv(name, defaultValue = '') { - return (process.env[name] || defaultValue).trim(); -} - -/** - * Get boolean environment variable. - * @param {string} name - Environment variable name - * @param {boolean} [defaultValue=false] - Default value if not set - * @returns {boolean} - Boolean value - */ -function getEnvBool(name, defaultValue = false) { - const val = getEnv(name, '').toLowerCase(); - if (['true', '1', 'yes', 'on'].includes(val)) return true; - if (['false', '0', 'no', 'off'].includes(val)) return false; - return defaultValue; -} - -/** - * Get integer environment variable. - * @param {string} name - Environment variable name - * @param {number} [defaultValue=0] - Default value if not set - * @returns {number} - Integer value - */ -function getEnvInt(name, defaultValue = 0) { - const val = parseInt(getEnv(name, String(defaultValue)), 10); - return isNaN(val) ? defaultValue : val; -} - -/** - * Get array environment variable (JSON array or comma-separated string). - * - * Parsing strategy: - * - If value starts with '[', parse as JSON array - * - Otherwise, parse as comma-separated values - * - * This prevents incorrect splitting of arguments that contain internal commas. - * For arguments with commas, use JSON format: - * CHROME_ARGS='["--user-data-dir=/path/with,comma", "--window-size=1440,900"]' - * - * @param {string} name - Environment variable name - * @param {string[]} [defaultValue=[]] - Default value if not set - * @returns {string[]} - Array of strings - */ -function getEnvArray(name, defaultValue = []) { - const val = getEnv(name, ''); - if (!val) return defaultValue; - - // If starts with '[', parse as JSON array - if (val.startsWith('[')) { - try { - const parsed = JSON.parse(val); - if (Array.isArray(parsed)) return parsed; - } catch (e) { - console.error(`[!] Failed to parse ${name} as JSON array: ${e.message}`); - // Fall through to comma-separated parsing - } - } - - // Parse as comma-separated values - return val.split(',').map(s => s.trim()).filter(Boolean); -} - -/** - * Parse resolution string into width/height. - * @param {string} resolution - Resolution string like "1440,2000" - * @returns {{width: number, height: number}} - Parsed dimensions - */ -function parseResolution(resolution) { - const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10)); - return { width: width || 1440, height: height || 2000 }; -} - -// ============================================================================ -// PID file management -// ============================================================================ - -/** - * Write PID file with specific mtime for process validation. - * @param {string} filePath - Path to PID file - * @param {number} pid - Process ID - * @param {number} startTimeSeconds - Process start time in seconds - */ -function writePidWithMtime(filePath, pid, startTimeSeconds) { - fs.writeFileSync(filePath, String(pid)); - const startTimeMs = startTimeSeconds * 1000; - fs.utimesSync(filePath, new Date(startTimeMs), new Date(startTimeMs)); -} - -/** - * Write a shell script that can re-run the Chrome command. - * @param {string} filePath - Path to script file - * @param {string} binary - Chrome binary path - * @param {string[]} args - Chrome arguments - */ -function writeCmdScript(filePath, binary, args) { - const escape = (arg) => - arg.includes(' ') || arg.includes('"') || arg.includes('$') - ? `"${arg.replace(/"/g, '\\"')}"` - : arg; - fs.writeFileSync( - filePath, - `#!/bin/bash\n${binary} ${args.map(escape).join(' ')}\n` - ); - fs.chmodSync(filePath, 0o755); -} - -// ============================================================================ -// Port management -// ============================================================================ - -/** - * Find a free port on localhost. - * @returns {Promise} - Available port number - */ -function findFreePort() { - return new Promise((resolve, reject) => { - const server = net.createServer(); - server.unref(); - server.on('error', reject); - server.listen(0, () => { - const port = server.address().port; - server.close(() => resolve(port)); - }); - }); -} - -/** - * Wait for Chrome's DevTools port to be ready. - * @param {number} port - Debug port number - * @param {number} [timeout=30000] - Timeout in milliseconds - * @returns {Promise} - Chrome version info - */ -function waitForDebugPort(port, timeout = 30000) { - const startTime = Date.now(); - - return new Promise((resolve, reject) => { - const tryConnect = () => { - if (Date.now() - startTime > timeout) { - reject(new Error(`Timeout waiting for Chrome debug port ${port}`)); - return; - } - - const req = http.get(`http://127.0.0.1:${port}/json/version`, (res) => { - let data = ''; - res.on('data', (chunk) => (data += chunk)); - res.on('end', () => { - try { - const info = JSON.parse(data); - resolve(info); - } catch (e) { - setTimeout(tryConnect, 100); - } - }); - }); - - req.on('error', () => { - setTimeout(tryConnect, 100); - }); - - req.setTimeout(1000, () => { - req.destroy(); - setTimeout(tryConnect, 100); - }); - }; - - tryConnect(); - }); -} - -// ============================================================================ -// Zombie process cleanup -// ============================================================================ - -/** - * Kill zombie Chrome processes from stale crawls. - * Recursively scans DATA_DIR for any .../chrome/...pid files from stale crawls. - * Does not assume specific directory structure - works with nested paths. - * @param {string} [dataDir] - Data directory (defaults to DATA_DIR env or '.') - * @returns {number} - Number of zombies killed - */ -function killZombieChrome(dataDir = null) { - dataDir = dataDir || getEnv('DATA_DIR', '.'); - const now = Date.now(); - const fiveMinutesAgo = now - 300000; - let killed = 0; - - console.error('[*] Checking for zombie Chrome processes...'); - - if (!fs.existsSync(dataDir)) { - console.error('[+] No data directory found'); - return 0; - } - - /** - * Recursively find all chrome/.pid files in directory tree - * @param {string} dir - Directory to search - * @param {number} depth - Current recursion depth (limit to 10) - * @returns {Array<{pidFile: string, crawlDir: string}>} - Array of PID file info - */ - function findChromePidFiles(dir, depth = 0) { - if (depth > 10) return []; // Prevent infinite recursion - - const results = []; - try { - const entries = fs.readdirSync(dir, { withFileTypes: true }); - - for (const entry of entries) { - if (!entry.isDirectory()) continue; - - const fullPath = path.join(dir, entry.name); - - // Found a chrome directory - check for .pid files - if (entry.name === 'chrome') { - try { - const pidFiles = fs.readdirSync(fullPath).filter(f => f.endsWith('.pid')); - const crawlDir = dir; // Parent of chrome/ is the crawl dir - - for (const pidFileName of pidFiles) { - results.push({ - pidFile: path.join(fullPath, pidFileName), - crawlDir: crawlDir, - }); - } - } catch (e) { - // Skip if can't read chrome dir - } - } else { - // Recurse into subdirectory (skip hidden dirs and node_modules) - if (!entry.name.startsWith('.') && entry.name !== 'node_modules') { - results.push(...findChromePidFiles(fullPath, depth + 1)); - } - } - } - } catch (e) { - // Skip if can't read directory - } - return results; - } - - try { - const chromePids = findChromePidFiles(dataDir); - - for (const {pidFile, crawlDir} of chromePids) { - // Check if crawl was modified recently (still active) - try { - const crawlStats = fs.statSync(crawlDir); - if (crawlStats.mtimeMs > fiveMinutesAgo) { - continue; // Crawl is active, skip - } - } catch (e) { - continue; - } - - // Crawl is stale, check PID - try { - const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10); - if (isNaN(pid) || pid <= 0) continue; - - // Check if process exists - try { - process.kill(pid, 0); - } catch (e) { - // Process dead, remove stale PID file - try { fs.unlinkSync(pidFile); } catch (e) {} - continue; - } - - // Process alive and crawl is stale - zombie! - console.error(`[!] Found zombie (PID ${pid}) from stale crawl ${path.basename(crawlDir)}`); - - try { - try { process.kill(-pid, 'SIGKILL'); } catch (e) { process.kill(pid, 'SIGKILL'); } - killed++; - console.error(`[+] Killed zombie (PID ${pid})`); - try { fs.unlinkSync(pidFile); } catch (e) {} - } catch (e) { - console.error(`[!] Failed to kill PID ${pid}: ${e.message}`); - } - } catch (e) { - // Skip invalid PID files - } - } - } catch (e) { - console.error(`[!] Error scanning for Chrome processes: ${e.message}`); - } - - if (killed > 0) { - console.error(`[+] Killed ${killed} zombie process(es)`); - } else { - console.error('[+] No zombies found'); - } - - // Clean up stale SingletonLock files from persona chrome_user_data directories - const personasDir = path.join(dataDir, 'personas'); - if (fs.existsSync(personasDir)) { - try { - const personas = fs.readdirSync(personasDir, { withFileTypes: true }); - for (const persona of personas) { - if (!persona.isDirectory()) continue; - - const userDataDir = path.join(personasDir, persona.name, 'chrome_user_data'); - const singletonLock = path.join(userDataDir, 'SingletonLock'); - - if (fs.existsSync(singletonLock)) { - try { - fs.unlinkSync(singletonLock); - console.error(`[+] Removed stale SingletonLock: ${singletonLock}`); - } catch (e) { - // Ignore - may be in use by active Chrome - } - } - } - } catch (e) { - // Ignore errors scanning personas directory - } - } - - return killed; -} - -// ============================================================================ -// Chrome launching -// ============================================================================ - -/** - * Launch Chromium with extensions and return connection info. - * - * @param {Object} options - Launch options - * @param {string} [options.binary] - Chrome binary path (auto-detected if not provided) - * @param {string} [options.outputDir='chrome'] - Directory for output files - * @param {string} [options.userDataDir] - Chrome user data directory for persistent sessions - * @param {string} [options.resolution='1440,2000'] - Window resolution - * @param {boolean} [options.headless=true] - Run in headless mode - * @param {boolean} [options.sandbox=true] - Enable Chrome sandbox - * @param {boolean} [options.checkSsl=true] - Check SSL certificates - * @param {string[]} [options.extensionPaths=[]] - Paths to unpacked extensions - * @param {boolean} [options.killZombies=true] - Kill zombie processes first - * @returns {Promise} - {success, cdpUrl, pid, port, process, error} - */ -async function launchChromium(options = {}) { - const { - binary = findChromium(), - outputDir = 'chrome', - userDataDir = getEnv('CHROME_USER_DATA_DIR'), - resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000'), - userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', ''), - headless = getEnvBool('CHROME_HEADLESS', true), - sandbox = getEnvBool('CHROME_SANDBOX', true), - checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true)), - extensionPaths = [], - killZombies = true, - } = options; - - if (!binary) { - return { success: false, error: 'Chrome binary not found' }; - } - - const downloadsDir = getEnv('CHROME_DOWNLOADS_DIR'); - - // Kill zombies first - if (killZombies) { - killZombieChrome(); - } - - const { width, height } = parseResolution(resolution); - - // Create output directory - if (!fs.existsSync(outputDir)) { - fs.mkdirSync(outputDir, { recursive: true }); - } - - // Create user data directory if specified and doesn't exist - if (userDataDir) { - if (!fs.existsSync(userDataDir)) { - fs.mkdirSync(userDataDir, { recursive: true }); - console.error(`[*] Created user data directory: ${userDataDir}`); - } - // Clean up any stale SingletonLock file from previous crashed sessions - const singletonLock = path.join(userDataDir, 'SingletonLock'); - if (fs.existsSync(singletonLock)) { - try { - fs.unlinkSync(singletonLock); - console.error(`[*] Removed stale SingletonLock: ${singletonLock}`); - } catch (e) { - console.error(`[!] Failed to remove SingletonLock: ${e.message}`); - } - } - if (downloadsDir) { - try { - const defaultProfileDir = path.join(userDataDir, 'Default'); - const prefsPath = path.join(defaultProfileDir, 'Preferences'); - fs.mkdirSync(defaultProfileDir, { recursive: true }); - let prefs = {}; - if (fs.existsSync(prefsPath)) { - try { - prefs = JSON.parse(fs.readFileSync(prefsPath, 'utf-8')); - } catch (e) { - prefs = {}; - } - } - prefs.download = prefs.download || {}; - prefs.download.default_directory = downloadsDir; - prefs.download.prompt_for_download = false; - fs.writeFileSync(prefsPath, JSON.stringify(prefs)); - console.error(`[*] Set Chrome download directory: ${downloadsDir}`); - } catch (e) { - console.error(`[!] Failed to set Chrome download directory: ${e.message}`); - } - } - } - - // Find a free port - const debugPort = await findFreePort(); - console.error(`[*] Using debug port: ${debugPort}`); - - // Get base Chrome args from config (static flags from CHROME_ARGS env var) - // These come from config.json defaults, merged by get_config() in Python - const baseArgs = getEnvArray('CHROME_ARGS', []); - - // Get extra user-provided args - const extraArgs = getEnvArray('CHROME_ARGS_EXTRA', []); - - // Build dynamic Chrome arguments (these must be computed at runtime) - const inDocker = getEnvBool('IN_DOCKER', false); - const dynamicArgs = [ - // Remote debugging setup - `--remote-debugging-port=${debugPort}`, - '--remote-debugging-address=127.0.0.1', - - // Sandbox settings (disable in Docker) - ...(sandbox ? [] : (inDocker ? ['--no-sandbox', '--disable-setuid-sandbox'] : [])), - - // Docker-specific workarounds - '--disable-dev-shm-usage', - - // Window size - `--window-size=${width},${height}`, - - // User data directory (for persistent sessions with persona) - ...(userDataDir ? [`--user-data-dir=${userDataDir}`] : []), - - // User agent - ...(userAgent ? [`--user-agent=${userAgent}`] : []), - - // Headless mode - ...(headless ? ['--headless=new'] : []), - - // SSL certificate checking - ...(checkSsl ? [] : ['--ignore-certificate-errors']), - ]; - - // Combine all args: base (from config) + dynamic (runtime) + extra (user overrides) - // Dynamic args come after base so they can override if needed - const chromiumArgs = [...baseArgs, ...dynamicArgs, ...extraArgs]; - - // Ensure keychain prompts are disabled on macOS - if (!chromiumArgs.includes('--use-mock-keychain')) { - chromiumArgs.push('--use-mock-keychain'); - } - - // Add extension loading flags - if (extensionPaths.length > 0) { - const extPathsArg = extensionPaths.join(','); - chromiumArgs.push(`--load-extension=${extPathsArg}`); - chromiumArgs.push('--enable-unsafe-extension-debugging'); - chromiumArgs.push('--disable-features=DisableLoadExtensionCommandLineSwitch,ExtensionManifestV2Unsupported,ExtensionManifestV2Disabled'); - console.error(`[*] Loading ${extensionPaths.length} extension(s) via --load-extension`); - } - - chromiumArgs.push('about:blank'); - - // Write command script for debugging - writeCmdScript(path.join(outputDir, 'cmd.sh'), binary, chromiumArgs); - - try { - console.error(`[*] Spawning Chromium (headless=${headless})...`); - const chromiumProcess = spawn(binary, chromiumArgs, { - stdio: ['ignore', 'pipe', 'pipe'], - detached: true, - }); - - const chromePid = chromiumProcess.pid; - const chromeStartTime = Date.now() / 1000; - - if (chromePid) { - console.error(`[*] Chromium spawned (PID: ${chromePid})`); - writePidWithMtime(path.join(outputDir, 'chrome.pid'), chromePid, chromeStartTime); - } - - // Pipe Chrome output to stderr - chromiumProcess.stdout.on('data', (data) => { - process.stderr.write(`[chromium:stdout] ${data}`); - }); - chromiumProcess.stderr.on('data', (data) => { - process.stderr.write(`[chromium:stderr] ${data}`); - }); - - // Wait for debug port - console.error(`[*] Waiting for debug port ${debugPort}...`); - const versionInfo = await waitForDebugPort(debugPort, 30000); - const wsUrl = versionInfo.webSocketDebuggerUrl; - console.error(`[+] Chromium ready: ${wsUrl}`); - - fs.writeFileSync(path.join(outputDir, 'cdp_url.txt'), wsUrl); - fs.writeFileSync(path.join(outputDir, 'port.txt'), String(debugPort)); - - return { - success: true, - cdpUrl: wsUrl, - pid: chromePid, - port: debugPort, - process: chromiumProcess, - }; - } catch (e) { - return { success: false, error: `${e.name}: ${e.message}` }; - } -} - -/** - * Check if a process is still running. - * @param {number} pid - Process ID to check - * @returns {boolean} - True if process exists - */ -function isProcessAlive(pid) { - try { - process.kill(pid, 0); // Signal 0 checks existence without killing - return true; - } catch (e) { - return false; - } -} - -/** - * Find all Chrome child processes for a given debug port. - * @param {number} port - Debug port number - * @returns {Array} - Array of PIDs - */ -function findChromeProcessesByPort(port) { - const { execSync } = require('child_process'); - const pids = []; - - try { - // Find all Chrome processes using this debug port - const output = execSync( - `ps aux | grep -i "chrome.*--remote-debugging-port=${port}" | grep -v grep | awk '{print $2}'`, - { encoding: 'utf8', timeout: 5000 } - ); - - for (const line of output.split('\n')) { - const pid = parseInt(line.trim(), 10); - if (!isNaN(pid) && pid > 0) { - pids.push(pid); - } - } - } catch (e) { - // Command failed or no processes found - } - - return pids; -} - -/** - * Kill a Chrome process by PID. - * Always sends SIGTERM before SIGKILL, then verifies death. - * - * @param {number} pid - Process ID to kill - * @param {string} [outputDir] - Directory containing PID files to clean up - */ -async function killChrome(pid, outputDir = null) { - if (!pid) return; - - console.error(`[*] Killing Chrome process tree (PID ${pid})...`); - - // Get debug port for finding child processes - let debugPort = null; - if (outputDir) { - try { - const portFile = path.join(outputDir, 'port.txt'); - if (fs.existsSync(portFile)) { - debugPort = parseInt(fs.readFileSync(portFile, 'utf8').trim(), 10); - } - } catch (e) {} - } - - // Step 1: SIGTERM to process group (graceful shutdown) - console.error(`[*] Sending SIGTERM to process group -${pid}...`); - try { - process.kill(-pid, 'SIGTERM'); - } catch (e) { - try { - console.error(`[*] Process group kill failed, trying single process...`); - process.kill(pid, 'SIGTERM'); - } catch (e2) { - console.error(`[!] SIGTERM failed: ${e2.message}`); - } - } - - // Step 2: Wait for graceful shutdown - await new Promise(resolve => setTimeout(resolve, 2000)); - - // Step 3: Check if still alive - if (!isProcessAlive(pid)) { - console.error('[+] Chrome process terminated gracefully'); - } else { - // Step 4: Force kill ENTIRE process group with SIGKILL - console.error(`[*] Process still alive, sending SIGKILL to process group -${pid}...`); - try { - process.kill(-pid, 'SIGKILL'); // Kill entire process group - } catch (e) { - console.error(`[!] Process group SIGKILL failed, trying single process: ${e.message}`); - try { - process.kill(pid, 'SIGKILL'); - } catch (e2) { - console.error(`[!] SIGKILL failed: ${e2.message}`); - } - } - - // Step 5: Wait briefly and verify death - await new Promise(resolve => setTimeout(resolve, 1000)); - - if (isProcessAlive(pid)) { - console.error(`[!] WARNING: Process ${pid} is unkillable (likely in UNE state)`); - console.error(`[!] This typically happens when Chrome crashes in kernel syscall`); - console.error(`[!] Process will remain as zombie until system reboot`); - console.error(`[!] macOS IOSurface crash creates unkillable processes in UNE state`); - - // Try one more time to kill the entire process group - if (debugPort) { - const relatedPids = findChromeProcessesByPort(debugPort); - if (relatedPids.length > 1) { - console.error(`[*] Found ${relatedPids.length} Chrome processes still running on port ${debugPort}`); - console.error(`[*] Attempting final process group SIGKILL...`); - - // Try to kill each unique process group we find - const processGroups = new Set(); - for (const relatedPid of relatedPids) { - if (relatedPid !== pid) { - processGroups.add(relatedPid); - } - } - - for (const groupPid of processGroups) { - try { - process.kill(-groupPid, 'SIGKILL'); - } catch (e) {} - } - } - } - } else { - console.error('[+] Chrome process group killed successfully'); - } - } - - // Step 8: Clean up PID files - // Note: hook-specific .pid files are cleaned up by run_hook() and Snapshot.cleanup() - if (outputDir) { - try { fs.unlinkSync(path.join(outputDir, 'chrome.pid')); } catch (e) {} - } - - console.error('[*] Chrome cleanup completed'); -} - -/** - * Install Chromium using @puppeteer/browsers programmatic API. - * Uses puppeteer's default cache location, returns the binary path. - * - * @param {Object} options - Install options - * @returns {Promise} - {success, binary, version, error} - */ -async function installChromium(options = {}) { - // Check if CHROME_BINARY is already set and valid - const configuredBinary = getEnv('CHROME_BINARY'); - if (configuredBinary && fs.existsSync(configuredBinary)) { - console.error(`[+] Using configured CHROME_BINARY: ${configuredBinary}`); - return { success: true, binary: configuredBinary, version: null }; - } - - // Try to load @puppeteer/browsers from NODE_MODULES_DIR or system - let puppeteerBrowsers; - try { - if (process.env.NODE_MODULES_DIR) { - module.paths.unshift(process.env.NODE_MODULES_DIR); - } - puppeteerBrowsers = require('@puppeteer/browsers'); - } catch (e) { - console.error(`[!] @puppeteer/browsers not found. Install it first with installPuppeteerCore.`); - return { success: false, error: '@puppeteer/browsers not installed' }; - } - - console.error(`[*] Installing Chromium via @puppeteer/browsers...`); - - try { - const result = await puppeteerBrowsers.install({ - browser: 'chromium', - buildId: 'latest', - }); - - const binary = result.executablePath; - const version = result.buildId; - - if (!binary || !fs.existsSync(binary)) { - console.error(`[!] Chromium binary not found at: ${binary}`); - return { success: false, error: `Chromium binary not found at: ${binary}` }; - } - - console.error(`[+] Chromium installed: ${binary}`); - return { success: true, binary, version }; - } catch (e) { - console.error(`[!] Failed to install Chromium: ${e.message}`); - return { success: false, error: e.message }; - } -} - -/** - * Install puppeteer-core npm package. - * - * @param {Object} options - Install options - * @param {string} [options.npmPrefix] - npm prefix directory (default: DATA_DIR/lib//npm or ./node_modules parent) - * @param {number} [options.timeout=60000] - Timeout in milliseconds - * @returns {Promise} - {success, path, error} - */ -async function installPuppeteerCore(options = {}) { - const arch = `${process.arch}-${process.platform}`; - const defaultPrefix = path.join(getEnv('LIB_DIR', getEnv('DATA_DIR', '.')), 'npm'); - const { - npmPrefix = defaultPrefix, - timeout = 60000, - } = options; - - const nodeModulesDir = path.join(npmPrefix, 'node_modules'); - const puppeteerPath = path.join(nodeModulesDir, 'puppeteer-core'); - - // Check if already installed - if (fs.existsSync(puppeteerPath)) { - console.error(`[+] puppeteer-core already installed: ${puppeteerPath}`); - return { success: true, path: puppeteerPath }; - } - - console.error(`[*] Installing puppeteer-core to ${npmPrefix}...`); - - // Create directory - if (!fs.existsSync(npmPrefix)) { - fs.mkdirSync(npmPrefix, { recursive: true }); - } - - try { - const { execSync } = require('child_process'); - execSync( - `npm install --prefix "${npmPrefix}" puppeteer-core`, - { encoding: 'utf8', timeout, stdio: ['pipe', 'pipe', 'pipe'] } - ); - console.error(`[+] puppeteer-core installed successfully`); - return { success: true, path: puppeteerPath }; - } catch (e) { - console.error(`[!] Failed to install puppeteer-core: ${e.message}`); - return { success: false, error: e.message }; - } -} - -// Try to import unzipper, fallback to system unzip if not available -let unzip = null; -try { - const unzipper = require('unzipper'); - unzip = async (sourcePath, destPath) => { - const stream = fs.createReadStream(sourcePath).pipe(unzipper.Extract({ path: destPath })); - return stream.promise(); - }; -} catch (err) { - // Will use system unzip command as fallback -} - -/** - * Compute the extension ID from the unpacked path. - * Chrome uses a SHA256 hash of the unpacked extension directory path to compute a dynamic id. - * - * @param {string} unpacked_path - Path to the unpacked extension directory - * @returns {string} - 32-character extension ID - */ -function getExtensionId(unpacked_path) { - let resolved_path = unpacked_path; - try { - resolved_path = fs.realpathSync(unpacked_path); - } catch (err) { - // Use the provided path if realpath fails - resolved_path = unpacked_path; - } - // Chrome uses a SHA256 hash of the unpacked extension directory path - const hash = crypto.createHash('sha256'); - hash.update(Buffer.from(resolved_path, 'utf-8')); - - // Convert first 32 hex chars to characters in the range 'a'-'p' - const detected_extension_id = Array.from(hash.digest('hex')) - .slice(0, 32) - .map(i => String.fromCharCode(parseInt(i, 16) + 'a'.charCodeAt(0))) - .join(''); - - return detected_extension_id; -} - -/** - * Download and install a Chrome extension from the Chrome Web Store. - * - * @param {Object} extension - Extension metadata object - * @param {string} extension.webstore_id - Chrome Web Store extension ID - * @param {string} extension.name - Human-readable extension name - * @param {string} extension.crx_url - URL to download the CRX file - * @param {string} extension.crx_path - Local path to save the CRX file - * @param {string} extension.unpacked_path - Path to extract the extension - * @returns {Promise} - True if installation succeeded - */ -async function installExtension(extension) { - const manifest_path = path.join(extension.unpacked_path, 'manifest.json'); - - // Download CRX file if not already downloaded - if (!fs.existsSync(manifest_path) && !fs.existsSync(extension.crx_path)) { - console.log(`[šŸ› ļø] Downloading missing extension ${extension.name} ${extension.webstore_id} -> ${extension.crx_path}`); - - try { - // Ensure parent directory exists - const crxDir = path.dirname(extension.crx_path); - if (!fs.existsSync(crxDir)) { - fs.mkdirSync(crxDir, { recursive: true }); - } - - // Download CRX file from Chrome Web Store - const response = await fetch(extension.crx_url); - - if (!response.ok) { - console.warn(`[āš ļø] Failed to download extension ${extension.name}: HTTP ${response.status}`); - return false; - } - - if (response.body) { - const crx_file = fs.createWriteStream(extension.crx_path); - const crx_stream = Readable.fromWeb(response.body); - await finished(crx_stream.pipe(crx_file)); - } else { - console.warn(`[āš ļø] Failed to download extension ${extension.name}: No response body`); - return false; - } - } catch (err) { - console.error(`[āŒ] Failed to download extension ${extension.name}:`, err); - return false; - } - } - - // Unzip CRX file to unpacked_path (CRX files have extra header bytes but unzip handles it) - await fs.promises.mkdir(extension.unpacked_path, { recursive: true }); - - try { - // Use -q to suppress warnings about extra bytes in CRX header - await execAsync(`/usr/bin/unzip -q -o "${extension.crx_path}" -d "${extension.unpacked_path}"`); - } catch (err1) { - // unzip may return non-zero even on success due to CRX header warning, check if manifest exists - if (!fs.existsSync(manifest_path)) { - if (unzip) { - // Fallback to unzipper library - try { - await unzip(extension.crx_path, extension.unpacked_path); - } catch (err2) { - console.error(`[āŒ] Failed to unzip ${extension.crx_path}:`, err2.message); - return false; - } - } else { - console.error(`[āŒ] Failed to unzip ${extension.crx_path}:`, err1.message); - return false; - } - } - } - - if (!fs.existsSync(manifest_path)) { - console.error(`[āŒ] Failed to install ${extension.crx_path}: could not find manifest.json in unpacked_path`); - return false; - } - - return true; -} - -/** - * Load or install a Chrome extension, computing all metadata. - * - * @param {Object} ext - Partial extension metadata (at minimum: webstore_id or unpacked_path) - * @param {string} [ext.webstore_id] - Chrome Web Store extension ID - * @param {string} [ext.name] - Human-readable extension name - * @param {string} [ext.unpacked_path] - Path to unpacked extension - * @param {string} [extensions_dir] - Directory to store extensions - * @returns {Promise} - Complete extension metadata object - */ -async function loadOrInstallExtension(ext, extensions_dir = null) { - if (!(ext.webstore_id || ext.unpacked_path)) { - throw new Error('Extension must have either {webstore_id} or {unpacked_path}'); - } - - // Determine extensions directory - // Use provided dir, or fall back to getExtensionsDir() which handles env vars and defaults - const EXTENSIONS_DIR = extensions_dir || getExtensionsDir(); - - // Set statically computable extension metadata - ext.webstore_id = ext.webstore_id || ext.id; - ext.name = ext.name || ext.webstore_id; - ext.webstore_url = ext.webstore_url || `https://chromewebstore.google.com/detail/${ext.webstore_id}`; - ext.crx_url = ext.crx_url || `https://clients2.google.com/service/update2/crx?response=redirect&prodversion=1230&acceptformat=crx3&x=id%3D${ext.webstore_id}%26uc`; - ext.crx_path = ext.crx_path || path.join(EXTENSIONS_DIR, `${ext.webstore_id}__${ext.name}.crx`); - ext.unpacked_path = ext.unpacked_path || path.join(EXTENSIONS_DIR, `${ext.webstore_id}__${ext.name}`); - - const manifest_path = path.join(ext.unpacked_path, 'manifest.json'); - ext.read_manifest = () => JSON.parse(fs.readFileSync(manifest_path, 'utf-8')); - ext.read_version = () => fs.existsSync(manifest_path) && ext.read_manifest()?.version || null; - - // If extension is not installed, download and unpack it - if (!ext.read_version()) { - await installExtension(ext); - } - - // Autodetect ID from filesystem path (unpacked extensions don't have stable IDs) - ext.id = getExtensionId(ext.unpacked_path); - ext.version = ext.read_version(); - - if (!ext.version) { - console.warn(`[āŒ] Unable to detect ID and version of installed extension ${ext.unpacked_path}`); - } else { - console.log(`[āž•] Installed extension ${ext.name} (${ext.version})... ${ext.unpacked_path}`); - } - - return ext; -} - -/** - * Check if a Puppeteer target is an extension background page/service worker. - * - * @param {Object} target - Puppeteer target object - * @returns {Promise} - Object with target_is_bg, extension_id, manifest_version, etc. - */ -async function isTargetExtension(target) { - let target_type; - let target_ctx; - let target_url; - - try { - target_type = target.type(); - target_ctx = (await target.worker()) || (await target.page()) || null; - target_url = target.url() || target_ctx?.url() || null; - } catch (err) { - if (String(err).includes('No target with given id found')) { - // Target closed during check, ignore harmless race condition - target_type = 'closed'; - target_ctx = null; - target_url = 'about:closed'; - } else { - throw err; - } - } - - // Check if this is an extension background page or service worker - const is_chrome_extension = target_url?.startsWith('chrome-extension://'); - const is_background_page = target_type === 'background_page'; - const is_service_worker = target_type === 'service_worker'; - const target_is_bg = is_chrome_extension && (is_background_page || is_service_worker); - - let extension_id = null; - let manifest_version = null; - let manifest = null; - let manifest_name = null; - const target_is_extension = is_chrome_extension || target_is_bg; - - if (target_is_extension) { - try { - extension_id = target_url?.split('://')[1]?.split('/')[0] || null; - - if (target_ctx) { - manifest = await target_ctx.evaluate(() => chrome.runtime.getManifest()); - manifest_version = manifest?.manifest_version || null; - manifest_name = manifest?.name || null; - } - } catch (err) { - // Failed to get extension metadata - } - } - - return { - target_is_extension, - target_is_bg, - target_type, - target_ctx, - target_url, - extension_id, - manifest_version, - manifest, - manifest_name, - }; -} - -/** - * Load extension metadata and connection handlers from a browser target. - * - * @param {Array} extensions - Array of extension metadata objects to update - * @param {Object} target - Puppeteer target object - * @returns {Promise} - Updated extension object or null if not an extension - */ -async function loadExtensionFromTarget(extensions, target) { - const { - target_is_bg, - target_is_extension, - target_type, - target_ctx, - target_url, - extension_id, - manifest_version, - } = await isTargetExtension(target); - - if (!(target_is_bg && extension_id && target_ctx)) { - return null; - } - - // Find matching extension in our list - const extension = extensions.find(ext => ext.id === extension_id); - if (!extension) { - console.warn(`[āš ļø] Found loaded extension ${extension_id} that's not in CHROME_EXTENSIONS list`); - return null; - } - - // Load manifest from the extension context - let manifest = null; - try { - manifest = await target_ctx.evaluate(() => chrome.runtime.getManifest()); - } catch (err) { - console.error(`[āŒ] Failed to read manifest for extension ${extension_id}:`, err); - return null; - } - - // Create dispatch methods for communicating with the extension - const new_extension = { - ...extension, - target, - target_type, - target_url, - manifest, - manifest_version, - - // Trigger extension toolbar button click - dispatchAction: async (tab) => { - return await target_ctx.evaluate(async (tab) => { - tab = tab || (await new Promise((resolve) => - chrome.tabs.query({ currentWindow: true, active: true }, ([tab]) => resolve(tab)) - )); - - // Manifest V3: chrome.action - if (chrome.action?.onClicked?.dispatch) { - return await chrome.action.onClicked.dispatch(tab); - } - - // Manifest V2: chrome.browserAction - if (chrome.browserAction?.onClicked?.dispatch) { - return await chrome.browserAction.onClicked.dispatch(tab); - } - - throw new Error('Extension action dispatch not available'); - }, tab || null); - }, - - // Send message to extension - dispatchMessage: async (message, options = {}) => { - return await target_ctx.evaluate((msg, opts) => { - return new Promise((resolve) => { - chrome.runtime.sendMessage(msg, opts, (response) => { - resolve(response); - }); - }); - }, message, options); - }, - - // Trigger extension command (keyboard shortcut) - dispatchCommand: async (command) => { - return await target_ctx.evaluate((cmd) => { - return new Promise((resolve) => { - chrome.commands.onCommand.addListener((receivedCommand) => { - if (receivedCommand === cmd) { - resolve({ success: true, command: receivedCommand }); - } - }); - // Note: Actually triggering commands programmatically is not directly supported - // This would need to be done via CDP or keyboard simulation - }); - }, command); - }, - }; - - // Update the extension in the array - Object.assign(extension, new_extension); - - console.log(`[šŸ”Œ] Connected to extension ${extension.name} (${extension.version})`); - - return new_extension; -} - -/** - * Install all extensions in the list if not already installed. - * - * @param {Array} extensions - Array of extension metadata objects - * @param {string} [extensions_dir] - Directory to store extensions - * @returns {Promise} - Array of installed extension objects - */ -async function installAllExtensions(extensions, extensions_dir = null) { - console.log(`[āš™ļø] Installing ${extensions.length} chrome extensions...`); - - for (const extension of extensions) { - await loadOrInstallExtension(extension, extensions_dir); - } - - return extensions; -} - -/** - * Load and connect to all extensions from a running browser. - * - * @param {Object} browser - Puppeteer browser instance - * @param {Array} extensions - Array of extension metadata objects - * @returns {Promise} - Array of loaded extension objects with connection handlers - */ -async function loadAllExtensionsFromBrowser(browser, extensions) { - console.log(`[āš™ļø] Loading ${extensions.length} chrome extensions from browser...`); - - // Find loaded extensions at runtime by examining browser targets - for (const target of browser.targets()) { - await loadExtensionFromTarget(extensions, target); - } - - return extensions; -} - -/** - * Load extension manifest.json file - * - * @param {string} unpacked_path - Path to unpacked extension directory - * @returns {object|null} - Parsed manifest object or null if not found/invalid - */ -function loadExtensionManifest(unpacked_path) { - const manifest_path = path.join(unpacked_path, 'manifest.json'); - - if (!fs.existsSync(manifest_path)) { - return null; - } - - try { - const manifest_content = fs.readFileSync(manifest_path, 'utf-8'); - return JSON.parse(manifest_content); - } catch (error) { - // Invalid JSON or read error - return null; - } -} - -/** - * @deprecated Use puppeteer's enableExtensions option instead. - * - * Generate Chrome launch arguments for loading extensions. - * NOTE: This is deprecated. Use puppeteer.launch({ pipe: true, enableExtensions: [paths] }) instead. - * - * @param {Array} extensions - Array of extension metadata objects - * @returns {Array} - Chrome CLI arguments for loading extensions - */ -function getExtensionLaunchArgs(extensions) { - console.warn('[DEPRECATED] getExtensionLaunchArgs is deprecated. Use puppeteer enableExtensions option instead.'); - if (!extensions || extensions.length === 0) { - return []; - } - - // Filter out extensions without unpacked_path first - const validExtensions = extensions.filter(ext => ext.unpacked_path); - - const unpacked_paths = validExtensions.map(ext => ext.unpacked_path); - // Use computed id (from path hash) for allowlisting, as that's what Chrome uses for unpacked extensions - // Fall back to webstore_id if computed id not available - const extension_ids = validExtensions.map(ext => ext.id || getExtensionId(ext.unpacked_path)); - - return [ - `--load-extension=${unpacked_paths.join(',')}`, - `--allowlisted-extension-id=${extension_ids.join(',')}`, - '--allow-legacy-extension-manifests', - '--disable-extensions-auto-update', - ]; -} - -/** - * Get extension paths for use with puppeteer's enableExtensions option. - * Following puppeteer best practices: https://pptr.dev/guides/chrome-extensions - * - * @param {Array} extensions - Array of extension metadata objects - * @returns {Array} - Array of extension unpacked paths - */ -function getExtensionPaths(extensions) { - if (!extensions || extensions.length === 0) { - return []; - } - return extensions - .filter(ext => ext.unpacked_path) - .map(ext => ext.unpacked_path); -} - -/** - * Wait for an extension target to be available in the browser. - * Following puppeteer best practices for accessing extension contexts. - * - * For Manifest V3 extensions (service workers): - * const worker = await waitForExtensionTarget(browser, extensionId); - * // worker is a WebWorker context - * - * For Manifest V2 extensions (background pages): - * const page = await waitForExtensionTarget(browser, extensionId); - * // page is a Page context - * - * @param {Object} browser - Puppeteer browser instance - * @param {string} extensionId - Extension ID to wait for (computed from path hash) - * @param {number} [timeout=30000] - Timeout in milliseconds - * @returns {Promise} - Worker or Page context for the extension - */ -async function waitForExtensionTarget(browser, extensionId, timeout = 30000) { - // Try to find service worker first (Manifest V3) - try { - const workerTarget = await browser.waitForTarget( - target => target.type() === 'service_worker' && - target.url().includes(`chrome-extension://${extensionId}`), - { timeout } - ); - const worker = await workerTarget.worker(); - if (worker) return worker; - } catch (err) { - // No service worker found, try background page - } - - // Try background page (Manifest V2) - try { - const backgroundTarget = await browser.waitForTarget( - target => target.type() === 'background_page' && - target.url().includes(`chrome-extension://${extensionId}`), - { timeout } - ); - const page = await backgroundTarget.page(); - if (page) return page; - } catch (err) { - // No background page found - } - - // Try any extension page as fallback - const extTarget = await browser.waitForTarget( - target => target.url().startsWith(`chrome-extension://${extensionId}`), - { timeout } - ); - - // Return worker or page depending on target type - if (extTarget.type() === 'service_worker') { - return await extTarget.worker(); - } - return await extTarget.page(); -} - -/** - * Get all loaded extension targets from a browser. - * - * @param {Object} browser - Puppeteer browser instance - * @returns {Array} - Array of extension target info objects - */ -function getExtensionTargets(browser) { - return browser.targets() - .filter(target => - target.url().startsWith('chrome-extension://') || - target.type() === 'service_worker' || - target.type() === 'background_page' - ) - .map(target => ({ - type: target.type(), - url: target.url(), - extensionId: target.url().includes('chrome-extension://') - ? target.url().split('chrome-extension://')[1]?.split('/')[0] - : null, - })); -} - -/** - * Find Chromium binary path. - * Checks CHROME_BINARY env var first, then falls back to system locations. - * - * @returns {string|null} - Absolute path to browser binary or null if not found - */ -function findChromium() { - const { execSync } = require('child_process'); - - // Helper to validate a binary by running --version - const validateBinary = (binaryPath) => { - if (!binaryPath || !fs.existsSync(binaryPath)) return false; - try { - execSync(`"${binaryPath}" --version`, { encoding: 'utf8', timeout: 5000, stdio: 'pipe' }); - return true; - } catch (e) { - return false; - } - }; - - // 1. Check CHROME_BINARY env var first - const chromeBinary = getEnv('CHROME_BINARY'); - if (chromeBinary) { - const absPath = path.resolve(chromeBinary); - if (absPath.includes('Google Chrome') || absPath.includes('google-chrome')) { - console.error('[!] Warning: CHROME_BINARY points to Chrome. Chromium is required for extension support.'); - } else if (validateBinary(absPath)) { - return absPath; - } - console.error(`[!] Warning: CHROME_BINARY="${chromeBinary}" is not valid`); - } - - // 2. Warn that no CHROME_BINARY is configured, searching fallbacks - if (!chromeBinary) { - console.error('[!] Warning: CHROME_BINARY not set, searching system locations...'); - } - - // Helper to find Chromium in @puppeteer/browsers directory structure - const findInPuppeteerDir = (baseDir) => { - if (!fs.existsSync(baseDir)) return null; - try { - const versions = fs.readdirSync(baseDir); - for (const version of versions.sort().reverse()) { - const versionDir = path.join(baseDir, version); - const candidates = [ - path.join(versionDir, 'chrome-mac-arm64/Chromium.app/Contents/MacOS/Chromium'), - path.join(versionDir, 'chrome-mac/Chromium.app/Contents/MacOS/Chromium'), - path.join(versionDir, 'chrome-mac-x64/Chromium.app/Contents/MacOS/Chromium'), - path.join(versionDir, 'chrome-linux64/chrome'), - path.join(versionDir, 'chrome-linux/chrome'), - ]; - for (const c of candidates) { - if (fs.existsSync(c)) return c; - } - } - } catch (e) {} - return null; - }; - - // 3. Search fallback locations (Chromium only) - const fallbackLocations = [ - // System Chromium - '/Applications/Chromium.app/Contents/MacOS/Chromium', - '/usr/bin/chromium', - '/usr/bin/chromium-browser', - // Puppeteer cache - path.join(process.env.HOME || '', '.cache/puppeteer/chromium'), - path.join(process.env.HOME || '', '.cache/puppeteer'), - ]; - - for (const loc of fallbackLocations) { - // Check if it's a puppeteer cache dir - if (loc.includes('.cache/puppeteer')) { - const binary = findInPuppeteerDir(loc); - if (binary && validateBinary(binary)) { - return binary; - } - } else if (validateBinary(loc)) { - return loc; - } - } - - return null; -} - -/** - * Find Chromium binary path only (never Chrome/Brave/Edge). - * Prefers CHROME_BINARY if set, then Chromium. - * - * @returns {string|null} - Absolute path or command name to browser binary - */ -function findAnyChromiumBinary() { - const chromiumBinary = findChromium(); - if (chromiumBinary) return chromiumBinary; - return null; -} - -// ============================================================================ -// Shared Extension Installer Utilities -// ============================================================================ - -/** - * Get the extensions directory path. - * Centralized path calculation used by extension installers and chrome launch. - * - * Path is derived from environment variables in this priority: - * 1. CHROME_EXTENSIONS_DIR (explicit override) - * 2. DATA_DIR/personas/ACTIVE_PERSONA/chrome_extensions (default) - * - * @returns {string} - Absolute path to extensions directory - */ -function getExtensionsDir() { - const dataDir = getEnv('DATA_DIR', '.'); - const persona = getEnv('ACTIVE_PERSONA', 'Default'); - return getEnv('CHROME_EXTENSIONS_DIR') || - path.join(dataDir, 'personas', persona, 'chrome_extensions'); -} - -/** - * Get machine type string for platform-specific paths. - * Matches Python's archivebox.config.paths.get_machine_type() - * - * @returns {string} - Machine type (e.g., 'x86_64-linux', 'arm64-darwin') - */ -function getMachineType() { - if (process.env.MACHINE_TYPE) { - return process.env.MACHINE_TYPE; - } - - let machine = process.arch; - const system = process.platform; - - // Normalize machine type to match Python's convention - if (machine === 'arm64' || machine === 'aarch64') { - machine = 'arm64'; - } else if (machine === 'x64' || machine === 'x86_64' || machine === 'amd64') { - machine = 'x86_64'; - } else if (machine === 'ia32' || machine === 'x86') { - machine = 'x86'; - } - - return `${machine}-${system}`; -} - -/** - * Get LIB_DIR path for platform-specific binaries. - * Returns DATA_DIR/lib/MACHINE_TYPE/ - * - * @returns {string} - Absolute path to lib directory - */ -function getLibDir() { - if (process.env.LIB_DIR) { - return path.resolve(process.env.LIB_DIR); - } - const dataDir = getEnv('DATA_DIR', './data'); - const machineType = getMachineType(); - return path.resolve(path.join(dataDir, 'lib', machineType)); -} - -/** - * Get NODE_MODULES_DIR path for npm packages. - * Returns LIB_DIR/npm/node_modules/ - * - * @returns {string} - Absolute path to node_modules directory - */ -function getNodeModulesDir() { - if (process.env.NODE_MODULES_DIR) { - return path.resolve(process.env.NODE_MODULES_DIR); - } - return path.resolve(path.join(getLibDir(), 'npm', 'node_modules')); -} - -/** - * Get all test environment paths as a JSON object. - * This is the single source of truth for path calculations - Python calls this - * to avoid duplicating path logic. - * - * @returns {Object} - Object with all test environment paths - */ -function getTestEnv() { - const dataDir = getEnv('DATA_DIR', './data'); - const machineType = getMachineType(); - const libDir = getLibDir(); - const nodeModulesDir = getNodeModulesDir(); - - return { - DATA_DIR: dataDir, - MACHINE_TYPE: machineType, - LIB_DIR: libDir, - NODE_MODULES_DIR: nodeModulesDir, - NODE_PATH: nodeModulesDir, // Node.js uses NODE_PATH for module resolution - NPM_BIN_DIR: path.join(libDir, 'npm', '.bin'), - CHROME_EXTENSIONS_DIR: getExtensionsDir(), - }; -} - -/** - * Install a Chrome extension with caching support. - * - * This is the main entry point for extension installer hooks. It handles: - * - Checking for cached extension metadata - * - Installing the extension if not cached - * - Writing cache file for future runs - * - * @param {Object} extension - Extension metadata object - * @param {string} extension.webstore_id - Chrome Web Store extension ID - * @param {string} extension.name - Human-readable extension name (used for cache file) - * @param {Object} [options] - Options - * @param {string} [options.extensionsDir] - Override extensions directory - * @param {boolean} [options.quiet=false] - Suppress info logging - * @returns {Promise} - Installed extension metadata or null on failure - */ -async function installExtensionWithCache(extension, options = {}) { - const { - extensionsDir = getExtensionsDir(), - quiet = false, - } = options; - - const cacheFile = path.join(extensionsDir, `${extension.name}.extension.json`); - - // Check if extension is already cached and valid - if (fs.existsSync(cacheFile)) { - try { - const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8')); - const manifestPath = path.join(cached.unpacked_path, 'manifest.json'); - - if (fs.existsSync(manifestPath)) { - if (!quiet) { - console.log(`[*] ${extension.name} extension already installed (using cache)`); - } - return cached; - } - } catch (e) { - // Cache file corrupted, re-install - console.warn(`[āš ļø] Extension cache corrupted for ${extension.name}, re-installing...`); - } - } - - // Install extension - if (!quiet) { - console.log(`[*] Installing ${extension.name} extension...`); - } - - const installedExt = await loadOrInstallExtension(extension, extensionsDir); - - if (!installedExt?.version) { - console.error(`[āŒ] Failed to install ${extension.name} extension`); - return null; - } - - // Write cache file - try { - await fs.promises.mkdir(extensionsDir, { recursive: true }); - await fs.promises.writeFile(cacheFile, JSON.stringify(installedExt, null, 2)); - if (!quiet) { - console.log(`[+] Extension metadata written to ${cacheFile}`); - } - } catch (e) { - console.warn(`[āš ļø] Failed to write cache file: ${e.message}`); - } - - if (!quiet) { - console.log(`[+] ${extension.name} extension installed`); - } - - return installedExt; -} - -// ============================================================================ -// Snapshot Hook Utilities (for CDP-based plugins like ssl, responses, dns) -// ============================================================================ - -/** - * Parse command line arguments into an object. - * Handles --key=value and --flag formats. - * - * @returns {Object} - Parsed arguments object - */ -function parseArgs() { - const args = {}; - process.argv.slice(2).forEach(arg => { - if (arg.startsWith('--')) { - const [key, ...valueParts] = arg.slice(2).split('='); - args[key.replace(/-/g, '_')] = valueParts.join('=') || true; - } - }); - return args; -} - -/** - * Wait for Chrome session files to be ready. - * Polls for cdp_url.txt and target_id.txt in the chrome session directory. - * - * @param {string} chromeSessionDir - Path to chrome session directory (e.g., '../chrome') - * @param {number} [timeoutMs=60000] - Timeout in milliseconds - * @returns {Promise} - True if files are ready, false if timeout - */ -async function waitForChromeSession(chromeSessionDir, timeoutMs = 60000) { - const cdpFile = path.join(chromeSessionDir, 'cdp_url.txt'); - const targetIdFile = path.join(chromeSessionDir, 'target_id.txt'); - const startTime = Date.now(); - - while (Date.now() - startTime < timeoutMs) { - if (fs.existsSync(cdpFile) && fs.existsSync(targetIdFile)) { - return true; - } - await new Promise(resolve => setTimeout(resolve, 100)); - } - - return false; -} - -/** - * Read CDP WebSocket URL from chrome session directory. - * - * @param {string} chromeSessionDir - Path to chrome session directory - * @returns {string|null} - CDP URL or null if not found - */ -function readCdpUrl(chromeSessionDir) { - const cdpFile = path.join(chromeSessionDir, 'cdp_url.txt'); - if (fs.existsSync(cdpFile)) { - return fs.readFileSync(cdpFile, 'utf8').trim(); - } - return null; -} - -/** - * Read target ID from chrome session directory. - * - * @param {string} chromeSessionDir - Path to chrome session directory - * @returns {string|null} - Target ID or null if not found - */ -function readTargetId(chromeSessionDir) { - const targetIdFile = path.join(chromeSessionDir, 'target_id.txt'); - if (fs.existsSync(targetIdFile)) { - return fs.readFileSync(targetIdFile, 'utf8').trim(); - } - return null; -} - -/** - * Connect to Chrome browser and find the target page. - * This is a high-level utility that handles all the connection logic: - * 1. Wait for chrome session files - * 2. Connect to browser via CDP - * 3. Find the target page by ID - * - * @param {Object} options - Connection options - * @param {string} [options.chromeSessionDir='../chrome'] - Path to chrome session directory - * @param {number} [options.timeoutMs=60000] - Timeout for waiting - * @param {Object} [options.puppeteer] - Puppeteer module (must be passed in) - * @returns {Promise} - { browser, page, targetId, cdpUrl } - * @throws {Error} - If connection fails or page not found - */ -async function connectToPage(options = {}) { - const { - chromeSessionDir = '../chrome', - timeoutMs = 60000, - puppeteer, - } = options; - - if (!puppeteer) { - throw new Error('puppeteer module must be passed to connectToPage()'); - } - - // Wait for chrome session to be ready - const sessionReady = await waitForChromeSession(chromeSessionDir, timeoutMs); - if (!sessionReady) { - throw new Error(CHROME_SESSION_REQUIRED_ERROR); - } - - // Read session files - const cdpUrl = readCdpUrl(chromeSessionDir); - if (!cdpUrl) { - throw new Error(CHROME_SESSION_REQUIRED_ERROR); - } - - const targetId = readTargetId(chromeSessionDir); - - // Connect to browser - const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl }); - - // Find the target page - const pages = await browser.pages(); - let page = null; - - if (targetId) { - page = pages.find(p => { - const target = p.target(); - return target && target._targetId === targetId; - }); - } - - // Fallback to last page if target not found - if (!page) { - page = pages[pages.length - 1]; - } - - if (!page) { - throw new Error('No page found in browser'); - } - - return { browser, page, targetId, cdpUrl }; -} - -/** - * Wait for page navigation to complete. - * Polls for page_loaded.txt marker file written by chrome_navigate. - * - * @param {string} chromeSessionDir - Path to chrome session directory - * @param {number} [timeoutMs=120000] - Timeout in milliseconds - * @param {number} [postLoadDelayMs=0] - Additional delay after page load marker - * @returns {Promise} - * @throws {Error} - If timeout waiting for navigation - */ -async function waitForPageLoaded(chromeSessionDir, timeoutMs = 120000, postLoadDelayMs = 0) { - const pageLoadedMarker = path.join(chromeSessionDir, 'page_loaded.txt'); - const pollInterval = 100; - let waitTime = 0; - - while (!fs.existsSync(pageLoadedMarker) && waitTime < timeoutMs) { - await new Promise(resolve => setTimeout(resolve, pollInterval)); - waitTime += pollInterval; - } - - if (!fs.existsSync(pageLoadedMarker)) { - throw new Error('Timeout waiting for navigation (chrome_navigate did not complete)'); - } - - // Optional post-load delay for late responses - if (postLoadDelayMs > 0) { - await new Promise(resolve => setTimeout(resolve, postLoadDelayMs)); - } -} - -// Export all functions -module.exports = { - // Environment helpers - getEnv, - getEnvBool, - getEnvInt, - getEnvArray, - parseResolution, - // PID file management - writePidWithMtime, - writeCmdScript, - // Port management - findFreePort, - waitForDebugPort, - // Zombie cleanup - killZombieChrome, - // Chrome launching - launchChromium, - killChrome, - // Chromium install - installChromium, - installPuppeteerCore, - // Chromium binary finding - findChromium, - findAnyChromiumBinary, - // Extension utilities - getExtensionId, - loadExtensionManifest, - installExtension, - loadOrInstallExtension, - isTargetExtension, - loadExtensionFromTarget, - installAllExtensions, - loadAllExtensionsFromBrowser, - // New puppeteer best-practices helpers - getExtensionPaths, - waitForExtensionTarget, - getExtensionTargets, - // Shared path utilities (single source of truth for Python/JS) - getMachineType, - getLibDir, - getNodeModulesDir, - getExtensionsDir, - getTestEnv, - // Shared extension installer utilities - installExtensionWithCache, - // Deprecated - use enableExtensions option instead - getExtensionLaunchArgs, - // Snapshot hook utilities (for CDP-based plugins) - parseArgs, - waitForChromeSession, - readCdpUrl, - readTargetId, - connectToPage, - waitForPageLoaded, -}; - -// CLI usage -if (require.main === module) { - const args = process.argv.slice(2); - - if (args.length === 0) { - console.log('Usage: chrome_utils.js [args...]'); - console.log(''); - console.log('Commands:'); - console.log(' findChromium Find Chromium binary'); - console.log(' installChromium Install Chromium via @puppeteer/browsers'); - console.log(' installPuppeteerCore Install puppeteer-core npm package'); - console.log(' launchChromium Launch Chrome with CDP debugging'); - console.log(' killChrome Kill Chrome process by PID'); - console.log(' killZombieChrome Clean up zombie Chrome processes'); - console.log(''); - console.log(' getMachineType Get machine type (e.g., x86_64-linux)'); - console.log(' getLibDir Get LIB_DIR path'); - console.log(' getNodeModulesDir Get NODE_MODULES_DIR path'); - console.log(' getExtensionsDir Get Chrome extensions directory'); - console.log(' getTestEnv Get all paths as JSON (for tests)'); - console.log(''); - console.log(' getExtensionId Get extension ID from unpacked path'); - console.log(' loadExtensionManifest Load extension manifest.json'); - console.log(' loadOrInstallExtension Load or install an extension'); - console.log(' installExtensionWithCache Install extension with caching'); - console.log(''); - console.log('Environment variables:'); - console.log(' DATA_DIR Base data directory'); - console.log(' LIB_DIR Library directory (computed if not set)'); - console.log(' MACHINE_TYPE Machine type override'); - console.log(' NODE_MODULES_DIR Node modules directory'); - console.log(' CHROME_BINARY Chrome binary path'); - console.log(' CHROME_EXTENSIONS_DIR Extensions directory'); - process.exit(1); - } - - const [command, ...commandArgs] = args; - - (async () => { - try { - switch (command) { - case 'findChromium': { - const binary = findChromium(); - if (binary) { - console.log(binary); - } else { - console.error('Chromium binary not found'); - process.exit(1); - } - break; - } - - case 'installChromium': { - const result = await installChromium(); - if (result.success) { - console.log(JSON.stringify({ - binary: result.binary, - version: result.version, - })); - } else { - console.error(result.error); - process.exit(1); - } - break; - } - - case 'installPuppeteerCore': { - const [npmPrefix] = commandArgs; - const result = await installPuppeteerCore({ npmPrefix: npmPrefix || undefined }); - if (result.success) { - console.log(JSON.stringify({ path: result.path })); - } else { - console.error(result.error); - process.exit(1); - } - break; - } - - case 'launchChromium': { - const [outputDir, extensionPathsJson] = commandArgs; - const extensionPaths = extensionPathsJson ? JSON.parse(extensionPathsJson) : []; - const result = await launchChromium({ - outputDir: outputDir || 'chrome', - extensionPaths, - }); - if (result.success) { - console.log(JSON.stringify({ - cdpUrl: result.cdpUrl, - pid: result.pid, - port: result.port, - })); - } else { - console.error(result.error); - process.exit(1); - } - break; - } - - case 'killChrome': { - const [pidStr, outputDir] = commandArgs; - const pid = parseInt(pidStr, 10); - if (isNaN(pid)) { - console.error('Invalid PID'); - process.exit(1); - } - await killChrome(pid, outputDir); - break; - } - - case 'killZombieChrome': { - const [dataDir] = commandArgs; - const killed = killZombieChrome(dataDir); - console.log(killed); - break; - } - - case 'getExtensionId': { - const [unpacked_path] = commandArgs; - const id = getExtensionId(unpacked_path); - console.log(id); - break; - } - - case 'loadExtensionManifest': { - const [unpacked_path] = commandArgs; - const manifest = loadExtensionManifest(unpacked_path); - console.log(JSON.stringify(manifest)); - break; - } - - case 'getExtensionLaunchArgs': { - const [extensions_json] = commandArgs; - const extensions = JSON.parse(extensions_json); - const launchArgs = getExtensionLaunchArgs(extensions); - console.log(JSON.stringify(launchArgs)); - break; - } - - case 'loadOrInstallExtension': { - const [webstore_id, name, extensions_dir] = commandArgs; - const ext = await loadOrInstallExtension({ webstore_id, name }, extensions_dir); - console.log(JSON.stringify(ext, null, 2)); - break; - } - - case 'getMachineType': { - console.log(getMachineType()); - break; - } - - case 'getLibDir': { - console.log(getLibDir()); - break; - } - - case 'getNodeModulesDir': { - console.log(getNodeModulesDir()); - break; - } - - case 'getExtensionsDir': { - console.log(getExtensionsDir()); - break; - } - - case 'getTestEnv': { - console.log(JSON.stringify(getTestEnv(), null, 2)); - break; - } - - case 'installExtensionWithCache': { - const [webstore_id, name] = commandArgs; - if (!webstore_id || !name) { - console.error('Usage: installExtensionWithCache '); - process.exit(1); - } - const ext = await installExtensionWithCache({ webstore_id, name }); - if (ext) { - console.log(JSON.stringify(ext, null, 2)); - } else { - process.exit(1); - } - break; - } - - default: - console.error(`Unknown command: ${command}`); - process.exit(1); - } - } catch (error) { - console.error(`Error: ${error.message}`); - process.exit(1); - } - })(); -} diff --git a/archivebox/plugins/chrome/config.json b/archivebox/plugins/chrome/config.json deleted file mode 100644 index f4d6a4d8..00000000 --- a/archivebox/plugins/chrome/config.json +++ /dev/null @@ -1,157 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "additionalProperties": false, - "properties": { - "CHROME_ENABLED": { - "type": "boolean", - "default": true, - "x-aliases": ["USE_CHROME"], - "description": "Enable Chromium browser integration for archiving" - }, - "CHROME_BINARY": { - "type": "string", - "default": "chromium", - "x-aliases": ["CHROMIUM_BINARY", "GOOGLE_CHROME_BINARY"], - "description": "Path to Chromium binary" - }, - "CHROME_NODE_BINARY": { - "type": "string", - "default": "node", - "x-fallback": "NODE_BINARY", - "description": "Path to Node.js binary (for Puppeteer)" - }, - "CHROME_TIMEOUT": { - "type": "integer", - "default": 60, - "minimum": 5, - "x-fallback": "TIMEOUT", - "description": "Timeout for Chrome operations in seconds" - }, - "CHROME_HEADLESS": { - "type": "boolean", - "default": true, - "description": "Run Chrome in headless mode" - }, - "CHROME_SANDBOX": { - "type": "boolean", - "default": true, - "description": "Enable Chrome sandbox (disable in Docker with --no-sandbox)" - }, - "CHROME_RESOLUTION": { - "type": "string", - "default": "1440,2000", - "pattern": "^\\d+,\\d+$", - "x-fallback": "RESOLUTION", - "description": "Browser viewport resolution (width,height)" - }, - "CHROME_USER_DATA_DIR": { - "type": "string", - "default": "", - "description": "Path to Chrome user data directory for persistent sessions (derived from ACTIVE_PERSONA if not set)" - }, - "CHROME_USER_AGENT": { - "type": "string", - "default": "", - "x-fallback": "USER_AGENT", - "description": "User agent string for Chrome" - }, - "CHROME_ARGS": { - "type": "array", - "items": {"type": "string"}, - "default": [ - "--no-first-run", - "--no-default-browser-check", - "--disable-default-apps", - "--disable-sync", - "--disable-infobars", - "--disable-blink-features=AutomationControlled", - "--disable-component-update", - "--disable-domain-reliability", - "--disable-breakpad", - "--disable-client-side-phishing-detection", - "--disable-hang-monitor", - "--disable-speech-synthesis-api", - "--disable-speech-api", - "--disable-print-preview", - "--disable-notifications", - "--disable-desktop-notifications", - "--disable-popup-blocking", - "--disable-prompt-on-repost", - "--disable-external-intent-requests", - "--disable-session-crashed-bubble", - "--disable-search-engine-choice-screen", - "--disable-datasaver-prompt", - "--ash-no-nudges", - "--hide-crash-restore-bubble", - "--suppress-message-center-popups", - "--noerrdialogs", - "--no-pings", - "--silent-debugger-extension-api", - "--deny-permission-prompts", - "--safebrowsing-disable-auto-update", - "--metrics-recording-only", - "--password-store=basic", - "--use-mock-keychain", - "--disable-cookie-encryption", - "--font-render-hinting=none", - "--force-color-profile=srgb", - "--disable-partial-raster", - "--disable-skia-runtime-opts", - "--disable-2d-canvas-clip-aa", - "--enable-webgl", - "--hide-scrollbars", - "--export-tagged-pdf", - "--generate-pdf-document-outline", - "--disable-lazy-loading", - "--disable-renderer-backgrounding", - "--disable-background-networking", - "--disable-background-timer-throttling", - "--disable-backgrounding-occluded-windows", - "--disable-ipc-flooding-protection", - "--disable-extensions-http-throttling", - "--disable-field-trial-config", - "--disable-back-forward-cache", - "--autoplay-policy=no-user-gesture-required", - "--disable-gesture-requirement-for-media-playback", - "--lang=en-US,en;q=0.9", - "--log-level=2", - "--enable-logging=stderr" - ], - "x-aliases": ["CHROME_DEFAULT_ARGS"], - "description": "Default Chrome command-line arguments (static flags only, dynamic args like --user-data-dir are added at runtime)" - }, - "CHROME_ARGS_EXTRA": { - "type": "array", - "items": {"type": "string"}, - "default": [], - "x-aliases": ["CHROME_EXTRA_ARGS"], - "description": "Extra arguments to append to Chrome command (for user customization)" - }, - "CHROME_PAGELOAD_TIMEOUT": { - "type": "integer", - "default": 60, - "minimum": 5, - "x-fallback": "CHROME_TIMEOUT", - "description": "Timeout for page navigation/load in seconds" - }, - "CHROME_WAIT_FOR": { - "type": "string", - "default": "networkidle2", - "enum": ["domcontentloaded", "load", "networkidle0", "networkidle2"], - "description": "Page load completion condition (domcontentloaded, load, networkidle0, networkidle2)" - }, - "CHROME_DELAY_AFTER_LOAD": { - "type": "number", - "default": 0, - "minimum": 0, - "description": "Extra delay in seconds after page load completes before archiving (useful for JS-heavy SPAs)" - }, - "CHROME_CHECK_SSL_VALIDITY": { - "type": "boolean", - "default": true, - "x-fallback": "CHECK_SSL_VALIDITY", - "description": "Whether to verify SSL certificates (disable for self-signed certs)" - } - } -} diff --git a/archivebox/plugins/chrome/extract_cookies.js b/archivebox/plugins/chrome/extract_cookies.js deleted file mode 100644 index c23515dc..00000000 --- a/archivebox/plugins/chrome/extract_cookies.js +++ /dev/null @@ -1,254 +0,0 @@ -#!/usr/bin/env node -/** - * Extract cookies from Chrome via CDP and write to Netscape cookies.txt format. - * - * This script launches Chrome with a given user data directory, connects via CDP, - * extracts all cookies, and writes them to a cookies.txt file in Netscape format. - * - * Usage: - * CHROME_USER_DATA_DIR=/path/to/profile COOKIES_OUTPUT_FILE=/path/to/cookies.txt node extract_cookies.js - * - * Environment variables: - * CHROME_USER_DATA_DIR: Path to Chrome user data directory (required) - * COOKIES_OUTPUT_FILE: Path to output cookies.txt file (required) - * CHROME_HEADLESS: Run in headless mode (default: true) - * NODE_MODULES_DIR: Path to node_modules for module resolution - */ - -// Add NODE_MODULES_DIR to module resolution paths if set -if (process.env.NODE_MODULES_DIR) { - module.paths.unshift(process.env.NODE_MODULES_DIR); -} - -const fs = require('fs'); -const path = require('path'); -const { - findAnyChromiumBinary, - launchChromium, - killChrome, - getEnv, -} = require('./chrome_utils.js'); - -/** - * Convert a cookie object to Netscape cookies.txt format line. - * - * Format: domain includeSubdomains path secure expiry name value - * - * @param {Object} cookie - CDP cookie object - * @returns {string} - Netscape format cookie line - */ -function cookieToNetscape(cookie) { - // Domain: prefix with . for domain cookies (not host-only) - let domain = cookie.domain; - if (!domain.startsWith('.') && !cookie.hostOnly) { - domain = '.' + domain; - } - - // Include subdomains: TRUE if domain cookie (starts with .) - const includeSubdomains = domain.startsWith('.') ? 'TRUE' : 'FALSE'; - - // Path - const cookiePath = cookie.path || '/'; - - // Secure flag - const secure = cookie.secure ? 'TRUE' : 'FALSE'; - - // Expiry timestamp (0 for session cookies) - let expiry = '0'; - if (cookie.expires && cookie.expires > 0) { - // CDP returns expiry in seconds since epoch - expiry = Math.floor(cookie.expires).toString(); - } - - // Name and value - const name = cookie.name; - const value = cookie.value; - - return `${domain}\t${includeSubdomains}\t${cookiePath}\t${secure}\t${expiry}\t${name}\t${value}`; -} - -/** - * Write cookies to Netscape cookies.txt format file. - * - * @param {Array} cookies - Array of CDP cookie objects - * @param {string} outputPath - Path to output file - */ -function writeCookiesFile(cookies, outputPath) { - const lines = [ - '# Netscape HTTP Cookie File', - '# https://curl.se/docs/http-cookies.html', - '# This file was generated by ArchiveBox persona cookie extraction', - '#', - '# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue', - '', - ]; - - for (const cookie of cookies) { - lines.push(cookieToNetscape(cookie)); - } - - fs.writeFileSync(outputPath, lines.join('\n') + '\n'); -} - -async function main() { - const userDataDir = getEnv('CHROME_USER_DATA_DIR'); - const outputFile = getEnv('COOKIES_OUTPUT_FILE'); - - if (!userDataDir) { - console.error('ERROR: CHROME_USER_DATA_DIR environment variable is required'); - process.exit(1); - } - - if (!outputFile) { - console.error('ERROR: COOKIES_OUTPUT_FILE environment variable is required'); - process.exit(1); - } - - if (!fs.existsSync(userDataDir)) { - console.error(`ERROR: User data directory does not exist: ${userDataDir}`); - process.exit(1); - } - - const binary = findAnyChromiumBinary(); - if (!binary) { - console.error('ERROR: Chromium-based browser binary not found'); - process.exit(1); - } - - console.error(`[*] Extracting cookies from: ${userDataDir}`); - console.error(`[*] Output file: ${outputFile}`); - console.error(`[*] Using browser: ${binary}`); - - // Create a temporary output directory for Chrome files - const outputDir = fs.mkdtempSync(path.join(require('os').tmpdir(), 'chrome-cookies-')); - - let chromePid = null; - - try { - // Launch Chrome with the user data directory - const result = await launchChromium({ - binary, - outputDir, - userDataDir, - headless: true, - killZombies: false, // Don't kill other Chrome instances - }); - - if (!result.success) { - console.error(`ERROR: Failed to launch Chrome: ${result.error}`); - process.exit(1); - } - - chromePid = result.pid; - const cdpUrl = result.cdpUrl; - const port = result.port; - - console.error(`[*] Chrome launched (PID: ${chromePid})`); - console.error(`[*] CDP URL: ${cdpUrl}`); - - // Connect to CDP and get cookies - const http = require('http'); - - // Use CDP directly via HTTP to get all cookies - const getCookies = () => { - return new Promise((resolve, reject) => { - const req = http.request( - { - hostname: '127.0.0.1', - port: port, - path: '/json/list', - method: 'GET', - }, - (res) => { - let data = ''; - res.on('data', (chunk) => (data += chunk)); - res.on('end', () => { - try { - const targets = JSON.parse(data); - // Find a page target - const pageTarget = targets.find(t => t.type === 'page') || targets[0]; - if (!pageTarget) { - reject(new Error('No page target found')); - return; - } - - // Connect via WebSocket and send CDP command - const WebSocket = require('ws'); - const ws = new WebSocket(pageTarget.webSocketDebuggerUrl); - - ws.on('open', () => { - ws.send(JSON.stringify({ - id: 1, - method: 'Network.getAllCookies', - })); - }); - - ws.on('message', (message) => { - const response = JSON.parse(message); - if (response.id === 1) { - ws.close(); - if (response.result && response.result.cookies) { - resolve(response.result.cookies); - } else { - reject(new Error('Failed to get cookies: ' + JSON.stringify(response))); - } - } - }); - - ws.on('error', (err) => { - reject(err); - }); - } catch (e) { - reject(e); - } - }); - } - ); - - req.on('error', reject); - req.end(); - }); - }; - - // Wait a moment for the browser to fully initialize - await new Promise(r => setTimeout(r, 2000)); - - console.error('[*] Fetching cookies via CDP...'); - const cookies = await getCookies(); - - console.error(`[+] Retrieved ${cookies.length} cookies`); - - // Write cookies to file - writeCookiesFile(cookies, outputFile); - console.error(`[+] Wrote cookies to: ${outputFile}`); - - // Clean up - await killChrome(chromePid, outputDir); - chromePid = null; - - // Remove temp directory - fs.rmSync(outputDir, { recursive: true, force: true }); - - console.error('[+] Cookie extraction complete'); - process.exit(0); - - } catch (error) { - console.error(`ERROR: ${error.message}`); - - // Clean up on error - if (chromePid) { - await killChrome(chromePid, outputDir); - } - - try { - fs.rmSync(outputDir, { recursive: true, force: true }); - } catch (e) {} - - process.exit(1); - } -} - -main().catch((e) => { - console.error(`Fatal error: ${e.message}`); - process.exit(1); -}); diff --git a/archivebox/plugins/chrome/on_Crawl__70_chrome_install.py b/archivebox/plugins/chrome/on_Crawl__70_chrome_install.py deleted file mode 100755 index af0b8ec7..00000000 --- a/archivebox/plugins/chrome/on_Crawl__70_chrome_install.py +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env python3 -""" -Emit Chromium Binary dependency for the crawl. - -NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for ---load-extension and --disable-extensions-except flags, which are needed for -loading unpacked extensions in headless mode. -""" - -import json -import os -import sys - - -def main(): - # Check if Chrome is enabled - chrome_enabled = os.environ.get('CHROME_ENABLED', 'true').lower() not in ('false', '0', 'no', 'off') - if not chrome_enabled: - sys.exit(0) - - record = { - 'type': 'Binary', - 'name': 'chromium', - 'binproviders': 'puppeteer,env', - 'overrides': { - 'puppeteer': ['chromium@latest', '--install-deps'], - }, - } - print(json.dumps(record)) - sys.exit(0) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/chrome/on_Crawl__90_chrome_launch.bg.js b/archivebox/plugins/chrome/on_Crawl__90_chrome_launch.bg.js deleted file mode 100644 index b5cb9822..00000000 --- a/archivebox/plugins/chrome/on_Crawl__90_chrome_launch.bg.js +++ /dev/null @@ -1,427 +0,0 @@ -#!/usr/bin/env node -/** - * Launch a shared Chromium browser session for the entire crawl. - * - * This runs once per crawl and keeps Chromium alive for all snapshots to share. - * Each snapshot creates its own tab via on_Snapshot__10_chrome_tab.bg.js. - * - * NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for - * --load-extension and --disable-extensions-except flags. - * - * Usage: on_Crawl__90_chrome_launch.bg.js --crawl-id= --source-url= - * Output: Writes to current directory (executor creates chrome/ dir): - * - cdp_url.txt: WebSocket URL for CDP connection - * - chrome.pid: Chromium process ID (for cleanup) - * - port.txt: Debug port number - * - extensions.json: Loaded extensions metadata - * - * Environment variables: - * NODE_MODULES_DIR: Path to node_modules directory for module resolution - * CHROME_BINARY: Path to Chromium binary (falls back to auto-detection) - * CHROME_RESOLUTION: Page resolution (default: 1440,2000) - * CHROME_HEADLESS: Run in headless mode (default: true) - * CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true) - * CHROME_EXTENSIONS_DIR: Directory containing Chrome extensions - */ - -// Add NODE_MODULES_DIR to module resolution paths if set -if (process.env.NODE_MODULES_DIR) { - module.paths.unshift(process.env.NODE_MODULES_DIR); -} - -const fs = require('fs'); -const path = require('path'); -const http = require('http'); -const puppeteer = require('puppeteer'); -const { - findChromium, - launchChromium, - killChrome, - getEnv, - getEnvBool, - getExtensionId, - writePidWithMtime, - getExtensionsDir, -} = require('./chrome_utils.js'); - -// Extractor metadata -const PLUGIN_NAME = 'chrome_launch'; -const OUTPUT_DIR = '.'; - -// Global state for cleanup -let chromePid = null; -let browserInstance = null; - -function parseCookiesTxt(contents) { - const cookies = []; - let skipped = 0; - - for (const rawLine of contents.split(/\r?\n/)) { - const line = rawLine.trim(); - if (!line) continue; - - let httpOnly = false; - let dataLine = line; - - if (dataLine.startsWith('#HttpOnly_')) { - httpOnly = true; - dataLine = dataLine.slice('#HttpOnly_'.length); - } else if (dataLine.startsWith('#')) { - continue; - } - - const parts = dataLine.split('\t'); - if (parts.length < 7) { - skipped += 1; - continue; - } - - const [domainRaw, includeSubdomainsRaw, pathRaw, secureRaw, expiryRaw, name, value] = parts; - if (!name || !domainRaw) { - skipped += 1; - continue; - } - - const includeSubdomains = (includeSubdomainsRaw || '').toUpperCase() === 'TRUE'; - let domain = domainRaw; - if (includeSubdomains && !domain.startsWith('.')) domain = `.${domain}`; - if (!includeSubdomains && domain.startsWith('.')) domain = domain.slice(1); - - const cookie = { - name, - value, - domain, - path: pathRaw || '/', - secure: (secureRaw || '').toUpperCase() === 'TRUE', - httpOnly, - }; - - const expires = parseInt(expiryRaw, 10); - if (!isNaN(expires) && expires > 0) { - cookie.expires = expires; - } - - cookies.push(cookie); - } - - return { cookies, skipped }; -} - -async function importCookiesFromFile(browser, cookiesFile, userDataDir) { - if (!cookiesFile) return; - - if (!fs.existsSync(cookiesFile)) { - console.error(`[!] Cookies file not found: ${cookiesFile}`); - return; - } - - let contents = ''; - try { - contents = fs.readFileSync(cookiesFile, 'utf-8'); - } catch (e) { - console.error(`[!] Failed to read COOKIES_TXT_FILE: ${e.message}`); - return; - } - - const { cookies, skipped } = parseCookiesTxt(contents); - if (cookies.length === 0) { - console.error('[!] No cookies found to import'); - return; - } - - console.error(`[*] Importing ${cookies.length} cookies from ${cookiesFile}...`); - if (skipped) { - console.error(`[*] Skipped ${skipped} malformed cookie line(s)`); - } - if (!userDataDir) { - console.error('[!] CHROME_USER_DATA_DIR not set; cookies will not persist beyond this session'); - } - - const page = await browser.newPage(); - const client = await page.target().createCDPSession(); - await client.send('Network.enable'); - - const chunkSize = 200; - let imported = 0; - for (let i = 0; i < cookies.length; i += chunkSize) { - const chunk = cookies.slice(i, i + chunkSize); - try { - await client.send('Network.setCookies', { cookies: chunk }); - imported += chunk.length; - } catch (e) { - console.error(`[!] Failed to import cookies ${i + 1}-${i + chunk.length}: ${e.message}`); - } - } - - await page.close(); - console.error(`[+] Imported ${imported}/${cookies.length} cookies`); -} - -function getPortFromCdpUrl(cdpUrl) { - if (!cdpUrl) return null; - const match = cdpUrl.match(/:(\d+)\/devtools\//); - return match ? match[1] : null; -} - -async function fetchDevtoolsTargets(cdpUrl) { - const port = getPortFromCdpUrl(cdpUrl); - if (!port) return []; - - const urlPath = '/json/list'; - return new Promise((resolve, reject) => { - const req = http.get( - { hostname: '127.0.0.1', port, path: urlPath }, - (res) => { - let data = ''; - res.on('data', (chunk) => (data += chunk)); - res.on('end', () => { - try { - const targets = JSON.parse(data); - resolve(Array.isArray(targets) ? targets : []); - } catch (e) { - reject(e); - } - }); - } - ); - req.on('error', reject); - }); -} - -async function discoverExtensionTargets(cdpUrl, installedExtensions) { - const builtinIds = [ - 'nkeimhogjdpnpccoofpliimaahmaaome', - 'fignfifoniblkonapihmkfakmlgkbkcf', - 'ahfgeienlihckogmohjhadlkjgocpleb', - 'mhjfbmdgcfjbbpaeojofohoefgiehjai', - ]; - - let targets = []; - for (let i = 0; i < 10; i += 1) { - try { - targets = await fetchDevtoolsTargets(cdpUrl); - if (targets.length > 0) break; - } catch (e) { - // Ignore and retry - } - await new Promise(r => setTimeout(r, 500)); - } - - const customExtTargets = targets.filter(t => { - const url = t.url || ''; - if (!url.startsWith('chrome-extension://')) return false; - const extId = url.split('://')[1].split('/')[0]; - return !builtinIds.includes(extId); - }); - - console.error(`[+] Found ${customExtTargets.length} custom extension target(s) via /json/list`); - - for (const target of customExtTargets) { - const url = target.url || ''; - const extId = url.split('://')[1].split('/')[0]; - console.error(`[+] Extension target: ${extId} (${target.type || 'unknown'})`); - } - - const runtimeIds = new Set(customExtTargets.map(t => (t.url || '').split('://')[1].split('/')[0])); - for (const ext of installedExtensions) { - if (ext.id) { - ext.loaded = runtimeIds.has(ext.id); - } - } - - if (customExtTargets.length === 0 && installedExtensions.length > 0) { - console.error(`[!] Warning: No custom extensions detected. Extension loading may have failed.`); - console.error(`[!] Make sure you are using Chromium, not Chrome (Chrome 137+ removed --load-extension support)`); - } -} - -// Parse command line arguments -function parseArgs() { - const args = {}; - process.argv.slice(2).forEach((arg) => { - if (arg.startsWith('--')) { - const [key, ...valueParts] = arg.slice(2).split('='); - args[key.replace(/-/g, '_')] = valueParts.join('=') || true; - } - }); - return args; -} - -// Cleanup handler for SIGTERM -async function cleanup() { - console.error('[*] Cleaning up Chrome session...'); - - // Try graceful browser close first - if (browserInstance) { - try { - console.error('[*] Closing browser gracefully...'); - await browserInstance.close(); - browserInstance = null; - console.error('[+] Browser closed gracefully'); - } catch (e) { - console.error(`[!] Graceful close failed: ${e.message}`); - } - } - - // Kill Chrome process - if (chromePid) { - await killChrome(chromePid, OUTPUT_DIR); - } - - process.exit(0); -} - -// Register signal handlers -process.on('SIGTERM', cleanup); -process.on('SIGINT', cleanup); - -async function main() { - const args = parseArgs(); - const crawlId = args.crawl_id; - - try { - const binary = findChromium(); - if (!binary) { - console.error('ERROR: Chromium binary not found'); - console.error('DEPENDENCY_NEEDED=chromium'); - console.error('BIN_PROVIDERS=puppeteer,env,playwright,apt,brew'); - console.error('INSTALL_HINT=npx @puppeteer/browsers install chromium@latest'); - process.exit(1); - } - - // Get Chromium version - let version = ''; - try { - const { execSync } = require('child_process'); - version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 }) - .trim() - .slice(0, 64); - } catch (e) {} - - console.error(`[*] Using browser: ${binary}`); - if (version) console.error(`[*] Version: ${version}`); - - // Load installed extensions - const extensionsDir = getExtensionsDir(); - const userDataDir = getEnv('CHROME_USER_DATA_DIR'); - const cookiesFile = getEnv('COOKIES_TXT_FILE') || getEnv('COOKIES_FILE'); - - if (userDataDir) { - console.error(`[*] Using user data dir: ${userDataDir}`); - } - if (cookiesFile) { - console.error(`[*] Using cookies file: ${cookiesFile}`); - } - - const installedExtensions = []; - const extensionPaths = []; - if (fs.existsSync(extensionsDir)) { - const files = fs.readdirSync(extensionsDir); - for (const file of files) { - if (file.endsWith('.extension.json')) { - try { - const extPath = path.join(extensionsDir, file); - const extData = JSON.parse(fs.readFileSync(extPath, 'utf-8')); - if (extData.unpacked_path && fs.existsSync(extData.unpacked_path)) { - installedExtensions.push(extData); - extensionPaths.push(extData.unpacked_path); - console.error(`[*] Loading extension: ${extData.name || file}`); - } - } catch (e) { - console.warn(`[!] Skipping invalid extension cache: ${file}`); - } - } - } - } - - if (installedExtensions.length > 0) { - console.error(`[+] Found ${installedExtensions.length} extension(s) to load`); - } - - // Ensure extension IDs are available without chrome://extensions - for (const ext of installedExtensions) { - if (!ext.id && ext.unpacked_path) { - try { - ext.id = getExtensionId(ext.unpacked_path); - } catch (e) { - console.error(`[!] Failed to compute extension id for ${ext.name}: ${e.message}`); - } - } - } - - // Note: PID file is written by run_hook() with hook-specific name - // Snapshot.cleanup() kills all *.pid processes when done - if (!fs.existsSync(OUTPUT_DIR)) { - fs.mkdirSync(OUTPUT_DIR, { recursive: true }); - } - - // Launch Chromium using consolidated function - // userDataDir is derived from ACTIVE_PERSONA by get_config() if not explicitly set - const result = await launchChromium({ - binary, - outputDir: OUTPUT_DIR, - userDataDir, - extensionPaths, - }); - - if (!result.success) { - console.error(`ERROR: ${result.error}`); - process.exit(1); - } - - chromePid = result.pid; - const cdpUrl = result.cdpUrl; - - // Discover extension targets at launch (no chrome://extensions) - if (extensionPaths.length > 0) { - await new Promise(r => setTimeout(r, 2000)); - console.error('[*] Discovering extension targets via devtools /json/list...'); - await discoverExtensionTargets(cdpUrl, installedExtensions); - } - - // Only connect to CDP when cookies import is needed to reduce crash risk. - if (cookiesFile) { - console.error(`[*] Connecting puppeteer to CDP for cookie import...`); - const browser = await puppeteer.connect({ - browserWSEndpoint: cdpUrl, - defaultViewport: null, - }); - browserInstance = browser; - - // Import cookies into Chrome profile at crawl start - await importCookiesFromFile(browser, cookiesFile, userDataDir); - - try { - browser.disconnect(); - } catch (e) {} - browserInstance = null; - } else { - console.error('[*] Skipping puppeteer CDP connection (no cookies to import)'); - } - - // Write extensions metadata with actual IDs - if (installedExtensions.length > 0) { - fs.writeFileSync( - path.join(OUTPUT_DIR, 'extensions.json'), - JSON.stringify(installedExtensions, null, 2) - ); - } - - console.error(`[+] Chromium session started for crawl ${crawlId}`); - console.error(`[+] CDP URL: ${cdpUrl}`); - console.error(`[+] PID: ${chromePid}`); - - // Stay alive to handle cleanup on SIGTERM - console.log('[*] Chromium launch hook staying alive to handle cleanup...'); - setInterval(() => {}, 1000000); - - } catch (e) { - console.error(`ERROR: ${e.name}: ${e.message}`); - process.exit(1); - } -} - -main().catch((e) => { - console.error(`Fatal error: ${e.message}`); - process.exit(1); -}); diff --git a/archivebox/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js b/archivebox/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js deleted file mode 100755 index 4f3c6594..00000000 --- a/archivebox/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js +++ /dev/null @@ -1,264 +0,0 @@ -#!/usr/bin/env node -/** - * Create a Chrome tab for this snapshot in the shared crawl Chrome session. - * - * Connects to the crawl-level Chrome session (from on_Crawl__90_chrome_launch.bg.js) - * and creates a new tab. This hook does NOT launch its own Chrome instance. - * - * Usage: on_Snapshot__10_chrome_tab.bg.js --url= --snapshot-id= --crawl-id= - * Output: Creates chrome/ directory under snapshot output dir with: - * - cdp_url.txt: WebSocket URL for CDP connection - * - chrome.pid: Chrome process ID (from crawl) - * - target_id.txt: Target ID of this snapshot's tab - * - url.txt: The URL to be navigated to - * - * Environment variables: - * CRAWL_OUTPUT_DIR: Crawl output directory (to find crawl's Chrome session) - * CHROME_BINARY: Path to Chromium binary (optional, for version info) - * - * This is a background hook that stays alive until SIGTERM so the tab - * can be closed cleanly at the end of the snapshot run. - */ - -const fs = require('fs'); -const path = require('path'); -const { execSync } = require('child_process'); -// Add NODE_MODULES_DIR to module resolution paths if set -if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); - -const puppeteer = require('puppeteer'); -const { getEnv, getEnvInt } = require('./chrome_utils.js'); - -// Extractor metadata -const PLUGIN_NAME = 'chrome_tab'; -const OUTPUT_DIR = '.'; // Hook already runs in chrome/ output directory -const CHROME_SESSION_DIR = '.'; -const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)'; - -let finalStatus = 'failed'; -let finalOutput = ''; -let finalError = ''; -let cmdVersion = ''; -let finalized = false; - -// Parse command line arguments -function parseArgs() { - const args = {}; - process.argv.slice(2).forEach(arg => { - if (arg.startsWith('--')) { - const [key, ...valueParts] = arg.slice(2).split('='); - args[key.replace(/-/g, '_')] = valueParts.join('=') || true; - } - }); - return args; -} - -function emitResult(statusOverride) { - if (finalized) return; - finalized = true; - - const status = statusOverride || finalStatus; - const outputStr = status === 'succeeded' - ? finalOutput - : (finalError || finalOutput || ''); - - const result = { - type: 'ArchiveResult', - status, - output_str: outputStr, - }; - if (cmdVersion) { - result.cmd_version = cmdVersion; - } - console.log(JSON.stringify(result)); -} - -// Cleanup handler for SIGTERM - close this snapshot's tab -async function cleanup(signal) { - if (signal) { - console.error(`\nReceived ${signal}, closing chrome tab...`); - } - try { - const cdpFile = path.join(OUTPUT_DIR, 'cdp_url.txt'); - const targetIdFile = path.join(OUTPUT_DIR, 'target_id.txt'); - - if (fs.existsSync(cdpFile) && fs.existsSync(targetIdFile)) { - const cdpUrl = fs.readFileSync(cdpFile, 'utf8').trim(); - const targetId = fs.readFileSync(targetIdFile, 'utf8').trim(); - - const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl }); - const pages = await browser.pages(); - const page = pages.find(p => p.target()._targetId === targetId); - - if (page) { - await page.close(); - } - browser.disconnect(); - } - } catch (e) { - // Best effort - } - emitResult(); - process.exit(finalStatus === 'succeeded' ? 0 : 1); -} - -// Register signal handlers -process.on('SIGTERM', () => cleanup('SIGTERM')); -process.on('SIGINT', () => cleanup('SIGINT')); - -// Try to find the crawl's Chrome session -function getCrawlChromeSession() { - // Use CRAWL_OUTPUT_DIR env var set by get_config() in configset.py - const crawlOutputDir = getEnv('CRAWL_OUTPUT_DIR', ''); - if (!crawlOutputDir) { - throw new Error(CHROME_SESSION_REQUIRED_ERROR); - } - - const crawlChromeDir = path.join(crawlOutputDir, 'chrome'); - const cdpFile = path.join(crawlChromeDir, 'cdp_url.txt'); - const pidFile = path.join(crawlChromeDir, 'chrome.pid'); - - if (!fs.existsSync(cdpFile)) { - throw new Error(CHROME_SESSION_REQUIRED_ERROR); - } - if (!fs.existsSync(pidFile)) { - throw new Error(CHROME_SESSION_REQUIRED_ERROR); - } - - const cdpUrl = fs.readFileSync(cdpFile, 'utf-8').trim(); - const pid = parseInt(fs.readFileSync(pidFile, 'utf-8').trim(), 10); - if (!cdpUrl) { - throw new Error(CHROME_SESSION_REQUIRED_ERROR); - } - if (!pid || Number.isNaN(pid)) { - throw new Error(CHROME_SESSION_REQUIRED_ERROR); - } - - // Verify the process is still running - try { - process.kill(pid, 0); // Signal 0 = check if process exists - } catch (e) { - throw new Error(CHROME_SESSION_REQUIRED_ERROR); - } - - return { cdpUrl, pid }; -} - -async function waitForCrawlChromeSession(timeoutMs, intervalMs = 250) { - const startTime = Date.now(); - let lastError = null; - - while (Date.now() - startTime < timeoutMs) { - try { - return getCrawlChromeSession(); - } catch (e) { - lastError = e; - } - await new Promise(resolve => setTimeout(resolve, intervalMs)); - } - - if (lastError) { - throw lastError; - } - throw new Error(CHROME_SESSION_REQUIRED_ERROR); -} - -// Create a new tab in an existing Chrome session -async function createTabInExistingChrome(cdpUrl, url, pid) { - console.log(`[*] Connecting to existing Chrome session: ${cdpUrl}`); - - // Connect Puppeteer to the running Chrome - const browser = await puppeteer.connect({ - browserWSEndpoint: cdpUrl, - defaultViewport: null, - }); - - // Create a new tab for this snapshot - const page = await browser.newPage(); - - // Get the page target ID - const target = page.target(); - const targetId = target._targetId; - - // Write session info - fs.writeFileSync(path.join(OUTPUT_DIR, 'cdp_url.txt'), cdpUrl); - fs.writeFileSync(path.join(OUTPUT_DIR, 'chrome.pid'), String(pid)); - fs.writeFileSync(path.join(OUTPUT_DIR, 'target_id.txt'), targetId); - fs.writeFileSync(path.join(OUTPUT_DIR, 'url.txt'), url); - - // Disconnect Puppeteer (Chrome and tab stay alive) - browser.disconnect(); - - return { success: true, output: OUTPUT_DIR, cdpUrl, targetId, pid }; -} - -async function main() { - const args = parseArgs(); - const url = args.url; - const snapshotId = args.snapshot_id; - const crawlId = args.crawl_id || getEnv('CRAWL_ID', ''); - - if (!url || !snapshotId) { - console.error('Usage: on_Snapshot__10_chrome_tab.bg.js --url= --snapshot-id= [--crawl-id=]'); - process.exit(1); - } - - let status = 'failed'; - let output = ''; - let error = ''; - let version = ''; - - try { - // Get Chrome version - try { - const binary = getEnv('CHROME_BINARY', '').trim(); - if (binary) { - version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 }).trim().slice(0, 64); - } - } catch (e) { - version = ''; - } - - // Try to use existing crawl Chrome session (wait for readiness) - const timeoutSeconds = getEnvInt('CHROME_TAB_TIMEOUT', getEnvInt('CHROME_TIMEOUT', getEnvInt('TIMEOUT', 60))); - const crawlSession = await waitForCrawlChromeSession(timeoutSeconds * 1000); - console.log(`[*] Found existing Chrome session from crawl ${crawlId}`); - const result = await createTabInExistingChrome(crawlSession.cdpUrl, url, crawlSession.pid); - - if (result.success) { - status = 'succeeded'; - output = result.output; - console.log(`[+] Chrome tab ready`); - console.log(`[+] CDP URL: ${result.cdpUrl}`); - console.log(`[+] Page target ID: ${result.targetId}`); - } else { - status = 'failed'; - error = result.error; - } - } catch (e) { - error = `${e.name}: ${e.message}`; - status = 'failed'; - } - - if (error) { - console.error(`ERROR: ${error}`); - } - - finalStatus = status; - finalOutput = output || ''; - finalError = error || ''; - cmdVersion = version || ''; - - if (status !== 'succeeded') { - emitResult(status); - process.exit(1); - } - - console.log('[*] Chrome tab created, waiting for cleanup signal...'); - await new Promise(() => {}); // Keep alive until SIGTERM -} - -main().catch(e => { - console.error(`Fatal error: ${e.message}`); - process.exit(1); -}); diff --git a/archivebox/plugins/chrome/on_Snapshot__11_chrome_wait.js b/archivebox/plugins/chrome/on_Snapshot__11_chrome_wait.js deleted file mode 100644 index dae2a3db..00000000 --- a/archivebox/plugins/chrome/on_Snapshot__11_chrome_wait.js +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env node -/** - * Wait for Chrome session files to exist (cdp_url.txt + target_id.txt). - * - * This is a foreground hook that blocks until the Chrome tab is ready, - * so downstream hooks can safely connect to CDP. - * - * Usage: on_Snapshot__11_chrome_wait.js --url= --snapshot-id= - */ - -const fs = require('fs'); -const path = require('path'); -// Add NODE_MODULES_DIR to module resolution paths if set -if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); - -const { - getEnvInt, - waitForChromeSession, - readCdpUrl, - readTargetId, -} = require('./chrome_utils.js'); - -const CHROME_SESSION_DIR = '.'; -const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)'; - -function parseArgs() { - const args = {}; - process.argv.slice(2).forEach(arg => { - if (arg.startsWith('--')) { - const [key, ...valueParts] = arg.slice(2).split('='); - args[key.replace(/-/g, '_')] = valueParts.join('=') || true; - } - }); - return args; -} - -async function main() { - const args = parseArgs(); - const url = args.url; - const snapshotId = args.snapshot_id; - - if (!url || !snapshotId) { - console.error('Usage: on_Snapshot__11_chrome_wait.js --url= --snapshot-id='); - process.exit(1); - } - - const timeoutSeconds = getEnvInt('CHROME_TAB_TIMEOUT', getEnvInt('CHROME_TIMEOUT', getEnvInt('TIMEOUT', 60))); - const timeoutMs = timeoutSeconds * 1000; - - console.error(`[chrome_wait] Waiting for Chrome session (timeout=${timeoutSeconds}s)...`); - - const ready = await waitForChromeSession(CHROME_SESSION_DIR, timeoutMs); - if (!ready) { - const error = CHROME_SESSION_REQUIRED_ERROR; - console.error(`[chrome_wait] ERROR: ${error}`); - console.log(JSON.stringify({ type: 'ArchiveResult', status: 'failed', output_str: error })); - process.exit(1); - } - - const cdpUrl = readCdpUrl(CHROME_SESSION_DIR); - const targetId = readTargetId(CHROME_SESSION_DIR); - if (!cdpUrl || !targetId) { - const error = CHROME_SESSION_REQUIRED_ERROR; - console.error(`[chrome_wait] ERROR: ${error}`); - console.log(JSON.stringify({ type: 'ArchiveResult', status: 'failed', output_str: error })); - process.exit(1); - } - - console.error(`[chrome_wait] Chrome session ready (cdp_url=${cdpUrl.slice(0, 32)}..., target_id=${targetId}).`); - console.log(JSON.stringify({ type: 'ArchiveResult', status: 'succeeded', output_str: 'chrome session ready' })); - process.exit(0); -} - -main().catch(e => { - console.error(`Fatal error: ${e.message}`); - process.exit(1); -}); diff --git a/archivebox/plugins/chrome/on_Snapshot__30_chrome_navigate.js b/archivebox/plugins/chrome/on_Snapshot__30_chrome_navigate.js deleted file mode 100644 index 33c515ec..00000000 --- a/archivebox/plugins/chrome/on_Snapshot__30_chrome_navigate.js +++ /dev/null @@ -1,225 +0,0 @@ -#!/usr/bin/env node -/** - * Navigate the Chrome browser to the target URL. - * - * This is a simple hook that ONLY navigates - nothing else. - * Pre-load hooks (21-29) should set up their own CDP listeners. - * Post-load hooks (31+) can then read from the loaded page. - * - * Usage: on_Snapshot__30_chrome_navigate.js --url= --snapshot-id= - * Output: Writes page_loaded.txt marker when navigation completes - * - * Environment variables: - * CHROME_PAGELOAD_TIMEOUT: Timeout in seconds (default: 60) - * CHROME_DELAY_AFTER_LOAD: Extra delay after load in seconds (default: 0) - * CHROME_WAIT_FOR: Wait condition (default: networkidle2) - */ - -const fs = require('fs'); -const path = require('path'); -// Add NODE_MODULES_DIR to module resolution paths if set -if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); -const puppeteer = require('puppeteer'); - -const PLUGIN_NAME = 'chrome_navigate'; -const CHROME_SESSION_DIR = '.'; -const OUTPUT_DIR = '.'; -const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)'; - -function parseArgs() { - const args = {}; - process.argv.slice(2).forEach(arg => { - if (arg.startsWith('--')) { - const [key, ...valueParts] = arg.slice(2).split('='); - args[key.replace(/-/g, '_')] = valueParts.join('=') || true; - } - }); - return args; -} - -function getEnv(name, defaultValue = '') { - return (process.env[name] || defaultValue).trim(); -} - -function getEnvInt(name, defaultValue = 0) { - const val = parseInt(getEnv(name, String(defaultValue)), 10); - return isNaN(val) ? defaultValue : val; -} - -function getEnvFloat(name, defaultValue = 0) { - const val = parseFloat(getEnv(name, String(defaultValue))); - return isNaN(val) ? defaultValue : val; -} - -async function waitForChromeTabOpen(timeoutMs = 60000) { - const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt'); - const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt'); - const startTime = Date.now(); - - while (Date.now() - startTime < timeoutMs) { - if (fs.existsSync(cdpFile) && fs.existsSync(targetIdFile)) { - return true; - } - // Wait 100ms before checking again - await new Promise(resolve => setTimeout(resolve, 100)); - } - - return false; -} - -function getCdpUrl() { - const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt'); - if (!fs.existsSync(cdpFile)) return null; - return fs.readFileSync(cdpFile, 'utf8').trim(); -} - -function getPageId() { - const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt'); - if (!fs.existsSync(targetIdFile)) return null; - return fs.readFileSync(targetIdFile, 'utf8').trim(); -} - -function getWaitCondition() { - const waitFor = getEnv('CHROME_WAIT_FOR', 'networkidle2').toLowerCase(); - const valid = ['domcontentloaded', 'load', 'networkidle0', 'networkidle2']; - return valid.includes(waitFor) ? waitFor : 'networkidle2'; -} - -function sleep(ms) { - return new Promise(resolve => setTimeout(resolve, ms)); -} - -async function navigate(url, cdpUrl) { - const timeout = (getEnvInt('CHROME_PAGELOAD_TIMEOUT') || getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000; - const delayAfterLoad = getEnvFloat('CHROME_DELAY_AFTER_LOAD', 0) * 1000; - const waitUntil = getWaitCondition(); - const targetId = getPageId(); - - let browser = null; - const navStartTime = Date.now(); - - try { - browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl }); - - const pages = await browser.pages(); - if (pages.length === 0) { - return { success: false, error: 'No pages found in browser', waitUntil, elapsed: Date.now() - navStartTime }; - } - - // Find page by target ID if available - let page = null; - if (targetId) { - page = pages.find(p => { - const target = p.target(); - return target && target._targetId === targetId; - }); - } - if (!page) { - page = pages[pages.length - 1]; - } - - // Navigate - console.log(`Navigating to ${url} (wait: ${waitUntil}, timeout: ${timeout}ms)`); - const response = await page.goto(url, { waitUntil, timeout }); - - // Optional delay - if (delayAfterLoad > 0) { - console.log(`Waiting ${delayAfterLoad}ms after load...`); - await sleep(delayAfterLoad); - } - - const finalUrl = page.url(); - const status = response ? response.status() : null; - const elapsed = Date.now() - navStartTime; - - // Write navigation state as JSON - const navigationState = { - waitUntil, - elapsed, - url, - finalUrl, - status, - timestamp: new Date().toISOString() - }; - fs.writeFileSync(path.join(OUTPUT_DIR, 'navigation.json'), JSON.stringify(navigationState, null, 2)); - - // Write marker files for backwards compatibility - fs.writeFileSync(path.join(OUTPUT_DIR, 'page_loaded.txt'), new Date().toISOString()); - fs.writeFileSync(path.join(OUTPUT_DIR, 'final_url.txt'), finalUrl); - - browser.disconnect(); - - return { success: true, finalUrl, status, waitUntil, elapsed }; - - } catch (e) { - if (browser) browser.disconnect(); - const elapsed = Date.now() - navStartTime; - return { success: false, error: `${e.name}: ${e.message}`, waitUntil, elapsed }; - } -} - -async function main() { - const args = parseArgs(); - const url = args.url; - const snapshotId = args.snapshot_id; - - if (!url || !snapshotId) { - console.error('Usage: on_Snapshot__30_chrome_navigate.js --url= --snapshot-id='); - process.exit(1); - } - - const startTs = new Date(); - let status = 'failed'; - let output = null; - let error = ''; - - // Wait for chrome tab to be open (up to 60s) - const tabOpen = await waitForChromeTabOpen(60000); - if (!tabOpen) { - console.error(`ERROR: ${CHROME_SESSION_REQUIRED_ERROR}`); - process.exit(1); - } - - const cdpUrl = getCdpUrl(); - if (!cdpUrl) { - console.error(`ERROR: ${CHROME_SESSION_REQUIRED_ERROR}`); - process.exit(1); - } - - const result = await navigate(url, cdpUrl); - - if (result.success) { - status = 'succeeded'; - output = 'navigation.json'; - console.log(`Page loaded: ${result.finalUrl} (HTTP ${result.status}) in ${result.elapsed}ms (waitUntil: ${result.waitUntil})`); - } else { - error = result.error; - // Save navigation state even on failure - const navigationState = { - waitUntil: result.waitUntil, - elapsed: result.elapsed, - url, - error: result.error, - timestamp: new Date().toISOString() - }; - fs.writeFileSync(path.join(OUTPUT_DIR, 'navigation.json'), JSON.stringify(navigationState, null, 2)); - } - - const endTs = new Date(); - - if (error) console.error(`ERROR: ${error}`); - - // Output clean JSONL (no RESULT_JSON= prefix) - console.log(JSON.stringify({ - type: 'ArchiveResult', - status, - output_str: output || error || '', - })); - - process.exit(status === 'succeeded' ? 0 : 1); -} - -main().catch(e => { - console.error(`Fatal error: ${e.message}`); - process.exit(1); -}); diff --git a/archivebox/plugins/chrome/templates/icon.html b/archivebox/plugins/chrome/templates/icon.html deleted file mode 100644 index 18555344..00000000 --- a/archivebox/plugins/chrome/templates/icon.html +++ /dev/null @@ -1 +0,0 @@ - diff --git a/archivebox/plugins/chrome/tests/chrome_test_helpers.py b/archivebox/plugins/chrome/tests/chrome_test_helpers.py deleted file mode 100644 index 3e37ce26..00000000 --- a/archivebox/plugins/chrome/tests/chrome_test_helpers.py +++ /dev/null @@ -1,1002 +0,0 @@ -""" -Shared Chrome test helpers for plugin integration tests. - -This module provides common utilities for Chrome-based plugin tests, reducing -duplication across test files. Functions delegate to chrome_utils.js (the single -source of truth) with Python fallbacks. - -Function names match the JS equivalents in snake_case: - JS: getMachineType() -> Python: get_machine_type() - JS: getLibDir() -> Python: get_lib_dir() - JS: getNodeModulesDir() -> Python: get_node_modules_dir() - JS: getExtensionsDir() -> Python: get_extensions_dir() - JS: findChromium() -> Python: find_chromium() - JS: killChrome() -> Python: kill_chrome() - JS: getTestEnv() -> Python: get_test_env() - -Usage: - # Path helpers (delegate to chrome_utils.js): - from archivebox.plugins.chrome.tests.chrome_test_helpers import ( - get_test_env, # env dict with LIB_DIR, NODE_MODULES_DIR, MACHINE_TYPE - get_machine_type, # e.g., 'x86_64-linux', 'arm64-darwin' - get_lib_dir, # Path to lib dir - get_node_modules_dir, # Path to node_modules - get_extensions_dir, # Path to chrome extensions - find_chromium, # Find Chrome/Chromium binary - kill_chrome, # Kill Chrome process by PID - ) - - # Test file helpers: - from archivebox.plugins.chrome.tests.chrome_test_helpers import ( - get_plugin_dir, # get_plugin_dir(__file__) -> plugin dir Path - get_hook_script, # Find hook script by glob pattern - PLUGINS_ROOT, # Path to plugins root - LIB_DIR, # Path to lib dir (lazy-loaded) - NODE_MODULES_DIR, # Path to node_modules (lazy-loaded) - ) - - # For Chrome session tests: - from archivebox.plugins.chrome.tests.chrome_test_helpers import ( - chrome_session, # Context manager (Full Chrome + tab setup with automatic cleanup) - cleanup_chrome, # Manual cleanup by PID (rarely needed) - ) - - # For extension tests: - from archivebox.plugins.chrome.tests.chrome_test_helpers import ( - setup_test_env, # Full dir structure + Chrome install - launch_chromium_session, # Launch Chrome, return CDP URL - kill_chromium_session, # Cleanup Chrome - ) - - # Run hooks and parse JSONL: - from archivebox.plugins.chrome.tests.chrome_test_helpers import ( - run_hook, # Run hook, return (returncode, stdout, stderr) - parse_jsonl_output, # Parse JSONL from stdout - ) -""" - -import json -import os -import platform -import signal -import subprocess -import sys -import time -from datetime import datetime -from pathlib import Path -from typing import Tuple, Optional, List, Dict, Any -from contextlib import contextmanager - - -# Plugin directory locations -CHROME_PLUGIN_DIR = Path(__file__).parent.parent -PLUGINS_ROOT = CHROME_PLUGIN_DIR.parent - -# Hook script locations -CHROME_INSTALL_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__70_chrome_install.py' -CHROME_LAUNCH_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__90_chrome_launch.bg.js' -CHROME_TAB_HOOK = CHROME_PLUGIN_DIR / 'on_Snapshot__10_chrome_tab.bg.js' -CHROME_NAVIGATE_HOOK = next(CHROME_PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None) -CHROME_UTILS = CHROME_PLUGIN_DIR / 'chrome_utils.js' -PUPPETEER_BINARY_HOOK = PLUGINS_ROOT / 'puppeteer' / 'on_Binary__12_puppeteer_install.py' -PUPPETEER_CRAWL_HOOK = PLUGINS_ROOT / 'puppeteer' / 'on_Crawl__60_puppeteer_install.py' -NPM_BINARY_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__10_npm_install.py' - - -# ============================================================================= -# Path Helpers - delegates to chrome_utils.js with Python fallback -# Function names match JS: getMachineType -> get_machine_type, etc. -# ============================================================================= - - -def _call_chrome_utils(command: str, *args: str, env: Optional[dict] = None) -> Tuple[int, str, str]: - """Call chrome_utils.js CLI command (internal helper). - - This is the central dispatch for calling the JS utilities from Python. - All path calculations and Chrome operations are centralized in chrome_utils.js - to ensure consistency between Python and JavaScript code. - - Args: - command: The CLI command (e.g., 'findChromium', 'getTestEnv') - *args: Additional command arguments - env: Environment dict (default: current env) - - Returns: - Tuple of (returncode, stdout, stderr) - """ - cmd = ['node', str(CHROME_UTILS), command] + list(args) - result = subprocess.run( - cmd, - capture_output=True, - text=True, - timeout=30, - env=env or os.environ.copy() - ) - return result.returncode, result.stdout, result.stderr - - -def get_plugin_dir(test_file: str) -> Path: - """Get the plugin directory from a test file path. - - Usage: - PLUGIN_DIR = get_plugin_dir(__file__) - - Args: - test_file: The __file__ of the test module (e.g., test_screenshot.py) - - Returns: - Path to the plugin directory (e.g., plugins/screenshot/) - """ - return Path(test_file).parent.parent - - -def get_hook_script(plugin_dir: Path, pattern: str) -> Optional[Path]: - """Find a hook script in a plugin directory by pattern. - - Usage: - HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_screenshot.*') - - Args: - plugin_dir: Path to the plugin directory - pattern: Glob pattern to match - - Returns: - Path to the hook script or None if not found - """ - matches = list(plugin_dir.glob(pattern)) - return matches[0] if matches else None - - -def get_machine_type() -> str: - """Get machine type string (e.g., 'x86_64-linux', 'arm64-darwin'). - - Matches JS: getMachineType() - - Tries chrome_utils.js first, falls back to Python computation. - """ - # Try JS first (single source of truth) - returncode, stdout, stderr = _call_chrome_utils('getMachineType') - if returncode == 0 and stdout.strip(): - return stdout.strip() - - # Fallback to Python computation - if os.environ.get('MACHINE_TYPE'): - return os.environ['MACHINE_TYPE'] - - machine = platform.machine().lower() - system = platform.system().lower() - if machine in ('arm64', 'aarch64'): - machine = 'arm64' - elif machine in ('x86_64', 'amd64'): - machine = 'x86_64' - return f"{machine}-{system}" - - -def get_lib_dir() -> Path: - """Get LIB_DIR path for platform-specific binaries. - - Matches JS: getLibDir() - - Tries chrome_utils.js first, falls back to Python computation. - """ - # Try JS first - returncode, stdout, stderr = _call_chrome_utils('getLibDir') - if returncode == 0 and stdout.strip(): - return Path(stdout.strip()) - - # Fallback to Python - if os.environ.get('LIB_DIR'): - return Path(os.environ['LIB_DIR']) - raise Exception('LIB_DIR env var must be set!') - - -def get_node_modules_dir() -> Path: - """Get NODE_MODULES_DIR path for npm packages. - - Matches JS: getNodeModulesDir() - - Tries chrome_utils.js first, falls back to Python computation. - """ - # Try JS first - returncode, stdout, stderr = _call_chrome_utils('getNodeModulesDir') - if returncode == 0 and stdout.strip(): - return Path(stdout.strip()) - - # Fallback to Python - if os.environ.get('NODE_MODULES_DIR'): - return Path(os.environ['NODE_MODULES_DIR']) - lib_dir = get_lib_dir() - return lib_dir / 'npm' / 'node_modules' - - -def get_extensions_dir() -> str: - """Get the Chrome extensions directory path. - - Matches JS: getExtensionsDir() - - Tries chrome_utils.js first, falls back to Python computation. - """ - try: - returncode, stdout, stderr = _call_chrome_utils('getExtensionsDir') - if returncode == 0 and stdout.strip(): - return stdout.strip() - except subprocess.TimeoutExpired: - pass # Fall through to default computation - - # Fallback to default computation if JS call fails - data_dir = os.environ.get('DATA_DIR', '.') - persona = os.environ.get('ACTIVE_PERSONA', 'Default') - return str(Path(data_dir) / 'personas' / persona / 'chrome_extensions') - - -def link_puppeteer_cache(lib_dir: Path) -> None: - """Best-effort symlink from system Puppeteer cache into test lib_dir. - - Avoids repeated Chromium downloads across tests by reusing the - default Puppeteer cache directory. - """ - cache_dir = lib_dir / 'puppeteer' - cache_dir.mkdir(parents=True, exist_ok=True) - - candidates = [ - Path.home() / 'Library' / 'Caches' / 'puppeteer', - Path.home() / '.cache' / 'puppeteer', - ] - for src_root in candidates: - if not src_root.exists(): - continue - for item in src_root.iterdir(): - dst = cache_dir / item.name - if dst.exists(): - continue - try: - os.symlink(item, dst, target_is_directory=item.is_dir()) - except Exception: - # Best-effort only; if symlink fails, leave as-is. - pass - - -def find_chromium(data_dir: Optional[str] = None) -> Optional[str]: - """Find the Chromium binary path. - - Matches JS: findChromium() - - Uses chrome_utils.js which checks: - - CHROME_BINARY env var - - @puppeteer/browsers install locations - - System Chromium locations - - Falls back to Chrome (with warning) - - Args: - data_dir: Optional DATA_DIR override - - Returns: - Path to Chromium binary or None if not found - """ - env = os.environ.copy() - if data_dir: - env['DATA_DIR'] = str(data_dir) - returncode, stdout, stderr = _call_chrome_utils('findChromium', env=env) - if returncode == 0 and stdout.strip(): - return stdout.strip() - return None - - -def kill_chrome(pid: int, output_dir: Optional[str] = None) -> bool: - """Kill a Chrome process by PID. - - Matches JS: killChrome() - - Uses chrome_utils.js which handles: - - SIGTERM then SIGKILL - - Process group killing - - Zombie process cleanup - - Args: - pid: Process ID to kill - output_dir: Optional chrome output directory for PID file cleanup - - Returns: - True if the kill command succeeded - """ - args = [str(pid)] - if output_dir: - args.append(str(output_dir)) - returncode, stdout, stderr = _call_chrome_utils('killChrome', *args) - return returncode == 0 - - -def get_test_env() -> dict: - """Get environment dict with all paths set correctly for tests. - - Matches JS: getTestEnv() - - Tries chrome_utils.js first for path values, builds env dict. - Use this for all subprocess calls in plugin tests. - """ - env = os.environ.copy() - - # Try to get all paths from JS (single source of truth) - returncode, stdout, stderr = _call_chrome_utils('getTestEnv') - if returncode == 0 and stdout.strip(): - try: - js_env = json.loads(stdout) - env.update(js_env) - return env - except json.JSONDecodeError: - pass - - # Fallback to Python computation - lib_dir = get_lib_dir() - env['LIB_DIR'] = str(lib_dir) - env['NODE_MODULES_DIR'] = str(get_node_modules_dir()) - env['MACHINE_TYPE'] = get_machine_type() - return env - - -# Backward compatibility aliases (deprecated, use new names) -find_chromium_binary = find_chromium -kill_chrome_via_js = kill_chrome -get_machine_type_from_js = get_machine_type -get_test_env_from_js = get_test_env - - -# ============================================================================= -# Module-level constants (lazy-loaded on first access) -# Import these directly: from chrome_test_helpers import LIB_DIR, NODE_MODULES_DIR -# ============================================================================= - -# These are computed once when first accessed -_LIB_DIR: Optional[Path] = None -_NODE_MODULES_DIR: Optional[Path] = None - - -def _get_lib_dir_cached() -> Path: - global _LIB_DIR - if _LIB_DIR is None: - _LIB_DIR = get_lib_dir() - return _LIB_DIR - - -def _get_node_modules_dir_cached() -> Path: - global _NODE_MODULES_DIR - if _NODE_MODULES_DIR is None: - _NODE_MODULES_DIR = get_node_modules_dir() - return _NODE_MODULES_DIR - - -# Module-level constants that can be imported directly -# Usage: from chrome_test_helpers import LIB_DIR, NODE_MODULES_DIR -class _LazyPath: - """Lazy path that computes value on first access.""" - def __init__(self, getter): - self._getter = getter - self._value = None - - def __fspath__(self): - if self._value is None: - self._value = self._getter() - return str(self._value) - - def __truediv__(self, other): - if self._value is None: - self._value = self._getter() - return self._value / other - - def __str__(self): - return self.__fspath__() - - def __repr__(self): - return f"" - - -LIB_DIR = _LazyPath(_get_lib_dir_cached) -NODE_MODULES_DIR = _LazyPath(_get_node_modules_dir_cached) - - -# ============================================================================= -# Hook Execution Helpers -# ============================================================================= - - -def run_hook( - hook_script: Path, - url: str, - snapshot_id: str, - cwd: Optional[Path] = None, - env: Optional[dict] = None, - timeout: int = 60, - extra_args: Optional[List[str]] = None, -) -> Tuple[int, str, str]: - """Run a hook script and return (returncode, stdout, stderr). - - Usage: - returncode, stdout, stderr = run_hook( - HOOK_SCRIPT, 'https://example.com', 'test-snap-123', - cwd=tmpdir, env=get_test_env() - ) - - Args: - hook_script: Path to the hook script - url: URL to process - snapshot_id: Snapshot ID - cwd: Working directory (default: current dir) - env: Environment dict (default: get_test_env()) - timeout: Timeout in seconds - extra_args: Additional arguments to pass - - Returns: - Tuple of (returncode, stdout, stderr) - """ - if env is None: - env = get_test_env() - - # Determine interpreter based on file extension - if hook_script.suffix == '.py': - cmd = [sys.executable, str(hook_script)] - elif hook_script.suffix == '.js': - cmd = ['node', str(hook_script)] - else: - cmd = [str(hook_script)] - - cmd.extend([f'--url={url}', f'--snapshot-id={snapshot_id}']) - if extra_args: - cmd.extend(extra_args) - - result = subprocess.run( - cmd, - cwd=str(cwd) if cwd else None, - capture_output=True, - text=True, - env=env, - timeout=timeout - ) - return result.returncode, result.stdout, result.stderr - - -def parse_jsonl_output(stdout: str, record_type: str = 'ArchiveResult') -> Optional[Dict[str, Any]]: - """Parse JSONL output from hook stdout and return the specified record type. - - Usage: - result = parse_jsonl_output(stdout) - if result and result['status'] == 'succeeded': - print("Success!") - - Args: - stdout: The stdout from a hook execution - record_type: The 'type' field to look for (default: 'ArchiveResult') - - Returns: - The parsed JSON dict or None if not found - """ - for line in stdout.strip().split('\n'): - line = line.strip() - if not line.startswith('{'): - continue - try: - record = json.loads(line) - if record.get('type') == record_type: - return record - except json.JSONDecodeError: - continue - return None - - -def parse_jsonl_records(stdout: str) -> List[Dict[str, Any]]: - """Parse all JSONL records from stdout.""" - records: List[Dict[str, Any]] = [] - for line in stdout.strip().split('\n'): - line = line.strip() - if not line.startswith('{'): - continue - try: - records.append(json.loads(line)) - except json.JSONDecodeError: - continue - return records - - -def apply_machine_updates(records: List[Dict[str, Any]], env: dict) -> None: - """Apply Machine update records to env dict in-place.""" - for record in records: - if record.get('type') != 'Machine': - continue - config = record.get('config') - if not isinstance(config, dict): - continue - env.update(config) - - -def install_chromium_with_hooks(env: dict, timeout: int = 300) -> str: - """Install Chromium via chrome crawl hook + puppeteer/npm hooks. - - Returns absolute path to Chromium binary. - """ - puppeteer_result = subprocess.run( - [sys.executable, str(PUPPETEER_CRAWL_HOOK)], - capture_output=True, - text=True, - timeout=timeout, - env=env, - ) - if puppeteer_result.returncode != 0: - raise RuntimeError(f"Puppeteer crawl hook failed: {puppeteer_result.stderr}") - - puppeteer_record = parse_jsonl_output(puppeteer_result.stdout, record_type='Binary') or {} - if not puppeteer_record or puppeteer_record.get('name') != 'puppeteer': - raise RuntimeError("Puppeteer Binary record not emitted by crawl hook") - - npm_cmd = [ - sys.executable, - str(NPM_BINARY_HOOK), - '--machine-id=test-machine', - '--binary-id=test-puppeteer', - '--name=puppeteer', - f"--binproviders={puppeteer_record.get('binproviders', '*')}", - ] - puppeteer_overrides = puppeteer_record.get('overrides') - if puppeteer_overrides: - npm_cmd.append(f'--overrides={json.dumps(puppeteer_overrides)}') - - npm_result = subprocess.run( - npm_cmd, - capture_output=True, - text=True, - timeout=timeout, - env=env, - ) - if npm_result.returncode != 0: - raise RuntimeError(f"Npm install failed: {npm_result.stderr}") - - apply_machine_updates(parse_jsonl_records(npm_result.stdout), env) - - chrome_result = subprocess.run( - [sys.executable, str(CHROME_INSTALL_HOOK)], - capture_output=True, - text=True, - timeout=timeout, - env=env, - ) - if chrome_result.returncode != 0: - raise RuntimeError(f"Chrome install hook failed: {chrome_result.stderr}") - - chrome_record = parse_jsonl_output(chrome_result.stdout, record_type='Binary') or {} - if not chrome_record or chrome_record.get('name') not in ('chromium', 'chrome'): - raise RuntimeError("Chrome Binary record not emitted by crawl hook") - - chromium_cmd = [ - sys.executable, - str(PUPPETEER_BINARY_HOOK), - '--machine-id=test-machine', - '--binary-id=test-chromium', - f"--name={chrome_record.get('name', 'chromium')}", - f"--binproviders={chrome_record.get('binproviders', '*')}", - ] - chrome_overrides = chrome_record.get('overrides') - if chrome_overrides: - chromium_cmd.append(f'--overrides={json.dumps(chrome_overrides)}') - - result = subprocess.run( - chromium_cmd, - capture_output=True, - text=True, - timeout=timeout, - env=env, - ) - if result.returncode != 0: - raise RuntimeError(f"Puppeteer chromium install failed: {result.stderr}") - - records = parse_jsonl_records(result.stdout) - chromium_record = None - for record in records: - if record.get('type') == 'Binary' and record.get('name') in ('chromium', 'chrome'): - chromium_record = record - break - if not chromium_record: - chromium_record = parse_jsonl_output(result.stdout, record_type='Binary') - - chromium_path = chromium_record.get('abspath') - if not chromium_path or not Path(chromium_path).exists(): - raise RuntimeError(f"Chromium binary not found after install: {chromium_path}") - - env['CHROME_BINARY'] = chromium_path - apply_machine_updates(records, env) - return chromium_path - - -def run_hook_and_parse( - hook_script: Path, - url: str, - snapshot_id: str, - cwd: Optional[Path] = None, - env: Optional[dict] = None, - timeout: int = 60, - extra_args: Optional[List[str]] = None, -) -> Tuple[int, Optional[Dict[str, Any]], str]: - """Run a hook and parse its JSONL output. - - Convenience function combining run_hook() and parse_jsonl_output(). - - Returns: - Tuple of (returncode, parsed_result_or_none, stderr) - """ - returncode, stdout, stderr = run_hook( - hook_script, url, snapshot_id, - cwd=cwd, env=env, timeout=timeout, extra_args=extra_args - ) - result = parse_jsonl_output(stdout) - return returncode, result, stderr - - -# ============================================================================= -# Extension Test Helpers -# Used by extension tests (ublock, istilldontcareaboutcookies, twocaptcha) -# ============================================================================= - - -def setup_test_env(tmpdir: Path) -> dict: - """Set up isolated data/lib directory structure for extension tests. - - Creates structure matching real ArchiveBox data dir: - /data/ - lib/ - arm64-darwin/ (or x86_64-linux, etc.) - npm/ - .bin/ - node_modules/ - personas/ - Default/ - chrome_extensions/ - users/ - testuser/ - crawls/ - snapshots/ - - Calls chrome install hook + puppeteer/npm hooks for Chromium installation. - Returns env dict with DATA_DIR, LIB_DIR, NPM_BIN_DIR, NODE_MODULES_DIR, CHROME_BINARY, etc. - - Args: - tmpdir: Base temporary directory for the test - - Returns: - Environment dict with all paths set. - """ - - # Determine machine type (matches archivebox.config.paths.get_machine_type()) - machine = platform.machine().lower() - system = platform.system().lower() - if machine in ('arm64', 'aarch64'): - machine = 'arm64' - elif machine in ('x86_64', 'amd64'): - machine = 'x86_64' - machine_type = f"{machine}-{system}" - - # Create proper directory structure matching real ArchiveBox layout - data_dir = tmpdir / 'data' - lib_dir = data_dir / 'lib' / machine_type - npm_dir = lib_dir / 'npm' - npm_bin_dir = npm_dir / '.bin' - node_modules_dir = npm_dir / 'node_modules' - - # Extensions go under personas/Default/ - chrome_extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions' - - # User data goes under users/{username}/ - date_str = datetime.now().strftime('%Y%m%d') - users_dir = data_dir / 'users' / 'testuser' - crawls_dir = users_dir / 'crawls' / date_str - snapshots_dir = users_dir / 'snapshots' / date_str - - # Create all directories - node_modules_dir.mkdir(parents=True, exist_ok=True) - npm_bin_dir.mkdir(parents=True, exist_ok=True) - chrome_extensions_dir.mkdir(parents=True, exist_ok=True) - crawls_dir.mkdir(parents=True, exist_ok=True) - snapshots_dir.mkdir(parents=True, exist_ok=True) - - # Build complete env dict - env = os.environ.copy() - env.update({ - 'DATA_DIR': str(data_dir), - 'LIB_DIR': str(lib_dir), - 'MACHINE_TYPE': machine_type, - 'NPM_BIN_DIR': str(npm_bin_dir), - 'NODE_MODULES_DIR': str(node_modules_dir), - 'CHROME_EXTENSIONS_DIR': str(chrome_extensions_dir), - 'CRAWLS_DIR': str(crawls_dir), - 'SNAPSHOTS_DIR': str(snapshots_dir), - }) - - # Only set headless if not already in environment (allow override for debugging) - if 'CHROME_HEADLESS' not in os.environ: - env['CHROME_HEADLESS'] = 'true' - - try: - install_chromium_with_hooks(env) - except RuntimeError as e: - raise RuntimeError(str(e)) - return env - - -def launch_chromium_session(env: dict, chrome_dir: Path, crawl_id: str) -> Tuple[subprocess.Popen, str]: - """Launch Chromium and return (process, cdp_url). - - This launches Chrome using the chrome launch hook and waits for the CDP URL - to become available. Use this for extension tests that need direct CDP access. - - Args: - env: Environment dict (from setup_test_env) - chrome_dir: Directory for Chrome to write its files (cdp_url.txt, chrome.pid, etc.) - crawl_id: ID for the crawl - - Returns: - Tuple of (chrome_launch_process, cdp_url) - - Raises: - RuntimeError: If Chrome fails to launch or CDP URL not available after 20s - """ - chrome_dir.mkdir(parents=True, exist_ok=True) - - chrome_launch_process = subprocess.Popen( - ['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'], - cwd=str(chrome_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=env - ) - - # Wait for Chromium to launch and CDP URL to be available - cdp_url = None - for i in range(20): - if chrome_launch_process.poll() is not None: - stdout, stderr = chrome_launch_process.communicate() - raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}") - cdp_file = chrome_dir / 'cdp_url.txt' - if cdp_file.exists(): - cdp_url = cdp_file.read_text().strip() - break - time.sleep(1) - - if not cdp_url: - chrome_launch_process.kill() - raise RuntimeError("Chromium CDP URL not found after 20s") - - return chrome_launch_process, cdp_url - - -def kill_chromium_session(chrome_launch_process: subprocess.Popen, chrome_dir: Path) -> None: - """Clean up Chromium process launched by launch_chromium_session. - - Uses chrome_utils.js killChrome for proper process group handling. - - Args: - chrome_launch_process: The Popen object from launch_chromium_session - chrome_dir: The chrome directory containing chrome.pid - """ - # First try to terminate the launch process gracefully - try: - chrome_launch_process.send_signal(signal.SIGTERM) - chrome_launch_process.wait(timeout=5) - except Exception: - pass - - # Read PID and use JS to kill with proper cleanup - chrome_pid_file = chrome_dir / 'chrome.pid' - if chrome_pid_file.exists(): - try: - chrome_pid = int(chrome_pid_file.read_text().strip()) - kill_chrome(chrome_pid, str(chrome_dir)) - except (ValueError, FileNotFoundError): - pass - - -@contextmanager -def chromium_session(env: dict, chrome_dir: Path, crawl_id: str): - """Context manager for Chromium sessions with automatic cleanup. - - Usage: - with chromium_session(env, chrome_dir, 'test-crawl') as (process, cdp_url): - # Use cdp_url to connect with puppeteer - pass - # Chromium automatically cleaned up - - Args: - env: Environment dict (from setup_test_env) - chrome_dir: Directory for Chrome files - crawl_id: ID for the crawl - - Yields: - Tuple of (chrome_launch_process, cdp_url) - """ - chrome_launch_process = None - try: - chrome_launch_process, cdp_url = launch_chromium_session(env, chrome_dir, crawl_id) - yield chrome_launch_process, cdp_url - finally: - if chrome_launch_process: - kill_chromium_session(chrome_launch_process, chrome_dir) - - -# ============================================================================= -# Tab-based Test Helpers -# Used by tab-based tests (infiniscroll, modalcloser) -# ============================================================================= - - -def cleanup_chrome(chrome_launch_process: subprocess.Popen, chrome_pid: int, chrome_dir: Optional[Path] = None) -> None: - """Clean up Chrome processes using chrome_utils.js killChrome. - - Uses the centralized kill logic from chrome_utils.js which handles: - - SIGTERM then SIGKILL - - Process group killing - - Zombie process cleanup - - Args: - chrome_launch_process: The Popen object for the chrome launch hook - chrome_pid: The PID of the Chrome process - chrome_dir: Optional path to chrome output directory - """ - # First try to terminate the launch process gracefully - try: - chrome_launch_process.send_signal(signal.SIGTERM) - chrome_launch_process.wait(timeout=5) - except Exception: - pass - - # Use JS to kill Chrome with proper process group handling - kill_chrome(chrome_pid, str(chrome_dir) if chrome_dir else None) - - -@contextmanager -def chrome_session( - tmpdir: Path, - crawl_id: str = 'test-crawl', - snapshot_id: str = 'test-snapshot', - test_url: str = 'about:blank', - navigate: bool = True, - timeout: int = 15, -): - """Context manager for Chrome sessions with automatic cleanup. - - Creates the directory structure, launches Chrome, creates a tab, - and optionally navigates to the test URL. Automatically cleans up - Chrome on exit. - - Usage: - with chrome_session(tmpdir, test_url='https://example.com') as (process, pid, chrome_dir, env): - # Run tests with chrome session - pass - # Chrome automatically cleaned up - - Args: - tmpdir: Temporary directory for test files - crawl_id: ID to use for the crawl - snapshot_id: ID to use for the snapshot - test_url: URL to navigate to (if navigate=True) - navigate: Whether to navigate to the URL after creating tab - timeout: Seconds to wait for Chrome to start - - Yields: - Tuple of (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env) - - Raises: - RuntimeError: If Chrome fails to start or tab creation fails - """ - chrome_launch_process = None - chrome_pid = None - try: - # Create proper directory structure in tmpdir - machine = platform.machine().lower() - system = platform.system().lower() - if machine in ('arm64', 'aarch64'): - machine = 'arm64' - elif machine in ('x86_64', 'amd64'): - machine = 'x86_64' - machine_type = f"{machine}-{system}" - - data_dir = Path(tmpdir) / 'data' - lib_dir = data_dir / 'lib' / machine_type - npm_dir = lib_dir / 'npm' - node_modules_dir = npm_dir / 'node_modules' - puppeteer_cache_dir = lib_dir / 'puppeteer' - - # Create lib structure for puppeteer installation - node_modules_dir.mkdir(parents=True, exist_ok=True) - - # Create crawl and snapshot directories - crawl_dir = Path(tmpdir) / 'crawl' - crawl_dir.mkdir(exist_ok=True) - chrome_dir = crawl_dir / 'chrome' - chrome_dir.mkdir(exist_ok=True) - - # Build env with tmpdir-specific paths - env = os.environ.copy() - env.update({ - 'DATA_DIR': str(data_dir), - 'LIB_DIR': str(lib_dir), - 'MACHINE_TYPE': machine_type, - 'NODE_MODULES_DIR': str(node_modules_dir), - 'NODE_PATH': str(node_modules_dir), - 'NPM_BIN_DIR': str(npm_dir / '.bin'), - 'CHROME_HEADLESS': 'true', - 'PUPPETEER_CACHE_DIR': str(puppeteer_cache_dir), - }) - - # Reuse system Puppeteer cache to avoid redundant Chromium downloads - link_puppeteer_cache(lib_dir) - - # Install Chromium via npm + puppeteer hooks using normal Binary flow - install_chromium_with_hooks(env) - - # Launch Chrome at crawl level - chrome_launch_process = subprocess.Popen( - ['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'], - cwd=str(chrome_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=env - ) - - # Wait for Chrome to launch - for i in range(timeout): - if chrome_launch_process.poll() is not None: - stdout, stderr = chrome_launch_process.communicate() - raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}") - if (chrome_dir / 'cdp_url.txt').exists(): - break - time.sleep(1) - - if not (chrome_dir / 'cdp_url.txt').exists(): - raise RuntimeError(f"Chrome CDP URL not found after {timeout}s") - - chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip()) - - # Create snapshot directory structure - snapshot_dir = Path(tmpdir) / 'snapshot' - snapshot_dir.mkdir(exist_ok=True) - snapshot_chrome_dir = snapshot_dir / 'chrome' - snapshot_chrome_dir.mkdir(exist_ok=True) - - # Create tab - tab_env = env.copy() - tab_env['CRAWL_OUTPUT_DIR'] = str(crawl_dir) - try: - result = subprocess.run( - ['node', str(CHROME_TAB_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}', f'--crawl-id={crawl_id}'], - cwd=str(snapshot_chrome_dir), - capture_output=True, - text=True, - timeout=60, - env=tab_env - ) - if result.returncode != 0: - cleanup_chrome(chrome_launch_process, chrome_pid) - raise RuntimeError(f"Tab creation failed: {result.stderr}") - except subprocess.TimeoutExpired: - cleanup_chrome(chrome_launch_process, chrome_pid) - raise RuntimeError("Tab creation timed out after 60s") - - # Navigate to URL if requested - if navigate and CHROME_NAVIGATE_HOOK and test_url != 'about:blank': - try: - result = subprocess.run( - ['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(snapshot_chrome_dir), - capture_output=True, - text=True, - timeout=120, - env=env - ) - if result.returncode != 0: - cleanup_chrome(chrome_launch_process, chrome_pid) - raise RuntimeError(f"Navigation failed: {result.stderr}") - except subprocess.TimeoutExpired: - cleanup_chrome(chrome_launch_process, chrome_pid) - raise RuntimeError("Navigation timed out after 120s") - - yield chrome_launch_process, chrome_pid, snapshot_chrome_dir, env - finally: - if chrome_launch_process and chrome_pid: - cleanup_chrome(chrome_launch_process, chrome_pid) diff --git a/archivebox/plugins/chrome/tests/test_chrome.py b/archivebox/plugins/chrome/tests/test_chrome.py deleted file mode 100644 index 33d328c9..00000000 --- a/archivebox/plugins/chrome/tests/test_chrome.py +++ /dev/null @@ -1,722 +0,0 @@ -""" -Integration tests for chrome plugin - -Tests verify: -1. Chromium install via @puppeteer/browsers -2. Verify deps with abx-pkg -3. Chrome hooks exist -4. Chromium launches at crawl level -5. Tab creation at snapshot level -6. Tab navigation works -7. Tab cleanup on SIGTERM -8. Chromium cleanup on crawl end - -NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for ---load-extension and --disable-extensions-except flags, which are needed for -loading unpacked extensions in headless mode. -""" - -import json -import os -import signal -import subprocess -import sys -import time -from pathlib import Path -import pytest -import tempfile -import shutil -import platform - -from archivebox.plugins.chrome.tests.chrome_test_helpers import ( - get_test_env, - find_chromium_binary, - install_chromium_with_hooks, - CHROME_PLUGIN_DIR as PLUGIN_DIR, - CHROME_LAUNCH_HOOK, - CHROME_TAB_HOOK, - CHROME_NAVIGATE_HOOK, -) - -def _get_cookies_via_cdp(port: int, env: dict) -> list[dict]: - node_script = r""" -const http = require('http'); -const WebSocket = require('ws'); -const port = process.env.CDP_PORT; - -function getTargets() { - return new Promise((resolve, reject) => { - const req = http.get(`http://127.0.0.1:${port}/json/list`, (res) => { - let data = ''; - res.on('data', (chunk) => (data += chunk)); - res.on('end', () => { - try { - resolve(JSON.parse(data)); - } catch (e) { - reject(e); - } - }); - }); - req.on('error', reject); - }); -} - -(async () => { - const targets = await getTargets(); - const pageTarget = targets.find(t => t.type === 'page') || targets[0]; - if (!pageTarget) { - console.error('No page target found'); - process.exit(2); - } - - const ws = new WebSocket(pageTarget.webSocketDebuggerUrl); - const timer = setTimeout(() => { - console.error('Timeout waiting for cookies'); - process.exit(3); - }, 10000); - - ws.on('open', () => { - ws.send(JSON.stringify({ id: 1, method: 'Network.getAllCookies' })); - }); - - ws.on('message', (data) => { - const msg = JSON.parse(data); - if (msg.id === 1) { - clearTimeout(timer); - ws.close(); - if (!msg.result || !msg.result.cookies) { - console.error('No cookies in response'); - process.exit(4); - } - process.stdout.write(JSON.stringify(msg.result.cookies)); - process.exit(0); - } - }); - - ws.on('error', (err) => { - console.error(String(err)); - process.exit(5); - }); -})().catch((err) => { - console.error(String(err)); - process.exit(1); -}); -""" - - result = subprocess.run( - ['node', '-e', node_script], - capture_output=True, - text=True, - timeout=30, - env=env | {'CDP_PORT': str(port)}, - ) - assert result.returncode == 0, f"Failed to read cookies via CDP: {result.stderr}\nStdout: {result.stdout}" - return json.loads(result.stdout or '[]') - - -@pytest.fixture(scope="session", autouse=True) -def ensure_chromium_and_puppeteer_installed(tmp_path_factory): - """Ensure Chromium and puppeteer are installed before running tests.""" - if not os.environ.get('DATA_DIR'): - test_data_dir = tmp_path_factory.mktemp('chrome_test_data') - os.environ['DATA_DIR'] = str(test_data_dir) - env = get_test_env() - - try: - chromium_binary = install_chromium_with_hooks(env) - except RuntimeError as e: - raise RuntimeError(str(e)) - - if not chromium_binary: - raise RuntimeError("Chromium not found after install") - - os.environ['CHROME_BINARY'] = chromium_binary - for key in ('NODE_MODULES_DIR', 'NODE_PATH', 'PATH'): - if env.get(key): - os.environ[key] = env[key] - - -def test_hook_scripts_exist(): - """Verify chrome hooks exist.""" - assert CHROME_LAUNCH_HOOK.exists(), f"Hook not found: {CHROME_LAUNCH_HOOK}" - assert CHROME_TAB_HOOK.exists(), f"Hook not found: {CHROME_TAB_HOOK}" - assert CHROME_NAVIGATE_HOOK.exists(), f"Hook not found: {CHROME_NAVIGATE_HOOK}" - - -def test_verify_chromium_available(): - """Verify Chromium is available via CHROME_BINARY env var.""" - chromium_binary = os.environ.get('CHROME_BINARY') or find_chromium_binary() - - assert chromium_binary, "Chromium binary should be available (set by fixture or found)" - assert Path(chromium_binary).exists(), f"Chromium binary should exist at {chromium_binary}" - - # Verify it's actually Chromium by checking version - result = subprocess.run( - [chromium_binary, '--version'], - capture_output=True, - text=True, - timeout=10 - ) - assert result.returncode == 0, f"Failed to get Chromium version: {result.stderr}" - assert 'Chromium' in result.stdout or 'Chrome' in result.stdout, f"Unexpected version output: {result.stdout}" - - -def test_chrome_launch_and_tab_creation(): - """Integration test: Launch Chrome at crawl level and create tab at snapshot level.""" - with tempfile.TemporaryDirectory() as tmpdir: - crawl_dir = Path(tmpdir) / 'crawl' - crawl_dir.mkdir() - chrome_dir = crawl_dir / 'chrome' - chrome_dir.mkdir() - - # Get test environment with NODE_MODULES_DIR set - env = get_test_env() - env['CHROME_HEADLESS'] = 'true' - - # Launch Chrome at crawl level (background process) - chrome_launch_process = subprocess.Popen( - ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-123'], - cwd=str(chrome_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=env - ) - - # Wait for Chrome to launch (check process isn't dead and files exist) - for i in range(15): # Wait up to 15 seconds for Chrome to start - if chrome_launch_process.poll() is not None: - stdout, stderr = chrome_launch_process.communicate() - pytest.fail(f"Chrome launch process exited early:\nStdout: {stdout}\nStderr: {stderr}") - if (chrome_dir / 'cdp_url.txt').exists(): - break - time.sleep(1) - - # Verify Chrome launch outputs - if it failed, get the error from the process - if not (chrome_dir / 'cdp_url.txt').exists(): - # Try to get output from the process - try: - stdout, stderr = chrome_launch_process.communicate(timeout=1) - except subprocess.TimeoutExpired: - # Process still running, try to read available output - stdout = stderr = "(process still running)" - - # Check what files exist - if chrome_dir.exists(): - files = list(chrome_dir.iterdir()) - # Check if Chrome process is still alive - if (chrome_dir / 'chrome.pid').exists(): - chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip()) - try: - os.kill(chrome_pid, 0) - chrome_alive = "yes" - except OSError: - chrome_alive = "no" - pytest.fail(f"cdp_url.txt missing after 15s. Chrome dir files: {files}. Chrome process {chrome_pid} alive: {chrome_alive}\nLaunch stdout: {stdout}\nLaunch stderr: {stderr}") - else: - pytest.fail(f"cdp_url.txt missing. Chrome dir exists with files: {files}\nLaunch stdout: {stdout}\nLaunch stderr: {stderr}") - else: - pytest.fail(f"Chrome dir {chrome_dir} doesn't exist\nLaunch stdout: {stdout}\nLaunch stderr: {stderr}") - - assert (chrome_dir / 'cdp_url.txt').exists(), "cdp_url.txt should exist" - assert (chrome_dir / 'chrome.pid').exists(), "chrome.pid should exist" - assert (chrome_dir / 'port.txt').exists(), "port.txt should exist" - - cdp_url = (chrome_dir / 'cdp_url.txt').read_text().strip() - chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip()) - - assert cdp_url.startswith('ws://'), f"CDP URL should be WebSocket URL: {cdp_url}" - assert chrome_pid > 0, "Chrome PID should be valid" - - # Verify Chrome process is running - try: - os.kill(chrome_pid, 0) - except OSError: - pytest.fail(f"Chrome process {chrome_pid} is not running") - - # Create snapshot directory and tab - snapshot_dir = Path(tmpdir) / 'snapshot1' - snapshot_dir.mkdir() - snapshot_chrome_dir = snapshot_dir / 'chrome' - snapshot_chrome_dir.mkdir() - - # Launch tab at snapshot level - env['CRAWL_OUTPUT_DIR'] = str(crawl_dir) - result = subprocess.run( - ['node', str(CHROME_TAB_HOOK), '--url=https://example.com', '--snapshot-id=snap-123', '--crawl-id=test-crawl-123'], - cwd=str(snapshot_chrome_dir), - capture_output=True, - text=True, - timeout=60, - env=env - ) - - assert result.returncode == 0, f"Tab creation failed: {result.stderr}\nStdout: {result.stdout}" - - # Verify tab creation outputs - assert (snapshot_chrome_dir / 'cdp_url.txt').exists(), "Snapshot cdp_url.txt should exist" - assert (snapshot_chrome_dir / 'target_id.txt').exists(), "target_id.txt should exist" - assert (snapshot_chrome_dir / 'url.txt').exists(), "url.txt should exist" - - target_id = (snapshot_chrome_dir / 'target_id.txt').read_text().strip() - assert len(target_id) > 0, "Target ID should not be empty" - - # Cleanup: Kill Chrome and launch process - try: - chrome_launch_process.send_signal(signal.SIGTERM) - chrome_launch_process.wait(timeout=5) - except: - pass - try: - os.kill(chrome_pid, signal.SIGKILL) - except OSError: - pass - - -def test_cookies_imported_on_launch(): - """Integration test: COOKIES_TXT_FILE is imported at crawl start.""" - with tempfile.TemporaryDirectory() as tmpdir: - crawl_dir = Path(tmpdir) / 'crawl' - crawl_dir.mkdir() - chrome_dir = crawl_dir / 'chrome' - chrome_dir.mkdir() - - cookies_file = Path(tmpdir) / 'cookies.txt' - cookies_file.write_text( - '\n'.join([ - '# Netscape HTTP Cookie File', - '# https://curl.se/docs/http-cookies.html', - '# This file was generated by a test', - '', - 'example.com\tTRUE\t/\tFALSE\t2147483647\tabx_test_cookie\thello', - '', - ]) - ) - - profile_dir = Path(tmpdir) / 'profile' - env = get_test_env() - env.update({ - 'CHROME_HEADLESS': 'true', - 'CHROME_USER_DATA_DIR': str(profile_dir), - 'COOKIES_TXT_FILE': str(cookies_file), - }) - - chrome_launch_process = subprocess.Popen( - ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-cookies'], - cwd=str(chrome_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=env - ) - - for _ in range(15): - if (chrome_dir / 'port.txt').exists(): - break - time.sleep(1) - - assert (chrome_dir / 'port.txt').exists(), "port.txt should exist" - chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip()) - port = int((chrome_dir / 'port.txt').read_text().strip()) - - cookie_found = False - for _ in range(15): - cookies = _get_cookies_via_cdp(port, env) - cookie_found = any( - c.get('name') == 'abx_test_cookie' and c.get('value') == 'hello' - for c in cookies - ) - if cookie_found: - break - time.sleep(1) - - assert cookie_found, "Imported cookie should be present in Chrome session" - - # Cleanup - try: - chrome_launch_process.send_signal(signal.SIGTERM) - chrome_launch_process.wait(timeout=5) - except: - pass - try: - os.kill(chrome_pid, signal.SIGKILL) - except OSError: - pass - - -def test_chrome_navigation(): - """Integration test: Navigate to a URL.""" - with tempfile.TemporaryDirectory() as tmpdir: - crawl_dir = Path(tmpdir) / 'crawl' - crawl_dir.mkdir() - chrome_dir = crawl_dir / 'chrome' - chrome_dir.mkdir() - - # Launch Chrome (background process) - chrome_launch_process = subprocess.Popen( - ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-nav'], - cwd=str(chrome_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=get_test_env() | {'CHROME_HEADLESS': 'true'} - ) - - # Wait for Chrome to launch - time.sleep(3) - - chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip()) - - # Create snapshot and tab - snapshot_dir = Path(tmpdir) / 'snapshot1' - snapshot_dir.mkdir() - snapshot_chrome_dir = snapshot_dir / 'chrome' - snapshot_chrome_dir.mkdir() - - result = subprocess.run( - ['node', str(CHROME_TAB_HOOK), '--url=https://example.com', '--snapshot-id=snap-nav-123', '--crawl-id=test-crawl-nav'], - cwd=str(snapshot_chrome_dir), - capture_output=True, - text=True, - timeout=60, - env=get_test_env() | {'CRAWL_OUTPUT_DIR': str(crawl_dir), 'CHROME_HEADLESS': 'true'} - ) - assert result.returncode == 0, f"Tab creation failed: {result.stderr}" - - # Navigate to URL - result = subprocess.run( - ['node', str(CHROME_NAVIGATE_HOOK), '--url=https://example.com', '--snapshot-id=snap-nav-123'], - cwd=str(snapshot_chrome_dir), - capture_output=True, - text=True, - timeout=120, - env=get_test_env() | {'CHROME_PAGELOAD_TIMEOUT': '30', 'CHROME_WAIT_FOR': 'load'} - ) - - assert result.returncode == 0, f"Navigation failed: {result.stderr}\nStdout: {result.stdout}" - - # Verify navigation outputs - assert (snapshot_chrome_dir / 'navigation.json').exists(), "navigation.json should exist" - assert (snapshot_chrome_dir / 'page_loaded.txt').exists(), "page_loaded.txt should exist" - - nav_data = json.loads((snapshot_chrome_dir / 'navigation.json').read_text()) - assert nav_data.get('status') in [200, 301, 302], f"Should get valid HTTP status: {nav_data}" - assert nav_data.get('finalUrl'), "Should have final URL" - - # Cleanup - try: - chrome_launch_process.send_signal(signal.SIGTERM) - chrome_launch_process.wait(timeout=5) - except: - pass - try: - os.kill(chrome_pid, signal.SIGKILL) - except OSError: - pass - - -def test_tab_cleanup_on_sigterm(): - """Integration test: Tab cleanup when receiving SIGTERM.""" - with tempfile.TemporaryDirectory() as tmpdir: - crawl_dir = Path(tmpdir) / 'crawl' - crawl_dir.mkdir() - chrome_dir = crawl_dir / 'chrome' - chrome_dir.mkdir() - - # Launch Chrome (background process) - chrome_launch_process = subprocess.Popen( - ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-cleanup'], - cwd=str(chrome_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=get_test_env() | {'CHROME_HEADLESS': 'true'} - ) - - # Wait for Chrome to launch - time.sleep(3) - - chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip()) - - # Create snapshot and tab - run in background - snapshot_dir = Path(tmpdir) / 'snapshot1' - snapshot_dir.mkdir() - snapshot_chrome_dir = snapshot_dir / 'chrome' - snapshot_chrome_dir.mkdir() - - tab_process = subprocess.Popen( - ['node', str(CHROME_TAB_HOOK), '--url=https://example.com', '--snapshot-id=snap-cleanup', '--crawl-id=test-cleanup'], - cwd=str(snapshot_chrome_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=get_test_env() | {'CRAWL_OUTPUT_DIR': str(crawl_dir), 'CHROME_HEADLESS': 'true'} - ) - - # Wait for tab to be created - time.sleep(3) - - # Send SIGTERM to tab process - tab_process.send_signal(signal.SIGTERM) - stdout, stderr = tab_process.communicate(timeout=10) - - assert tab_process.returncode == 0, f"Tab process should exit cleanly: {stderr}" - - # Chrome should still be running - try: - os.kill(chrome_pid, 0) - except OSError: - pytest.fail("Chrome should still be running after tab cleanup") - - # Cleanup - try: - chrome_launch_process.send_signal(signal.SIGTERM) - chrome_launch_process.wait(timeout=5) - except: - pass - try: - os.kill(chrome_pid, signal.SIGKILL) - except OSError: - pass - - -def test_multiple_snapshots_share_chrome(): - """Integration test: Multiple snapshots share one Chrome instance.""" - with tempfile.TemporaryDirectory() as tmpdir: - crawl_dir = Path(tmpdir) / 'crawl' - crawl_dir.mkdir() - chrome_dir = crawl_dir / 'chrome' - chrome_dir.mkdir() - - # Launch Chrome at crawl level - chrome_launch_process = subprocess.Popen( - ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-multi-crawl'], - cwd=str(chrome_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=get_test_env() | {'CHROME_HEADLESS': 'true'} - ) - - # Wait for Chrome to launch - for i in range(15): - if (chrome_dir / 'cdp_url.txt').exists(): - break - time.sleep(1) - - chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip()) - crawl_cdp_url = (chrome_dir / 'cdp_url.txt').read_text().strip() - - # Create multiple snapshots that share this Chrome - snapshot_dirs = [] - target_ids = [] - - for snap_num in range(3): - snapshot_dir = Path(tmpdir) / f'snapshot{snap_num}' - snapshot_dir.mkdir() - snapshot_chrome_dir = snapshot_dir / 'chrome' - snapshot_chrome_dir.mkdir() - snapshot_dirs.append(snapshot_chrome_dir) - - # Create tab for this snapshot - result = subprocess.run( - ['node', str(CHROME_TAB_HOOK), f'--url=https://example.com/{snap_num}', f'--snapshot-id=snap-{snap_num}', '--crawl-id=test-multi-crawl'], - cwd=str(snapshot_chrome_dir), - capture_output=True, - text=True, - timeout=60, - env=get_test_env() | {'CRAWL_OUTPUT_DIR': str(crawl_dir), 'CHROME_HEADLESS': 'true'} - ) - - assert result.returncode == 0, f"Tab {snap_num} creation failed: {result.stderr}" - - # Verify each snapshot has its own target_id but same Chrome PID - assert (snapshot_chrome_dir / 'target_id.txt').exists() - assert (snapshot_chrome_dir / 'cdp_url.txt').exists() - assert (snapshot_chrome_dir / 'chrome.pid').exists() - - target_id = (snapshot_chrome_dir / 'target_id.txt').read_text().strip() - snapshot_cdp_url = (snapshot_chrome_dir / 'cdp_url.txt').read_text().strip() - snapshot_pid = int((snapshot_chrome_dir / 'chrome.pid').read_text().strip()) - - target_ids.append(target_id) - - # All snapshots should share same Chrome - assert snapshot_pid == chrome_pid, f"Snapshot {snap_num} should use crawl Chrome PID" - assert snapshot_cdp_url == crawl_cdp_url, f"Snapshot {snap_num} should use crawl CDP URL" - - # All target IDs should be unique (different tabs) - assert len(set(target_ids)) == 3, f"All snapshots should have unique tabs: {target_ids}" - - # Chrome should still be running with all 3 tabs - try: - os.kill(chrome_pid, 0) - except OSError: - pytest.fail("Chrome should still be running after creating 3 tabs") - - # Cleanup - try: - chrome_launch_process.send_signal(signal.SIGTERM) - chrome_launch_process.wait(timeout=5) - except: - pass - try: - os.kill(chrome_pid, signal.SIGKILL) - except OSError: - pass - - -def test_chrome_cleanup_on_crawl_end(): - """Integration test: Chrome cleanup at end of crawl.""" - with tempfile.TemporaryDirectory() as tmpdir: - crawl_dir = Path(tmpdir) / 'crawl' - crawl_dir.mkdir() - chrome_dir = crawl_dir / 'chrome' - chrome_dir.mkdir() - - # Launch Chrome in background - chrome_launch_process = subprocess.Popen( - ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-end'], - cwd=str(chrome_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=get_test_env() | {'CHROME_HEADLESS': 'true'} - ) - - # Wait for Chrome to launch - time.sleep(3) - - # Verify Chrome is running - assert (chrome_dir / 'chrome.pid').exists(), "Chrome PID file should exist" - chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip()) - - try: - os.kill(chrome_pid, 0) - except OSError: - pytest.fail("Chrome should be running") - - # Send SIGTERM to chrome launch process - chrome_launch_process.send_signal(signal.SIGTERM) - stdout, stderr = chrome_launch_process.communicate(timeout=10) - - # Wait for cleanup - time.sleep(3) - - # Verify Chrome process is killed - try: - os.kill(chrome_pid, 0) - pytest.fail("Chrome should be killed after SIGTERM") - except OSError: - # Expected - Chrome should be dead - pass - - -def test_zombie_prevention_hook_killed(): - """Integration test: Chrome is killed even if hook process is SIGKILL'd.""" - with tempfile.TemporaryDirectory() as tmpdir: - crawl_dir = Path(tmpdir) / 'crawl' - crawl_dir.mkdir() - chrome_dir = crawl_dir / 'chrome' - chrome_dir.mkdir() - - # Launch Chrome - chrome_launch_process = subprocess.Popen( - ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-zombie'], - cwd=str(chrome_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=get_test_env() | {'CHROME_HEADLESS': 'true'} - ) - - # Wait for Chrome to launch - for i in range(15): - if (chrome_dir / 'chrome.pid').exists(): - break - time.sleep(1) - - assert (chrome_dir / 'chrome.pid').exists(), "Chrome PID file should exist" - - chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip()) - hook_pid = chrome_launch_process.pid # Use the Popen process PID instead of hook.pid file - - # Verify both Chrome and hook are running - try: - os.kill(chrome_pid, 0) - os.kill(hook_pid, 0) - except OSError: - pytest.fail("Both Chrome and hook should be running") - - # Simulate hook getting SIGKILL'd (can't cleanup) - os.kill(hook_pid, signal.SIGKILL) - time.sleep(1) - - # Chrome should still be running (orphaned) - try: - os.kill(chrome_pid, 0) - except OSError: - pytest.fail("Chrome should still be running after hook SIGKILL") - - # Simulate Crawl.cleanup() using the actual cleanup logic - def is_process_alive(pid): - """Check if a process exists.""" - try: - os.kill(pid, 0) - return True - except (OSError, ProcessLookupError): - return False - - for pid_file in chrome_dir.glob('**/*.pid'): - try: - pid = int(pid_file.read_text().strip()) - - # Step 1: SIGTERM for graceful shutdown - try: - try: - os.killpg(pid, signal.SIGTERM) - except (OSError, ProcessLookupError): - os.kill(pid, signal.SIGTERM) - except ProcessLookupError: - pid_file.unlink(missing_ok=True) - continue - - # Step 2: Wait for graceful shutdown - time.sleep(2) - - # Step 3: Check if still alive - if not is_process_alive(pid): - pid_file.unlink(missing_ok=True) - continue - - # Step 4: Force kill ENTIRE process group with SIGKILL - try: - try: - # Always kill entire process group with SIGKILL - os.killpg(pid, signal.SIGKILL) - except (OSError, ProcessLookupError): - os.kill(pid, signal.SIGKILL) - except ProcessLookupError: - pid_file.unlink(missing_ok=True) - continue - - # Step 5: Wait and verify death - time.sleep(1) - - if not is_process_alive(pid): - pid_file.unlink(missing_ok=True) - - except (ValueError, OSError): - pass - - # Chrome should now be dead - try: - os.kill(chrome_pid, 0) - pytest.fail("Chrome should be killed after cleanup") - except OSError: - # Expected - Chrome is dead - pass - - -if __name__ == '__main__': - pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/chrome/tests/test_chrome_test_helpers.py b/archivebox/plugins/chrome/tests/test_chrome_test_helpers.py deleted file mode 100644 index 703ea037..00000000 --- a/archivebox/plugins/chrome/tests/test_chrome_test_helpers.py +++ /dev/null @@ -1,260 +0,0 @@ -""" -Tests for chrome_test_helpers.py functions. - -These tests verify the Python helper functions used across Chrome plugin tests. -""" - -import os -import pytest -import tempfile -from pathlib import Path - -from archivebox.plugins.chrome.tests.chrome_test_helpers import ( - get_test_env, - get_machine_type, - get_lib_dir, - get_node_modules_dir, - get_extensions_dir, - find_chromium_binary, - get_plugin_dir, - get_hook_script, - parse_jsonl_output, -) - - -def test_get_machine_type(): - """Test get_machine_type() returns valid format.""" - machine_type = get_machine_type() - assert isinstance(machine_type, str) - assert '-' in machine_type, "Machine type should be in format: arch-os" - # Should be one of the expected formats - assert any(x in machine_type for x in ['arm64', 'x86_64']), "Should contain valid architecture" - assert any(x in machine_type for x in ['darwin', 'linux', 'win32']), "Should contain valid OS" - - -def test_get_lib_dir_with_env_var(): - """Test get_lib_dir() respects LIB_DIR env var.""" - with tempfile.TemporaryDirectory() as tmpdir: - custom_lib = Path(tmpdir) / 'custom_lib' - custom_lib.mkdir() - - old_lib_dir = os.environ.get('LIB_DIR') - try: - os.environ['LIB_DIR'] = str(custom_lib) - lib_dir = get_lib_dir() - assert lib_dir == custom_lib - finally: - if old_lib_dir: - os.environ['LIB_DIR'] = old_lib_dir - else: - os.environ.pop('LIB_DIR', None) - - -def test_get_node_modules_dir_with_env_var(): - """Test get_node_modules_dir() respects NODE_MODULES_DIR env var.""" - with tempfile.TemporaryDirectory() as tmpdir: - custom_nm = Path(tmpdir) / 'node_modules' - custom_nm.mkdir() - - old_nm_dir = os.environ.get('NODE_MODULES_DIR') - try: - os.environ['NODE_MODULES_DIR'] = str(custom_nm) - nm_dir = get_node_modules_dir() - assert nm_dir == custom_nm - finally: - if old_nm_dir: - os.environ['NODE_MODULES_DIR'] = old_nm_dir - else: - os.environ.pop('NODE_MODULES_DIR', None) - - -def test_get_extensions_dir_default(): - """Test get_extensions_dir() returns expected path format.""" - ext_dir = get_extensions_dir() - assert isinstance(ext_dir, str) - assert 'personas' in ext_dir - assert 'chrome_extensions' in ext_dir - - -def test_get_extensions_dir_with_custom_persona(): - """Test get_extensions_dir() respects ACTIVE_PERSONA env var.""" - old_persona = os.environ.get('ACTIVE_PERSONA') - old_data_dir = os.environ.get('DATA_DIR') - try: - os.environ['ACTIVE_PERSONA'] = 'TestPersona' - os.environ['DATA_DIR'] = '/tmp/test' - ext_dir = get_extensions_dir() - assert 'TestPersona' in ext_dir - assert '/tmp/test' in ext_dir - finally: - if old_persona: - os.environ['ACTIVE_PERSONA'] = old_persona - else: - os.environ.pop('ACTIVE_PERSONA', None) - if old_data_dir: - os.environ['DATA_DIR'] = old_data_dir - else: - os.environ.pop('DATA_DIR', None) - - -def test_get_test_env_returns_dict(): - """Test get_test_env() returns properly formatted environment dict.""" - env = get_test_env() - assert isinstance(env, dict) - - # Should include key paths - assert 'MACHINE_TYPE' in env - assert 'LIB_DIR' in env - assert 'NODE_MODULES_DIR' in env - assert 'NODE_PATH' in env # Critical for module resolution - assert 'NPM_BIN_DIR' in env - assert 'CHROME_EXTENSIONS_DIR' in env - - # Verify NODE_PATH equals NODE_MODULES_DIR (for Node.js module resolution) - assert env['NODE_PATH'] == env['NODE_MODULES_DIR'] - - -def test_get_test_env_paths_are_absolute(): - """Test that get_test_env() returns absolute paths.""" - env = get_test_env() - - # All path-like values should be absolute - assert Path(env['LIB_DIR']).is_absolute() - assert Path(env['NODE_MODULES_DIR']).is_absolute() - assert Path(env['NODE_PATH']).is_absolute() - - -def test_find_chromium_binary(): - """Test find_chromium_binary() returns a path or None.""" - binary = find_chromium_binary() - if binary: - assert isinstance(binary, str) - # Should be an absolute path if found - assert os.path.isabs(binary) - - -def test_get_plugin_dir(): - """Test get_plugin_dir() finds correct plugin directory.""" - # Use this test file's path - test_file = __file__ - plugin_dir = get_plugin_dir(test_file) - - assert plugin_dir.exists() - assert plugin_dir.is_dir() - # Should be the chrome plugin directory - assert plugin_dir.name == 'chrome' - assert (plugin_dir.parent.name == 'plugins') - - -def test_get_hook_script_finds_existing_hook(): - """Test get_hook_script() can find an existing hook.""" - from archivebox.plugins.chrome.tests.chrome_test_helpers import CHROME_PLUGIN_DIR - - # Try to find the chrome launch hook - hook = get_hook_script(CHROME_PLUGIN_DIR, 'on_Crawl__*_chrome_launch.*') - - if hook: # May not exist in all test environments - assert hook.exists() - assert hook.is_file() - assert 'chrome_launch' in hook.name - - -def test_get_hook_script_returns_none_for_missing(): - """Test get_hook_script() returns None for non-existent hooks.""" - from archivebox.plugins.chrome.tests.chrome_test_helpers import CHROME_PLUGIN_DIR - - hook = get_hook_script(CHROME_PLUGIN_DIR, 'nonexistent_hook_*_pattern.*') - assert hook is None - - -def test_parse_jsonl_output_valid(): - """Test parse_jsonl_output() parses valid JSONL.""" - jsonl_output = '''{"type": "ArchiveResult", "status": "succeeded", "output": "test1"} -{"type": "ArchiveResult", "status": "failed", "error": "test2"} -''' - - # Returns first match only - result = parse_jsonl_output(jsonl_output) - assert result is not None - assert result['type'] == 'ArchiveResult' - assert result['status'] == 'succeeded' - assert result['output'] == 'test1' - - -def test_parse_jsonl_output_with_non_json_lines(): - """Test parse_jsonl_output() skips non-JSON lines.""" - mixed_output = '''Some non-JSON output -{"type": "ArchiveResult", "status": "succeeded"} -More non-JSON -{"type": "ArchiveResult", "status": "failed"} -''' - - result = parse_jsonl_output(mixed_output) - assert result is not None - assert result['type'] == 'ArchiveResult' - assert result['status'] == 'succeeded' - - -def test_parse_jsonl_output_empty(): - """Test parse_jsonl_output() handles empty input.""" - result = parse_jsonl_output('') - assert result is None - - -def test_parse_jsonl_output_filters_by_type(): - """Test parse_jsonl_output() can filter by record type.""" - jsonl_output = '''{"type": "LogEntry", "data": "log1"} -{"type": "ArchiveResult", "data": "result1"} -{"type": "ArchiveResult", "data": "result2"} -''' - - # Should return first ArchiveResult, not LogEntry - result = parse_jsonl_output(jsonl_output, record_type='ArchiveResult') - assert result is not None - assert result['type'] == 'ArchiveResult' - assert result['data'] == 'result1' # First ArchiveResult - - -def test_parse_jsonl_output_filters_custom_type(): - """Test parse_jsonl_output() can filter by custom record type.""" - jsonl_output = '''{"type": "ArchiveResult", "data": "result1"} -{"type": "LogEntry", "data": "log1"} -{"type": "ArchiveResult", "data": "result2"} -''' - - result = parse_jsonl_output(jsonl_output, record_type='LogEntry') - assert result is not None - assert result['type'] == 'LogEntry' - assert result['data'] == 'log1' - - -def test_machine_type_consistency(): - """Test that machine type is consistent across calls.""" - mt1 = get_machine_type() - mt2 = get_machine_type() - assert mt1 == mt2, "Machine type should be stable across calls" - - -def test_lib_dir_is_directory(): - """Test that lib_dir points to an actual directory when DATA_DIR is set.""" - with tempfile.TemporaryDirectory() as tmpdir: - old_data_dir = os.environ.get('DATA_DIR') - try: - os.environ['DATA_DIR'] = tmpdir - # Create the expected directory structure - machine_type = get_machine_type() - lib_dir = Path(tmpdir) / 'lib' / machine_type - lib_dir.mkdir(parents=True, exist_ok=True) - - result = get_lib_dir() - # Should return a Path object - assert isinstance(result, Path) - finally: - if old_data_dir: - os.environ['DATA_DIR'] = old_data_dir - else: - os.environ.pop('DATA_DIR', None) - - -if __name__ == '__main__': - pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/consolelog/config.json b/archivebox/plugins/consolelog/config.json deleted file mode 100644 index f03ae547..00000000 --- a/archivebox/plugins/consolelog/config.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "additionalProperties": false, - "required_plugins": ["chrome"], - "properties": { - "CONSOLELOG_ENABLED": { - "type": "boolean", - "default": true, - "x-aliases": ["SAVE_CONSOLELOG", "USE_CONSOLELOG"], - "description": "Enable console log capture" - }, - "CONSOLELOG_TIMEOUT": { - "type": "integer", - "default": 30, - "minimum": 5, - "x-fallback": "TIMEOUT", - "description": "Timeout for console log capture in seconds" - } - } -} diff --git a/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js b/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js deleted file mode 100755 index 92351c05..00000000 --- a/archivebox/plugins/consolelog/on_Snapshot__21_consolelog.bg.js +++ /dev/null @@ -1,201 +0,0 @@ -#!/usr/bin/env node -/** - * Capture console output from a page. - * - * This hook sets up CDP listeners BEFORE chrome_navigate loads the page, - * then waits for navigation to complete. The listeners stay active through - * navigation and capture all console output. - * - * Usage: on_Snapshot__21_consolelog.js --url= --snapshot-id= - * Output: Writes console.jsonl - */ - -const fs = require('fs'); -const path = require('path'); - -// Add NODE_MODULES_DIR to module resolution paths if set -if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); - -const puppeteer = require('puppeteer-core'); - -// Import shared utilities from chrome_utils.js -const { - getEnvBool, - getEnvInt, - parseArgs, - connectToPage, - waitForPageLoaded, -} = require('../chrome/chrome_utils.js'); - -const PLUGIN_NAME = 'consolelog'; -const OUTPUT_DIR = '.'; -const OUTPUT_FILE = 'console.jsonl'; -const CHROME_SESSION_DIR = '../chrome'; - -let browser = null; -let page = null; -let logCount = 0; -let errorCount = 0; -let requestFailCount = 0; -let shuttingDown = false; - -async function serializeArgs(args) { - const serialized = []; - for (const arg of args) { - try { - const json = await arg.jsonValue(); - serialized.push(json); - } catch (e) { - try { - serialized.push(String(arg)); - } catch (e2) { - serialized.push('[Unserializable]'); - } - } - } - return serialized; -} - -async function setupListeners() { - const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); - const timeout = getEnvInt('CONSOLELOG_TIMEOUT', 30) * 1000; - - fs.writeFileSync(outputPath, ''); // Clear existing - - // Connect to Chrome page using shared utility - const { browser, page } = await connectToPage({ - chromeSessionDir: CHROME_SESSION_DIR, - timeoutMs: timeout, - puppeteer, - }); - - // Set up listeners that write directly to file - page.on('console', async (msg) => { - try { - const logEntry = { - timestamp: new Date().toISOString(), - type: msg.type(), - text: msg.text(), - args: await serializeArgs(msg.args()), - location: msg.location(), - }; - fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n'); - logCount += 1; - } catch (e) { - // Ignore errors - } - }); - - page.on('pageerror', (error) => { - try { - const logEntry = { - timestamp: new Date().toISOString(), - type: 'error', - text: error.message, - stack: error.stack || '', - }; - fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n'); - errorCount += 1; - } catch (e) { - // Ignore - } - }); - - page.on('requestfailed', (request) => { - try { - const failure = request.failure(); - const logEntry = { - timestamp: new Date().toISOString(), - type: 'request_failed', - text: `Request failed: ${request.url()}`, - error: failure ? failure.errorText : 'Unknown error', - url: request.url(), - }; - fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n'); - requestFailCount += 1; - } catch (e) { - // Ignore - } - }); - - return { browser, page }; -} - -function emitResult(status = 'succeeded') { - if (shuttingDown) return; - shuttingDown = true; - - const counts = `${logCount} console, ${errorCount} errors, ${requestFailCount} failed requests`; - console.log(JSON.stringify({ - type: 'ArchiveResult', - status, - output_str: `${OUTPUT_FILE} (${counts})`, - })); -} - -async function handleShutdown(signal) { - console.error(`\nReceived ${signal}, emitting final results...`); - emitResult('succeeded'); - if (browser) { - try { - browser.disconnect(); - } catch (e) {} - } - process.exit(0); -} - -async function main() { - const args = parseArgs(); - const url = args.url; - const snapshotId = args.snapshot_id; - - if (!url || !snapshotId) { - console.error('Usage: on_Snapshot__21_consolelog.js --url= --snapshot-id='); - process.exit(1); - } - - if (!getEnvBool('CONSOLELOG_ENABLED', true)) { - console.error('Skipping (CONSOLELOG_ENABLED=False)'); - console.log(JSON.stringify({type: 'ArchiveResult', status: 'skipped', output_str: 'CONSOLELOG_ENABLED=False'})); - process.exit(0); - } - - try { - // Set up listeners BEFORE navigation - const connection = await setupListeners(); - browser = connection.browser; - page = connection.page; - - // Register signal handlers for graceful shutdown - process.on('SIGTERM', () => handleShutdown('SIGTERM')); - process.on('SIGINT', () => handleShutdown('SIGINT')); - - // Wait for chrome_navigate to complete (non-fatal) - try { - const timeout = getEnvInt('CONSOLELOG_TIMEOUT', 30) * 1000; - await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 500); - } catch (e) { - console.error(`WARN: ${e.message}`); - } - - // console.error('Consolelog active, waiting for cleanup signal...'); - await new Promise(() => {}); // Keep alive until SIGTERM - return; - - } catch (e) { - const error = `${e.name}: ${e.message}`; - console.error(`ERROR: ${error}`); - - console.log(JSON.stringify({ - type: 'ArchiveResult', - status: 'failed', - output_str: error, - })); - process.exit(1); - } -} - -main().catch(e => { - console.error(`Fatal error: ${e.message}`); - process.exit(1); -}); diff --git a/archivebox/plugins/consolelog/templates/icon.html b/archivebox/plugins/consolelog/templates/icon.html deleted file mode 100644 index c68b8db5..00000000 --- a/archivebox/plugins/consolelog/templates/icon.html +++ /dev/null @@ -1 +0,0 @@ - diff --git a/archivebox/plugins/consolelog/tests/test_consolelog.py b/archivebox/plugins/consolelog/tests/test_consolelog.py deleted file mode 100644 index ab851d15..00000000 --- a/archivebox/plugins/consolelog/tests/test_consolelog.py +++ /dev/null @@ -1,127 +0,0 @@ -""" -Tests for the consolelog plugin. - -Tests the real consolelog hook with an actual URL to verify -console output capture. -""" - -import json -import shutil -import subprocess -import sys -import tempfile -import time -from pathlib import Path - -from django.test import TestCase - -# Import chrome test helpers -sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests')) -from chrome_test_helpers import ( - chrome_session, - CHROME_NAVIGATE_HOOK, - get_plugin_dir, - get_hook_script, -) - - -# Get the path to the consolelog hook -PLUGIN_DIR = get_plugin_dir(__file__) -CONSOLELOG_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_consolelog.*') - - -class TestConsolelogPlugin(TestCase): - """Test the consolelog plugin.""" - - def test_consolelog_hook_exists(self): - """Consolelog hook script should exist.""" - self.assertIsNotNone(CONSOLELOG_HOOK, "Consolelog hook not found in plugin directory") - self.assertTrue(CONSOLELOG_HOOK.exists(), f"Hook not found: {CONSOLELOG_HOOK}") - - -class TestConsolelogWithChrome(TestCase): - """Integration tests for consolelog plugin with Chrome.""" - - def setUp(self): - """Set up test environment.""" - self.temp_dir = Path(tempfile.mkdtemp()) - - def tearDown(self): - """Clean up.""" - shutil.rmtree(self.temp_dir, ignore_errors=True) - - def test_consolelog_captures_output(self): - """Consolelog hook should capture console output from page.""" - test_url = 'data:text/html,' - snapshot_id = 'test-consolelog-snapshot' - - with chrome_session( - self.temp_dir, - crawl_id='test-consolelog-crawl', - snapshot_id=snapshot_id, - test_url=test_url, - navigate=False, - timeout=30, - ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env): - console_dir = snapshot_chrome_dir.parent / 'consolelog' - console_dir.mkdir(exist_ok=True) - - # Run consolelog hook with the active Chrome session (background hook) - result = subprocess.Popen( - ['node', str(CONSOLELOG_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(console_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=env - ) - - nav_result = subprocess.run( - ['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(snapshot_chrome_dir), - capture_output=True, - text=True, - timeout=120, - env=env - ) - self.assertEqual(nav_result.returncode, 0, f"Navigation failed: {nav_result.stderr}") - - # Check for output file - console_output = console_dir / 'console.jsonl' - - # Allow it to run briefly, then terminate (background hook) - for _ in range(10): - if console_output.exists() and console_output.stat().st_size > 0: - break - time.sleep(1) - if result.poll() is None: - result.terminate() - try: - stdout, stderr = result.communicate(timeout=5) - except subprocess.TimeoutExpired: - result.kill() - stdout, stderr = result.communicate() - else: - stdout, stderr = result.communicate() - - # At minimum, verify no crash - self.assertNotIn('Traceback', stderr) - - # If output file exists, verify it's valid JSONL and has output - if console_output.exists(): - with open(console_output) as f: - content = f.read().strip() - self.assertTrue(content, "Console output should not be empty") - for line in content.split('\n'): - if line.strip(): - try: - record = json.loads(line) - # Verify structure - self.assertIn('timestamp', record) - self.assertIn('type', record) - except json.JSONDecodeError: - pass # Some lines may be incomplete - - -if __name__ == '__main__': - pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/custom/on_Binary__14_custom_install.py b/archivebox/plugins/custom/on_Binary__14_custom_install.py deleted file mode 100644 index 47eea07f..00000000 --- a/archivebox/plugins/custom/on_Binary__14_custom_install.py +++ /dev/null @@ -1,98 +0,0 @@ -#!/usr/bin/env python3 -""" -Install a binary using a custom bash command. - -This provider runs arbitrary shell commands to install binaries -that don't fit into standard package managers. - -Usage: on_Binary__install_using_custom_bash.py --binary-id= --machine-id= --name= --custom-cmd= -Output: Binary JSONL record to stdout after installation - -Environment variables: - MACHINE_ID: Machine UUID (set by orchestrator) -""" - -import json -import os -import subprocess -import sys - -import rich_click as click -from abx_pkg import Binary, EnvProvider - - -@click.command() -@click.option('--binary-id', required=True, help="Binary UUID") -@click.option('--machine-id', required=True, help="Machine UUID") -@click.option('--name', required=True, help="Binary name to install") -@click.option('--binproviders', default='*', help="Allowed providers (comma-separated)") -@click.option('--custom-cmd', required=True, help="Custom bash command to run") -def main(binary_id: str, machine_id: str, name: str, binproviders: str, custom_cmd: str): - """Install binary using custom bash command.""" - - if binproviders != '*' and 'custom' not in binproviders.split(','): - click.echo(f"custom provider not allowed for {name}", err=True) - sys.exit(0) - - if not custom_cmd: - click.echo("custom provider requires --custom-cmd", err=True) - sys.exit(1) - - click.echo(f"Installing {name} via custom command: {custom_cmd}", err=True) - - try: - result = subprocess.run( - custom_cmd, - shell=True, - timeout=600, # 10 minute timeout for custom installs - ) - if result.returncode != 0: - click.echo(f"Custom install failed (exit={result.returncode})", err=True) - sys.exit(1) - except subprocess.TimeoutExpired: - click.echo("Custom install timed out", err=True) - sys.exit(1) - - # Use abx-pkg to load the binary and get its info - provider = EnvProvider() - try: - binary = Binary(name=name, binproviders=[provider]).load() - except Exception: - try: - binary = Binary( - name=name, - binproviders=[provider], - overrides={'env': {'version': '0.0.1'}}, - ).load() - except Exception as e: - click.echo(f"{name} not found after custom install: {e}", err=True) - sys.exit(1) - - if not binary.abspath: - click.echo(f"{name} not found after custom install", err=True) - sys.exit(1) - - machine_id = os.environ.get('MACHINE_ID', '') - - # Output Binary JSONL record to stdout - record = { - 'type': 'Binary', - 'name': name, - 'abspath': str(binary.abspath), - 'version': str(binary.version) if binary.version else '', - 'sha256': binary.sha256 or '', - 'binprovider': 'custom', - 'machine_id': machine_id, - 'binary_id': binary_id, - } - print(json.dumps(record)) - - # Log human-readable info to stderr - click.echo(f"Installed {name} at {binary.abspath}", err=True) - click.echo(f" version: {binary.version}", err=True) - - sys.exit(0) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/custom/templates/icon.html b/archivebox/plugins/custom/templates/icon.html deleted file mode 100644 index e69de29b..00000000 diff --git a/archivebox/plugins/custom/tests/test_custom_provider.py b/archivebox/plugins/custom/tests/test_custom_provider.py deleted file mode 100644 index 22a2cb1d..00000000 --- a/archivebox/plugins/custom/tests/test_custom_provider.py +++ /dev/null @@ -1,149 +0,0 @@ -""" -Tests for the custom binary provider plugin. - -Tests the custom bash binary installer with safe commands. -""" - -import json -import os -import subprocess -import sys -import tempfile -from pathlib import Path - -import pytest -from django.test import TestCase - - -# Get the path to the custom provider hook -PLUGIN_DIR = Path(__file__).parent.parent -INSTALL_HOOK = next(PLUGIN_DIR.glob('on_Binary__*_custom_install.py'), None) - - -class TestCustomProviderHook(TestCase): - """Test the custom binary provider hook.""" - - def setUp(self): - """Set up test environment.""" - self.temp_dir = tempfile.mkdtemp() - - def tearDown(self): - """Clean up.""" - import shutil - shutil.rmtree(self.temp_dir, ignore_errors=True) - - def test_hook_script_exists(self): - """Hook script should exist.""" - self.assertTrue(INSTALL_HOOK and INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}") - - def test_hook_skips_when_custom_not_allowed(self): - """Hook should skip when custom not in allowed binproviders.""" - env = os.environ.copy() - env['DATA_DIR'] = self.temp_dir - - result = subprocess.run( - [ - sys.executable, str(INSTALL_HOOK), - '--name=echo', - '--binary-id=test-uuid', - '--machine-id=test-machine', - '--binproviders=pip,apt', # custom not allowed - '--custom-cmd=echo hello', - ], - capture_output=True, - text=True, - timeout=30, - env=env - ) - - # Should exit cleanly (code 0) when custom not allowed - self.assertEqual(result.returncode, 0) - self.assertIn('custom provider not allowed', result.stderr) - - def test_hook_runs_custom_command_and_finds_binary(self): - """Hook should run custom command and find the binary in PATH.""" - env = os.environ.copy() - env['DATA_DIR'] = self.temp_dir - - # Use a simple echo command that doesn't actually install anything - # Then check for 'echo' which is already in PATH - result = subprocess.run( - [ - sys.executable, str(INSTALL_HOOK), - '--name=echo', - '--binary-id=test-uuid', - '--machine-id=test-machine', - '--custom-cmd=echo "custom install simulation"', - ], - capture_output=True, - text=True, - timeout=30, - env=env - ) - - # Should succeed since echo is in PATH - self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}") - - # Parse JSONL output - for line in result.stdout.split('\n'): - line = line.strip() - if line.startswith('{'): - try: - record = json.loads(line) - if record.get('type') == 'Binary' and record.get('name') == 'echo': - self.assertEqual(record['binprovider'], 'custom') - self.assertTrue(record['abspath']) - return - except json.JSONDecodeError: - continue - - self.fail("No Binary JSONL record found in output") - - def test_hook_fails_for_missing_binary_after_command(self): - """Hook should fail if binary not found after running custom command.""" - env = os.environ.copy() - env['DATA_DIR'] = self.temp_dir - - result = subprocess.run( - [ - sys.executable, str(INSTALL_HOOK), - '--name=nonexistent_binary_xyz123', - '--binary-id=test-uuid', - '--machine-id=test-machine', - '--custom-cmd=echo "failed install"', # Doesn't actually install - ], - capture_output=True, - text=True, - timeout=30, - env=env - ) - - # Should fail since binary not found after command - self.assertEqual(result.returncode, 1) - self.assertIn('not found', result.stderr.lower()) - - def test_hook_fails_for_failing_command(self): - """Hook should fail if custom command returns non-zero exit code.""" - env = os.environ.copy() - env['DATA_DIR'] = self.temp_dir - - result = subprocess.run( - [ - sys.executable, str(INSTALL_HOOK), - '--name=echo', - '--binary-id=test-uuid', - '--machine-id=test-machine', - '--custom-cmd=exit 1', # Command that fails - ], - capture_output=True, - text=True, - timeout=30, - env=env - ) - - # Should fail with exit code 1 - self.assertEqual(result.returncode, 1) - - -if __name__ == '__main__': - pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/dns/config.json b/archivebox/plugins/dns/config.json deleted file mode 100644 index 2a69a4c8..00000000 --- a/archivebox/plugins/dns/config.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "additionalProperties": false, - "required_plugins": ["chrome"], - "properties": { - "DNS_ENABLED": { - "type": "boolean", - "default": true, - "x-aliases": ["SAVE_DNS", "USE_DNS"], - "description": "Enable DNS traffic recording during page load" - }, - "DNS_TIMEOUT": { - "type": "integer", - "default": 30, - "minimum": 5, - "x-fallback": "TIMEOUT", - "description": "Timeout for DNS recording in seconds" - } - } -} diff --git a/archivebox/plugins/dns/on_Snapshot__22_dns.bg.js b/archivebox/plugins/dns/on_Snapshot__22_dns.bg.js deleted file mode 100755 index 105f13d8..00000000 --- a/archivebox/plugins/dns/on_Snapshot__22_dns.bg.js +++ /dev/null @@ -1,265 +0,0 @@ -#!/usr/bin/env node -/** - * Record all DNS traffic (hostname -> IP resolutions) during page load. - * - * This hook sets up CDP listeners BEFORE chrome_navigate loads the page, - * then waits for navigation to complete. The listeners capture all DNS - * resolutions by extracting hostname/IP pairs from network responses. - * - * Usage: on_Snapshot__22_dns.js --url= --snapshot-id= - * Output: Writes dns.jsonl with one line per DNS resolution record - */ - -const fs = require('fs'); -const path = require('path'); - -// Add NODE_MODULES_DIR to module resolution paths if set -if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); - -const puppeteer = require('puppeteer-core'); - -// Import shared utilities from chrome_utils.js -const { - getEnvBool, - getEnvInt, - parseArgs, - connectToPage, - waitForPageLoaded, -} = require('../chrome/chrome_utils.js'); - -const PLUGIN_NAME = 'dns'; -const OUTPUT_DIR = '.'; -const OUTPUT_FILE = 'dns.jsonl'; -const CHROME_SESSION_DIR = '../chrome'; - -let browser = null; -let page = null; -let recordCount = 0; -let shuttingDown = false; - -function extractHostname(url) { - try { - const urlObj = new URL(url); - return urlObj.hostname; - } catch (e) { - return null; - } -} - -async function setupListener(targetUrl) { - const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); - const timeout = getEnvInt('DNS_TIMEOUT', 30) * 1000; - - // Initialize output file - fs.writeFileSync(outputPath, ''); - - // Track seen hostname -> IP mappings to avoid duplicates per request - const seenResolutions = new Map(); - // Track request IDs to their URLs for correlation - const requestUrls = new Map(); - - // Connect to Chrome page using shared utility - const { browser, page } = await connectToPage({ - chromeSessionDir: CHROME_SESSION_DIR, - timeoutMs: timeout, - puppeteer, - }); - - // Get CDP session for low-level network events - const client = await page.target().createCDPSession(); - - // Enable network domain to receive events - await client.send('Network.enable'); - - // Listen for request events to track URLs - client.on('Network.requestWillBeSent', (params) => { - requestUrls.set(params.requestId, params.request.url); - }); - - // Listen for response events which contain remoteIPAddress (the resolved IP) - client.on('Network.responseReceived', (params) => { - try { - const response = params.response; - const url = response.url; - const remoteIPAddress = response.remoteIPAddress; - const remotePort = response.remotePort; - - if (!url || !remoteIPAddress) { - return; - } - - const hostname = extractHostname(url); - if (!hostname) { - return; - } - - // Skip if IP address is same as hostname (already an IP) - if (hostname === remoteIPAddress) { - return; - } - - // Create a unique key for this resolution - const resolutionKey = `${hostname}:${remoteIPAddress}`; - - // Skip if we've already recorded this resolution - if (seenResolutions.has(resolutionKey)) { - return; - } - seenResolutions.set(resolutionKey, true); - - // Determine record type (A for IPv4, AAAA for IPv6) - const isIPv6 = remoteIPAddress.includes(':'); - const recordType = isIPv6 ? 'AAAA' : 'A'; - - // Create DNS record - const timestamp = new Date().toISOString(); - const dnsRecord = { - ts: timestamp, - hostname: hostname, - ip: remoteIPAddress, - port: remotePort || null, - type: recordType, - protocol: url.startsWith('https://') ? 'https' : 'http', - url: url, - requestId: params.requestId, - }; - - // Append to output file - fs.appendFileSync(outputPath, JSON.stringify(dnsRecord) + '\n'); - recordCount += 1; - - } catch (e) { - // Ignore errors - } - }); - - // Listen for failed requests too - they still involve DNS - client.on('Network.loadingFailed', (params) => { - try { - const requestId = params.requestId; - const url = requestUrls.get(requestId); - - if (!url) { - return; - } - - const hostname = extractHostname(url); - if (!hostname) { - return; - } - - // Check if this is a DNS-related failure - const errorText = params.errorText || ''; - if (errorText.includes('net::ERR_NAME_NOT_RESOLVED') || - errorText.includes('net::ERR_NAME_RESOLUTION_FAILED')) { - - // Create a unique key for this failed resolution - const resolutionKey = `${hostname}:NXDOMAIN`; - - // Skip if we've already recorded this NXDOMAIN - if (seenResolutions.has(resolutionKey)) { - return; - } - seenResolutions.set(resolutionKey, true); - - const timestamp = new Date().toISOString(); - const dnsRecord = { - ts: timestamp, - hostname: hostname, - ip: null, - port: null, - type: 'NXDOMAIN', - protocol: url.startsWith('https://') ? 'https' : 'http', - url: url, - requestId: requestId, - error: errorText, - }; - - fs.appendFileSync(outputPath, JSON.stringify(dnsRecord) + '\n'); - recordCount += 1; - } - } catch (e) { - // Ignore errors - } - }); - - return { browser, page, client }; -} - -function emitResult(status = 'succeeded') { - if (shuttingDown) return; - shuttingDown = true; - - console.log(JSON.stringify({ - type: 'ArchiveResult', - status, - output_str: `${OUTPUT_FILE} (${recordCount} DNS records)`, - })); -} - -async function handleShutdown(signal) { - console.error(`\nReceived ${signal}, emitting final results...`); - emitResult('succeeded'); - if (browser) { - try { - browser.disconnect(); - } catch (e) {} - } - process.exit(0); -} - -async function main() { - const args = parseArgs(); - const url = args.url; - const snapshotId = args.snapshot_id; - - if (!url || !snapshotId) { - console.error('Usage: on_Snapshot__22_dns.js --url= --snapshot-id='); - process.exit(1); - } - - if (!getEnvBool('DNS_ENABLED', true)) { - console.error('Skipping (DNS_ENABLED=False)'); - console.log(JSON.stringify({type: 'ArchiveResult', status: 'skipped', output_str: 'DNS_ENABLED=False'})); - process.exit(0); - } - - try { - // Set up listener BEFORE navigation - const connection = await setupListener(url); - browser = connection.browser; - page = connection.page; - - // Register signal handlers for graceful shutdown - process.on('SIGTERM', () => handleShutdown('SIGTERM')); - process.on('SIGINT', () => handleShutdown('SIGINT')); - - // Wait for chrome_navigate to complete (non-fatal) - try { - const timeout = getEnvInt('DNS_TIMEOUT', 30) * 1000; - await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 500); - } catch (e) { - console.error(`WARN: ${e.message}`); - } - - // console.error('DNS listener active, waiting for cleanup signal...'); - await new Promise(() => {}); // Keep alive until SIGTERM - return; - - } catch (e) { - const error = `${e.name}: ${e.message}`; - console.error(`ERROR: ${error}`); - - console.log(JSON.stringify({ - type: 'ArchiveResult', - status: 'failed', - output_str: error, - })); - process.exit(1); - } -} - -main().catch(e => { - console.error(`Fatal error: ${e.message}`); - process.exit(1); -}); diff --git a/archivebox/plugins/dns/templates/icon.html b/archivebox/plugins/dns/templates/icon.html deleted file mode 100644 index 1a558d40..00000000 --- a/archivebox/plugins/dns/templates/icon.html +++ /dev/null @@ -1 +0,0 @@ - diff --git a/archivebox/plugins/dns/tests/test_dns.py b/archivebox/plugins/dns/tests/test_dns.py deleted file mode 100644 index ac10a478..00000000 --- a/archivebox/plugins/dns/tests/test_dns.py +++ /dev/null @@ -1,126 +0,0 @@ -""" -Tests for the DNS plugin. - -Tests the real DNS hook with an actual URL to verify -DNS resolution capture. -""" - -import json -import shutil -import subprocess -import sys -import tempfile -import time -from pathlib import Path - -from django.test import TestCase - -# Import chrome test helpers -sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests')) -from chrome_test_helpers import ( - chrome_session, - CHROME_NAVIGATE_HOOK, - get_plugin_dir, - get_hook_script, -) - - -# Get the path to the DNS hook -PLUGIN_DIR = get_plugin_dir(__file__) -DNS_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_dns.*') - - -class TestDNSPlugin(TestCase): - """Test the DNS plugin.""" - - def test_dns_hook_exists(self): - """DNS hook script should exist.""" - self.assertIsNotNone(DNS_HOOK, "DNS hook not found in plugin directory") - self.assertTrue(DNS_HOOK.exists(), f"Hook not found: {DNS_HOOK}") - - -class TestDNSWithChrome(TestCase): - """Integration tests for DNS plugin with Chrome.""" - - def setUp(self): - """Set up test environment.""" - self.temp_dir = Path(tempfile.mkdtemp()) - - def tearDown(self): - """Clean up.""" - shutil.rmtree(self.temp_dir, ignore_errors=True) - - def test_dns_records_captured(self): - """DNS hook should capture DNS records from a real URL.""" - test_url = 'https://example.com' - snapshot_id = 'test-dns-snapshot' - - with chrome_session( - self.temp_dir, - crawl_id='test-dns-crawl', - snapshot_id=snapshot_id, - test_url=test_url, - navigate=False, - timeout=30, - ) as (_process, _pid, snapshot_chrome_dir, env): - dns_dir = snapshot_chrome_dir.parent / 'dns' - dns_dir.mkdir(exist_ok=True) - - result = subprocess.Popen( - ['node', str(DNS_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(dns_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=env - ) - - nav_result = subprocess.run( - ['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(snapshot_chrome_dir), - capture_output=True, - text=True, - timeout=120, - env=env - ) - self.assertEqual(nav_result.returncode, 0, f"Navigation failed: {nav_result.stderr}") - - dns_output = dns_dir / 'dns.jsonl' - for _ in range(30): - if dns_output.exists() and dns_output.stat().st_size > 0: - break - time.sleep(1) - - if result.poll() is None: - result.terminate() - try: - stdout, stderr = result.communicate(timeout=5) - except subprocess.TimeoutExpired: - result.kill() - stdout, stderr = result.communicate() - else: - stdout, stderr = result.communicate() - - self.assertNotIn('Traceback', stderr) - - self.assertTrue(dns_output.exists(), "dns.jsonl not created") - content = dns_output.read_text().strip() - self.assertTrue(content, "DNS output should not be empty") - - records = [] - for line in content.split('\n'): - line = line.strip() - if not line: - continue - try: - records.append(json.loads(line)) - except json.JSONDecodeError: - pass - - self.assertTrue(records, "No DNS records parsed") - has_ip_record = any(r.get('hostname') and r.get('ip') for r in records) - self.assertTrue(has_ip_record, f"No DNS record with hostname + ip: {records}") - - -if __name__ == '__main__': - pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/dom/config.json b/archivebox/plugins/dom/config.json deleted file mode 100644 index 7863e873..00000000 --- a/archivebox/plugins/dom/config.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "additionalProperties": false, - "required_plugins": ["chrome"], - "properties": { - "DOM_ENABLED": { - "type": "boolean", - "default": true, - "x-aliases": ["SAVE_DOM", "USE_DOM"], - "description": "Enable DOM capture" - }, - "DOM_TIMEOUT": { - "type": "integer", - "default": 60, - "minimum": 5, - "x-fallback": "TIMEOUT", - "description": "Timeout for DOM capture in seconds" - } - } -} diff --git a/archivebox/plugins/dom/on_Snapshot__53_dom.js b/archivebox/plugins/dom/on_Snapshot__53_dom.js deleted file mode 100644 index db8a2420..00000000 --- a/archivebox/plugins/dom/on_Snapshot__53_dom.js +++ /dev/null @@ -1,184 +0,0 @@ -#!/usr/bin/env node -/** - * Dump the DOM of a URL using Chrome/Puppeteer. - * - * Requires a Chrome session (from chrome plugin) and connects to it via CDP. - * - * Usage: on_Snapshot__53_dom.js --url= --snapshot-id= - * Output: Writes dom/output.html - * - * Environment variables: - * DOM_ENABLED: Enable DOM extraction (default: true) - */ - -const fs = require('fs'); -const path = require('path'); -// Add NODE_MODULES_DIR to module resolution paths if set -if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); - -const { - getEnvBool, - parseArgs, - readCdpUrl, -} = require('../chrome/chrome_utils.js'); - -// Check if DOM is enabled BEFORE requiring puppeteer -if (!getEnvBool('DOM_ENABLED', true)) { - console.error('Skipping DOM (DOM_ENABLED=False)'); - // Temporary failure (config disabled) - NO JSONL emission - process.exit(0); -} - -// Now safe to require puppeteer -const puppeteer = require('puppeteer-core'); - -// Extractor metadata -const PLUGIN_NAME = 'dom'; -const OUTPUT_DIR = '.'; -const OUTPUT_FILE = 'output.html'; -const CHROME_SESSION_DIR = '../chrome'; - -// Check if staticfile extractor already downloaded this URL -const STATICFILE_DIR = '../staticfile'; -function hasStaticFileOutput() { - if (!fs.existsSync(STATICFILE_DIR)) return false; - const stdoutPath = path.join(STATICFILE_DIR, 'stdout.log'); - if (!fs.existsSync(stdoutPath)) return false; - const stdout = fs.readFileSync(stdoutPath, 'utf8'); - for (const line of stdout.split('\n')) { - const trimmed = line.trim(); - if (!trimmed.startsWith('{')) continue; - try { - const record = JSON.parse(trimmed); - if (record.type === 'ArchiveResult' && record.status === 'succeeded') { - return true; - } - } catch (e) {} - } - return false; -} - -// Wait for chrome tab to be fully loaded -async function waitForChromeTabLoaded(timeoutMs = 60000) { - const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json'); - const startTime = Date.now(); - - while (Date.now() - startTime < timeoutMs) { - if (fs.existsSync(navigationFile)) { - return true; - } - // Wait 100ms before checking again - await new Promise(resolve => setTimeout(resolve, 100)); - } - - return false; -} - -async function dumpDom(url) { - // Output directory is current directory (hook already runs in output dir) - const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); - - let browser = null; - let page = null; - - try { - // Connect to existing Chrome session (required) - const cdpUrl = readCdpUrl(CHROME_SESSION_DIR); - if (!cdpUrl) { - return { success: false, error: 'No Chrome session found (chrome plugin must run first)' }; - } - - browser = await puppeteer.connect({ - browserWSEndpoint: cdpUrl, - defaultViewport: null, - }); - - // Get existing pages or create new one - const pages = await browser.pages(); - page = pages.find(p => p.url().startsWith('http')) || pages[0]; - - if (!page) { - page = await browser.newPage(); - } - - // Get the full DOM content - const domContent = await page.content(); - - if (domContent && domContent.length > 100) { - fs.writeFileSync(outputPath, domContent, 'utf8'); - return { success: true, output: outputPath }; - } else { - return { success: false, error: 'DOM content too short or empty' }; - } - - } catch (e) { - return { success: false, error: `${e.name}: ${e.message}` }; - } finally { - if (browser) { - browser.disconnect(); - } - } -} - -async function main() { - const args = parseArgs(); - const url = args.url; - const snapshotId = args.snapshot_id; - - if (!url || !snapshotId) { - console.error('Usage: on_Snapshot__53_dom.js --url= --snapshot-id='); - process.exit(1); - } - - try { - // Check if staticfile extractor already handled this (permanent skip) - if (hasStaticFileOutput()) { - console.error(`Skipping DOM - staticfile extractor already downloaded this`); - // Permanent skip - emit ArchiveResult with status='skipped' - console.log(JSON.stringify({ - type: 'ArchiveResult', - status: 'skipped', - output_str: 'staticfile already handled', - })); - process.exit(0); - } - - const cdpUrl = readCdpUrl(CHROME_SESSION_DIR); - if (!cdpUrl) { - throw new Error('No Chrome session found (chrome plugin must run first)'); - } - - // Wait for page to be fully loaded - const pageLoaded = await waitForChromeTabLoaded(60000); - if (!pageLoaded) { - throw new Error('Page not loaded after 60s (chrome_navigate must complete first)'); - } - - const result = await dumpDom(url); - - if (result.success) { - // Success - emit ArchiveResult - const size = fs.statSync(result.output).size; - console.error(`DOM saved (${size} bytes)`); - console.log(JSON.stringify({ - type: 'ArchiveResult', - status: 'succeeded', - output_str: result.output, - })); - process.exit(0); - } else { - // Transient error - emit NO JSONL - console.error(`ERROR: ${result.error}`); - process.exit(1); - } - } catch (e) { - // Transient error - emit NO JSONL - console.error(`ERROR: ${e.name}: ${e.message}`); - process.exit(1); - } -} - -main().catch(e => { - console.error(`Fatal error: ${e.message}`); - process.exit(1); -}); diff --git a/archivebox/plugins/dom/templates/card.html b/archivebox/plugins/dom/templates/card.html deleted file mode 100644 index 88f126df..00000000 --- a/archivebox/plugins/dom/templates/card.html +++ /dev/null @@ -1,8 +0,0 @@ - -
- -
diff --git a/archivebox/plugins/dom/templates/icon.html b/archivebox/plugins/dom/templates/icon.html deleted file mode 100644 index 56efac8d..00000000 --- a/archivebox/plugins/dom/templates/icon.html +++ /dev/null @@ -1 +0,0 @@ - diff --git a/archivebox/plugins/dom/tests/test_dom.py b/archivebox/plugins/dom/tests/test_dom.py deleted file mode 100644 index 2d98d873..00000000 --- a/archivebox/plugins/dom/tests/test_dom.py +++ /dev/null @@ -1,185 +0,0 @@ -""" -Integration tests for dom plugin - -Tests verify: -1. Hook script exists -2. Dependencies installed via chrome validation hooks -3. Verify deps with abx-pkg -4. DOM extraction works on https://example.com -5. JSONL output is correct -6. Filesystem output contains actual page content -7. Config options work -""" - -import json -import os -import subprocess -import sys -import tempfile -from pathlib import Path - -import pytest - -from archivebox.plugins.chrome.tests.chrome_test_helpers import ( - get_test_env, - get_plugin_dir, - get_hook_script, - run_hook_and_parse, - LIB_DIR, - NODE_MODULES_DIR, - PLUGINS_ROOT, - chrome_session, -) - - -PLUGIN_DIR = get_plugin_dir(__file__) -DOM_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_dom.*') -NPM_PROVIDER_HOOK = get_hook_script(PLUGINS_ROOT / 'npm', 'on_Binary__install_using_npm_provider.py') -TEST_URL = 'https://example.com' - - -def test_hook_script_exists(): - """Verify on_Snapshot hook exists.""" - assert DOM_HOOK.exists(), f"Hook not found: {DOM_HOOK}" - - -def test_verify_deps_with_abx_pkg(): - """Verify dependencies are available via abx-pkg after hook installation.""" - from abx_pkg import Binary, EnvProvider, BinProviderOverrides - - EnvProvider.model_rebuild() - - # Verify node is available - node_binary = Binary(name='node', binproviders=[EnvProvider()]) - node_loaded = node_binary.load() - assert node_loaded and node_loaded.abspath, "Node.js required for dom plugin" - - -def test_extracts_dom_from_example_com(): - """Test full workflow: extract DOM from real example.com via hook.""" - # Prerequisites checked by earlier test - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - with chrome_session(tmpdir, test_url=TEST_URL) as (_process, _pid, snapshot_chrome_dir, env): - dom_dir = snapshot_chrome_dir.parent / 'dom' - dom_dir.mkdir(exist_ok=True) - - # Run DOM extraction hook - result = subprocess.run( - ['node', str(DOM_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'], - cwd=dom_dir, - capture_output=True, - text=True, - timeout=120, - env=env - ) - - assert result.returncode == 0, f"Extraction failed: {result.stderr}" - - # Parse clean JSONL output - result_json = None - for line in result.stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - assert result_json, "Should have ArchiveResult JSONL output" - assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" - - # Verify filesystem output (hook writes directly to working dir) - dom_file = dom_dir / 'output.html' - assert dom_file.exists(), f"output.html not created. Files: {list(tmpdir.iterdir())}" - - # Verify HTML content contains REAL example.com text - html_content = dom_file.read_text(errors='ignore') - assert len(html_content) > 200, f"HTML content too short: {len(html_content)} bytes" - assert ' tag" - assert 'example domain' in html_content.lower(), "Missing 'Example Domain' in HTML" - assert ('this domain' in html_content.lower() or - 'illustrative examples' in html_content.lower()), \ - "Missing example.com description text" - - -def test_config_save_dom_false_skips(): - """Test that DOM_ENABLED=False exits without emitting JSONL.""" - import os - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - env = os.environ.copy() - env['DOM_ENABLED'] = 'False' - - result = subprocess.run( - ['node', str(DOM_HOOK), f'--url={TEST_URL}', '--snapshot-id=test999'], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=30 - ) - - assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}" - - # Feature disabled - temporary failure, should NOT emit JSONL - assert 'Skipping DOM' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr" - - # Should NOT emit any JSONL - jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')] - assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}" - - -def test_staticfile_present_skips(): - """Test that dom skips when staticfile already downloaded.""" - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - # Create directory structure like real ArchiveBox: - # tmpdir/ - # staticfile/ <- staticfile extractor output - # dom/ <- dom extractor runs here, looks for ../staticfile - staticfile_dir = tmpdir / 'staticfile' - staticfile_dir.mkdir() - (staticfile_dir / 'stdout.log').write_text('{"type":"ArchiveResult","status":"succeeded","output_str":"index.html"}\n') - - dom_dir = tmpdir / 'dom' - dom_dir.mkdir() - - result = subprocess.run( - ['node', str(DOM_HOOK), f'--url={TEST_URL}', '--snapshot-id=teststatic'], - cwd=dom_dir, # Run from dom subdirectory - capture_output=True, - text=True, - timeout=30 - , - env=get_test_env()) - - assert result.returncode == 0, "Should exit 0 when permanently skipping" - - # Permanent skip - should emit ArchiveResult with status='skipped' - result_json = None - for line in result.stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - assert result_json, "Should emit ArchiveResult JSONL for permanent skip" - assert result_json['status'] == 'skipped', f"Should have status='skipped': {result_json}" - assert 'staticfile' in result_json.get('output_str', '').lower(), "Should mention staticfile in output_str" - - -if __name__ == '__main__': - pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/env/on_Binary__15_env_install.py b/archivebox/plugins/env/on_Binary__15_env_install.py deleted file mode 100644 index 35b3a9ca..00000000 --- a/archivebox/plugins/env/on_Binary__15_env_install.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/env python3 -""" -Check if a binary is already available in the system PATH. - -This is the simplest "provider" - it doesn't install anything, -it just discovers binaries that are already installed. - -Usage: on_Binary__install_using_env_provider.py --binary-id= --machine-id= --name= -Output: Binary JSONL record to stdout if binary found in PATH - -Environment variables: - MACHINE_ID: Machine UUID (set by orchestrator) -""" - -import json -import os -import sys - -import rich_click as click -from abx_pkg import Binary, EnvProvider - - -@click.command() -@click.option('--machine-id', required=True, help="Machine UUID") -@click.option('--binary-id', required=True, help="Dependency UUID") -@click.option('--name', required=True, help="Binary name to find") -@click.option('--binproviders', default='*', help="Allowed providers (comma-separated)") -@click.option('--overrides', default=None, help="JSON-encoded overrides dict (unused)") -def main(binary_id: str, machine_id: str, name: str, binproviders: str, overrides: str | None): - """Check if binary is available in PATH and record it.""" - - # Check if env provider is allowed - if binproviders != '*' and 'env' not in binproviders.split(','): - click.echo(f"env provider not allowed for {name}", err=True) - sys.exit(0) # Not an error, just skip - - # Use abx-pkg EnvProvider to find binary - provider = EnvProvider() - try: - binary = Binary(name=name, binproviders=[provider]).load() - except Exception as e: - click.echo(f"{name} not found in PATH: {e}", err=True) - sys.exit(1) - - if not binary.abspath: - click.echo(f"{name} not found in PATH", err=True) - sys.exit(1) - - machine_id = os.environ.get('MACHINE_ID', '') - - # Output Binary JSONL record to stdout - record = { - 'type': 'Binary', - 'name': name, - 'abspath': str(binary.abspath), - 'version': str(binary.version) if binary.version else '', - 'sha256': binary.sha256 or '', - 'binprovider': 'env', - 'machine_id': machine_id, - 'binary_id': binary_id, - } - print(json.dumps(record)) - - # Log human-readable info to stderr - click.echo(f"Found {name} at {binary.abspath}", err=True) - click.echo(f" version: {binary.version}", err=True) - - sys.exit(0) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/env/templates/icon.html b/archivebox/plugins/env/templates/icon.html deleted file mode 100644 index e69de29b..00000000 diff --git a/archivebox/plugins/env/tests/test_env_provider.py b/archivebox/plugins/env/tests/test_env_provider.py deleted file mode 100644 index 2bffcfca..00000000 --- a/archivebox/plugins/env/tests/test_env_provider.py +++ /dev/null @@ -1,159 +0,0 @@ -""" -Tests for the env binary provider plugin. - -Tests the real env provider hook with actual system binaries. -""" - -import json -import os -import subprocess -import sys -import tempfile -from pathlib import Path - -import pytest -from django.test import TestCase - - -# Get the path to the env provider hook -PLUGIN_DIR = Path(__file__).parent.parent -INSTALL_HOOK = next(PLUGIN_DIR.glob('on_Binary__*_env_install.py'), None) - - -class TestEnvProviderHook(TestCase): - """Test the env binary provider hook.""" - - def setUp(self): - """Set up test environment.""" - self.temp_dir = tempfile.mkdtemp() - - def tearDown(self): - """Clean up.""" - import shutil - shutil.rmtree(self.temp_dir, ignore_errors=True) - - def test_hook_script_exists(self): - """Hook script should exist.""" - self.assertTrue(INSTALL_HOOK and INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}") - - def test_hook_finds_python(self): - """Hook should find python3 binary in PATH.""" - env = os.environ.copy() - env['DATA_DIR'] = self.temp_dir - - result = subprocess.run( - [ - sys.executable, str(INSTALL_HOOK), - '--name=python3', - '--binary-id=test-uuid', - '--machine-id=test-machine', - ], - capture_output=True, - text=True, - timeout=30, - env=env - ) - - # Should succeed and output JSONL - self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}") - - # Parse JSONL output - for line in result.stdout.split('\n'): - line = line.strip() - if line.startswith('{'): - try: - record = json.loads(line) - if record.get('type') == 'Binary' and record.get('name') == 'python3': - self.assertEqual(record['binprovider'], 'env') - self.assertTrue(record['abspath']) - self.assertTrue(Path(record['abspath']).exists()) - return - except json.JSONDecodeError: - continue - - self.fail("No Binary JSONL record found in output") - - def test_hook_finds_bash(self): - """Hook should find bash binary in PATH.""" - env = os.environ.copy() - env['DATA_DIR'] = self.temp_dir - - result = subprocess.run( - [ - sys.executable, str(INSTALL_HOOK), - '--name=bash', - '--binary-id=test-uuid', - '--machine-id=test-machine', - ], - capture_output=True, - text=True, - timeout=30, - env=env - ) - - # Should succeed and output JSONL - self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}") - - # Parse JSONL output - for line in result.stdout.split('\n'): - line = line.strip() - if line.startswith('{'): - try: - record = json.loads(line) - if record.get('type') == 'Binary' and record.get('name') == 'bash': - self.assertEqual(record['binprovider'], 'env') - self.assertTrue(record['abspath']) - return - except json.JSONDecodeError: - continue - - self.fail("No Binary JSONL record found in output") - - def test_hook_fails_for_missing_binary(self): - """Hook should fail for binary not in PATH.""" - env = os.environ.copy() - env['DATA_DIR'] = self.temp_dir - - result = subprocess.run( - [ - sys.executable, str(INSTALL_HOOK), - '--name=nonexistent_binary_xyz123', - '--binary-id=test-uuid', - '--machine-id=test-machine', - ], - capture_output=True, - text=True, - timeout=30, - env=env - ) - - # Should fail with exit code 1 - self.assertEqual(result.returncode, 1) - self.assertIn('not found', result.stderr.lower()) - - def test_hook_skips_when_env_not_allowed(self): - """Hook should skip when env not in allowed binproviders.""" - env = os.environ.copy() - env['DATA_DIR'] = self.temp_dir - - result = subprocess.run( - [ - sys.executable, str(INSTALL_HOOK), - '--name=python3', - '--binary-id=test-uuid', - '--machine-id=test-machine', - '--binproviders=pip,apt', # env not allowed - ], - capture_output=True, - text=True, - timeout=30, - env=env - ) - - # Should exit cleanly (code 0) when env not allowed - self.assertEqual(result.returncode, 0) - self.assertIn('env provider not allowed', result.stderr) - - -if __name__ == '__main__': - pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/favicon/config.json b/archivebox/plugins/favicon/config.json deleted file mode 100644 index 4c67e18f..00000000 --- a/archivebox/plugins/favicon/config.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "additionalProperties": false, - "properties": { - "FAVICON_ENABLED": { - "type": "boolean", - "default": true, - "x-aliases": ["SAVE_FAVICON", "USE_FAVICON"], - "description": "Enable favicon downloading" - }, - "FAVICON_TIMEOUT": { - "type": "integer", - "default": 30, - "minimum": 5, - "x-fallback": "TIMEOUT", - "description": "Timeout for favicon fetch in seconds" - }, - "FAVICON_USER_AGENT": { - "type": "string", - "default": "", - "x-fallback": "USER_AGENT", - "description": "User agent string" - } - } -} diff --git a/archivebox/plugins/favicon/on_Snapshot__11_favicon.bg.py b/archivebox/plugins/favicon/on_Snapshot__11_favicon.bg.py deleted file mode 100644 index fc4604f4..00000000 --- a/archivebox/plugins/favicon/on_Snapshot__11_favicon.bg.py +++ /dev/null @@ -1,153 +0,0 @@ -#!/usr/bin/env python3 -""" -Extract favicon from a URL. - -Usage: on_Snapshot__favicon.bg.py --url= --snapshot-id= -Output: Writes favicon.ico to $PWD - -Environment variables: - FAVICON_TIMEOUT: Timeout in seconds (default: 30) - USER_AGENT: User agent string - - # Fallback to ARCHIVING_CONFIG values if FAVICON_* not set: - TIMEOUT: Fallback timeout - -Note: This extractor uses the 'requests' library which is bundled with ArchiveBox. - It can run standalone if requests is installed: pip install requests -""" - -import json -import os -import re -import sys -from pathlib import Path -from urllib.parse import urljoin, urlparse - -import rich_click as click - - -# Extractor metadata -PLUGIN_NAME = 'favicon' -OUTPUT_DIR = '.' -OUTPUT_FILE = 'favicon.ico' - - -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - - -def get_env_int(name: str, default: int = 0) -> int: - try: - return int(get_env(name, str(default))) - except ValueError: - return default - - -def get_favicon(url: str) -> tuple[bool, str | None, str]: - """ - Fetch favicon from URL. - - Returns: (success, output_path, error_message) - """ - try: - import requests - except ImportError: - return False, None, 'requests library not installed' - - timeout = get_env_int('FAVICON_TIMEOUT') or get_env_int('TIMEOUT', 30) - user_agent = get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)') - headers = {'User-Agent': user_agent} - - # Build list of possible favicon URLs - parsed = urlparse(url) - base_url = f"{parsed.scheme}://{parsed.netloc}" - - favicon_urls = [ - urljoin(base_url, '/favicon.ico'), - urljoin(base_url, '/favicon.png'), - urljoin(base_url, '/apple-touch-icon.png'), - ] - - # Try to extract favicon URL from HTML link tags - try: - response = requests.get(url, timeout=timeout, headers=headers) - if response.ok: - # Look for - for match in re.finditer( - r']+rel=["\'](?:shortcut )?icon["\'][^>]+href=["\']([^"\']+)["\']', - response.text, - re.I - ): - favicon_urls.insert(0, urljoin(url, match.group(1))) - - # Also check reverse order: href before rel - for match in re.finditer( - r']+href=["\']([^"\']+)["\'][^>]+rel=["\'](?:shortcut )?icon["\']', - response.text, - re.I - ): - favicon_urls.insert(0, urljoin(url, match.group(1))) - except Exception: - pass # Continue with default favicon URLs - - # Try each URL until we find one that works - for favicon_url in favicon_urls: - try: - response = requests.get(favicon_url, timeout=15, headers=headers) - if response.ok and len(response.content) > 0: - Path(OUTPUT_FILE).write_bytes(response.content) - return True, OUTPUT_FILE, '' - except Exception: - continue - - # Try Google's favicon service as fallback - try: - google_url = f'https://www.google.com/s2/favicons?domain={parsed.netloc}' - response = requests.get(google_url, timeout=15, headers=headers) - if response.ok and len(response.content) > 0: - Path(OUTPUT_FILE).write_bytes(response.content) - return True, OUTPUT_FILE, '' - except Exception: - pass - - return False, None, 'No favicon found' - - -@click.command() -@click.option('--url', required=True, help='URL to extract favicon from') -@click.option('--snapshot-id', required=True, help='Snapshot UUID') -def main(url: str, snapshot_id: str): - """Extract favicon from a URL.""" - - output = None - status = 'failed' - error = '' - - try: - # Run extraction - success, output, error = get_favicon(url) - if success: - status = 'succeeded' - else: - status = 'failed' - - except Exception as e: - error = f'{type(e).__name__}: {e}' - status = 'failed' - - if error: - print(f'ERROR: {error}', file=sys.stderr) - - # Output clean JSONL (no RESULT_JSON= prefix) - result = { - 'type': 'ArchiveResult', - 'status': status, - 'output_str': output or error or '', - } - print(json.dumps(result)) - - sys.exit(0 if status == 'succeeded' else 1) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/favicon/templates/card.html b/archivebox/plugins/favicon/templates/card.html deleted file mode 100644 index c5df1617..00000000 --- a/archivebox/plugins/favicon/templates/card.html +++ /dev/null @@ -1,9 +0,0 @@ - -
- {% if output_path %} - Favicon - {% endif %} -
diff --git a/archivebox/plugins/favicon/templates/icon.html b/archivebox/plugins/favicon/templates/icon.html deleted file mode 100644 index 7ba648b3..00000000 --- a/archivebox/plugins/favicon/templates/icon.html +++ /dev/null @@ -1 +0,0 @@ - diff --git a/archivebox/plugins/favicon/tests/test_favicon.py b/archivebox/plugins/favicon/tests/test_favicon.py deleted file mode 100644 index 4434d1a8..00000000 --- a/archivebox/plugins/favicon/tests/test_favicon.py +++ /dev/null @@ -1,293 +0,0 @@ -""" -Integration tests for favicon plugin - -Tests verify: -1. Plugin script exists -2. requests library is available -3. Favicon extraction works for real example.com -4. Output file is actual image data -5. Tries multiple favicon URLs -6. Falls back to Google's favicon service -7. Config options work (TIMEOUT, USER_AGENT) -8. Handles failures gracefully -""" - -import json -import subprocess -import sys -import tempfile -from pathlib import Path - -import pytest - -from archivebox.plugins.chrome.tests.chrome_test_helpers import ( - get_plugin_dir, - get_hook_script, - parse_jsonl_output, -) - - -PLUGIN_DIR = get_plugin_dir(__file__) -FAVICON_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_favicon.*') -TEST_URL = 'https://example.com' - - -def test_hook_script_exists(): - """Verify hook script exists.""" - assert FAVICON_HOOK.exists(), f"Hook script not found: {FAVICON_HOOK}" - - -def test_requests_library_available(): - """Test that requests library is available.""" - result = subprocess.run( - [sys.executable, '-c', 'import requests; print(requests.__version__)'], - capture_output=True, - text=True - ) - - if result.returncode != 0: - pass - - assert len(result.stdout.strip()) > 0, "Should report requests version" - - -def test_extracts_favicon_from_example_com(): - """Test full workflow: extract favicon from real example.com. - - Note: example.com doesn't have a favicon and Google's service may also fail, - so we test that the extraction completes and reports appropriate status. - """ - - # Check requests is available - check_result = subprocess.run( - [sys.executable, '-c', 'import requests'], - capture_output=True - ) - if check_result.returncode != 0: - pass - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - # Run favicon extraction - result = subprocess.run( - [sys.executable, str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'], - cwd=tmpdir, - capture_output=True, - text=True, - timeout=60 - ) - - # May succeed (if Google service works) or fail (if no favicon) - assert result.returncode in (0, 1), "Should complete extraction attempt" - - # Parse clean JSONL output - result_json = None - for line in result.stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - pass - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - assert result_json, "Should have ArchiveResult JSONL output" - - # If it succeeded, verify the favicon file - if result_json['status'] == 'succeeded': - favicon_file = tmpdir / 'favicon.ico' - assert favicon_file.exists(), "favicon.ico not created" - - # Verify file is not empty and contains actual image data - file_size = favicon_file.stat().st_size - assert file_size > 0, "Favicon file should not be empty" - assert file_size < 1024 * 1024, f"Favicon file suspiciously large: {file_size} bytes" - - # Check for common image magic bytes - favicon_data = favicon_file.read_bytes() - # ICO, PNG, GIF, JPEG, or WebP - is_image = ( - favicon_data[:4] == b'\x00\x00\x01\x00' or # ICO - favicon_data[:8] == b'\x89PNG\r\n\x1a\n' or # PNG - favicon_data[:3] == b'GIF' or # GIF - favicon_data[:2] == b'\xff\xd8' or # JPEG - favicon_data[8:12] == b'WEBP' # WebP - ) - assert is_image, "Favicon file should be a valid image format" - else: - # Failed as expected - assert result_json['status'] == 'failed', f"Should report failure: {result_json}" - - -def test_config_timeout_honored(): - """Test that TIMEOUT config is respected.""" - - check_result = subprocess.run( - [sys.executable, '-c', 'import requests'], - capture_output=True - ) - if check_result.returncode != 0: - pass - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - # Set very short timeout (but example.com should still succeed) - import os - env = os.environ.copy() - env['TIMEOUT'] = '5' - - result = subprocess.run( - [sys.executable, str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'testtimeout'], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=30 - ) - - # Should complete (success or fail, but not hang) - assert result.returncode in (0, 1), "Should complete without hanging" - - -def test_config_user_agent(): - """Test that USER_AGENT config is used.""" - - check_result = subprocess.run( - [sys.executable, '-c', 'import requests'], - capture_output=True - ) - if check_result.returncode != 0: - pass - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - # Set custom user agent - import os - env = os.environ.copy() - env['USER_AGENT'] = 'TestBot/1.0' - - result = subprocess.run( - [sys.executable, str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'testua'], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=60 - ) - - # Should succeed (example.com doesn't block) - if result.returncode == 0: - # Parse clean JSONL output - result_json = None - for line in result.stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - pass - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - if result_json: - assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" - - -def test_handles_https_urls(): - """Test that HTTPS URLs work correctly.""" - - check_result = subprocess.run( - [sys.executable, '-c', 'import requests'], - capture_output=True - ) - if check_result.returncode != 0: - pass - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - result = subprocess.run( - [sys.executable, str(FAVICON_HOOK), '--url', 'https://example.org', '--snapshot-id', 'testhttps'], - cwd=tmpdir, - capture_output=True, - text=True, - timeout=60 - ) - - if result.returncode == 0: - favicon_file = tmpdir / 'favicon.ico' - if favicon_file.exists(): - assert favicon_file.stat().st_size > 0 - - -def test_handles_missing_favicon_gracefully(): - """Test that favicon plugin handles sites without favicons gracefully. - - Note: The plugin falls back to Google's favicon service, which generates - a generic icon even if the site doesn't have one, so extraction usually succeeds. - """ - - check_result = subprocess.run( - [sys.executable, '-c', 'import requests'], - capture_output=True - ) - if check_result.returncode != 0: - pass - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - # Try a URL that likely doesn't have a favicon - result = subprocess.run( - [sys.executable, str(FAVICON_HOOK), '--url', 'https://example.com/nonexistent', '--snapshot-id', 'test404'], - cwd=tmpdir, - capture_output=True, - text=True, - timeout=60 - ) - - # May succeed (Google fallback) or fail gracefully - assert result.returncode in (0, 1), "Should complete (may succeed or fail)" - - if result.returncode != 0: - combined = result.stdout + result.stderr - assert 'No favicon found' in combined or 'ERROR=' in combined - - -def test_reports_missing_requests_library(): - """Test that script reports error when requests library is missing.""" - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - # Run with PYTHONPATH cleared to simulate missing requests - import os - env = os.environ.copy() - # Keep only minimal PATH, clear PYTHONPATH - env['PYTHONPATH'] = '/nonexistent' - - result = subprocess.run( - [sys.executable, '-S', str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'test123'], - cwd=tmpdir, - capture_output=True, - text=True, - env=env - ) - - # Should fail and report missing requests - if result.returncode != 0: - combined = result.stdout + result.stderr - # May report missing requests or other import errors - assert 'requests' in combined.lower() or 'import' in combined.lower() or 'ERROR=' in combined - - -if __name__ == '__main__': - pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/forumdl/config.json b/archivebox/plugins/forumdl/config.json deleted file mode 100644 index 9e9ea10a..00000000 --- a/archivebox/plugins/forumdl/config.json +++ /dev/null @@ -1,51 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "additionalProperties": false, - "properties": { - "FORUMDL_ENABLED": { - "type": "boolean", - "default": true, - "x-aliases": ["SAVE_FORUMDL", "USE_FORUMDL"], - "description": "Enable forum downloading with forum-dl" - }, - "FORUMDL_BINARY": { - "type": "string", - "default": "forum-dl", - "description": "Path to forum-dl binary" - }, - "FORUMDL_TIMEOUT": { - "type": "integer", - "default": 3600, - "minimum": 30, - "x-fallback": "TIMEOUT", - "description": "Timeout for forum downloads in seconds" - }, - "FORUMDL_OUTPUT_FORMAT": { - "type": "string", - "default": "jsonl", - "enum": ["jsonl", "warc", "mbox", "maildir", "mh", "mmdf", "babyl"], - "description": "Output format for forum downloads" - }, - "FORUMDL_CHECK_SSL_VALIDITY": { - "type": "boolean", - "default": true, - "x-fallback": "CHECK_SSL_VALIDITY", - "description": "Whether to verify SSL certificates" - }, - "FORUMDL_ARGS": { - "type": "array", - "items": {"type": "string"}, - "default": [], - "x-aliases": ["FORUMDL_DEFAULT_ARGS"], - "description": "Default forum-dl arguments" - }, - "FORUMDL_ARGS_EXTRA": { - "type": "array", - "items": {"type": "string"}, - "default": [], - "x-aliases": ["FORUMDL_EXTRA_ARGS"], - "description": "Extra arguments to append to forum-dl command" - } - } -} diff --git a/archivebox/plugins/forumdl/forum-dl-wrapper.py b/archivebox/plugins/forumdl/forum-dl-wrapper.py deleted file mode 100755 index 2b53ca99..00000000 --- a/archivebox/plugins/forumdl/forum-dl-wrapper.py +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env python3 -""" -Wrapper for forum-dl that applies Pydantic v2 compatibility patches. - -This wrapper fixes forum-dl 0.3.0's incompatibility with Pydantic v2 by monkey-patching -the JsonlWriter class to use model_dump_json() instead of the deprecated json(models_as_dict=False). -""" - -import sys - -# Apply Pydantic v2 compatibility patch BEFORE importing forum_dl -try: - from forum_dl.writers.jsonl import JsonlWriter - from pydantic import BaseModel - - # Check if we're using Pydantic v2 - if hasattr(BaseModel, 'model_dump_json'): - def _patched_serialize_entry(self, entry): - """Use Pydantic v2's model_dump_json() instead of deprecated json(models_as_dict=False)""" - return entry.model_dump_json() - - JsonlWriter._serialize_entry = _patched_serialize_entry -except (ImportError, AttributeError): - # forum-dl not installed or already compatible - no patch needed - pass - -# Now import and run forum-dl's main function -from forum_dl import main - -if __name__ == '__main__': - sys.exit(main()) diff --git a/archivebox/plugins/forumdl/on_Crawl__25_forumdl_install.py b/archivebox/plugins/forumdl/on_Crawl__25_forumdl_install.py deleted file mode 100755 index b30ca715..00000000 --- a/archivebox/plugins/forumdl/on_Crawl__25_forumdl_install.py +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env python3 -""" -Emit forum-dl Binary dependency for the crawl. -""" - -import json -import os -import sys - - -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - -def get_env_bool(name: str, default: bool = False) -> bool: - val = get_env(name, '').lower() - if val in ('true', '1', 'yes', 'on'): - return True - if val in ('false', '0', 'no', 'off'): - return False - return default - - -def output_binary(name: str, binproviders: str, overrides: dict | None = None): - """Output Binary JSONL record for a dependency.""" - machine_id = os.environ.get('MACHINE_ID', '') - - record = { - 'type': 'Binary', - 'name': name, - 'binproviders': binproviders, - 'machine_id': machine_id, - } - if overrides: - record['overrides'] = overrides - print(json.dumps(record)) - - -def main(): - forumdl_enabled = get_env_bool('FORUMDL_ENABLED', True) - - if not forumdl_enabled: - sys.exit(0) - - output_binary( - name='forum-dl', - binproviders='pip,env', - overrides={ - 'pip': { - 'packages': [ - '--no-deps', - '--prefer-binary', - 'forum-dl', - 'chardet==5.2.0', - 'pydantic', - 'pydantic-core', - 'typing-extensions', - 'annotated-types', - 'typing-inspection', - 'beautifulsoup4', - 'soupsieve', - 'lxml', - 'requests', - 'urllib3', - 'certifi', - 'idna', - 'charset-normalizer', - 'tenacity', - 'python-dateutil', - 'six', - 'html2text', - 'warcio', - ] - } - }, - ) - - sys.exit(0) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/forumdl/on_Snapshot__04_forumdl.bg.py b/archivebox/plugins/forumdl/on_Snapshot__04_forumdl.bg.py deleted file mode 100755 index d19e7e16..00000000 --- a/archivebox/plugins/forumdl/on_Snapshot__04_forumdl.bg.py +++ /dev/null @@ -1,266 +0,0 @@ -#!/usr/bin/env python3 -""" -Download forum content from a URL using forum-dl. - -Usage: on_Snapshot__04_forumdl.bg.py --url= --snapshot-id= -Output: Downloads forum content to $PWD/ - -Environment variables: - FORUMDL_ENABLED: Enable forum downloading (default: True) - FORUMDL_BINARY: Path to forum-dl binary (default: forum-dl) - FORUMDL_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT) - FORUMDL_OUTPUT_FORMAT: Output format (default: jsonl) - FORUMDL_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY) - FORUMDL_ARGS: Default forum-dl arguments (JSON array) - FORUMDL_ARGS_EXTRA: Extra arguments to append (JSON array) -""" - -import json -import os -import shutil -import subprocess -import sys -import threading -from pathlib import Path - -import rich_click as click - - -# Monkey patch forum-dl for Pydantic v2 compatibility -# forum-dl 0.3.0 uses deprecated json(models_as_dict=False) which doesn't work in Pydantic v2 -try: - from forum_dl.writers.jsonl import JsonlWriter - from pydantic import BaseModel - - # Check if we're using Pydantic v2 (has model_dump_json) - if hasattr(BaseModel, 'model_dump_json'): - # Patch JsonlWriter to use Pydantic v2 API - original_serialize = JsonlWriter._serialize_entry - - def _patched_serialize_entry(self, entry): - # Use Pydantic v2's model_dump_json() instead of deprecated json(models_as_dict=False) - return entry.model_dump_json() - - JsonlWriter._serialize_entry = _patched_serialize_entry -except (ImportError, AttributeError): - # forum-dl not installed or already compatible - pass - - -# Extractor metadata -PLUGIN_NAME = 'forumdl' -BIN_NAME = 'forum-dl' -BIN_PROVIDERS = 'pip,env' -OUTPUT_DIR = '.' - - -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - - -def get_env_bool(name: str, default: bool = False) -> bool: - val = get_env(name, '').lower() - if val in ('true', '1', 'yes', 'on'): - return True - if val in ('false', '0', 'no', 'off'): - return False - return default - - -def get_env_int(name: str, default: int = 0) -> int: - try: - return int(get_env(name, str(default))) - except ValueError: - return default - - -def get_env_array(name: str, default: list[str] | None = None) -> list[str]: - """Parse a JSON array from environment variable.""" - val = get_env(name, '') - if not val: - return default if default is not None else [] - try: - result = json.loads(val) - if isinstance(result, list): - return [str(item) for item in result] - return default if default is not None else [] - except json.JSONDecodeError: - return default if default is not None else [] - - -def get_binary_shebang(binary_path: str) -> str | None: - """Return interpreter from shebang line if present (e.g., /path/to/python).""" - try: - with open(binary_path, 'r', encoding='utf-8') as f: - first_line = f.readline().strip() - if first_line.startswith('#!'): - return first_line[2:].strip().split(' ')[0] - except Exception: - pass - return None - - -def resolve_binary_path(binary: str) -> str | None: - """Resolve binary to an absolute path if possible.""" - if not binary: - return None - if Path(binary).is_file(): - return binary - return shutil.which(binary) - - - -def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]: - """ - Download forum using forum-dl. - - Returns: (success, output_path, error_message) - """ - # Get config from env (with FORUMDL_ prefix, x-fallback handled by config loader) - timeout = get_env_int('FORUMDL_TIMEOUT') or get_env_int('TIMEOUT', 3600) - check_ssl = get_env_bool('FORUMDL_CHECK_SSL_VALIDITY', True) if get_env('FORUMDL_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True) - forumdl_args = get_env_array('FORUMDL_ARGS', []) - forumdl_args_extra = get_env_array('FORUMDL_ARGS_EXTRA', []) - output_format = get_env('FORUMDL_OUTPUT_FORMAT', 'jsonl') - - # Output directory is current directory (hook already runs in output dir) - output_dir = Path(OUTPUT_DIR) - - # Build output filename based on format - if output_format == 'warc': - output_file = output_dir / 'forum.warc.gz' - elif output_format == 'jsonl': - output_file = output_dir / 'forum.jsonl' - elif output_format == 'maildir': - output_file = output_dir / 'forum' # maildir is a directory - elif output_format in ('mbox', 'mh', 'mmdf', 'babyl'): - output_file = output_dir / f'forum.{output_format}' - else: - output_file = output_dir / f'forum.{output_format}' - - # Use our Pydantic v2 compatible wrapper if available, otherwise fall back to binary - wrapper_path = Path(__file__).parent / 'forum-dl-wrapper.py' - resolved_binary = resolve_binary_path(binary) or binary - if wrapper_path.exists(): - forumdl_python = get_binary_shebang(resolved_binary) or sys.executable - cmd = [forumdl_python, str(wrapper_path), *forumdl_args, '-f', output_format, '-o', str(output_file)] - else: - cmd = [resolved_binary, *forumdl_args, '-f', output_format, '-o', str(output_file)] - - if not check_ssl: - cmd.append('--no-check-certificate') - - if forumdl_args_extra: - cmd.extend(forumdl_args_extra) - - cmd.append(url) - - try: - print(f'[forumdl] Starting download (timeout={timeout}s)', file=sys.stderr) - output_lines: list[str] = [] - process = subprocess.Popen( - cmd, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - bufsize=1, - ) - - def _read_output() -> None: - if not process.stdout: - return - for line in process.stdout: - output_lines.append(line) - sys.stderr.write(line) - - reader = threading.Thread(target=_read_output, daemon=True) - reader.start() - - try: - process.wait(timeout=timeout) - except subprocess.TimeoutExpired: - process.kill() - reader.join(timeout=1) - return False, None, f'Timed out after {timeout} seconds' - - reader.join(timeout=1) - combined_output = ''.join(output_lines) - - # Check if output file was created - if output_file.exists() and output_file.stat().st_size > 0: - return True, str(output_file), '' - else: - stderr = combined_output - - # These are NOT errors - page simply has no downloadable forum content - stderr_lower = stderr.lower() - if 'unsupported url' in stderr_lower: - return True, None, '' # Not a forum site - success, no output - if 'no content' in stderr_lower: - return True, None, '' # No forum found - success, no output - if 'extractornotfounderror' in stderr_lower: - return True, None, '' # No forum extractor for this URL - success, no output - if process.returncode == 0: - return True, None, '' # forum-dl exited cleanly, just no forum - success - - # These ARE errors - something went wrong - if '404' in stderr: - return False, None, '404 Not Found' - if '403' in stderr: - return False, None, '403 Forbidden' - if 'unable to extract' in stderr_lower: - return False, None, 'Unable to extract forum info' - - return False, None, f'forum-dl error: {stderr}' - - except subprocess.TimeoutExpired: - return False, None, f'Timed out after {timeout} seconds' - except Exception as e: - return False, None, f'{type(e).__name__}: {e}' - - -@click.command() -@click.option('--url', required=True, help='URL to download forum from') -@click.option('--snapshot-id', required=True, help='Snapshot UUID') -def main(url: str, snapshot_id: str): - """Download forum content from a URL using forum-dl.""" - - output = None - status = 'failed' - error = '' - - try: - # Check if forum-dl is enabled - if not get_env_bool('FORUMDL_ENABLED', True): - print('Skipping forum-dl (FORUMDL_ENABLED=False)', file=sys.stderr) - # Temporary failure (config disabled) - NO JSONL emission - sys.exit(0) - - # Get binary from environment - binary = get_env('FORUMDL_BINARY', 'forum-dl') - - # Run extraction - success, output, error = save_forum(url, binary) - - if success: - # Success - emit ArchiveResult - result = { - 'type': 'ArchiveResult', - 'status': 'succeeded', - 'output_str': output or '' - } - print(json.dumps(result)) - sys.exit(0) - else: - # Transient error - emit NO JSONL - print(f'ERROR: {error}', file=sys.stderr) - sys.exit(1) - - except Exception as e: - # Transient error - emit NO JSONL - print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr) - sys.exit(1) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/forumdl/templates/card.html b/archivebox/plugins/forumdl/templates/card.html deleted file mode 100644 index 24000949..00000000 --- a/archivebox/plugins/forumdl/templates/card.html +++ /dev/null @@ -1,7 +0,0 @@ - -
-
- šŸ’¬ - Forum -
-
diff --git a/archivebox/plugins/forumdl/templates/full.html b/archivebox/plugins/forumdl/templates/full.html deleted file mode 100644 index 85413866..00000000 --- a/archivebox/plugins/forumdl/templates/full.html +++ /dev/null @@ -1,147 +0,0 @@ - - - - - - - Forum Thread - - - -
-
šŸ’¬
-

Forum Thread

-
-
-
Loading posts...
-
- - - diff --git a/archivebox/plugins/forumdl/templates/icon.html b/archivebox/plugins/forumdl/templates/icon.html deleted file mode 100644 index 01cace0d..00000000 --- a/archivebox/plugins/forumdl/templates/icon.html +++ /dev/null @@ -1 +0,0 @@ - diff --git a/archivebox/plugins/forumdl/tests/test_forumdl.py b/archivebox/plugins/forumdl/tests/test_forumdl.py deleted file mode 100644 index 18a692c9..00000000 --- a/archivebox/plugins/forumdl/tests/test_forumdl.py +++ /dev/null @@ -1,317 +0,0 @@ -""" -Integration tests for forumdl plugin - -Tests verify: - pass -1. Hook script exists -2. Dependencies installed via validation hooks -3. Verify deps with abx-pkg -4. Forum extraction works on forum URLs -5. JSONL output is correct -6. Config options work -7. Handles non-forum URLs gracefully -""" - -import json -import os -import subprocess -import sys -import tempfile -import time -import uuid -from pathlib import Path -import pytest - -PLUGIN_DIR = Path(__file__).parent.parent -PLUGINS_ROOT = PLUGIN_DIR.parent -FORUMDL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_forumdl.*'), None) -TEST_URL = 'https://example.com' - -# Module-level cache for binary path -_forumdl_binary_path = None -_forumdl_lib_root = None - -def get_forumdl_binary_path(): - """Get the installed forum-dl binary path from cache or by running installation.""" - global _forumdl_binary_path - if _forumdl_binary_path: - return _forumdl_binary_path - - # Try to find forum-dl binary using abx-pkg - from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides - - try: - binary = Binary( - name='forum-dl', - binproviders=[PipProvider(), EnvProvider()] - ).load() - - if binary and binary.abspath: - _forumdl_binary_path = str(binary.abspath) - return _forumdl_binary_path - except Exception: - pass - - # If not found, try to install via pip using the crawl hook overrides - pip_hook = PLUGINS_ROOT / 'pip' / 'on_Binary__11_pip_install.py' - crawl_hook = PLUGIN_DIR / 'on_Crawl__25_forumdl_install.py' - if pip_hook.exists(): - binary_id = str(uuid.uuid4()) - machine_id = str(uuid.uuid4()) - overrides = None - - if crawl_hook.exists(): - crawl_result = subprocess.run( - [sys.executable, str(crawl_hook)], - capture_output=True, - text=True, - timeout=30, - ) - for crawl_line in crawl_result.stdout.strip().split('\n'): - if crawl_line.strip().startswith('{'): - try: - crawl_record = json.loads(crawl_line) - if crawl_record.get('type') == 'Binary' and crawl_record.get('name') == 'forum-dl': - overrides = crawl_record.get('overrides') - break - except json.JSONDecodeError: - continue - - # Create a persistent temp LIB_DIR for the pip provider - import platform - global _forumdl_lib_root - if not _forumdl_lib_root: - _forumdl_lib_root = tempfile.mkdtemp(prefix='forumdl-lib-') - machine = platform.machine().lower() - system = platform.system().lower() - if machine in ('arm64', 'aarch64'): - machine = 'arm64' - elif machine in ('x86_64', 'amd64'): - machine = 'x86_64' - machine_type = f"{machine}-{system}" - lib_dir = Path(_forumdl_lib_root) / 'lib' / machine_type - lib_dir.mkdir(parents=True, exist_ok=True) - env = os.environ.copy() - env['LIB_DIR'] = str(lib_dir) - env['DATA_DIR'] = str(Path(_forumdl_lib_root) / 'data') - - cmd = [ - sys.executable, str(pip_hook), - '--binary-id', binary_id, - '--machine-id', machine_id, - '--name', 'forum-dl' - ] - if overrides: - cmd.append(f'--overrides={json.dumps(overrides)}') - - install_result = subprocess.run( - cmd, - capture_output=True, - text=True, - timeout=300, - env=env, - ) - - # Parse Binary from pip installation - for install_line in install_result.stdout.strip().split('\n'): - if install_line.strip(): - try: - install_record = json.loads(install_line) - if install_record.get('type') == 'Binary' and install_record.get('name') == 'forum-dl': - _forumdl_binary_path = install_record.get('abspath') - return _forumdl_binary_path - except json.JSONDecodeError: - pass - - return None - - -def test_hook_script_exists(): - """Verify on_Snapshot hook exists.""" - assert FORUMDL_HOOK.exists(), f"Hook not found: {FORUMDL_HOOK}" - - -def test_verify_deps_with_abx_pkg(): - """Verify forum-dl is installed by calling the REAL installation hooks.""" - binary_path = get_forumdl_binary_path() - if not binary_path: - assert False, ( - "forum-dl installation failed. Install hook should install forum-dl automatically. " - "Note: forum-dl has a dependency on cchardet which may not compile on Python 3.14+ " - "due to removed longintrepr.h header." - ) - assert Path(binary_path).is_file(), f"Binary path must be a valid file: {binary_path}" - - -def test_handles_non_forum_url(): - """Test that forum-dl extractor handles non-forum URLs gracefully via hook.""" - import os - - binary_path = get_forumdl_binary_path() - if not binary_path: - pass - assert Path(binary_path).is_file(), f"Binary must be a valid file: {binary_path}" - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - env = os.environ.copy() - env['FORUMDL_BINARY'] = binary_path - - # Run forum-dl extraction hook on non-forum URL - result = subprocess.run( - [sys.executable, str(FORUMDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'test789'], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=60 - ) - - # Should exit 0 even for non-forum URL (graceful handling) - assert result.returncode == 0, f"Should handle non-forum URL gracefully: {result.stderr}" - - # Parse clean JSONL output - result_json = None - for line in result.stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - pass - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - assert result_json, "Should have ArchiveResult JSONL output" - assert result_json['status'] == 'succeeded', f"Should succeed even for non-forum URL: {result_json}" - - -def test_config_save_forumdl_false_skips(): - """Test that FORUMDL_ENABLED=False exits without emitting JSONL.""" - import os - - with tempfile.TemporaryDirectory() as tmpdir: - env = os.environ.copy() - env['FORUMDL_ENABLED'] = 'False' - - result = subprocess.run( - [sys.executable, str(FORUMDL_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=30 - ) - - assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}" - - # Feature disabled - temporary failure, should NOT emit JSONL - assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr" - - # Should NOT emit any JSONL - jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')] - assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}" - - -def test_config_timeout(): - """Test that FORUMDL_TIMEOUT config is respected.""" - import os - - binary_path = get_forumdl_binary_path() - if not binary_path: - pass - assert Path(binary_path).is_file(), f"Binary must be a valid file: {binary_path}" - - with tempfile.TemporaryDirectory() as tmpdir: - env = os.environ.copy() - env['FORUMDL_BINARY'] = binary_path - env['FORUMDL_TIMEOUT'] = '5' - - start_time = time.time() - result = subprocess.run( - [sys.executable, str(FORUMDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'testtimeout'], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=10 # Should complete in 5s, use 10s as safety margin - ) - elapsed_time = time.time() - start_time - - assert result.returncode == 0, f"Should complete without hanging: {result.stderr}" - # Allow 1 second overhead for subprocess startup and Python interpreter - assert elapsed_time <= 6.0, f"Should complete within 6 seconds (5s timeout + 1s overhead), took {elapsed_time:.2f}s" - - -def test_real_forum_url(): - """Test that forum-dl extracts content from a real HackerNews thread with jsonl output. - - Uses our Pydantic v2 compatible wrapper to fix forum-dl 0.3.0's incompatibility. - """ - import os - - binary_path = get_forumdl_binary_path() - assert binary_path, "forum-dl binary not available" - assert Path(binary_path).is_file(), f"Binary must be a valid file: {binary_path}" - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - # Use HackerNews - one of the most reliable forum-dl extractors - forum_url = 'https://news.ycombinator.com/item?id=1' - - env = os.environ.copy() - env['FORUMDL_BINARY'] = binary_path - env['FORUMDL_TIMEOUT'] = '60' - env['FORUMDL_OUTPUT_FORMAT'] = 'jsonl' # Use jsonl format - # HTML output could be added via: env['FORUMDL_ARGS_EXTRA'] = json.dumps(['--files-output', './files']) - - start_time = time.time() - result = subprocess.run( - [sys.executable, str(FORUMDL_HOOK), '--url', forum_url, '--snapshot-id', 'testforum'], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=90 - ) - elapsed_time = time.time() - start_time - - # Should succeed with our Pydantic v2 wrapper - assert result.returncode == 0, f"Should extract forum successfully: {result.stderr}" - - # Parse JSONL output - result_json = None - for line in result.stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - assert result_json, f"Should have ArchiveResult JSONL output. stdout: {result.stdout}" - assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" - - # Check that forum files were downloaded - output_files = list(tmpdir.glob('**/*')) - forum_files = [f for f in output_files if f.is_file()] - - assert len(forum_files) > 0, f"Should have downloaded at least one forum file. Files: {output_files}" - - # Verify the JSONL file has content - jsonl_file = tmpdir / 'forum.jsonl' - assert jsonl_file.exists(), "Should have created forum.jsonl" - assert jsonl_file.stat().st_size > 0, "forum.jsonl should not be empty" - - print(f"Successfully extracted {len(forum_files)} file(s) in {elapsed_time:.2f}s") - - -if __name__ == '__main__': - pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/gallerydl/config.json b/archivebox/plugins/gallerydl/config.json deleted file mode 100644 index 522a4b22..00000000 --- a/archivebox/plugins/gallerydl/config.json +++ /dev/null @@ -1,54 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "additionalProperties": false, - "properties": { - "GALLERYDL_ENABLED": { - "type": "boolean", - "default": true, - "x-aliases": ["SAVE_GALLERYDL", "USE_GALLERYDL"], - "description": "Enable gallery downloading with gallery-dl" - }, - "GALLERYDL_BINARY": { - "type": "string", - "default": "gallery-dl", - "description": "Path to gallery-dl binary" - }, - "GALLERYDL_TIMEOUT": { - "type": "integer", - "default": 3600, - "minimum": 30, - "x-fallback": "TIMEOUT", - "description": "Timeout for gallery downloads in seconds" - }, - "GALLERYDL_COOKIES_FILE": { - "type": "string", - "default": "", - "x-fallback": "COOKIES_FILE", - "description": "Path to cookies file" - }, - "GALLERYDL_CHECK_SSL_VALIDITY": { - "type": "boolean", - "default": true, - "x-fallback": "CHECK_SSL_VALIDITY", - "description": "Whether to verify SSL certificates" - }, - "GALLERYDL_ARGS": { - "type": "array", - "items": {"type": "string"}, - "default": [ - "--write-metadata", - "--write-info-json" - ], - "x-aliases": ["GALLERYDL_DEFAULT_ARGS"], - "description": "Default gallery-dl arguments" - }, - "GALLERYDL_ARGS_EXTRA": { - "type": "array", - "items": {"type": "string"}, - "default": [], - "x-aliases": ["GALLERYDL_EXTRA_ARGS"], - "description": "Extra arguments to append to gallery-dl command" - } - } -} diff --git a/archivebox/plugins/gallerydl/on_Crawl__20_gallerydl_install.py b/archivebox/plugins/gallerydl/on_Crawl__20_gallerydl_install.py deleted file mode 100755 index 06d95f4d..00000000 --- a/archivebox/plugins/gallerydl/on_Crawl__20_gallerydl_install.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python3 -""" -Emit gallery-dl Binary dependency for the crawl. -""" - -import json -import os -import sys - - -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - -def get_env_bool(name: str, default: bool = False) -> bool: - val = get_env(name, '').lower() - if val in ('true', '1', 'yes', 'on'): - return True - if val in ('false', '0', 'no', 'off'): - return False - return default - - -def output_binary(name: str, binproviders: str): - """Output Binary JSONL record for a dependency.""" - machine_id = os.environ.get('MACHINE_ID', '') - - record = { - 'type': 'Binary', - 'name': name, - 'binproviders': binproviders, - 'machine_id': machine_id, - } - print(json.dumps(record)) - - -def main(): - gallerydl_enabled = get_env_bool('GALLERYDL_ENABLED', True) - - if not gallerydl_enabled: - sys.exit(0) - - output_binary(name='gallery-dl', binproviders='pip,brew,apt,env') - - sys.exit(0) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/gallerydl/on_Snapshot__03_gallerydl.bg.py b/archivebox/plugins/gallerydl/on_Snapshot__03_gallerydl.bg.py deleted file mode 100755 index fc5d951c..00000000 --- a/archivebox/plugins/gallerydl/on_Snapshot__03_gallerydl.bg.py +++ /dev/null @@ -1,261 +0,0 @@ -#!/usr/bin/env python3 -""" -Download image galleries from a URL using gallery-dl. - -Usage: on_Snapshot__03_gallerydl.bg.py --url= --snapshot-id= -Output: Downloads gallery images to $PWD/gallerydl/ - -Environment variables: - GALLERYDL_ENABLED: Enable gallery-dl gallery extraction (default: True) - GALLERYDL_BINARY: Path to gallery-dl binary (default: gallery-dl) - GALLERYDL_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT) - GALLERYDL_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE) - GALLERYDL_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY) - GALLERYDL_ARGS: Default gallery-dl arguments (JSON array) - GALLERYDL_ARGS_EXTRA: Extra arguments to append (JSON array) -""" - -import json -import os -import subprocess -import sys -import threading -from pathlib import Path - -import rich_click as click - - -# Extractor metadata -PLUGIN_NAME = 'gallerydl' -BIN_NAME = 'gallery-dl' -BIN_PROVIDERS = 'pip,env' -OUTPUT_DIR = '.' - - -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - - -def get_env_bool(name: str, default: bool = False) -> bool: - val = get_env(name, '').lower() - if val in ('true', '1', 'yes', 'on'): - return True - if val in ('false', '0', 'no', 'off'): - return False - return default - - -def get_env_int(name: str, default: int = 0) -> int: - try: - return int(get_env(name, str(default))) - except ValueError: - return default - - -def get_env_array(name: str, default: list[str] | None = None) -> list[str]: - """Parse a JSON array from environment variable.""" - val = get_env(name, '') - if not val: - return default if default is not None else [] - try: - result = json.loads(val) - if isinstance(result, list): - return [str(item) for item in result] - return default if default is not None else [] - except json.JSONDecodeError: - return default if default is not None else [] - - -STATICFILE_DIR = '../staticfile' - -def has_staticfile_output() -> bool: - """Check if staticfile extractor already downloaded this URL.""" - staticfile_dir = Path(STATICFILE_DIR) - if not staticfile_dir.exists(): - return False - stdout_log = staticfile_dir / 'stdout.log' - if not stdout_log.exists(): - return False - for line in stdout_log.read_text(errors='ignore').splitlines(): - line = line.strip() - if not line.startswith('{'): - continue - try: - record = json.loads(line) - except json.JSONDecodeError: - continue - if record.get('type') == 'ArchiveResult' and record.get('status') == 'succeeded': - return True - return False - - -def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]: - """ - Download gallery using gallery-dl. - - Returns: (success, output_path, error_message) - """ - # Get config from env (with GALLERYDL_ prefix, x-fallback handled by config loader) - timeout = get_env_int('GALLERYDL_TIMEOUT') or get_env_int('TIMEOUT', 3600) - check_ssl = get_env_bool('GALLERYDL_CHECK_SSL_VALIDITY', True) if get_env('GALLERYDL_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True) - gallerydl_args = get_env_array('GALLERYDL_ARGS', []) - gallerydl_args_extra = get_env_array('GALLERYDL_ARGS_EXTRA', []) - cookies_file = get_env('GALLERYDL_COOKIES_FILE') or get_env('COOKIES_FILE', '') - - # Output directory is current directory (hook already runs in output dir) - output_dir = Path(OUTPUT_DIR) - - # Build command - # Use -D for exact directory (flat structure) instead of -d (nested structure) - cmd = [ - binary, - *gallerydl_args, - '-D', str(output_dir), - ] - - if not check_ssl: - cmd.append('--no-check-certificate') - - if cookies_file and Path(cookies_file).exists(): - cmd.extend(['-C', cookies_file]) - - if gallerydl_args_extra: - cmd.extend(gallerydl_args_extra) - - cmd.append(url) - - try: - print(f'[gallerydl] Starting download (timeout={timeout}s)', file=sys.stderr) - output_lines: list[str] = [] - process = subprocess.Popen( - cmd, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - bufsize=1, - ) - - def _read_output() -> None: - if not process.stdout: - return - for line in process.stdout: - output_lines.append(line) - sys.stderr.write(line) - - reader = threading.Thread(target=_read_output, daemon=True) - reader.start() - - try: - process.wait(timeout=timeout) - except subprocess.TimeoutExpired: - process.kill() - reader.join(timeout=1) - return False, None, f'Timed out after {timeout} seconds' - - reader.join(timeout=1) - combined_output = ''.join(output_lines) - - # Check if any gallery files were downloaded (search recursively) - gallery_extensions = ( - '.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.svg', - '.mp4', '.webm', '.mkv', '.avi', '.mov', '.flv', - '.json', '.txt', '.zip', - ) - - downloaded_files = [ - f for f in output_dir.rglob('*') - if f.is_file() and f.suffix.lower() in gallery_extensions - ] - - if downloaded_files: - # Return first image file, or first file if no images - image_files = [ - f for f in downloaded_files - if f.suffix.lower() in ('.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp') - ] - output = str(image_files[0]) if image_files else str(downloaded_files[0]) - return True, output, '' - else: - stderr = combined_output - - # These are NOT errors - page simply has no downloadable gallery - # Return success with no output (legitimate "nothing to download") - stderr_lower = stderr.lower() - if 'unsupported url' in stderr_lower: - return True, None, '' # Not a gallery site - success, no output - if 'no results' in stderr_lower: - return True, None, '' # No gallery found - success, no output - if process.returncode == 0: - return True, None, '' # gallery-dl exited cleanly, just no gallery - success - - # These ARE errors - something went wrong - if '404' in stderr: - return False, None, '404 Not Found' - if '403' in stderr: - return False, None, '403 Forbidden' - if 'unable to extract' in stderr_lower: - return False, None, 'Unable to extract gallery info' - - return False, None, f'gallery-dl error: {stderr}' - - except subprocess.TimeoutExpired: - return False, None, f'Timed out after {timeout} seconds' - except Exception as e: - return False, None, f'{type(e).__name__}: {e}' - - -@click.command() -@click.option('--url', required=True, help='URL to download gallery from') -@click.option('--snapshot-id', required=True, help='Snapshot UUID') -def main(url: str, snapshot_id: str): - """Download image gallery from a URL using gallery-dl.""" - - output = None - status = 'failed' - error = '' - - try: - # Check if gallery-dl is enabled - if not get_env_bool('GALLERYDL_ENABLED', True): - print('Skipping gallery-dl (GALLERYDL_ENABLED=False)', file=sys.stderr) - # Temporary failure (config disabled) - NO JSONL emission - sys.exit(0) - - # Check if staticfile extractor already handled this (permanent skip) - if has_staticfile_output(): - print(f'Skipping gallery-dl - staticfile extractor already downloaded this', file=sys.stderr) - print(json.dumps({ - 'type': 'ArchiveResult', - 'status': 'skipped', - 'output_str': 'staticfile already handled', - })) - sys.exit(0) - - # Get binary from environment - binary = get_env('GALLERYDL_BINARY', 'gallery-dl') - - # Run extraction - success, output, error = save_gallery(url, binary) - - if success: - # Success - emit ArchiveResult - result = { - 'type': 'ArchiveResult', - 'status': 'succeeded', - 'output_str': output or '' - } - print(json.dumps(result)) - sys.exit(0) - else: - # Transient error - emit NO JSONL - print(f'ERROR: {error}', file=sys.stderr) - sys.exit(1) - - except Exception as e: - # Transient error - emit NO JSONL - print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr) - sys.exit(1) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/gallerydl/templates/card.html b/archivebox/plugins/gallerydl/templates/card.html deleted file mode 100644 index 32ea0fe0..00000000 --- a/archivebox/plugins/gallerydl/templates/card.html +++ /dev/null @@ -1,11 +0,0 @@ - -
- Gallery thumbnail -
- šŸ–¼ļø - Gallery -
-
diff --git a/archivebox/plugins/gallerydl/templates/full.html b/archivebox/plugins/gallerydl/templates/full.html deleted file mode 100644 index bf06ceb4..00000000 --- a/archivebox/plugins/gallerydl/templates/full.html +++ /dev/null @@ -1,28 +0,0 @@ - - - - - - - Gallery - - - - Gallery image - - diff --git a/archivebox/plugins/gallerydl/templates/icon.html b/archivebox/plugins/gallerydl/templates/icon.html deleted file mode 100644 index a8ef89e7..00000000 --- a/archivebox/plugins/gallerydl/templates/icon.html +++ /dev/null @@ -1 +0,0 @@ - diff --git a/archivebox/plugins/gallerydl/tests/test_gallerydl.py b/archivebox/plugins/gallerydl/tests/test_gallerydl.py deleted file mode 100644 index 7feedb1e..00000000 --- a/archivebox/plugins/gallerydl/tests/test_gallerydl.py +++ /dev/null @@ -1,190 +0,0 @@ -""" -Integration tests for gallerydl plugin - -Tests verify: - pass -1. Hook script exists -2. Dependencies installed via validation hooks -3. Verify deps with abx-pkg -4. Gallery extraction works on gallery URLs -5. JSONL output is correct -6. Config options work -7. Handles non-gallery URLs gracefully -""" - -import json -import subprocess -import sys -import tempfile -import time -from pathlib import Path -import pytest - -PLUGIN_DIR = Path(__file__).parent.parent -PLUGINS_ROOT = PLUGIN_DIR.parent -GALLERYDL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_gallerydl.*'), None) -TEST_URL = 'https://example.com' - -def test_hook_script_exists(): - """Verify on_Snapshot hook exists.""" - assert GALLERYDL_HOOK.exists(), f"Hook not found: {GALLERYDL_HOOK}" - - -def test_verify_deps_with_abx_pkg(): - """Verify gallery-dl is available via abx-pkg.""" - from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides - - missing_binaries = [] - - # Verify gallery-dl is available - gallerydl_binary = Binary(name='gallery-dl', binproviders=[PipProvider(), EnvProvider()]) - gallerydl_loaded = gallerydl_binary.load() - if not (gallerydl_loaded and gallerydl_loaded.abspath): - missing_binaries.append('gallery-dl') - - if missing_binaries: - pass - - -def test_handles_non_gallery_url(): - """Test that gallery-dl extractor handles non-gallery URLs gracefully via hook.""" - # Prerequisites checked by earlier test - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - # Run gallery-dl extraction hook on non-gallery URL - result = subprocess.run( - [sys.executable, str(GALLERYDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'test789'], - cwd=tmpdir, - capture_output=True, - text=True, - timeout=60 - ) - - # Should exit 0 even for non-gallery URL - assert result.returncode == 0, f"Should handle non-gallery URL gracefully: {result.stderr}" - - # Parse clean JSONL output - result_json = None - for line in result.stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - pass - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - assert result_json, "Should have ArchiveResult JSONL output" - assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" - - -def test_config_save_gallery_dl_false_skips(): - """Test that GALLERYDL_ENABLED=False exits without emitting JSONL.""" - import os - - with tempfile.TemporaryDirectory() as tmpdir: - env = os.environ.copy() - env['GALLERYDL_ENABLED'] = 'False' - - result = subprocess.run( - [sys.executable, str(GALLERYDL_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=30 - ) - - assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}" - - # Feature disabled - temporary failure, should NOT emit JSONL - assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr" - - # Should NOT emit any JSONL - jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')] - assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}" - - -def test_config_timeout(): - """Test that GALLERY_DL_TIMEOUT config is respected.""" - import os - - with tempfile.TemporaryDirectory() as tmpdir: - env = os.environ.copy() - env['GALLERY_DL_TIMEOUT'] = '5' - - start_time = time.time() - result = subprocess.run( - [sys.executable, str(GALLERYDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'testtimeout'], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=10 # Should complete in 5s, use 10s as safety margin - ) - elapsed_time = time.time() - start_time - - assert result.returncode == 0, f"Should complete without hanging: {result.stderr}" - # Allow 1 second overhead for subprocess startup and Python interpreter - assert elapsed_time <= 6.0, f"Should complete within 6 seconds (5s timeout + 1s overhead), took {elapsed_time:.2f}s" - - -def test_real_gallery_url(): - """Test that gallery-dl can extract images from a real Flickr gallery URL.""" - import os - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - # Use a real Flickr photo page - gallery_url = 'https://www.flickr.com/photos/gregorydolivet/55002388567/in/explore-2025-12-25/' - - env = os.environ.copy() - env['GALLERY_DL_TIMEOUT'] = '60' # Give it time to download - - start_time = time.time() - result = subprocess.run( - [sys.executable, str(GALLERYDL_HOOK), '--url', gallery_url, '--snapshot-id', 'testflickr'], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=90 - ) - elapsed_time = time.time() - start_time - - # Should succeed - assert result.returncode == 0, f"Should extract gallery successfully: {result.stderr}" - - # Parse JSONL output - result_json = None - for line in result.stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - assert result_json, f"Should have ArchiveResult JSONL output. stdout: {result.stdout}" - assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" - - # Check that some files were downloaded - output_files = list(tmpdir.glob('**/*')) - image_files = [f for f in output_files if f.is_file() and f.suffix.lower() in ('.jpg', '.jpeg', '.png', '.gif', '.webp')] - - assert len(image_files) > 0, f"Should have downloaded at least one image. Files: {output_files}" - - print(f"Successfully extracted {len(image_files)} image(s) in {elapsed_time:.2f}s") - - -if __name__ == '__main__': - pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/git/config.json b/archivebox/plugins/git/config.json deleted file mode 100644 index da0a3b02..00000000 --- a/archivebox/plugins/git/config.json +++ /dev/null @@ -1,44 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "additionalProperties": false, - "properties": { - "GIT_ENABLED": { - "type": "boolean", - "default": true, - "x-aliases": ["SAVE_GIT", "USE_GIT"], - "description": "Enable git repository cloning" - }, - "GIT_BINARY": { - "type": "string", - "default": "git", - "description": "Path to git binary" - }, - "GIT_TIMEOUT": { - "type": "integer", - "default": 120, - "minimum": 10, - "x-fallback": "TIMEOUT", - "description": "Timeout for git operations in seconds" - }, - "GIT_DOMAINS": { - "type": "string", - "default": "github.com,gitlab.com,bitbucket.org,gist.github.com,codeberg.org,gitea.com,git.sr.ht", - "description": "Comma-separated list of domains to treat as git repositories" - }, - "GIT_ARGS": { - "type": "array", - "items": {"type": "string"}, - "default": ["clone", "--depth=1", "--recursive"], - "x-aliases": ["GIT_DEFAULT_ARGS"], - "description": "Default git arguments" - }, - "GIT_ARGS_EXTRA": { - "type": "array", - "items": {"type": "string"}, - "default": [], - "x-aliases": ["GIT_EXTRA_ARGS"], - "description": "Extra arguments to append to git command" - } - } -} diff --git a/archivebox/plugins/git/on_Crawl__05_git_install.py b/archivebox/plugins/git/on_Crawl__05_git_install.py deleted file mode 100755 index e090d546..00000000 --- a/archivebox/plugins/git/on_Crawl__05_git_install.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python3 -""" -Emit git Binary dependency for the crawl. -""" - -import json -import os -import sys - - -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - -def get_env_bool(name: str, default: bool = False) -> bool: - val = get_env(name, '').lower() - if val in ('true', '1', 'yes', 'on'): - return True - if val in ('false', '0', 'no', 'off'): - return False - return default - - -def output_binary(name: str, binproviders: str): - """Output Binary JSONL record for a dependency.""" - machine_id = os.environ.get('MACHINE_ID', '') - - record = { - 'type': 'Binary', - 'name': name, - 'binproviders': binproviders, - 'machine_id': machine_id, - } - print(json.dumps(record)) - - -def main(): - git_enabled = get_env_bool('GIT_ENABLED', True) - - if not git_enabled: - sys.exit(0) - - output_binary(name='git', binproviders='apt,brew,env') - - sys.exit(0) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/git/on_Snapshot__05_git.bg.py b/archivebox/plugins/git/on_Snapshot__05_git.bg.py deleted file mode 100644 index c124ddbe..00000000 --- a/archivebox/plugins/git/on_Snapshot__05_git.bg.py +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python3 -""" -Clone a git repository from a URL. - -Usage: on_Snapshot__05_git.bg.py --url= --snapshot-id= -Output: Clones repository to $PWD/repo - -Environment variables: - GIT_BINARY: Path to git binary - GIT_TIMEOUT: Timeout in seconds (default: 120) - GIT_ARGS: Default git arguments (JSON array, default: ["clone", "--depth=1", "--recursive"]) - GIT_ARGS_EXTRA: Extra arguments to append (JSON array, default: []) - - # Fallback to ARCHIVING_CONFIG values if GIT_* not set: - TIMEOUT: Fallback timeout -""" - -import json -import os -import subprocess -import sys -from pathlib import Path - -import rich_click as click - - -# Extractor metadata -PLUGIN_NAME = 'git' -BIN_NAME = 'git' -BIN_PROVIDERS = 'apt,brew,env' -OUTPUT_DIR = '.' - - -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - - -def get_env_int(name: str, default: int = 0) -> int: - try: - return int(get_env(name, str(default))) - except ValueError: - return default - - -def get_env_array(name: str, default: list[str] | None = None) -> list[str]: - """Parse a JSON array from environment variable.""" - val = get_env(name, '') - if not val: - return default if default is not None else [] - try: - result = json.loads(val) - if isinstance(result, list): - return [str(item) for item in result] - return default if default is not None else [] - except json.JSONDecodeError: - return default if default is not None else [] - - -def is_git_url(url: str) -> bool: - """Check if URL looks like a git repository.""" - git_patterns = [ - '.git', - 'github.com', - 'gitlab.com', - 'bitbucket.org', - 'git://', - 'ssh://git@', - ] - return any(p in url.lower() for p in git_patterns) - - -def clone_git(url: str, binary: str) -> tuple[bool, str | None, str]: - """ - Clone git repository. - - Returns: (success, output_path, error_message) - """ - timeout = get_env_int('GIT_TIMEOUT') or get_env_int('TIMEOUT', 120) - git_args = get_env_array('GIT_ARGS', ["clone", "--depth=1", "--recursive"]) - git_args_extra = get_env_array('GIT_ARGS_EXTRA', []) - - cmd = [binary, *git_args, *git_args_extra, url, OUTPUT_DIR] - - try: - result = subprocess.run(cmd, timeout=timeout) - - if result.returncode == 0 and Path(OUTPUT_DIR).is_dir(): - return True, OUTPUT_DIR, '' - else: - return False, None, f'git clone failed (exit={result.returncode})' - - except subprocess.TimeoutExpired: - return False, None, f'Timed out after {timeout} seconds' - except Exception as e: - return False, None, f'{type(e).__name__}: {e}' - - -@click.command() -@click.option('--url', required=True, help='Git repository URL') -@click.option('--snapshot-id', required=True, help='Snapshot UUID') -def main(url: str, snapshot_id: str): - """Clone a git repository from a URL.""" - - output = None - status = 'failed' - error = '' - - try: - # Check if URL looks like a git repo - if not is_git_url(url): - print(f'Skipping git clone for non-git URL: {url}', file=sys.stderr) - print(json.dumps({ - 'type': 'ArchiveResult', - 'status': 'skipped', - 'output_str': 'Not a git URL', - })) - sys.exit(0) - - # Get binary from environment - binary = get_env('GIT_BINARY', 'git') - - # Run extraction - success, output, error = clone_git(url, binary) - status = 'succeeded' if success else 'failed' - - except Exception as e: - error = f'{type(e).__name__}: {e}' - status = 'failed' - - if error: - print(f'ERROR: {error}', file=sys.stderr) - - # Output clean JSONL (no RESULT_JSON= prefix) - result = { - 'type': 'ArchiveResult', - 'status': status, - 'output_str': output or error or '', - } - print(json.dumps(result)) - - sys.exit(0 if status == 'succeeded' else 1) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/git/templates/card.html b/archivebox/plugins/git/templates/card.html deleted file mode 100644 index 3148d5b9..00000000 --- a/archivebox/plugins/git/templates/card.html +++ /dev/null @@ -1,5 +0,0 @@ - -
- šŸ“‚ - Git Repository -
diff --git a/archivebox/plugins/git/templates/icon.html b/archivebox/plugins/git/templates/icon.html deleted file mode 100644 index e16f0231..00000000 --- a/archivebox/plugins/git/templates/icon.html +++ /dev/null @@ -1 +0,0 @@ - diff --git a/archivebox/plugins/git/tests/test_git.py b/archivebox/plugins/git/tests/test_git.py deleted file mode 100644 index c7449495..00000000 --- a/archivebox/plugins/git/tests/test_git.py +++ /dev/null @@ -1,130 +0,0 @@ -""" -Integration tests for git plugin - -Tests verify: - pass -1. Validate hook checks for git binary -2. Verify deps with abx-pkg -3. Standalone git extractor execution -""" - -import json -import shutil -import subprocess -import sys -import tempfile -import time -from pathlib import Path -import pytest - -PLUGIN_DIR = Path(__file__).parent.parent -GIT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_git.*'), None) -TEST_URL = 'https://github.com/ArchiveBox/abx-pkg.git' - -def test_hook_script_exists(): - assert GIT_HOOK.exists() - -def test_verify_deps_with_abx_pkg(): - """Verify git is available via abx-pkg.""" - from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides - - git_binary = Binary(name='git', binproviders=[AptProvider(), BrewProvider(), EnvProvider()]) - git_loaded = git_binary.load() - - assert git_loaded and git_loaded.abspath, "git is required for git plugin tests" - -def test_reports_missing_git(): - with tempfile.TemporaryDirectory() as tmpdir: - env = {'PATH': '/nonexistent'} - result = subprocess.run( - [sys.executable, str(GIT_HOOK), '--url', TEST_URL, '--snapshot-id', 'test123'], - cwd=tmpdir, capture_output=True, text=True, env=env - ) - if result.returncode != 0: - combined = result.stdout + result.stderr - assert 'DEPENDENCY_NEEDED' in combined or 'git' in combined.lower() or 'ERROR=' in combined - -def test_handles_non_git_url(): - assert shutil.which('git'), "git binary not available" - - with tempfile.TemporaryDirectory() as tmpdir: - result = subprocess.run( - [sys.executable, str(GIT_HOOK), '--url', 'https://example.com', '--snapshot-id', 'test789'], - cwd=tmpdir, capture_output=True, text=True, timeout=30 - ) - # Should fail or skip for non-git URL - assert result.returncode in (0, 1) - - # Parse clean JSONL output - result_json = None - for line in result.stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - pass - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - if result_json: - # Should report failure or skip for non-git URL - assert result_json['status'] in ['failed', 'skipped'], f"Should fail or skip: {result_json}" - - -def test_real_git_repo(): - """Test that git can clone a real GitHub repository.""" - import os - - assert shutil.which('git'), "git binary not available" - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - # Use a real but small GitHub repository - git_url = 'https://github.com/ArchiveBox/abx-pkg' - - env = os.environ.copy() - env['GIT_TIMEOUT'] = '120' # Give it time to clone - - start_time = time.time() - result = subprocess.run( - [sys.executable, str(GIT_HOOK), '--url', git_url, '--snapshot-id', 'testgit'], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=180 - ) - elapsed_time = time.time() - start_time - - # Should succeed - assert result.returncode == 0, f"Should clone repository successfully: {result.stderr}" - - # Parse JSONL output - result_json = None - for line in result.stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - assert result_json, f"Should have ArchiveResult JSONL output. stdout: {result.stdout}" - assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" - - # Check that the git repo was cloned - git_dirs = list(tmpdir.glob('**/.git')) - assert len(git_dirs) > 0, f"Should have cloned a git repository. Contents: {list(tmpdir.rglob('*'))}" - - print(f"Successfully cloned repository in {elapsed_time:.2f}s") - - -if __name__ == '__main__': - pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/hashes/config.json b/archivebox/plugins/hashes/config.json deleted file mode 100644 index b57db14a..00000000 --- a/archivebox/plugins/hashes/config.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "additionalProperties": false, - "properties": { - "HASHES_ENABLED": { - "type": "boolean", - "default": true, - "x-aliases": ["SAVE_HASHES", "USE_HASHES"], - "description": "Enable merkle tree hash generation" - }, - "HASHES_TIMEOUT": { - "type": "integer", - "default": 30, - "minimum": 5, - "x-fallback": "TIMEOUT", - "description": "Timeout for merkle tree generation in seconds" - } - } -} diff --git a/archivebox/plugins/hashes/on_Snapshot__93_hashes.py b/archivebox/plugins/hashes/on_Snapshot__93_hashes.py deleted file mode 100755 index 2738d85f..00000000 --- a/archivebox/plugins/hashes/on_Snapshot__93_hashes.py +++ /dev/null @@ -1,185 +0,0 @@ -#!/usr/bin/env python3 -""" -Create a hashed Merkle tree of all archived outputs. - -This plugin runs after all extractors complete (priority 93) and generates -a cryptographic Merkle hash tree of all files in the snapshot directory. - -Output: hashes.json containing root_hash, tree structure, file list, metadata - -Usage: on_Snapshot__93_hashes.py --url= --snapshot-id= - -Environment variables: - SAVE_HASHES: Enable hash merkle tree generation (default: true) - DATA_DIR: ArchiveBox data directory - ARCHIVE_DIR: Archive output directory -""" - -import os -import sys -import json -import hashlib -from pathlib import Path -from datetime import datetime, timezone -from typing import Dict, List, Optional, Tuple, Any - -import click - - -def sha256_file(filepath: Path) -> str: - """Compute SHA256 hash of a file.""" - h = hashlib.sha256() - try: - with open(filepath, 'rb') as f: - while chunk := f.read(65536): - h.update(chunk) - return h.hexdigest() - except (OSError, PermissionError): - return '0' * 64 - - -def sha256_data(data: bytes) -> str: - """Compute SHA256 hash of raw data.""" - return hashlib.sha256(data).hexdigest() - - -def collect_files(snapshot_dir: Path, exclude_dirs: Optional[List[str]] = None) -> List[Tuple[Path, str, int]]: - """Recursively collect all files in snapshot directory.""" - exclude_dirs = exclude_dirs or ['hashes', '.git', '__pycache__'] - files = [] - - for root, dirs, filenames in os.walk(snapshot_dir): - dirs[:] = [d for d in dirs if d not in exclude_dirs] - - for filename in filenames: - filepath = Path(root) / filename - rel_path = filepath.relative_to(snapshot_dir) - - if filepath.is_symlink(): - continue - - file_hash = sha256_file(filepath) - file_size = filepath.stat().st_size if filepath.exists() else 0 - files.append((rel_path, file_hash, file_size)) - - files.sort(key=lambda x: str(x[0])) - return files - - -def build_merkle_tree(file_hashes: List[str]) -> Tuple[str, List[List[str]]]: - """Build a Merkle tree from a list of leaf hashes.""" - if not file_hashes: - return sha256_data(b''), [[]] - - tree_levels = [file_hashes.copy()] - - while len(tree_levels[-1]) > 1: - current_level = tree_levels[-1] - next_level = [] - - for i in range(0, len(current_level), 2): - left = current_level[i] - if i + 1 < len(current_level): - right = current_level[i + 1] - combined = left + right - else: - combined = left + left - - parent_hash = sha256_data(combined.encode('utf-8')) - next_level.append(parent_hash) - - tree_levels.append(next_level) - - root_hash = tree_levels[-1][0] - return root_hash, tree_levels - - -def create_hashes(snapshot_dir: Path) -> Dict[str, Any]: - """Create a complete Merkle hash tree of all files in snapshot directory.""" - files = collect_files(snapshot_dir) - file_hashes = [file_hash for _, file_hash, _ in files] - root_hash, tree_levels = build_merkle_tree(file_hashes) - total_size = sum(size for _, _, size in files) - - file_list = [ - {'path': str(path), 'hash': file_hash, 'size': size} - for path, file_hash, size in files - ] - - return { - 'root_hash': root_hash, - 'tree_levels': tree_levels, - 'files': file_list, - 'metadata': { - 'timestamp': datetime.now(timezone.utc).isoformat(), - 'file_count': len(files), - 'total_size': total_size, - 'tree_depth': len(tree_levels), - }, - } - - -@click.command() -@click.option('--url', required=True, help='URL being archived') -@click.option('--snapshot-id', required=True, help='Snapshot UUID') -def main(url: str, snapshot_id: str): - """Generate Merkle tree of all archived outputs.""" - status = 'failed' - output = None - error = '' - root_hash = None - file_count = 0 - - try: - # Check if enabled - save_hashes = os.getenv('HASHES_ENABLED', 'true').lower() in ('true', '1', 'yes', 'on') - - if not save_hashes: - status = 'skipped' - click.echo(json.dumps({'status': status, 'output': 'HASHES_ENABLED=false'})) - sys.exit(0) - - # Working directory is the extractor output dir (e.g., /hashes/) - # Parent is the snapshot directory - output_dir = Path.cwd() - snapshot_dir = output_dir.parent - - if not snapshot_dir.exists(): - raise FileNotFoundError(f'Snapshot directory not found: {snapshot_dir}') - - # Ensure output directory exists - output_dir.mkdir(exist_ok=True) - output_path = output_dir / 'hashes.json' - - # Generate Merkle tree - merkle_data = create_hashes(snapshot_dir) - - # Write output - with open(output_path, 'w', encoding='utf-8') as f: - json.dump(merkle_data, f, indent=2) - - status = 'succeeded' - output = 'hashes.json' - root_hash = merkle_data['root_hash'] - file_count = merkle_data['metadata']['file_count'] - - except Exception as e: - error = f'{type(e).__name__}: {e}' - status = 'failed' - click.echo(f'Error: {error}', err=True) - - # Print JSON result for hook runner - result = { - 'status': status, - 'output': output, - 'error': error or None, - 'root_hash': root_hash, - 'file_count': file_count, - } - click.echo(json.dumps(result)) - - sys.exit(0 if status in ('succeeded', 'skipped') else 1) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/hashes/templates/icon.html b/archivebox/plugins/hashes/templates/icon.html deleted file mode 100644 index 211930f0..00000000 --- a/archivebox/plugins/hashes/templates/icon.html +++ /dev/null @@ -1 +0,0 @@ - diff --git a/archivebox/plugins/hashes/tests/test_hashes.py b/archivebox/plugins/hashes/tests/test_hashes.py deleted file mode 100644 index 0eb7d7f1..00000000 --- a/archivebox/plugins/hashes/tests/test_hashes.py +++ /dev/null @@ -1,157 +0,0 @@ -""" -Tests for the hashes plugin. - -Tests the real merkle tree generation with actual files. -""" - -import json -import os -import subprocess -import sys -import tempfile -from pathlib import Path - -import pytest -from django.test import TestCase - - -# Get the path to the hashes hook -PLUGIN_DIR = Path(__file__).parent.parent -HASHES_HOOK = PLUGIN_DIR / 'on_Snapshot__93_hashes.py' - - -class TestHashesPlugin(TestCase): - """Test the hashes plugin.""" - - def test_hashes_hook_exists(self): - """Hashes hook script should exist.""" - self.assertTrue(HASHES_HOOK.exists(), f"Hook not found: {HASHES_HOOK}") - - def test_hashes_generates_tree_for_files(self): - """Hashes hook should generate merkle tree for files in snapshot directory.""" - with tempfile.TemporaryDirectory() as temp_dir: - # Create a mock snapshot directory structure - snapshot_dir = Path(temp_dir) / 'snapshot' - snapshot_dir.mkdir() - - # Create output directory for hashes - output_dir = snapshot_dir / 'hashes' - output_dir.mkdir() - - # Create some test files - (snapshot_dir / 'index.html').write_text('Test') - (snapshot_dir / 'screenshot.png').write_bytes(b'\x89PNG\r\n\x1a\n' + b'\x00' * 100) - - subdir = snapshot_dir / 'media' - subdir.mkdir() - (subdir / 'video.mp4').write_bytes(b'\x00\x00\x00\x18ftypmp42') - - # Run the hook from the output directory - env = os.environ.copy() - env['HASHES_ENABLED'] = 'true' - - result = subprocess.run( - [ - sys.executable, str(HASHES_HOOK), - '--url=https://example.com', - '--snapshot-id=test-snapshot', - ], - capture_output=True, - text=True, - cwd=str(output_dir), # Hook expects to run from output dir - env=env, - timeout=30 - ) - - # Should succeed - self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}") - - # Check output file exists - output_file = output_dir / 'hashes.json' - self.assertTrue(output_file.exists(), "hashes.json not created") - - # Parse and verify output - with open(output_file) as f: - data = json.load(f) - - self.assertIn('root_hash', data) - self.assertIn('files', data) - self.assertIn('metadata', data) - - # Should have indexed our test files - file_paths = [f['path'] for f in data['files']] - self.assertIn('index.html', file_paths) - self.assertIn('screenshot.png', file_paths) - - # Verify metadata - self.assertGreater(data['metadata']['file_count'], 0) - self.assertGreater(data['metadata']['total_size'], 0) - - def test_hashes_skips_when_disabled(self): - """Hashes hook should skip when HASHES_ENABLED=false.""" - with tempfile.TemporaryDirectory() as temp_dir: - snapshot_dir = Path(temp_dir) / 'snapshot' - snapshot_dir.mkdir() - output_dir = snapshot_dir / 'hashes' - output_dir.mkdir() - - env = os.environ.copy() - env['HASHES_ENABLED'] = 'false' - - result = subprocess.run( - [ - sys.executable, str(HASHES_HOOK), - '--url=https://example.com', - '--snapshot-id=test-snapshot', - ], - capture_output=True, - text=True, - cwd=str(output_dir), - env=env, - timeout=30 - ) - - # Should succeed (exit 0) but skip - self.assertEqual(result.returncode, 0) - self.assertIn('skipped', result.stdout) - - def test_hashes_handles_empty_directory(self): - """Hashes hook should handle empty snapshot directory.""" - with tempfile.TemporaryDirectory() as temp_dir: - snapshot_dir = Path(temp_dir) / 'snapshot' - snapshot_dir.mkdir() - output_dir = snapshot_dir / 'hashes' - output_dir.mkdir() - - env = os.environ.copy() - env['HASHES_ENABLED'] = 'true' - - result = subprocess.run( - [ - sys.executable, str(HASHES_HOOK), - '--url=https://example.com', - '--snapshot-id=test-snapshot', - ], - capture_output=True, - text=True, - cwd=str(output_dir), - env=env, - timeout=30 - ) - - # Should succeed even with empty directory - self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}") - - # Check output file exists - output_file = output_dir / 'hashes.json' - self.assertTrue(output_file.exists()) - - with open(output_file) as f: - data = json.load(f) - - # Should have empty file list - self.assertEqual(data['metadata']['file_count'], 0) - - -if __name__ == '__main__': - pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/headers/config.json b/archivebox/plugins/headers/config.json deleted file mode 100644 index a0068f6e..00000000 --- a/archivebox/plugins/headers/config.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "additionalProperties": false, - "required_plugins": ["chrome"], - "properties": { - "HEADERS_ENABLED": { - "type": "boolean", - "default": true, - "x-aliases": ["SAVE_HEADERS", "USE_HEADERS"], - "description": "Enable HTTP headers capture" - }, - "HEADERS_TIMEOUT": { - "type": "integer", - "default": 30, - "minimum": 5, - "x-fallback": "TIMEOUT", - "description": "Timeout for headers capture in seconds" - } - } -} diff --git a/archivebox/plugins/headers/on_Snapshot__27_headers.bg.js b/archivebox/plugins/headers/on_Snapshot__27_headers.bg.js deleted file mode 100644 index 7ca72994..00000000 --- a/archivebox/plugins/headers/on_Snapshot__27_headers.bg.js +++ /dev/null @@ -1,247 +0,0 @@ -#!/usr/bin/env node -/** - * Capture original request + response headers for the main navigation. - * - * This hook sets up CDP listeners BEFORE chrome_navigate loads the page, - * then waits for navigation to complete. It records the first top-level - * request headers and the corresponding response headers (with :status). - * - * Usage: on_Snapshot__27_headers.bg.js --url= --snapshot-id= - * Output: Writes headers.json - */ - -const fs = require('fs'); -const path = require('path'); - -// Add NODE_MODULES_DIR to module resolution paths if set -if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); - -const puppeteer = require('puppeteer-core'); - -// Import shared utilities from chrome_utils.js -const { - getEnvBool, - getEnvInt, - parseArgs, - connectToPage, - waitForPageLoaded, -} = require('../chrome/chrome_utils.js'); - -const PLUGIN_NAME = 'headers'; -const OUTPUT_DIR = '.'; -const OUTPUT_FILE = 'headers.json'; -const CHROME_SESSION_DIR = '../chrome'; -const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)'; - -let browser = null; -let page = null; -let client = null; -let shuttingDown = false; -let headersWritten = false; - -let requestId = null; -let requestUrl = null; -let requestHeaders = null; -let responseHeaders = null; -let responseStatus = null; -let responseStatusText = null; -let responseUrl = null; -let originalUrl = null; - -function getFinalUrl() { - const finalUrlFile = path.join(CHROME_SESSION_DIR, 'final_url.txt'); - if (fs.existsSync(finalUrlFile)) { - return fs.readFileSync(finalUrlFile, 'utf8').trim(); - } - return page ? page.url() : null; -} - -function writeHeadersFile() { - if (headersWritten) return; - if (!responseHeaders) return; - - const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); - const responseHeadersWithStatus = { - ...(responseHeaders || {}), - }; - - if (responseStatus !== null && responseStatus !== undefined && - responseHeadersWithStatus[':status'] === undefined) { - responseHeadersWithStatus[':status'] = String(responseStatus); - } - - const record = { - url: requestUrl || originalUrl, - final_url: getFinalUrl(), - status: responseStatus !== undefined ? responseStatus : null, - request_headers: requestHeaders || {}, - response_headers: responseHeadersWithStatus, - headers: responseHeadersWithStatus, // backwards compatibility - }; - - if (responseStatusText) { - record.statusText = responseStatusText; - } - if (responseUrl) { - record.response_url = responseUrl; - } - - fs.writeFileSync(outputPath, JSON.stringify(record, null, 2)); - headersWritten = true; -} - -async function setupListener(url) { - const timeout = getEnvInt('HEADERS_TIMEOUT', getEnvInt('TIMEOUT', 30)) * 1000; - const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt'); - const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt'); - const pidFile = path.join(CHROME_SESSION_DIR, 'chrome.pid'); - - if (!fs.existsSync(cdpFile) || !fs.existsSync(targetIdFile) || !fs.existsSync(pidFile)) { - throw new Error(CHROME_SESSION_REQUIRED_ERROR); - } - try { - const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10); - if (!pid || Number.isNaN(pid)) throw new Error('Invalid pid'); - process.kill(pid, 0); - } catch (e) { - throw new Error(CHROME_SESSION_REQUIRED_ERROR); - } - - const { browser, page } = await connectToPage({ - chromeSessionDir: CHROME_SESSION_DIR, - timeoutMs: timeout, - puppeteer, - }); - - client = await page.target().createCDPSession(); - await client.send('Network.enable'); - - client.on('Network.requestWillBeSent', (params) => { - try { - if (requestId && !responseHeaders && params.redirectResponse && params.requestId === requestId) { - responseHeaders = params.redirectResponse.headers || {}; - responseStatus = params.redirectResponse.status || null; - responseStatusText = params.redirectResponse.statusText || null; - responseUrl = params.redirectResponse.url || null; - writeHeadersFile(); - } - - if (requestId) return; - if (params.type && params.type !== 'Document') return; - if (!params.request || !params.request.url) return; - if (!params.request.url.startsWith('http')) return; - - requestId = params.requestId; - requestUrl = params.request.url; - requestHeaders = params.request.headers || {}; - } catch (e) { - // Ignore errors - } - }); - - client.on('Network.responseReceived', (params) => { - try { - if (!requestId || params.requestId !== requestId || responseHeaders) return; - const response = params.response || {}; - responseHeaders = response.headers || {}; - responseStatus = response.status || null; - responseStatusText = response.statusText || null; - responseUrl = response.url || null; - writeHeadersFile(); - } catch (e) { - // Ignore errors - } - }); - - return { browser, page }; -} - -function emitResult(status = 'succeeded', outputStr = OUTPUT_FILE) { - if (shuttingDown) return; - shuttingDown = true; - - console.log(JSON.stringify({ - type: 'ArchiveResult', - status, - output_str: outputStr, - })); -} - -async function handleShutdown(signal) { - console.error(`\nReceived ${signal}, emitting final results...`); - if (!headersWritten) { - writeHeadersFile(); - } - if (headersWritten) { - emitResult('succeeded', OUTPUT_FILE); - } else { - emitResult('failed', 'No headers captured'); - } - - if (browser) { - try { - browser.disconnect(); - } catch (e) {} - } - process.exit(headersWritten ? 0 : 1); -} - -async function main() { - const args = parseArgs(); - const url = args.url; - const snapshotId = args.snapshot_id; - - if (!url || !snapshotId) { - console.error('Usage: on_Snapshot__27_headers.bg.js --url= --snapshot-id='); - process.exit(1); - } - - originalUrl = url; - - if (!getEnvBool('HEADERS_ENABLED', true)) { - console.error('Skipping (HEADERS_ENABLED=False)'); - console.log(JSON.stringify({type: 'ArchiveResult', status: 'skipped', output_str: 'HEADERS_ENABLED=False'})); - process.exit(0); - } - - try { - // Set up listeners BEFORE navigation - const connection = await setupListener(url); - browser = connection.browser; - page = connection.page; - - // Register signal handlers for graceful shutdown - process.on('SIGTERM', () => handleShutdown('SIGTERM')); - process.on('SIGINT', () => handleShutdown('SIGINT')); - - // Wait for chrome_navigate to complete (non-fatal) - try { - const timeout = getEnvInt('HEADERS_TIMEOUT', getEnvInt('TIMEOUT', 30)) * 1000; - await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 200); - } catch (e) { - console.error(`WARN: ${e.message}`); - } - - // Keep alive until SIGTERM - await new Promise(() => {}); - return; - - } catch (e) { - const errorMessage = (e && e.message) - ? `${e.name || 'Error'}: ${e.message}` - : String(e || 'Unknown error'); - console.error(`ERROR: ${errorMessage}`); - - console.log(JSON.stringify({ - type: 'ArchiveResult', - status: 'failed', - output_str: errorMessage, - })); - process.exit(1); - } -} - -main().catch(e => { - console.error(`Fatal error: ${e.message}`); - process.exit(1); -}); diff --git a/archivebox/plugins/headers/templates/icon.html b/archivebox/plugins/headers/templates/icon.html deleted file mode 100644 index f693e709..00000000 --- a/archivebox/plugins/headers/templates/icon.html +++ /dev/null @@ -1 +0,0 @@ - diff --git a/archivebox/plugins/headers/tests/test_headers.py b/archivebox/plugins/headers/tests/test_headers.py deleted file mode 100644 index 09ec86fb..00000000 --- a/archivebox/plugins/headers/tests/test_headers.py +++ /dev/null @@ -1,409 +0,0 @@ -""" -Integration tests for headers plugin - -Tests verify: - pass -1. Plugin script exists and is executable -2. Node.js is available -3. Headers extraction works for real example.com -4. Output JSON contains actual HTTP headers -5. Config options work (TIMEOUT, USER_AGENT) -""" - -import json -import shutil -import subprocess -import tempfile -import time -from pathlib import Path - -import pytest - -from archivebox.plugins.chrome.tests.chrome_test_helpers import ( - CHROME_NAVIGATE_HOOK, - get_test_env, - chrome_session, -) - -PLUGIN_DIR = Path(__file__).parent.parent -HEADERS_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_headers.*'), None) -TEST_URL = 'https://example.com' - -def normalize_root_url(url: str) -> str: - return url.rstrip('/') - -def run_headers_capture(headers_dir, snapshot_chrome_dir, env, url, snapshot_id): - hook_proc = subprocess.Popen( - ['node', str(HEADERS_HOOK), f'--url={url}', f'--snapshot-id={snapshot_id}'], - cwd=headers_dir, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=env, - ) - - nav_result = subprocess.run( - ['node', str(CHROME_NAVIGATE_HOOK), f'--url={url}', f'--snapshot-id={snapshot_id}'], - cwd=snapshot_chrome_dir, - capture_output=True, - text=True, - timeout=120, - env=env, - ) - - headers_file = headers_dir / 'headers.json' - for _ in range(60): - if headers_file.exists() and headers_file.stat().st_size > 0: - break - time.sleep(1) - - if hook_proc.poll() is None: - hook_proc.terminate() - try: - stdout, stderr = hook_proc.communicate(timeout=5) - except subprocess.TimeoutExpired: - hook_proc.kill() - stdout, stderr = hook_proc.communicate() - else: - stdout, stderr = hook_proc.communicate() - - return hook_proc.returncode, stdout, stderr, nav_result, headers_file - - -def test_hook_script_exists(): - """Verify hook script exists.""" - assert HEADERS_HOOK.exists(), f"Hook script not found: {HEADERS_HOOK}" - - -def test_node_is_available(): - """Test that Node.js is available on the system.""" - result = subprocess.run( - ['which', 'node'], - capture_output=True, - text=True - ) - - if result.returncode != 0: - pass - - binary_path = result.stdout.strip() - assert Path(binary_path).exists(), f"Binary should exist at {binary_path}" - - # Test that node is executable and get version - result = subprocess.run( - ['node', '--version'], - capture_output=True, - text=True, - timeout=10 - , - env=get_test_env()) - assert result.returncode == 0, f"node not executable: {result.stderr}" - assert result.stdout.startswith('v'), f"Unexpected node version format: {result.stdout}" - - -def test_extracts_headers_from_example_com(): - """Test full workflow: extract headers from real example.com.""" - - # Check node is available - if not shutil.which('node'): - pass - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env): - headers_dir = snapshot_chrome_dir.parent / 'headers' - headers_dir.mkdir(exist_ok=True) - - result = run_headers_capture( - headers_dir, - snapshot_chrome_dir, - env, - TEST_URL, - 'test789', - ) - - hook_code, stdout, stderr, nav_result, headers_file = result - assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}" - assert hook_code == 0, f"Extraction failed: {stderr}" - - # Parse clean JSONL output - result_json = None - for line in stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - pass - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - assert result_json, "Should have ArchiveResult JSONL output" - assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" - - # Verify output file exists (hook writes to current directory) - assert headers_file.exists(), "headers.json not created" - - # Verify headers JSON contains REAL example.com response - headers_data = json.loads(headers_file.read_text()) - - assert 'url' in headers_data, "Should have url field" - assert normalize_root_url(headers_data['url']) == normalize_root_url(TEST_URL), f"URL should be {TEST_URL}" - - assert 'status' in headers_data, "Should have status field" - assert headers_data['status'] in [200, 301, 302], \ - f"Should have valid HTTP status, got {headers_data['status']}" - - assert 'request_headers' in headers_data, "Should have request_headers field" - assert isinstance(headers_data['request_headers'], dict), "Request headers should be a dict" - - assert 'response_headers' in headers_data, "Should have response_headers field" - assert isinstance(headers_data['response_headers'], dict), "Response headers should be a dict" - assert len(headers_data['response_headers']) > 0, "Response headers dict should not be empty" - - assert 'headers' in headers_data, "Should have headers field" - assert isinstance(headers_data['headers'], dict), "Headers should be a dict" - - # Verify common HTTP headers are present - headers_lower = {k.lower(): v for k, v in headers_data['response_headers'].items()} - assert 'content-type' in headers_lower or 'content-length' in headers_lower, \ - "Should have at least one common HTTP header" - - assert headers_data['response_headers'].get(':status') == str(headers_data['status']), \ - "Response headers should include :status pseudo header" - - -def test_headers_output_structure(): - """Test that headers plugin produces correctly structured output.""" - - if not shutil.which('node'): - pass - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env): - headers_dir = snapshot_chrome_dir.parent / 'headers' - headers_dir.mkdir(exist_ok=True) - - result = run_headers_capture( - headers_dir, - snapshot_chrome_dir, - env, - TEST_URL, - 'testformat', - ) - - hook_code, stdout, stderr, nav_result, headers_file = result - assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}" - assert hook_code == 0, f"Extraction failed: {stderr}" - - # Parse clean JSONL output - result_json = None - for line in stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - pass - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - assert result_json, "Should have ArchiveResult JSONL output" - assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" - - # Verify output structure - assert headers_file.exists(), "Output headers.json not created" - - output_data = json.loads(headers_file.read_text()) - - # Verify all required fields are present - assert 'url' in output_data, "Output should have url field" - assert 'status' in output_data, "Output should have status field" - assert 'request_headers' in output_data, "Output should have request_headers field" - assert 'response_headers' in output_data, "Output should have response_headers field" - assert 'headers' in output_data, "Output should have headers field" - - # Verify data types - assert isinstance(output_data['status'], int), "Status should be integer" - assert isinstance(output_data['request_headers'], dict), "Request headers should be dict" - assert isinstance(output_data['response_headers'], dict), "Response headers should be dict" - assert isinstance(output_data['headers'], dict), "Headers should be dict" - - # Verify example.com returns expected headers - assert normalize_root_url(output_data['url']) == normalize_root_url(TEST_URL) - assert output_data['status'] in [200, 301, 302] - - -def test_fails_without_chrome_session(): - """Test that headers plugin fails when chrome session is missing.""" - - if not shutil.which('node'): - pass - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - # Run headers extraction - result = subprocess.run( - ['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=testhttp'], - cwd=tmpdir, - capture_output=True, - text=True, - timeout=60 - , - env=get_test_env()) - - assert result.returncode != 0, "Should fail without chrome session" - assert 'No Chrome session found (chrome plugin must run first)' in (result.stdout + result.stderr) - - -def test_config_timeout_honored(): - """Test that TIMEOUT config is respected.""" - - if not shutil.which('node'): - pass - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - # Set very short timeout (but example.com should still succeed) - import os - env_override = os.environ.copy() - env_override['TIMEOUT'] = '5' - - with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env): - headers_dir = snapshot_chrome_dir.parent / 'headers' - headers_dir.mkdir(exist_ok=True) - env.update(env_override) - - result = run_headers_capture( - headers_dir, - snapshot_chrome_dir, - env, - TEST_URL, - 'testtimeout', - ) - - # Should complete (success or fail, but not hang) - hook_code, _stdout, _stderr, nav_result, _headers_file = result - assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}" - assert hook_code in (0, 1), "Should complete without hanging" - - -def test_config_user_agent(): - """Test that USER_AGENT config is used.""" - - if not shutil.which('node'): - pass - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - # Set custom user agent - import os - env_override = os.environ.copy() - env_override['USER_AGENT'] = 'TestBot/1.0' - - with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env): - headers_dir = snapshot_chrome_dir.parent / 'headers' - headers_dir.mkdir(exist_ok=True) - env.update(env_override) - - result = run_headers_capture( - headers_dir, - snapshot_chrome_dir, - env, - TEST_URL, - 'testua', - ) - - # Should succeed (example.com doesn't block) - hook_code, stdout, _stderr, nav_result, _headers_file = result - assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}" - if hook_code == 0: - # Parse clean JSONL output - result_json = None - for line in stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - pass - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - assert result_json, "Should have ArchiveResult JSONL output" - assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" - - -def test_handles_https_urls(): - """Test that HTTPS URLs work correctly.""" - - if not shutil.which('node'): - pass - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - with chrome_session(tmpdir, test_url='https://example.org', navigate=False) as (_process, _pid, snapshot_chrome_dir, env): - headers_dir = snapshot_chrome_dir.parent / 'headers' - headers_dir.mkdir(exist_ok=True) - result = run_headers_capture( - headers_dir, - snapshot_chrome_dir, - env, - 'https://example.org', - 'testhttps', - ) - - hook_code, _stdout, _stderr, nav_result, headers_file = result - assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}" - if hook_code == 0: - if headers_file.exists(): - output_data = json.loads(headers_file.read_text()) - assert normalize_root_url(output_data['url']) == normalize_root_url('https://example.org') - assert output_data['status'] in [200, 301, 302] - - -def test_handles_404_gracefully(): - """Test that headers plugin handles 404s gracefully.""" - - if not shutil.which('node'): - pass - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - with chrome_session(tmpdir, test_url='https://example.com/nonexistent-page-404', navigate=False) as (_process, _pid, snapshot_chrome_dir, env): - headers_dir = snapshot_chrome_dir.parent / 'headers' - headers_dir.mkdir(exist_ok=True) - result = run_headers_capture( - headers_dir, - snapshot_chrome_dir, - env, - 'https://example.com/nonexistent-page-404', - 'test404', - ) - - # May succeed or fail depending on server behavior - # If it succeeds, verify 404 status is captured - hook_code, _stdout, _stderr, nav_result, headers_file = result - assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}" - if hook_code == 0: - if headers_file.exists(): - output_data = json.loads(headers_file.read_text()) - assert output_data['status'] == 404, "Should capture 404 status" - - -if __name__ == '__main__': - pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/htmltotext/config.json b/archivebox/plugins/htmltotext/config.json deleted file mode 100644 index 7f9e644a..00000000 --- a/archivebox/plugins/htmltotext/config.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "additionalProperties": false, - "properties": { - "HTMLTOTEXT_ENABLED": { - "type": "boolean", - "default": true, - "x-aliases": ["SAVE_HTMLTOTEXT", "USE_HTMLTOTEXT"], - "description": "Enable HTML to text conversion" - }, - "HTMLTOTEXT_TIMEOUT": { - "type": "integer", - "default": 30, - "minimum": 5, - "x-fallback": "TIMEOUT", - "description": "Timeout for HTML to text conversion in seconds" - } - } -} diff --git a/archivebox/plugins/htmltotext/on_Snapshot__58_htmltotext.py b/archivebox/plugins/htmltotext/on_Snapshot__58_htmltotext.py deleted file mode 100644 index 30134446..00000000 --- a/archivebox/plugins/htmltotext/on_Snapshot__58_htmltotext.py +++ /dev/null @@ -1,161 +0,0 @@ -#!/usr/bin/env python3 -""" -Convert HTML to plain text for search indexing. - -This extractor reads HTML from other extractors (wget, singlefile, dom) -and converts it to plain text for full-text search. - -Usage: on_Snapshot__htmltotext.py --url= --snapshot-id= -Output: Writes htmltotext.txt to $PWD - -Environment variables: - TIMEOUT: Timeout in seconds (not used, but kept for consistency) - -Note: This extractor does not require any external binaries. - It uses Python's built-in html.parser module. -""" - -import json -import os -import re -import sys -from html.parser import HTMLParser -from pathlib import Path - -import rich_click as click - - -# Extractor metadata -PLUGIN_NAME = 'htmltotext' -OUTPUT_DIR = '.' -OUTPUT_FILE = 'htmltotext.txt' - - -class HTMLTextExtractor(HTMLParser): - """Extract text content from HTML, ignoring scripts/styles.""" - - def __init__(self): - super().__init__() - self.result = [] - self.skip_tags = {'script', 'style', 'head', 'meta', 'link', 'noscript'} - self.current_tag = None - - def handle_starttag(self, tag, attrs): - self.current_tag = tag.lower() - - def handle_endtag(self, tag): - self.current_tag = None - - def handle_data(self, data): - if self.current_tag not in self.skip_tags: - text = data.strip() - if text: - self.result.append(text) - - def get_text(self) -> str: - return ' '.join(self.result) - - -def html_to_text(html: str) -> str: - """Convert HTML to plain text.""" - parser = HTMLTextExtractor() - try: - parser.feed(html) - return parser.get_text() - except Exception: - # Fallback: strip HTML tags with regex - text = re.sub(r']*>.*?', '', html, flags=re.DOTALL | re.IGNORECASE) - text = re.sub(r']*>.*?', '', text, flags=re.DOTALL | re.IGNORECASE) - text = re.sub(r'<[^>]+>', ' ', text) - text = re.sub(r'\s+', ' ', text) - return text.strip() - - -def find_html_source() -> str | None: - """Find HTML content from other extractors in the snapshot directory.""" - # Hooks run in snapshot_dir, sibling extractor outputs are in subdirectories - search_patterns = [ - 'singlefile/singlefile.html', - '*_singlefile/singlefile.html', - 'singlefile/*.html', - '*_singlefile/*.html', - 'dom/output.html', - '*_dom/output.html', - 'dom/*.html', - '*_dom/*.html', - 'wget/**/*.html', - '*_wget/**/*.html', - 'wget/**/*.htm', - '*_wget/**/*.htm', - ] - - for base in (Path.cwd(), Path.cwd().parent): - for pattern in search_patterns: - matches = list(base.glob(pattern)) - for match in matches: - if match.is_file() and match.stat().st_size > 0: - try: - return match.read_text(errors='ignore') - except Exception: - continue - - return None - - -def extract_htmltotext(url: str) -> tuple[bool, str | None, str]: - """ - Extract plain text from HTML sources. - - Returns: (success, output_path, error_message) - """ - # Find HTML source from other extractors - html_content = find_html_source() - if not html_content: - return False, None, 'No HTML source found (run singlefile, dom, or wget first)' - - # Convert HTML to text - text = html_to_text(html_content) - - if not text or len(text) < 10: - return False, None, 'No meaningful text extracted from HTML' - - # Output directory is current directory (hook already runs in output dir) - output_dir = Path(OUTPUT_DIR) - output_path = output_dir / OUTPUT_FILE - output_path.write_text(text, encoding='utf-8') - - return True, str(output_path), '' - - -@click.command() -@click.option('--url', required=True, help='URL that was archived') -@click.option('--snapshot-id', required=True, help='Snapshot UUID') -def main(url: str, snapshot_id: str): - """Convert HTML to plain text for search indexing.""" - - try: - # Run extraction - success, output, error = extract_htmltotext(url) - - if success: - # Success - emit ArchiveResult - result = { - 'type': 'ArchiveResult', - 'status': 'succeeded', - 'output_str': output or '' - } - print(json.dumps(result)) - sys.exit(0) - else: - # Transient error - emit NO JSONL - print(f'ERROR: {error}', file=sys.stderr) - sys.exit(1) - - except Exception as e: - # Transient error - emit NO JSONL - print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr) - sys.exit(1) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/htmltotext/templates/icon.html b/archivebox/plugins/htmltotext/templates/icon.html deleted file mode 100644 index d1c8c78d..00000000 --- a/archivebox/plugins/htmltotext/templates/icon.html +++ /dev/null @@ -1 +0,0 @@ - diff --git a/archivebox/plugins/htmltotext/tests/test_htmltotext.py b/archivebox/plugins/htmltotext/tests/test_htmltotext.py deleted file mode 100644 index 7d59fdd1..00000000 --- a/archivebox/plugins/htmltotext/tests/test_htmltotext.py +++ /dev/null @@ -1,84 +0,0 @@ -""" -Integration tests for htmltotext plugin - -Tests verify standalone htmltotext extractor execution. -""" - -import json -import subprocess -import sys -import tempfile -from pathlib import Path -import pytest - -PLUGIN_DIR = Path(__file__).parent.parent -HTMLTOTEXT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_htmltotext.*'), None) -TEST_URL = 'https://example.com' - -def test_hook_script_exists(): - assert HTMLTOTEXT_HOOK.exists() - -def test_extracts_text_from_html(): - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - # Create HTML source - (tmpdir / 'singlefile').mkdir() - (tmpdir / 'singlefile' / 'singlefile.html').write_text('

Example Domain

This domain is for examples.

') - - result = subprocess.run( - [sys.executable, str(HTMLTOTEXT_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'], - cwd=tmpdir, capture_output=True, text=True, timeout=30 - ) - - assert result.returncode == 0, f"Extraction failed: {result.stderr}" - - # Parse clean JSONL output - result_json = None - for line in result.stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - assert result_json, "Should have ArchiveResult JSONL output" - assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" - - # Verify output file (hook writes to current directory) - output_file = tmpdir / 'htmltotext.txt' - assert output_file.exists(), f"htmltotext.txt not created. Files: {list(tmpdir.iterdir())}" - content = output_file.read_text() - assert len(content) > 0, "Content should not be empty" - assert 'Example Domain' in content, "Should contain text from HTML" - -def test_fails_gracefully_without_html(): - with tempfile.TemporaryDirectory() as tmpdir: - result = subprocess.run( - [sys.executable, str(HTMLTOTEXT_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'], - cwd=tmpdir, capture_output=True, text=True, timeout=30 - ) - - # Should exit with non-zero or emit failure JSONL - # Parse clean JSONL output - result_json = None - for line in result.stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - if result_json: - # Should report failure or skip since no HTML source - assert result_json['status'] in ['failed', 'skipped'], f"Should fail or skip without HTML: {result_json}" - -if __name__ == '__main__': - pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/infiniscroll/config.json b/archivebox/plugins/infiniscroll/config.json deleted file mode 100644 index 5954ff11..00000000 --- a/archivebox/plugins/infiniscroll/config.json +++ /dev/null @@ -1,51 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "additionalProperties": false, - "required_plugins": ["chrome"], - "properties": { - "INFINISCROLL_ENABLED": { - "type": "boolean", - "default": true, - "x-aliases": ["SAVE_INFINISCROLL", "USE_INFINISCROLL"], - "description": "Enable infinite scroll page expansion" - }, - "INFINISCROLL_TIMEOUT": { - "type": "integer", - "default": 120, - "minimum": 10, - "x-fallback": "TIMEOUT", - "description": "Maximum timeout for scrolling in seconds" - }, - "INFINISCROLL_SCROLL_DELAY": { - "type": "integer", - "default": 2000, - "minimum": 500, - "description": "Delay between scrolls in milliseconds" - }, - "INFINISCROLL_SCROLL_DISTANCE": { - "type": "integer", - "default": 1600, - "minimum": 100, - "description": "Distance to scroll per step in pixels" - }, - "INFINISCROLL_SCROLL_LIMIT": { - "type": "integer", - "default": 10, - "minimum": 1, - "maximum": 100, - "description": "Maximum number of scroll steps" - }, - "INFINISCROLL_MIN_HEIGHT": { - "type": "integer", - "default": 16000, - "minimum": 1000, - "description": "Minimum page height to scroll to in pixels" - }, - "INFINISCROLL_EXPAND_DETAILS": { - "type": "boolean", - "default": true, - "description": "Expand
elements and click 'load more' buttons for comments" - } - } -} diff --git a/archivebox/plugins/infiniscroll/on_Snapshot__45_infiniscroll.js b/archivebox/plugins/infiniscroll/on_Snapshot__45_infiniscroll.js deleted file mode 100755 index 8275d61c..00000000 --- a/archivebox/plugins/infiniscroll/on_Snapshot__45_infiniscroll.js +++ /dev/null @@ -1,427 +0,0 @@ -#!/usr/bin/env node -/** - * Scroll the page down to trigger infinite scroll / lazy loading. - * - * Scrolls down 1 page at a time, up to INFINISCROLL_SCROLL_LIMIT times, - * ensuring at least INFINISCROLL_MIN_HEIGHT (default 16,000px) is reached. - * Stops early if no new content loads after a scroll. - * - * Optionally expands
elements and clicks "load more" buttons. - * - * Usage: on_Snapshot__45_infiniscroll.js --url= --snapshot-id= - * Output: JSONL with scroll stats (no files created) - * - * Environment variables: - * INFINISCROLL_ENABLED: Enable/disable (default: true) - * INFINISCROLL_TIMEOUT: Max timeout in seconds (default: 120) - * INFINISCROLL_SCROLL_DELAY: Delay between scrolls in ms (default: 2000) - * INFINISCROLL_SCROLL_DISTANCE: Pixels per scroll (default: 1600) - * INFINISCROLL_SCROLL_LIMIT: Max scroll iterations (default: 10) - * INFINISCROLL_MIN_HEIGHT: Min page height to reach in px (default: 16000) - * INFINISCROLL_EXPAND_DETAILS: Expand
and comments (default: true) - */ - -const fs = require('fs'); -const path = require('path'); -// Add NODE_MODULES_DIR to module resolution paths if set -if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); - -const { - getEnv, - getEnvBool, - getEnvInt, -} = require('../chrome/chrome_utils.js'); - -// Check if infiniscroll is enabled BEFORE requiring puppeteer -if (!getEnvBool('INFINISCROLL_ENABLED', true)) { - console.error('Skipping infiniscroll (INFINISCROLL_ENABLED=False)'); - process.exit(0); -} - -const puppeteer = require('puppeteer-core'); - -const PLUGIN_NAME = 'infiniscroll'; -const CHROME_SESSION_DIR = '../chrome'; -const CHROME_SESSION_REQUIRED_ERROR = 'No Chrome session found (chrome plugin must run first)'; - -function parseArgs() { - const args = {}; - process.argv.slice(2).forEach(arg => { - if (arg.startsWith('--')) { - const [key, ...valueParts] = arg.slice(2).split('='); - args[key.replace(/-/g, '_')] = valueParts.join('=') || true; - } - }); - return args; -} - -function getCdpUrl() { - const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt'); - if (fs.existsSync(cdpFile)) { - return fs.readFileSync(cdpFile, 'utf8').trim(); - } - return null; -} - -function getPageId() { - const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt'); - if (fs.existsSync(targetIdFile)) { - return fs.readFileSync(targetIdFile, 'utf8').trim(); - } - return null; -} - -async function waitForChromeTabLoaded(timeoutMs = 60000) { - const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json'); - const startTime = Date.now(); - - while (Date.now() - startTime < timeoutMs) { - if (fs.existsSync(navigationFile)) { - return true; - } - await new Promise(resolve => setTimeout(resolve, 100)); - } - return false; -} - -function sleep(ms) { - return new Promise(resolve => setTimeout(resolve, ms)); -} - -/** - * Expand
elements and click "load more" buttons for comments. - * Based on archivebox.ts expandComments function. - */ -async function expandDetails(page, options = {}) { - const { - timeout = 30000, - limit = 500, - delay = 500, - } = options; - - const startTime = Date.now(); - - // First, expand all
elements - const detailsExpanded = await page.evaluate(() => { - let count = 0; - // Generic
elements - document.querySelectorAll('details:not([open])').forEach(el => { - el.open = true; - count++; - }); - // Github README details sections - document.querySelectorAll('article details:not([open])').forEach(el => { - el.open = true; - count++; - }); - // Github issue discussion hidden comments - document.querySelectorAll('div.js-discussion details:not(.details-overlay):not([open])').forEach(el => { - el.open = true; - count++; - }); - // HedgeDoc/Markdown details sections - document.querySelectorAll('.markdown-body details:not([open])').forEach(el => { - el.open = true; - count++; - }); - return count; - }); - - if (detailsExpanded > 0) { - console.error(`Expanded ${detailsExpanded}
elements`); - } - - // Then click "load more" buttons for comments - const numExpanded = await page.evaluate(async ({ timeout, limit, delay }) => { - // Helper to find elements by XPath - function getElementsByXPath(xpath) { - const results = []; - const xpathResult = document.evaluate( - xpath, - document, - null, - XPathResult.ORDERED_NODE_ITERATOR_TYPE, - null - ); - let node; - while ((node = xpathResult.iterateNext()) != null) { - results.push(node); - } - return results; - } - - const wait = (ms) => new Promise(res => setTimeout(res, ms)); - - // Find all "load more" type buttons/links - const getLoadMoreLinks = () => [ - // Reddit (new) - ...document.querySelectorAll('faceplate-partial[loading=action]'), - // Reddit (old) - show more replies - ...document.querySelectorAll('a[onclick^="return morechildren"]'), - // Reddit (old) - show hidden replies - ...document.querySelectorAll('a[onclick^="return togglecomment"]'), - // Twitter/X - show more replies - ...getElementsByXPath("//*[text()='Show more replies']"), - ...getElementsByXPath("//*[text()='Show replies']"), - // Generic "load more" / "show more" buttons - ...getElementsByXPath("//*[contains(text(),'Load more')]"), - ...getElementsByXPath("//*[contains(text(),'Show more')]"), - // Hacker News - ...document.querySelectorAll('a.morelink'), - ]; - - let expanded = 0; - let loadMoreLinks = getLoadMoreLinks(); - const startTime = Date.now(); - - while (loadMoreLinks.length > 0) { - for (const link of loadMoreLinks) { - // Skip certain elements - if (link.slot === 'children') continue; - - try { - link.scrollIntoView({ behavior: 'smooth' }); - link.click(); - expanded++; - await wait(delay); - } catch (e) { - // Ignore click errors - } - - // Check limits - if (expanded >= limit) return expanded; - if (Date.now() - startTime >= timeout) return expanded; - } - - // Check for new load more links after clicking - await wait(delay); - loadMoreLinks = getLoadMoreLinks(); - } - - return expanded; - }, { timeout, limit, delay }); - - if (numExpanded > 0) { - console.error(`Clicked ${numExpanded} "load more" buttons`); - } - - return { - detailsExpanded, - commentsExpanded: numExpanded, - total: detailsExpanded + numExpanded, - }; -} - -async function scrollDown(page, options = {}) { - const { - timeout = 120000, - scrollDelay = 2000, - scrollDistance = 1600, - scrollLimit = 10, - minHeight = 16000, - } = options; - - const startTime = Date.now(); - - // Get page height using multiple methods (some pages use different scroll containers) - const getPageHeight = () => page.evaluate(() => { - return Math.max( - document.body.scrollHeight || 0, - document.body.offsetHeight || 0, - document.documentElement.scrollHeight || 0, - document.documentElement.offsetHeight || 0 - ); - }); - - const startingHeight = await getPageHeight(); - let lastHeight = startingHeight; - let scrollCount = 0; - let scrollPosition = 0; - - console.error(`Initial page height: ${startingHeight}px`); - - // Scroll to top first - await page.evaluate(() => { - window.scrollTo({ top: 0, left: 0, behavior: 'smooth' }); - }); - await sleep(500); - - while (scrollCount < scrollLimit) { - // Check timeout - const elapsed = Date.now() - startTime; - if (elapsed >= timeout) { - console.error(`Timeout reached after ${scrollCount} scrolls`); - break; - } - - scrollPosition = (scrollCount + 1) * scrollDistance; - console.error(`Scrolling down ${scrollCount + 1}x ${scrollDistance}px... (${scrollPosition}/${lastHeight})`); - - await page.evaluate((yOffset) => { - window.scrollTo({ top: yOffset, left: 0, behavior: 'smooth' }); - }, scrollPosition); - - scrollCount++; - await sleep(scrollDelay); - - // Check if new content was added (infinite scroll detection) - const newHeight = await getPageHeight(); - const addedPx = newHeight - lastHeight; - - if (addedPx > 0) { - console.error(`Detected infini-scrolling: ${lastHeight}+${addedPx} => ${newHeight}`); - } else if (scrollPosition >= newHeight + scrollDistance) { - // Reached the bottom - if (scrollCount > 2) { - console.error(`Reached bottom of page at ${newHeight}px`); - break; - } - } - - lastHeight = newHeight; - - // Check if we've reached minimum height and can stop - if (lastHeight >= minHeight && scrollPosition >= lastHeight) { - console.error(`Reached minimum height target (${minHeight}px)`); - break; - } - } - - // Scroll to absolute bottom - if (scrollPosition < lastHeight) { - await page.evaluate(() => { - window.scrollTo({ top: document.documentElement.scrollHeight, left: 0, behavior: 'smooth' }); - }); - await sleep(scrollDelay); - } - - // Scroll back to top - console.error(`Reached bottom of page at ${lastHeight}px, scrolling back to top...`); - await page.evaluate(() => { - window.scrollTo({ top: 0, left: 0, behavior: 'smooth' }); - }); - await sleep(scrollDelay); - - const totalElapsed = Date.now() - startTime; - - return { - scrollCount, - finalHeight: lastHeight, - startingHeight, - elapsedMs: totalElapsed, - }; -} - -async function main() { - const args = parseArgs(); - const url = args.url; - const snapshotId = args.snapshot_id; - - if (!url || !snapshotId) { - console.error('Usage: on_Snapshot__45_infiniscroll.js --url= --snapshot-id='); - process.exit(1); - } - - const timeout = getEnvInt('INFINISCROLL_TIMEOUT', 120) * 1000; - const scrollDelay = getEnvInt('INFINISCROLL_SCROLL_DELAY', 2000); - const scrollDistance = getEnvInt('INFINISCROLL_SCROLL_DISTANCE', 1600); - const scrollLimit = getEnvInt('INFINISCROLL_SCROLL_LIMIT', 10); - const minHeight = getEnvInt('INFINISCROLL_MIN_HEIGHT', 16000); - const expandDetailsEnabled = getEnvBool('INFINISCROLL_EXPAND_DETAILS', true); - - const cdpUrl = getCdpUrl(); - if (!cdpUrl) { - console.error(CHROME_SESSION_REQUIRED_ERROR); - process.exit(1); - } - - // Wait for page to be loaded - const pageLoaded = await waitForChromeTabLoaded(60000); - if (!pageLoaded) { - console.error('ERROR: Page not loaded after 60s (chrome_navigate must complete first)'); - process.exit(1); - } - - let browser = null; - try { - browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl }); - - const pages = await browser.pages(); - if (pages.length === 0) { - throw new Error('No pages found in browser'); - } - - // Find the right page by target ID - const targetId = getPageId(); - let page = null; - if (targetId) { - page = pages.find(p => { - const target = p.target(); - return target && target._targetId === targetId; - }); - } - if (!page) { - page = pages[pages.length - 1]; - } - - console.error(`Starting infinite scroll on ${url}`); - - // Expand
and comments before scrolling (if enabled) - let expandResult = { total: 0, detailsExpanded: 0, commentsExpanded: 0 }; - if (expandDetailsEnabled) { - console.error('Expanding
and comments...'); - expandResult = await expandDetails(page, { - timeout: Math.min(timeout / 4, 30000), - limit: 500, - delay: scrollDelay / 4, - }); - } - - const result = await scrollDown(page, { - timeout, - scrollDelay, - scrollDistance, - scrollLimit, - minHeight, - }); - - // Expand again after scrolling (new content may have loaded) - if (expandDetailsEnabled) { - const expandResult2 = await expandDetails(page, { - timeout: Math.min(timeout / 4, 30000), - limit: 500, - delay: scrollDelay / 4, - }); - expandResult.total += expandResult2.total; - expandResult.detailsExpanded += expandResult2.detailsExpanded; - expandResult.commentsExpanded += expandResult2.commentsExpanded; - } - - browser.disconnect(); - - const elapsedSec = (result.elapsedMs / 1000).toFixed(1); - const finalHeightStr = result.finalHeight.toLocaleString(); - const addedHeight = result.finalHeight - result.startingHeight; - const addedStr = addedHeight > 0 ? `+${addedHeight.toLocaleString()}px new content` : 'no new content'; - const expandStr = expandResult.total > 0 ? `, expanded ${expandResult.total}` : ''; - const outputStr = `scrolled to ${finalHeightStr}px (${addedStr}${expandStr}) over ${elapsedSec}s`; - - console.error(`Success: ${outputStr}`); - console.log(JSON.stringify({ - type: 'ArchiveResult', - status: 'succeeded', - output_str: outputStr, - })); - process.exit(0); - - } catch (e) { - if (browser) browser.disconnect(); - console.error(`ERROR: ${e.name}: ${e.message}`); - process.exit(1); - } -} - -main().catch(e => { - console.error(`Fatal error: ${e.message}`); - process.exit(1); -}); diff --git a/archivebox/plugins/infiniscroll/templates/icon.html b/archivebox/plugins/infiniscroll/templates/icon.html deleted file mode 100644 index 7de95bf4..00000000 --- a/archivebox/plugins/infiniscroll/templates/icon.html +++ /dev/null @@ -1 +0,0 @@ - diff --git a/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py b/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py deleted file mode 100644 index a2c1cb58..00000000 --- a/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py +++ /dev/null @@ -1,245 +0,0 @@ -""" -Integration tests for infiniscroll plugin - -Tests verify: -1. Hook script exists -2. Dependencies installed via chrome validation hooks -3. Verify deps with abx-pkg -4. INFINISCROLL_ENABLED=False skips without JSONL -5. Fails gracefully when no chrome session exists -6. Full integration test: scrolls page and outputs stats -7. Config options work (scroll limit, min height) -""" - -import json -import os -import re -import subprocess -import time -import tempfile -from pathlib import Path - -import pytest - -# Import shared Chrome test helpers -from archivebox.plugins.chrome.tests.chrome_test_helpers import ( - get_test_env, - chrome_session, -) - - -PLUGIN_DIR = Path(__file__).parent.parent -INFINISCROLL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_infiniscroll.*'), None) -TEST_URL = 'https://www.singsing.movie/' - - -def test_hook_script_exists(): - """Verify on_Snapshot hook exists.""" - assert INFINISCROLL_HOOK is not None, "Infiniscroll hook not found" - assert INFINISCROLL_HOOK.exists(), f"Hook not found: {INFINISCROLL_HOOK}" - - -def test_verify_deps_with_abx_pkg(): - """Verify dependencies are available via abx-pkg after hook installation.""" - from abx_pkg import Binary, EnvProvider, BinProviderOverrides - - EnvProvider.model_rebuild() - - # Verify node is available - node_binary = Binary(name='node', binproviders=[EnvProvider()]) - node_loaded = node_binary.load() - assert node_loaded and node_loaded.abspath, "Node.js required for infiniscroll plugin" - - -def test_config_infiniscroll_disabled_skips(): - """Test that INFINISCROLL_ENABLED=False exits without emitting JSONL.""" - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - env = get_test_env() - env['INFINISCROLL_ENABLED'] = 'False' - - result = subprocess.run( - ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-disabled'], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=30 - ) - - assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}" - assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr" - - # Should NOT emit any JSONL - jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')] - assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, got: {jsonl_lines}" - - -def test_fails_gracefully_without_chrome_session(): - """Test that hook fails gracefully when no chrome session exists.""" - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - infiniscroll_dir = tmpdir / 'snapshot' / 'infiniscroll' - infiniscroll_dir.mkdir(parents=True, exist_ok=True) - - result = subprocess.run( - ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-no-chrome'], - cwd=infiniscroll_dir, - capture_output=True, - text=True, - env=get_test_env(), - timeout=30 - ) - - # Should fail (exit 1) when no chrome session - assert result.returncode != 0, "Should fail when no chrome session exists" - # Error could be about chrome/CDP not found, or puppeteer module missing - err_lower = result.stderr.lower() - assert any(x in err_lower for x in ['chrome', 'cdp', 'puppeteer', 'module']), \ - f"Should mention chrome/CDP/puppeteer in error: {result.stderr}" - - -def test_scrolls_page_and_outputs_stats(): - """Integration test: scroll page and verify JSONL output format.""" - with tempfile.TemporaryDirectory() as tmpdir: - with chrome_session( - Path(tmpdir), - crawl_id='test-infiniscroll', - snapshot_id='snap-infiniscroll', - test_url=TEST_URL, - ) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env): - # Create infiniscroll output directory (sibling to chrome) - infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll' - infiniscroll_dir.mkdir() - - # Run infiniscroll hook - env['INFINISCROLL_SCROLL_LIMIT'] = '3' # Limit scrolls for faster test - env['INFINISCROLL_SCROLL_DELAY'] = '500' # Faster scrolling - env['INFINISCROLL_MIN_HEIGHT'] = '1000' # Lower threshold for test - - result = subprocess.run( - ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll'], - cwd=str(infiniscroll_dir), - capture_output=True, - text=True, - timeout=60, - env=env - ) - - assert result.returncode == 0, f"Infiniscroll failed: {result.stderr}\nStdout: {result.stdout}" - - # Parse JSONL output - result_json = None - for line in result.stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - assert result_json is not None, f"Should have ArchiveResult JSONL output. Stdout: {result.stdout}" - assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" - - # Verify output_str format: "scrolled to X,XXXpx (+Y,YYYpx new content) over Z.Zs" - output_str = result_json.get('output_str', '') - assert output_str.startswith('scrolled to'), f"output_str should start with 'scrolled to': {output_str}" - assert 'px' in output_str, f"output_str should contain pixel count: {output_str}" - assert re.search(r'over \d+(\.\d+)?s', output_str), f"output_str should contain duration: {output_str}" - - # Verify no files created in output directory - output_files = list(infiniscroll_dir.iterdir()) - assert len(output_files) == 0, f"Should not create any files, but found: {output_files}" - - -def test_config_scroll_limit_honored(): - """Test that INFINISCROLL_SCROLL_LIMIT config is respected.""" - with tempfile.TemporaryDirectory() as tmpdir: - with chrome_session( - Path(tmpdir), - crawl_id='test-scroll-limit', - snapshot_id='snap-limit', - test_url=TEST_URL, - ) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env): - - infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll' - infiniscroll_dir.mkdir() - - # Set scroll limit to 2 (use env from setup_chrome_session) - env['INFINISCROLL_SCROLL_LIMIT'] = '2' - env['INFINISCROLL_SCROLL_DELAY'] = '500' - env['INFINISCROLL_MIN_HEIGHT'] = '100000' # High threshold so limit kicks in - - result = subprocess.run( - ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-limit'], - cwd=str(infiniscroll_dir), - capture_output=True, - text=True, - timeout=60, - env=env - ) - - assert result.returncode == 0, f"Infiniscroll failed: {result.stderr}" - - # Parse output and verify scroll count - result_json = None - for line in result.stdout.strip().split('\n'): - if line.strip().startswith('{'): - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - assert result_json is not None, "Should have JSONL output" - output_str = result_json.get('output_str', '') - - # Verify output format and that it completed (scroll limit enforced internally) - assert output_str.startswith('scrolled to'), f"Should have valid output_str: {output_str}" - assert result_json['status'] == 'succeeded', f"Should succeed with scroll limit: {result_json}" - - - -def test_config_timeout_honored(): - """Test that INFINISCROLL_TIMEOUT config is respected.""" - with tempfile.TemporaryDirectory() as tmpdir: - with chrome_session( - Path(tmpdir), - crawl_id='test-timeout', - snapshot_id='snap-timeout', - test_url=TEST_URL, - ) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env): - - infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll' - infiniscroll_dir.mkdir() - - # Set very short timeout (use env from setup_chrome_session) - env['INFINISCROLL_TIMEOUT'] = '3' # 3 seconds - env['INFINISCROLL_SCROLL_DELAY'] = '2000' # 2s delay - timeout should trigger - env['INFINISCROLL_SCROLL_LIMIT'] = '100' # High limit - env['INFINISCROLL_MIN_HEIGHT'] = '100000' - - start_time = time.time() - result = subprocess.run( - ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-timeout'], - cwd=str(infiniscroll_dir), - capture_output=True, - text=True, - timeout=30, - env=env - ) - elapsed = time.time() - start_time - - # Should complete within reasonable time (timeout + buffer) - assert elapsed < 15, f"Should respect timeout, took {elapsed:.1f}s" - assert result.returncode == 0, f"Should complete even with timeout: {result.stderr}" - - - -if __name__ == '__main__': - pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/istilldontcareaboutcookies/config.json b/archivebox/plugins/istilldontcareaboutcookies/config.json deleted file mode 100644 index 44c488b0..00000000 --- a/archivebox/plugins/istilldontcareaboutcookies/config.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "additionalProperties": false, - "required_plugins": ["chrome"], - "properties": { - "ISTILLDONTCAREABOUTCOOKIES_ENABLED": { - "type": "boolean", - "default": true, - "x-aliases": ["USE_ISTILLDONTCAREABOUTCOOKIES"], - "description": "Enable I Still Don't Care About Cookies browser extension" - } - } -} diff --git a/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__81_install_istilldontcareaboutcookies_extension.js b/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__81_install_istilldontcareaboutcookies_extension.js deleted file mode 100755 index ab29cdac..00000000 --- a/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__81_install_istilldontcareaboutcookies_extension.js +++ /dev/null @@ -1,115 +0,0 @@ -#!/usr/bin/env node -/** - * I Still Don't Care About Cookies Extension Plugin - * - * Installs and configures the "I still don't care about cookies" Chrome extension - * for automatic cookie consent banner dismissal during page archiving. - * - * Extension: https://chromewebstore.google.com/detail/edibdbjcniadpccecjdfdjjppcpchdlm - * - * Priority: 81 - Must install before Chrome session starts at Crawl level - * Hook: on_Crawl (runs once per crawl, not per snapshot) - * - * This extension automatically: - * - Dismisses cookie consent popups - * - Removes cookie banners - * - Accepts necessary cookies to proceed with browsing - * - Works on thousands of websites out of the box - */ - -const path = require('path'); -const fs = require('fs'); - -// Import extension utilities -const extensionUtils = require('../chrome/chrome_utils.js'); - -// Extension metadata -const EXTENSION = { - webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm', - name: 'istilldontcareaboutcookies', -}; - -// Get extensions directory from environment or use default -const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR || - path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions'); - -/** - * Install the I Still Don't Care About Cookies extension - */ -async function installCookiesExtension() { - console.log('[*] Installing I Still Don\'t Care About Cookies extension...'); - - // Install the extension - const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR); - - if (!extension) { - console.error('[āŒ] Failed to install I Still Don\'t Care About Cookies extension'); - return null; - } - - console.log('[+] I Still Don\'t Care About Cookies extension installed'); - console.log('[+] Cookie banners will be automatically dismissed during archiving'); - - return extension; -} - -/** - * Note: This extension works out of the box with no configuration needed. - * It automatically detects and dismisses cookie banners on page load. - */ - -/** - * Main entry point - install extension before archiving - */ -async function main() { - // Check if extension is already cached - const cacheFile = path.join(EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json'); - - if (fs.existsSync(cacheFile)) { - try { - const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8')); - const manifestPath = path.join(cached.unpacked_path, 'manifest.json'); - - if (fs.existsSync(manifestPath)) { - console.log('[*] I Still Don\'t Care About Cookies extension already installed (using cache)'); - return cached; - } - } catch (e) { - // Cache file corrupted, re-install - console.warn('[āš ļø] Extension cache corrupted, re-installing...'); - } - } - - // Install extension - const extension = await installCookiesExtension(); - - // Export extension metadata for chrome plugin to load - if (extension) { - // Write extension info to a cache file that chrome plugin can read - await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true }); - await fs.promises.writeFile( - cacheFile, - JSON.stringify(extension, null, 2) - ); - console.log(`[+] Extension metadata written to ${cacheFile}`); - } - - return extension; -} - -// Export functions for use by other plugins -module.exports = { - EXTENSION, - installCookiesExtension, -}; - -// Run if executed directly -if (require.main === module) { - main().then(() => { - console.log('[āœ“] I Still Don\'t Care About Cookies extension setup complete'); - process.exit(0); - }).catch(err => { - console.error('[āŒ] I Still Don\'t Care About Cookies extension setup failed:', err); - process.exit(1); - }); -} diff --git a/archivebox/plugins/istilldontcareaboutcookies/templates/icon.html b/archivebox/plugins/istilldontcareaboutcookies/templates/icon.html deleted file mode 100644 index e69de29b..00000000 diff --git a/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py b/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py deleted file mode 100644 index 1371b5c7..00000000 --- a/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py +++ /dev/null @@ -1,641 +0,0 @@ -""" -Unit tests for istilldontcareaboutcookies plugin - -Tests invoke the plugin hook as an external process and verify outputs/side effects. -""" - -import json -import os -import signal -import subprocess -import tempfile -import time -from pathlib import Path - -import pytest - -from archivebox.plugins.chrome.tests.chrome_test_helpers import ( - setup_test_env, - get_test_env, - launch_chromium_session, - kill_chromium_session, - CHROME_LAUNCH_HOOK, - PLUGINS_ROOT, -) - - -PLUGIN_DIR = Path(__file__).parent.parent -INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_istilldontcareaboutcookies_extension.*'), None) - - -def test_install_script_exists(): - """Verify install script exists""" - assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}" - - -def test_extension_metadata(): - """Test that extension has correct metadata""" - with tempfile.TemporaryDirectory() as tmpdir: - env = os.environ.copy() - env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions") - - result = subprocess.run( - ["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"], - capture_output=True, - text=True, - env=env - ) - - assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}" - - metadata = json.loads(result.stdout) - assert metadata["webstore_id"] == "edibdbjcniadpccecjdfdjjppcpchdlm" - assert metadata["name"] == "istilldontcareaboutcookies" - - -def test_install_creates_cache(): - """Test that install creates extension cache""" - with tempfile.TemporaryDirectory() as tmpdir: - ext_dir = Path(tmpdir) / "chrome_extensions" - ext_dir.mkdir(parents=True) - - env = os.environ.copy() - env["CHROME_EXTENSIONS_DIR"] = str(ext_dir) - - result = subprocess.run( - ["node", str(INSTALL_SCRIPT)], - capture_output=True, - text=True, - env=env, - timeout=60 - ) - - # Check output mentions installation - assert "Installing" in result.stdout or "installed" in result.stdout or "istilldontcareaboutcookies" in result.stdout - - # Check cache file was created - cache_file = ext_dir / "istilldontcareaboutcookies.extension.json" - assert cache_file.exists(), "Cache file should be created" - - # Verify cache content - cache_data = json.loads(cache_file.read_text()) - assert cache_data["webstore_id"] == "edibdbjcniadpccecjdfdjjppcpchdlm" - assert cache_data["name"] == "istilldontcareaboutcookies" - - -def test_install_uses_existing_cache(): - """Test that install uses existing cache when available""" - with tempfile.TemporaryDirectory() as tmpdir: - ext_dir = Path(tmpdir) / "chrome_extensions" - ext_dir.mkdir(parents=True) - - # Create fake cache - fake_extension_dir = ext_dir / "edibdbjcniadpccecjdfdjjppcpchdlm__istilldontcareaboutcookies" - fake_extension_dir.mkdir(parents=True) - - manifest = {"version": "1.1.8", "name": "I still don't care about cookies"} - (fake_extension_dir / "manifest.json").write_text(json.dumps(manifest)) - - env = os.environ.copy() - env["CHROME_EXTENSIONS_DIR"] = str(ext_dir) - - result = subprocess.run( - ["node", str(INSTALL_SCRIPT)], - capture_output=True, - text=True, - env=env, - timeout=30 - ) - - # Should use cache or install successfully - assert result.returncode == 0 - - -def test_no_configuration_required(): - """Test that extension works without any configuration""" - with tempfile.TemporaryDirectory() as tmpdir: - ext_dir = Path(tmpdir) / "chrome_extensions" - ext_dir.mkdir(parents=True) - - env = os.environ.copy() - env["CHROME_EXTENSIONS_DIR"] = str(ext_dir) - # No special env vars needed - works out of the box - - result = subprocess.run( - ["node", str(INSTALL_SCRIPT)], - capture_output=True, - text=True, - env=env, - timeout=60 - ) - - # Should not require any API keys or configuration - assert "API" not in (result.stdout + result.stderr) or result.returncode == 0 - - -TEST_URL = 'https://www.filmin.es/' - - -def test_extension_loads_in_chromium(): - """Verify extension loads in Chromium by visiting its options page. - - Uses Chromium with --load-extension to load the extension, then navigates - to chrome-extension:///options.html and checks that the extension name - appears in the page content. - """ - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - # Set up isolated env with proper directory structure - env = setup_test_env(tmpdir) - env.setdefault('CHROME_HEADLESS', 'true') - - ext_dir = Path(env['CHROME_EXTENSIONS_DIR']) - - # Step 1: Install the extension - result = subprocess.run( - ['node', str(INSTALL_SCRIPT)], - cwd=str(tmpdir), - capture_output=True, - text=True, - env=env, - timeout=60 - ) - assert result.returncode == 0, f"Extension install failed: {result.stderr}" - - # Verify extension cache was created - cache_file = ext_dir / 'istilldontcareaboutcookies.extension.json' - assert cache_file.exists(), "Extension cache not created" - ext_data = json.loads(cache_file.read_text()) - print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}") - - # Step 2: Launch Chromium using the chrome hook (loads extensions automatically) - crawl_id = 'test-cookies' - crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id - crawl_dir.mkdir(parents=True, exist_ok=True) - chrome_dir = crawl_dir / 'chrome' - chrome_dir.mkdir(parents=True, exist_ok=True) - env['CRAWL_OUTPUT_DIR'] = str(crawl_dir) - - chrome_launch_process = subprocess.Popen( - ['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'], - cwd=str(chrome_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=env - ) - - # Wait for Chromium to launch and CDP URL to be available - cdp_url = None - for i in range(20): - if chrome_launch_process.poll() is not None: - stdout, stderr = chrome_launch_process.communicate() - raise RuntimeError(f"Chromium launch failed:\nStdout: {stdout}\nStderr: {stderr}") - cdp_file = chrome_dir / 'cdp_url.txt' - if cdp_file.exists(): - cdp_url = cdp_file.read_text().strip() - break - time.sleep(1) - - assert cdp_url, "Chromium CDP URL not found after 20s" - print(f"Chromium launched with CDP URL: {cdp_url}") - - # Check that extensions were loaded - extensions_file = chrome_dir / 'extensions.json' - if extensions_file.exists(): - loaded_exts = json.loads(extensions_file.read_text()) - print(f"Extensions loaded: {[e.get('name') for e in loaded_exts]}") - - try: - # Step 3: Connect to Chromium and verify extension loaded via options page - test_script = f''' -if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); -const puppeteer = require('puppeteer-core'); - -(async () => {{ - const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }}); - - // Wait for extension to initialize - await new Promise(r => setTimeout(r, 2000)); - - // Find extension targets to get the extension ID - const targets = browser.targets(); - const extTargets = targets.filter(t => - t.url().startsWith('chrome-extension://') || - t.type() === 'service_worker' || - t.type() === 'background_page' - ); - - // Filter out Chrome's built-in extensions - const builtinIds = ['nkeimhogjdpnpccoofpliimaahmaaome', 'fignfifoniblkonapihmkfakmlgkbkcf', - 'ahfgeienlihckogmohjhadlkjgocpleb', 'mhjfbmdgcfjbbpaeojofohoefgiehjai']; - const customExtTargets = extTargets.filter(t => {{ - const url = t.url(); - if (!url.startsWith('chrome-extension://')) return false; - const extId = url.split('://')[1].split('/')[0]; - return !builtinIds.includes(extId); - }}); - - console.error('Custom extension targets found:', customExtTargets.length); - customExtTargets.forEach(t => console.error(' -', t.type(), t.url())); - - if (customExtTargets.length === 0) {{ - console.log(JSON.stringify({{ loaded: false, error: 'No custom extension targets found' }})); - browser.disconnect(); - return; - }} - - // Get the extension ID from the first custom extension target - const extUrl = customExtTargets[0].url(); - const extId = extUrl.split('://')[1].split('/')[0]; - console.error('Extension ID:', extId); - - // Try to navigate to the extension's options.html page - const page = await browser.newPage(); - const optionsUrl = 'chrome-extension://' + extId + '/options.html'; - console.error('Navigating to options page:', optionsUrl); - - try {{ - await page.goto(optionsUrl, {{ waitUntil: 'domcontentloaded', timeout: 10000 }}); - const pageContent = await page.content(); - const pageTitle = await page.title(); - - // Check if extension name appears in the page - const hasExtensionName = pageContent.toLowerCase().includes('cookie') || - pageContent.toLowerCase().includes('idontcareaboutcookies') || - pageTitle.toLowerCase().includes('cookie'); - - console.log(JSON.stringify({{ - loaded: true, - extensionId: extId, - optionsPageLoaded: true, - pageTitle: pageTitle, - hasExtensionName: hasExtensionName, - contentLength: pageContent.length - }})); - }} catch (e) {{ - // options.html may not exist, but extension is still loaded - console.log(JSON.stringify({{ - loaded: true, - extensionId: extId, - optionsPageLoaded: false, - error: e.message - }})); - }} - - browser.disconnect(); -}})(); -''' - script_path = tmpdir / 'test_extension.js' - script_path.write_text(test_script) - - result = subprocess.run( - ['node', str(script_path)], - cwd=str(tmpdir), - capture_output=True, - text=True, - env=env, - timeout=90 - ) - - print(f"stderr: {result.stderr}") - print(f"stdout: {result.stdout}") - - assert result.returncode == 0, f"Test failed: {result.stderr}" - - output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')] - assert output_lines, f"No JSON output: {result.stdout}" - - test_result = json.loads(output_lines[-1]) - assert test_result.get('loaded'), \ - f"Extension should be loaded in Chromium. Result: {test_result}" - print(f"Extension loaded successfully: {test_result}") - - finally: - # Clean up Chromium - try: - chrome_launch_process.send_signal(signal.SIGTERM) - chrome_launch_process.wait(timeout=5) - except: - pass - chrome_pid_file = chrome_dir / 'chrome.pid' - if chrome_pid_file.exists(): - try: - chrome_pid = int(chrome_pid_file.read_text().strip()) - os.kill(chrome_pid, signal.SIGKILL) - except (OSError, ValueError): - pass - - -def check_cookie_consent_visibility(cdp_url: str, test_url: str, env: dict, script_dir: Path) -> dict: - """Check if cookie consent elements are visible on a page. - - Returns dict with: - - visible: bool - whether any cookie consent element is visible - - selector: str - which selector matched (if visible) - - elements_found: list - all cookie-related elements found in DOM - - html_snippet: str - snippet of the page HTML for debugging - """ - test_script = f''' -if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); -const puppeteer = require('puppeteer-core'); - -(async () => {{ - const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }}); - - const page = await browser.newPage(); - await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'); - await page.setViewport({{ width: 1440, height: 900 }}); - - console.error('Navigating to {test_url}...'); - await page.goto('{test_url}', {{ waitUntil: 'networkidle2', timeout: 30000 }}); - - // Wait for page to fully render and any cookie scripts to run - await new Promise(r => setTimeout(r, 3000)); - - // Check cookie consent visibility using multiple common selectors - const result = await page.evaluate(() => {{ - // Common cookie consent selectors used by various consent management platforms - const selectors = [ - // CookieYes - '.cky-consent-container', '.cky-popup-center', '.cky-overlay', '.cky-modal', - // OneTrust - '#onetrust-consent-sdk', '#onetrust-banner-sdk', '.onetrust-pc-dark-filter', - // Cookiebot - '#CybotCookiebotDialog', '#CybotCookiebotDialogBodyUnderlay', - // Generic cookie banners - '[class*="cookie-consent"]', '[class*="cookie-banner"]', '[class*="cookie-notice"]', - '[class*="cookie-popup"]', '[class*="cookie-modal"]', '[class*="cookie-dialog"]', - '[id*="cookie-consent"]', '[id*="cookie-banner"]', '[id*="cookie-notice"]', - '[id*="cookieconsent"]', '[id*="cookie-law"]', - // GDPR banners - '[class*="gdpr"]', '[id*="gdpr"]', - // Consent banners - '[class*="consent-banner"]', '[class*="consent-modal"]', '[class*="consent-popup"]', - // Privacy banners - '[class*="privacy-banner"]', '[class*="privacy-notice"]', - // Common frameworks - '.cc-window', '.cc-banner', '#cc-main', // Cookie Consent by Insites - '.qc-cmp2-container', // Quantcast - '.sp-message-container', // SourcePoint - ]; - - const elementsFound = []; - let visibleElement = null; - - for (const sel of selectors) {{ - try {{ - const elements = document.querySelectorAll(sel); - for (const el of elements) {{ - const style = window.getComputedStyle(el); - const rect = el.getBoundingClientRect(); - const isVisible = style.display !== 'none' && - style.visibility !== 'hidden' && - style.opacity !== '0' && - rect.width > 0 && rect.height > 0; - - elementsFound.push({{ - selector: sel, - visible: isVisible, - display: style.display, - visibility: style.visibility, - opacity: style.opacity, - width: rect.width, - height: rect.height - }}); - - if (isVisible && !visibleElement) {{ - visibleElement = {{ selector: sel, width: rect.width, height: rect.height }}; - }} - }} - }} catch (e) {{ - // Invalid selector, skip - }} - }} - - // Also grab a snippet of the HTML to help debug - const bodyHtml = document.body.innerHTML.slice(0, 2000); - const hasCookieKeyword = bodyHtml.toLowerCase().includes('cookie') || - bodyHtml.toLowerCase().includes('consent') || - bodyHtml.toLowerCase().includes('gdpr'); - - return {{ - visible: visibleElement !== null, - selector: visibleElement ? visibleElement.selector : null, - elements_found: elementsFound, - has_cookie_keyword_in_html: hasCookieKeyword, - html_snippet: bodyHtml.slice(0, 500) - }}; - }}); - - console.error('Cookie consent check result:', JSON.stringify({{ - visible: result.visible, - selector: result.selector, - elements_found_count: result.elements_found.length - }})); - - browser.disconnect(); - console.log(JSON.stringify(result)); -}})(); -''' - script_path = script_dir / 'check_cookies.js' - script_path.write_text(test_script) - - result = subprocess.run( - ['node', str(script_path)], - cwd=str(script_dir), - capture_output=True, - text=True, - env=env, - timeout=90 - ) - - if result.returncode != 0: - raise RuntimeError(f"Cookie check script failed: {result.stderr}") - - output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')] - if not output_lines: - raise RuntimeError(f"No JSON output from cookie check: {result.stdout}\nstderr: {result.stderr}") - - return json.loads(output_lines[-1]) - - -def test_hides_cookie_consent_on_filmin(): - """Live test: verify extension hides cookie consent popup on filmin.es. - - This test runs TWO browser sessions: - 1. WITHOUT extension - verifies cookie consent IS visible (baseline) - 2. WITH extension - verifies cookie consent is HIDDEN - - This ensures we're actually testing the extension's effect, not just - that a page happens to not have cookie consent. - """ - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - # Set up isolated env with proper directory structure - env_base = setup_test_env(tmpdir) - env_base['CHROME_HEADLESS'] = 'true' - - ext_dir = Path(env_base['CHROME_EXTENSIONS_DIR']) - - # ============================================================ - # STEP 1: BASELINE - Run WITHOUT extension, verify cookie consent IS visible - # ============================================================ - print("\n" + "="*60) - print("STEP 1: BASELINE TEST (no extension)") - print("="*60) - - data_dir = Path(env_base['DATA_DIR']) - - env_no_ext = env_base.copy() - env_no_ext['CHROME_EXTENSIONS_DIR'] = str(data_dir / 'personas' / 'Default' / 'empty_extensions') - (data_dir / 'personas' / 'Default' / 'empty_extensions').mkdir(parents=True, exist_ok=True) - - # Launch baseline Chromium in crawls directory - baseline_crawl_id = 'baseline-no-ext' - baseline_crawl_dir = Path(env_base['CRAWLS_DIR']) / baseline_crawl_id - baseline_crawl_dir.mkdir(parents=True, exist_ok=True) - baseline_chrome_dir = baseline_crawl_dir / 'chrome' - env_no_ext['CRAWL_OUTPUT_DIR'] = str(baseline_crawl_dir) - baseline_process = None - - try: - baseline_process, baseline_cdp_url = launch_chromium_session( - env_no_ext, baseline_chrome_dir, baseline_crawl_id - ) - print(f"Baseline Chromium launched: {baseline_cdp_url}") - - # Wait a moment for browser to be ready - time.sleep(2) - - baseline_result = check_cookie_consent_visibility( - baseline_cdp_url, TEST_URL, env_no_ext, tmpdir - ) - - print(f"Baseline result: visible={baseline_result['visible']}, " - f"elements_found={len(baseline_result['elements_found'])}") - - if baseline_result['elements_found']: - print("Elements found in baseline:") - for el in baseline_result['elements_found'][:5]: # Show first 5 - print(f" - {el['selector']}: visible={el['visible']}, " - f"display={el['display']}, size={el['width']}x{el['height']}") - - finally: - if baseline_process: - kill_chromium_session(baseline_process, baseline_chrome_dir) - - # Verify baseline shows cookie consent - if not baseline_result['visible']: - # If no cookie consent visible in baseline, we can't test the extension - # This could happen if: - # - The site changed and no longer shows cookie consent - # - Cookie consent is region-specific - # - Our selectors don't match this site - print("\nWARNING: No cookie consent visible in baseline!") - print(f"HTML has cookie keywords: {baseline_result.get('has_cookie_keyword_in_html')}") - print(f"HTML snippet: {baseline_result.get('html_snippet', '')[:200]}") - - pytest.fail( - f"Cannot test extension: no cookie consent visible in baseline on {TEST_URL}. " - f"Elements found: {len(baseline_result['elements_found'])}. " - f"The site may have changed or cookie consent may be region-specific." - ) - - print(f"\nāœ“ Baseline confirmed: Cookie consent IS visible (selector: {baseline_result['selector']})") - - # ============================================================ - # STEP 2: Install the extension - # ============================================================ - print("\n" + "="*60) - print("STEP 2: INSTALLING EXTENSION") - print("="*60) - - env_with_ext = env_base.copy() - env_with_ext['CHROME_EXTENSIONS_DIR'] = str(ext_dir) - - result = subprocess.run( - ['node', str(INSTALL_SCRIPT)], - cwd=str(tmpdir), - capture_output=True, - text=True, - env=env_with_ext, - timeout=60 - ) - assert result.returncode == 0, f"Extension install failed: {result.stderr}" - - cache_file = ext_dir / 'istilldontcareaboutcookies.extension.json' - assert cache_file.exists(), "Extension cache not created" - ext_data = json.loads(cache_file.read_text()) - print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}") - - # ============================================================ - # STEP 3: Run WITH extension, verify cookie consent is HIDDEN - # ============================================================ - print("\n" + "="*60) - print("STEP 3: TEST WITH EXTENSION") - print("="*60) - - # Launch extension test Chromium in crawls directory - ext_crawl_id = 'test-with-ext' - ext_crawl_dir = Path(env_base['CRAWLS_DIR']) / ext_crawl_id - ext_crawl_dir.mkdir(parents=True, exist_ok=True) - ext_chrome_dir = ext_crawl_dir / 'chrome' - env_with_ext['CRAWL_OUTPUT_DIR'] = str(ext_crawl_dir) - ext_process = None - - try: - ext_process, ext_cdp_url = launch_chromium_session( - env_with_ext, ext_chrome_dir, ext_crawl_id - ) - print(f"Extension Chromium launched: {ext_cdp_url}") - - # Check that extension was loaded - extensions_file = ext_chrome_dir / 'extensions.json' - if extensions_file.exists(): - loaded_exts = json.loads(extensions_file.read_text()) - print(f"Extensions loaded: {[e.get('name') for e in loaded_exts]}") - - # Wait for extension to initialize - time.sleep(3) - - ext_result = check_cookie_consent_visibility( - ext_cdp_url, TEST_URL, env_with_ext, tmpdir - ) - - print(f"Extension result: visible={ext_result['visible']}, " - f"elements_found={len(ext_result['elements_found'])}") - - if ext_result['elements_found']: - print("Elements found with extension:") - for el in ext_result['elements_found'][:5]: - print(f" - {el['selector']}: visible={el['visible']}, " - f"display={el['display']}, size={el['width']}x{el['height']}") - - finally: - if ext_process: - kill_chromium_session(ext_process, ext_chrome_dir) - - # ============================================================ - # STEP 4: Compare results - # ============================================================ - print("\n" + "="*60) - print("STEP 4: COMPARISON") - print("="*60) - print(f"Baseline (no extension): cookie consent visible = {baseline_result['visible']}") - print(f"With extension: cookie consent visible = {ext_result['visible']}") - - assert baseline_result['visible'], \ - "Baseline should show cookie consent (this shouldn't happen, we checked above)" - - assert not ext_result['visible'], \ - f"Cookie consent should be HIDDEN by extension.\n" \ - f"Baseline showed consent at: {baseline_result['selector']}\n" \ - f"But with extension, consent is still visible.\n" \ - f"Elements still visible: {[e for e in ext_result['elements_found'] if e['visible']]}" - - print("\nāœ“ SUCCESS: Extension correctly hides cookie consent!") - print(f" - Baseline showed consent at: {baseline_result['selector']}") - print(f" - Extension successfully hid it") diff --git a/archivebox/plugins/mercury/config.json b/archivebox/plugins/mercury/config.json deleted file mode 100644 index 039c38a7..00000000 --- a/archivebox/plugins/mercury/config.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "additionalProperties": false, - "properties": { - "MERCURY_ENABLED": { - "type": "boolean", - "default": true, - "x-aliases": ["SAVE_MERCURY", "USE_MERCURY"], - "description": "Enable Mercury text extraction" - }, - "MERCURY_BINARY": { - "type": "string", - "default": "postlight-parser", - "x-aliases": ["POSTLIGHT_PARSER_BINARY"], - "description": "Path to Mercury/Postlight parser binary" - }, - "MERCURY_TIMEOUT": { - "type": "integer", - "default": 30, - "minimum": 5, - "x-fallback": "TIMEOUT", - "description": "Timeout for Mercury in seconds" - }, - "MERCURY_ARGS": { - "type": "array", - "items": {"type": "string"}, - "default": [], - "x-aliases": ["MERCURY_DEFAULT_ARGS"], - "description": "Default Mercury parser arguments" - }, - "MERCURY_ARGS_EXTRA": { - "type": "array", - "items": {"type": "string"}, - "default": [], - "x-aliases": ["MERCURY_EXTRA_ARGS"], - "description": "Extra arguments to append to Mercury parser command" - } - } -} diff --git a/archivebox/plugins/mercury/on_Crawl__40_mercury_install.py b/archivebox/plugins/mercury/on_Crawl__40_mercury_install.py deleted file mode 100755 index 7ec64d8b..00000000 --- a/archivebox/plugins/mercury/on_Crawl__40_mercury_install.py +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python3 -""" -Emit postlight-parser Binary dependency for the crawl. -""" - -import json -import os -import sys - - -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - -def get_env_bool(name: str, default: bool = False) -> bool: - val = get_env(name, '').lower() - if val in ('true', '1', 'yes', 'on'): - return True - if val in ('false', '0', 'no', 'off'): - return False - return default - - -def output_binary(name: str, binproviders: str): - """Output Binary JSONL record for a dependency.""" - machine_id = os.environ.get('MACHINE_ID', '') - - record = { - 'type': 'Binary', - 'name': name, - 'binproviders': binproviders, - 'overrides': { - 'npm': { - 'packages': ['@postlight/parser'], - } - }, - 'machine_id': machine_id, - } - print(json.dumps(record)) - - -def main(): - mercury_enabled = get_env_bool('MERCURY_ENABLED', True) - - if not mercury_enabled: - sys.exit(0) - - output_binary(name='postlight-parser', binproviders='npm,env') - - sys.exit(0) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/mercury/on_Snapshot__57_mercury.py b/archivebox/plugins/mercury/on_Snapshot__57_mercury.py deleted file mode 100644 index 1af0bdb6..00000000 --- a/archivebox/plugins/mercury/on_Snapshot__57_mercury.py +++ /dev/null @@ -1,200 +0,0 @@ -#!/usr/bin/env python3 -""" -Extract article content using Postlight's Mercury Parser. - -Usage: on_Snapshot__mercury.py --url= --snapshot-id= -Output: Creates mercury/ directory with content.html, content.txt, article.json - -Environment variables: - MERCURY_BINARY: Path to postlight-parser binary - MERCURY_TIMEOUT: Timeout in seconds (default: 60) - MERCURY_ARGS: Default Mercury arguments (JSON array) - MERCURY_ARGS_EXTRA: Extra arguments to append (JSON array) - TIMEOUT: Fallback timeout - -Note: Requires postlight-parser: npm install -g @postlight/parser -""" - -import html -import json -import os -import subprocess -import sys -from pathlib import Path -from urllib.parse import urlparse - -import rich_click as click - - -# Extractor metadata -PLUGIN_NAME = 'mercury' -BIN_NAME = 'postlight-parser' -BIN_PROVIDERS = 'npm,env' -OUTPUT_DIR = '.' - - -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - - -def get_env_bool(name: str, default: bool = False) -> bool: - val = get_env(name, '').lower() - if val in ('true', '1', 'yes', 'on'): - return True - if val in ('false', '0', 'no', 'off'): - return False - return default - - -def get_env_int(name: str, default: int = 0) -> int: - try: - return int(get_env(name, str(default))) - except ValueError: - return default - - -def get_env_array(name: str, default: list[str] | None = None) -> list[str]: - """Parse a JSON array from environment variable.""" - val = get_env(name, '') - if not val: - return default if default is not None else [] - try: - result = json.loads(val) - if isinstance(result, list): - return [str(item) for item in result] - return default if default is not None else [] - except json.JSONDecodeError: - return default if default is not None else [] - - -def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]: - """ - Extract article using Mercury Parser. - - Returns: (success, output_path, error_message) - """ - timeout = get_env_int('MERCURY_TIMEOUT') or get_env_int('TIMEOUT', 60) - mercury_args = get_env_array('MERCURY_ARGS', []) - mercury_args_extra = get_env_array('MERCURY_ARGS_EXTRA', []) - - # Output directory is current directory (hook already runs in output dir) - output_dir = Path(OUTPUT_DIR) - - try: - # Get text version - cmd_text = [binary, *mercury_args, *mercury_args_extra, url, '--format=text'] - result_text = subprocess.run(cmd_text, stdout=subprocess.PIPE, timeout=timeout, text=True) - if result_text.stdout: - sys.stderr.write(result_text.stdout) - sys.stderr.flush() - - if result_text.returncode != 0: - return False, None, f'postlight-parser failed (exit={result_text.returncode})' - - try: - text_json = json.loads(result_text.stdout) - except json.JSONDecodeError: - return False, None, 'postlight-parser returned invalid JSON' - - if text_json.get('failed'): - return False, None, 'Mercury was not able to extract article' - - # Save text content - text_content = text_json.get('content', '') - (output_dir / 'content.txt').write_text(text_content, encoding='utf-8') - - # Get HTML version - cmd_html = [binary, *mercury_args, *mercury_args_extra, url, '--format=html'] - result_html = subprocess.run(cmd_html, stdout=subprocess.PIPE, timeout=timeout, text=True) - if result_html.stdout: - sys.stderr.write(result_html.stdout) - sys.stderr.flush() - - try: - html_json = json.loads(result_html.stdout) - except json.JSONDecodeError: - html_json = {} - - # Save HTML content and metadata - html_content = html_json.pop('content', '') - # Some sources return HTML-escaped markup inside the content blob. - # If it looks heavily escaped, unescape once so it renders properly. - if html_content: - escaped_count = html_content.count('<') + html_content.count('>') - tag_count = html_content.count('<') - if escaped_count and escaped_count > tag_count * 2: - html_content = html.unescape(html_content) - (output_dir / 'content.html').write_text(html_content, encoding='utf-8') - - # Save article metadata - metadata = {k: v for k, v in text_json.items() if k != 'content'} - (output_dir / 'article.json').write_text(json.dumps(metadata, indent=2), encoding='utf-8') - - # Link images/ to responses capture (if available) - try: - hostname = urlparse(url).hostname or '' - if hostname: - responses_images = (output_dir / '..' / 'responses' / 'image' / hostname / 'images').resolve() - link_path = output_dir / 'images' - if responses_images.exists() and responses_images.is_dir(): - if link_path.exists() or link_path.is_symlink(): - if link_path.is_symlink() or link_path.is_file(): - link_path.unlink() - else: - # Don't remove real directories - responses_images = None - if responses_images: - rel_target = os.path.relpath(str(responses_images), str(output_dir)) - link_path.symlink_to(rel_target) - except Exception: - pass - - return True, 'content.html', '' - - except subprocess.TimeoutExpired: - return False, None, f'Timed out after {timeout} seconds' - except Exception as e: - return False, None, f'{type(e).__name__}: {e}' - - -@click.command() -@click.option('--url', required=True, help='URL to extract article from') -@click.option('--snapshot-id', required=True, help='Snapshot UUID') -def main(url: str, snapshot_id: str): - """Extract article content using Postlight's Mercury Parser.""" - - try: - # Check if mercury extraction is enabled - if not get_env_bool('MERCURY_ENABLED', True): - print('Skipping mercury (MERCURY_ENABLED=False)', file=sys.stderr) - # Temporary failure (config disabled) - NO JSONL emission - sys.exit(0) - - # Get binary from environment - binary = get_env('MERCURY_BINARY', 'postlight-parser') - - # Run extraction - success, output, error = extract_mercury(url, binary) - - if success: - # Success - emit ArchiveResult - result = { - 'type': 'ArchiveResult', - 'status': 'succeeded', - 'output_str': output or '' - } - print(json.dumps(result)) - sys.exit(0) - else: - # Transient error - emit NO JSONL - print(f'ERROR: {error}', file=sys.stderr) - sys.exit(1) - - except Exception as e: - # Transient error - emit NO JSONL - print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr) - sys.exit(1) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/mercury/templates/card.html b/archivebox/plugins/mercury/templates/card.html deleted file mode 100644 index cf7cdb40..00000000 --- a/archivebox/plugins/mercury/templates/card.html +++ /dev/null @@ -1,8 +0,0 @@ - -
- -
diff --git a/archivebox/plugins/mercury/templates/icon.html b/archivebox/plugins/mercury/templates/icon.html deleted file mode 100644 index bd17e0cf..00000000 --- a/archivebox/plugins/mercury/templates/icon.html +++ /dev/null @@ -1 +0,0 @@ - diff --git a/archivebox/plugins/mercury/tests/test_mercury.py b/archivebox/plugins/mercury/tests/test_mercury.py deleted file mode 100644 index 242eb5db..00000000 --- a/archivebox/plugins/mercury/tests/test_mercury.py +++ /dev/null @@ -1,163 +0,0 @@ -""" -Integration tests for mercury plugin - -Tests verify: -1. Hook script exists -2. Dependencies installed via validation hooks -3. Verify deps with abx-pkg -4. Mercury extraction works on https://example.com -5. JSONL output is correct -6. Filesystem output contains extracted content -7. Config options work -""" - -import json -import subprocess -import sys -import tempfile -from pathlib import Path -import pytest - -from archivebox.plugins.chrome.tests.chrome_test_helpers import ( - get_plugin_dir, - get_hook_script, - PLUGINS_ROOT, -) - - -PLUGIN_DIR = get_plugin_dir(__file__) -MERCURY_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_mercury.*') -TEST_URL = 'https://example.com' - -def test_hook_script_exists(): - """Verify on_Snapshot hook exists.""" - assert MERCURY_HOOK.exists(), f"Hook not found: {MERCURY_HOOK}" - - -def test_verify_deps_with_abx_pkg(): - """Verify postlight-parser is available via abx-pkg.""" - from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides - - # Verify postlight-parser is available - mercury_binary = Binary( - name='postlight-parser', - binproviders=[NpmProvider(), EnvProvider()], - overrides={'npm': {'packages': ['@postlight/parser']}} - ) - mercury_loaded = mercury_binary.load() - - # If validate hook found it (exit 0), this should succeed - # If validate hook didn't find it (exit 1), this may fail unless binprovider installed it - if mercury_loaded and mercury_loaded.abspath: - assert True, "postlight-parser is available" - else: - pass - -def test_extracts_with_mercury_parser(): - """Test full workflow: extract with postlight-parser from real HTML via hook.""" - # Prerequisites checked by earlier test - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - # Create HTML source that mercury can parse - (tmpdir / 'singlefile').mkdir() - (tmpdir / 'singlefile' / 'singlefile.html').write_text( - 'Test Article' - '

Example Article

This is test content for mercury parser.

' - '' - ) - - # Run mercury extraction hook - result = subprocess.run( - [sys.executable, str(MERCURY_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'], - cwd=tmpdir, - capture_output=True, - text=True, - timeout=60 - ) - - assert result.returncode == 0, f"Extraction failed: {result.stderr}" - - # Parse clean JSONL output - result_json = None - for line in result.stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - pass - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - assert result_json, "Should have ArchiveResult JSONL output" - assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" - - # Verify filesystem output (hook writes to current directory) - output_file = tmpdir / 'content.html' - assert output_file.exists(), "content.html not created" - - content = output_file.read_text() - assert len(content) > 0, "Output should not be empty" - -def test_config_save_mercury_false_skips(): - """Test that MERCURY_ENABLED=False exits without emitting JSONL.""" - import os - - with tempfile.TemporaryDirectory() as tmpdir: - env = os.environ.copy() - env['MERCURY_ENABLED'] = 'False' - - result = subprocess.run( - [sys.executable, str(MERCURY_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=30 - ) - - assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}" - - # Feature disabled - temporary failure, should NOT emit JSONL - assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr" - - # Should NOT emit any JSONL - jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')] - assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}" - - -def test_fails_gracefully_without_html(): - """Test that mercury works even without HTML source (fetches URL directly).""" - with tempfile.TemporaryDirectory() as tmpdir: - result = subprocess.run( - [sys.executable, str(MERCURY_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'], - cwd=tmpdir, - capture_output=True, - text=True, - timeout=30 - ) - - # Mercury fetches URL directly with postlight-parser, doesn't need HTML source - # Parse clean JSONL output - result_json = None - for line in result.stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - # Mercury should succeed or fail based on network, not based on HTML source - assert result_json, "Should emit ArchiveResult" - assert result_json['status'] in ['succeeded', 'failed'], f"Should succeed or fail: {result_json}" - -if __name__ == '__main__': - pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/modalcloser/config.json b/archivebox/plugins/modalcloser/config.json deleted file mode 100644 index 7e746087..00000000 --- a/archivebox/plugins/modalcloser/config.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "additionalProperties": false, - "required_plugins": ["chrome"], - "properties": { - "MODALCLOSER_ENABLED": { - "type": "boolean", - "default": true, - "x-aliases": ["CLOSE_MODALS", "AUTO_CLOSE_MODALS"], - "description": "Enable automatic modal and dialog closing" - }, - "MODALCLOSER_TIMEOUT": { - "type": "integer", - "default": 1250, - "minimum": 100, - "description": "Delay before auto-closing dialogs (ms)" - }, - "MODALCLOSER_POLL_INTERVAL": { - "type": "integer", - "default": 500, - "minimum": 100, - "description": "How often to check for CSS modals (ms)" - } - } -} diff --git a/archivebox/plugins/modalcloser/on_Snapshot__15_modalcloser.bg.js b/archivebox/plugins/modalcloser/on_Snapshot__15_modalcloser.bg.js deleted file mode 100644 index 7f9e664b..00000000 --- a/archivebox/plugins/modalcloser/on_Snapshot__15_modalcloser.bg.js +++ /dev/null @@ -1,333 +0,0 @@ -#!/usr/bin/env node -/** - * Auto-close browser dialogs and CSS modals. - * - * Runs as a background script that sets up listeners BEFORE navigation, - * so it catches modals that appear on page load. - * - * Handles: - * - Browser dialogs (alert, confirm, prompt, beforeunload) - * - Framework modals (Bootstrap, Tailwind, shadcn, Angular Material, jQuery UI, SweetAlert) - * - Cookie consent banners, newsletter popups, age gates - * - * Usage: on_Snapshot__15_modalcloser.bg.js --url= --snapshot-id= - * Output: JSONL with modal close stats (no files created) - * Termination: Send SIGTERM to exit cleanly - * - * Environment variables: - * MODALCLOSER_ENABLED: Enable/disable (default: true) - * MODALCLOSER_TIMEOUT: Delay before auto-closing dialogs in ms (default: 1250) - * MODALCLOSER_POLL_INTERVAL: How often to check for CSS modals in ms (default: 500) - */ - -const fs = require('fs'); -const path = require('path'); - -// Add NODE_MODULES_DIR to module resolution paths if set -if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); - -// Import shared utilities from chrome_utils.js -const { - getEnvBool, - getEnvInt, - parseArgs, - readCdpUrl, - readTargetId, -} = require('../chrome/chrome_utils.js'); - -// Check if modalcloser is enabled BEFORE requiring puppeteer -if (!getEnvBool('MODALCLOSER_ENABLED', true)) { - console.error('Skipping modalcloser (MODALCLOSER_ENABLED=False)'); - process.exit(0); -} - -const puppeteer = require('puppeteer-core'); - -const PLUGIN_NAME = 'modalcloser'; -const CHROME_SESSION_DIR = '../chrome'; - -function sleep(ms) { - return new Promise(resolve => setTimeout(resolve, ms)); -} - -/** - * Close CSS modals using framework-specific dismiss methods. - * Returns the number of modals closed. - */ -async function closeModals(page) { - return page.evaluate(() => { - let closed = 0; - - // Bootstrap 4/5 - use Bootstrap's modal API - if (typeof bootstrap !== 'undefined' && bootstrap.Modal) { - document.querySelectorAll('.modal.show').forEach(el => { - try { - const modal = bootstrap.Modal.getInstance(el); - if (modal) { modal.hide(); closed++; } - } catch (e) {} - }); - } - - // Bootstrap 3 / jQuery - use jQuery modal API - if (typeof jQuery !== 'undefined' && jQuery.fn && jQuery.fn.modal) { - try { - const $modals = jQuery('.modal.in, .modal.show'); - if ($modals.length > 0) { - $modals.modal('hide'); - closed += $modals.length; - } - } catch (e) {} - } - - // shadcn/Radix UI - fire escape key to dismiss - document.querySelectorAll('[data-radix-dialog-overlay], [data-state="open"][role="dialog"]').forEach(el => { - try { - el.dispatchEvent(new KeyboardEvent('keydown', { key: 'Escape', bubbles: true, cancelable: true })); - closed++; - } catch (e) {} - }); - - // Angular Material - click backdrop to dismiss - document.querySelectorAll('.cdk-overlay-backdrop').forEach(el => { - try { - el.click(); - closed++; - } catch (e) {} - }); - - // Tailwind / Headless UI - dispatch escape key - document.querySelectorAll('[role="dialog"][aria-modal="true"]').forEach(el => { - try { - el.dispatchEvent(new KeyboardEvent('keydown', { key: 'Escape', bubbles: true, cancelable: true })); - closed++; - } catch (e) {} - }); - - // jQuery UI Dialog - if (typeof jQuery !== 'undefined' && jQuery.ui && jQuery.ui.dialog) { - try { - const $dialogs = jQuery('.ui-dialog-content'); - if ($dialogs.length > 0) { - $dialogs.dialog('close'); - closed += $dialogs.length; - } - } catch (e) {} - } - - // SweetAlert2 - if (typeof Swal !== 'undefined' && Swal.close) { - try { Swal.close(); closed++; } catch (e) {} - } - - // SweetAlert 1 - if (typeof swal !== 'undefined' && swal.close) { - try { swal.close(); closed++; } catch (e) {} - } - - // Generic fallback - hide unrecognized modals with CSS - const genericSelectors = [ - // CookieYes (cky) - '.cky-consent-container', '.cky-popup-center', '.cky-overlay', '.cky-modal', '#ckyPreferenceCenter', - // OneTrust - '#onetrust-consent-sdk', '#onetrust-banner-sdk', '.onetrust-pc-dark-filter', '#onetrust-pc-sdk', - // CookieBot - '#CybotCookiebotDialog', '#CybotCookiebotDialogBodyUnderlay', '#CookiebotWidget', - // Quantcast / CMP - '.qc-cmp-ui-container', '#qc-cmp2-container', '.qc-cmp2-summary-buttons', - // TrustArc / TrustE - '#truste-consent-track', '.truste-banner', '#truste-consent-content', - // Osano - '.osano-cm-window', '.osano-cm-dialog', - // Klaro - '.klaro .cookie-modal', '.klaro .cookie-notice', - // Tarteaucitron - '#tarteaucitronRoot', '#tarteaucitronAlertBig', - // Complianz (WordPress) - '.cmplz-cookiebanner', '#cmplz-cookiebanner-container', - // GDPR Cookie Consent (WordPress) - '#gdpr-cookie-consent-bar', '.gdpr-cookie-consent-popup', - // Cookie Notice (WordPress) - '#cookie-notice', '.cookie-notice-container', - // EU Cookie Law - '.eupopup', '#eu-cookie-law', - // Didomi - '#didomi-popup', '#didomi-host', '.didomi-popup-container', - // Usercentrics - '#usercentrics-root', '.uc-banner', - // Axeptio - '#axeptio_overlay', '#axeptio_btn', - // iubenda - '#iubenda-cs-banner', '.iubenda-cs-container', - // Termly - '.termly-consent-banner', '#termly-code-snippet-support', - // Borlabs Cookie (WordPress) - '#BorlabsCookieBox', '.BorlabsCookie', - // CookieFirst - '.cookiefirst-root', '#cookiefirst-root', - // CookieScript - '#cookiescript_injected', '.cookiescript_injected_wrapper', - // Civic Cookie Control - '#ccc', '#ccc-overlay', - // Generic patterns - '#cookie-consent', '.cookie-banner', '.cookie-notice', - '#cookieConsent', '.cookie-consent', '.cookies-banner', - '[class*="cookie"][class*="banner"]', '[class*="cookie"][class*="notice"]', - '[class*="cookie"][class*="popup"]', '[class*="cookie"][class*="modal"]', - '[class*="consent"][class*="banner"]', '[class*="consent"][class*="popup"]', - '[class*="gdpr"]', '[class*="privacy"][class*="banner"]', - // Modal overlays and backdrops - '.modal-overlay:not([style*="display: none"])', - '.modal-backdrop:not([style*="display: none"])', - '.overlay-visible', - // Popup overlays - '.popup-overlay', '.newsletter-popup', '.age-gate', - '.subscribe-popup', '.subscription-modal', - // Generic modal patterns - '[class*="modal"][class*="open"]:not(.modal-open)', - '[class*="modal"][class*="show"][class*="overlay"]', - '[class*="modal"][class*="visible"]', - '[class*="dialog"][class*="open"]', - '[class*="overlay"][class*="visible"]', - // Interstitials - '.interstitial', '.interstitial-wrapper', - '[class*="interstitial"]', - ]; - - genericSelectors.forEach(selector => { - try { - document.querySelectorAll(selector).forEach(el => { - // Skip if already hidden - const style = window.getComputedStyle(el); - if (style.display === 'none' || style.visibility === 'hidden') return; - - el.style.display = 'none'; - el.style.visibility = 'hidden'; - el.style.opacity = '0'; - el.style.pointerEvents = 'none'; - closed++; - }); - } catch (e) {} - }); - - // Remove body scroll lock (common pattern when modals are open) - try { - document.body.style.overflow = ''; - document.body.style.position = ''; - document.body.classList.remove('modal-open', 'overflow-hidden', 'no-scroll', 'scroll-locked'); - document.documentElement.style.overflow = ''; - document.documentElement.classList.remove('overflow-hidden', 'no-scroll'); - } catch (e) {} - - return closed; - }); -} - -async function main() { - const args = parseArgs(); - const url = args.url; - const snapshotId = args.snapshot_id; - - if (!url || !snapshotId) { - console.error('Usage: on_Snapshot__15_modalcloser.bg.js --url= --snapshot-id='); - process.exit(1); - } - - const dialogTimeout = getEnvInt('MODALCLOSER_TIMEOUT', 1250); - const pollInterval = getEnvInt('MODALCLOSER_POLL_INTERVAL', 500); - - const cdpUrl = readCdpUrl(CHROME_SESSION_DIR); - if (!cdpUrl) { - console.error('No Chrome session found (chrome plugin must run first)'); - process.exit(1); - } - - let browser = null; - let dialogsClosed = 0; - let cssModalsClosed = 0; - let running = true; - - // Handle SIGTERM for clean exit - process.on('SIGTERM', () => { - running = false; - const total = dialogsClosed + cssModalsClosed; - console.error(`Modalcloser exiting: closed ${dialogsClosed} dialogs, ${cssModalsClosed} CSS modals`); - - const outputStr = total > 0 - ? `closed ${total} modals (${dialogsClosed} dialogs, ${cssModalsClosed} CSS)` - : 'no modals detected'; - - console.log(JSON.stringify({ - type: 'ArchiveResult', - status: 'succeeded', - output_str: outputStr, - })); - - if (browser) browser.disconnect(); - process.exit(0); - }); - - try { - browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl }); - - const pages = await browser.pages(); - if (pages.length === 0) { - throw new Error('No pages found in browser'); - } - - // Find the right page by target ID - const targetId = readTargetId(CHROME_SESSION_DIR); - let page = null; - if (targetId) { - page = pages.find(p => { - const target = p.target(); - return target && target._targetId === targetId; - }); - } - if (!page) { - page = pages[pages.length - 1]; - } - - // console.error(`Modalcloser listening on ${url}`); - - // Set up dialog handler (for JS alert/confirm/prompt/beforeunload) - page.on('dialog', async (dialog) => { - const type = dialog.type(); - const message = dialog.message().substring(0, 100); - console.error(`Auto-closing dialog: ${type} - "${message}"`); - - // Small delay before accepting (some pages expect a brief pause) - await sleep(dialogTimeout); - try { - await dialog.accept(); - dialogsClosed++; - } catch (e) { - // Dialog may have been dismissed by page - } - }); - - // Poll for CSS modals - while (running) { - try { - const closed = await closeModals(page); - if (closed > 0) { - console.error(`Closed ${closed} CSS modals`); - cssModalsClosed += closed; - } - } catch (e) { - // Page may have navigated or been closed - if (!running) break; - } - await sleep(pollInterval); - } - - } catch (e) { - if (browser) browser.disconnect(); - console.error(`ERROR: ${e.name}: ${e.message}`); - process.exit(1); - } -} - -main().catch(e => { - console.error(`Fatal error: ${e.message}`); - process.exit(1); -}); diff --git a/archivebox/plugins/modalcloser/templates/icon.html b/archivebox/plugins/modalcloser/templates/icon.html deleted file mode 100644 index e58b588b..00000000 --- a/archivebox/plugins/modalcloser/templates/icon.html +++ /dev/null @@ -1 +0,0 @@ - diff --git a/archivebox/plugins/modalcloser/tests/test_modalcloser.py b/archivebox/plugins/modalcloser/tests/test_modalcloser.py deleted file mode 100644 index 53c62479..00000000 --- a/archivebox/plugins/modalcloser/tests/test_modalcloser.py +++ /dev/null @@ -1,454 +0,0 @@ -""" -Integration tests for modalcloser plugin - -Tests verify: -1. Hook script exists -2. Dependencies installed via chrome validation hooks -3. Verify deps with abx-pkg -4. MODALCLOSER_ENABLED=False skips without JSONL -5. Fails gracefully when no chrome session exists -6. Background script runs and handles SIGTERM correctly -7. Config options work (timeout, poll interval) -8. Live test: hides cookie consent on filmin.es -""" - -import json -import os -import signal -import subprocess -import time -import tempfile -from pathlib import Path - -import pytest - -# Import shared Chrome test helpers -from archivebox.plugins.chrome.tests.chrome_test_helpers import ( - get_test_env, - chrome_session, -) - - -PLUGIN_DIR = Path(__file__).parent.parent -MODALCLOSER_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_modalcloser.*'), None) -TEST_URL = 'https://www.singsing.movie/' -COOKIE_CONSENT_TEST_URL = 'https://www.filmin.es/' - - -def test_hook_script_exists(): - """Verify on_Snapshot hook exists.""" - assert MODALCLOSER_HOOK is not None, "Modalcloser hook not found" - assert MODALCLOSER_HOOK.exists(), f"Hook not found: {MODALCLOSER_HOOK}" - - -def test_verify_deps_with_abx_pkg(): - """Verify dependencies are available via abx-pkg after hook installation.""" - from abx_pkg import Binary, EnvProvider - - EnvProvider.model_rebuild() - - # Verify node is available - node_binary = Binary(name='node', binproviders=[EnvProvider()]) - node_loaded = node_binary.load() - assert node_loaded and node_loaded.abspath, "Node.js required for modalcloser plugin" - - -def test_config_modalcloser_disabled_skips(): - """Test that MODALCLOSER_ENABLED=False exits without emitting JSONL.""" - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - env = get_test_env() - env['MODALCLOSER_ENABLED'] = 'False' - - result = subprocess.run( - ['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-disabled'], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=30 - ) - - assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}" - assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr" - - # Should NOT emit any JSONL - jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')] - assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, got: {jsonl_lines}" - - -def test_fails_gracefully_without_chrome_session(): - """Test that hook fails gracefully when no chrome session exists.""" - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - modalcloser_dir = tmpdir / 'snapshot' / 'modalcloser' - modalcloser_dir.mkdir(parents=True, exist_ok=True) - - result = subprocess.run( - ['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-no-chrome'], - cwd=modalcloser_dir, - capture_output=True, - text=True, - env=get_test_env(), - timeout=30 - ) - - # Should fail (exit 1) when no chrome session - assert result.returncode != 0, "Should fail when no chrome session exists" - # Error could be about chrome/CDP not found, or puppeteer module missing - err_lower = result.stderr.lower() - assert any(x in err_lower for x in ['chrome', 'cdp', 'puppeteer', 'module']), \ - f"Should mention chrome/CDP/puppeteer in error: {result.stderr}" - - -def test_background_script_handles_sigterm(): - """Test that background script runs and handles SIGTERM correctly.""" - with tempfile.TemporaryDirectory() as tmpdir: - modalcloser_process = None - try: - with chrome_session( - Path(tmpdir), - crawl_id='test-modalcloser', - snapshot_id='snap-modalcloser', - test_url=TEST_URL, - ) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env): - # Create modalcloser output directory (sibling to chrome) - modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser' - modalcloser_dir.mkdir() - - # Run modalcloser as background process (use env from setup_chrome_session) - env['MODALCLOSER_POLL_INTERVAL'] = '200' # Faster polling for test - - modalcloser_process = subprocess.Popen( - ['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-modalcloser'], - cwd=str(modalcloser_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=env - ) - - # Let it run for a bit - time.sleep(2) - - # Verify it's still running (background script) - assert modalcloser_process.poll() is None, "Modalcloser should still be running as background process" - - # Send SIGTERM - modalcloser_process.send_signal(signal.SIGTERM) - stdout, stderr = modalcloser_process.communicate(timeout=5) - - assert modalcloser_process.returncode == 0, f"Should exit 0 on SIGTERM: {stderr}" - - # Parse JSONL output - result_json = None - for line in stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - assert result_json is not None, f"Should have ArchiveResult JSONL output. Stdout: {stdout}" - assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" - - # Verify output_str format - output_str = result_json.get('output_str', '') - assert 'modal' in output_str.lower() or 'dialog' in output_str.lower(), \ - f"output_str should mention modals/dialogs: {output_str}" - - # Verify no files created in output directory - output_files = list(modalcloser_dir.iterdir()) - assert len(output_files) == 0, f"Should not create any files, but found: {output_files}" - - finally: - if modalcloser_process and modalcloser_process.poll() is None: - modalcloser_process.kill() - - -def test_dialog_handler_logs_dialogs(): - """Test that dialog handler is set up correctly.""" - with tempfile.TemporaryDirectory() as tmpdir: - modalcloser_process = None - try: - with chrome_session( - Path(tmpdir), - crawl_id='test-dialog', - snapshot_id='snap-dialog', - test_url=TEST_URL, - ) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env): - - modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser' - modalcloser_dir.mkdir() - - # Use env from setup_chrome_session - env['MODALCLOSER_TIMEOUT'] = '100' # Fast timeout for test - env['MODALCLOSER_POLL_INTERVAL'] = '200' - - modalcloser_process = subprocess.Popen( - ['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-dialog'], - cwd=str(modalcloser_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=env - ) - - # Let it run briefly - time.sleep(1.5) - - # Verify it's running - assert modalcloser_process.poll() is None, "Should be running" - - # Check stderr for "listening" message - # Note: Can't read stderr while process is running without blocking, - # so we just verify it exits cleanly - modalcloser_process.send_signal(signal.SIGTERM) - stdout, stderr = modalcloser_process.communicate(timeout=5) - - assert 'listening' in stderr.lower() or 'modalcloser' in stderr.lower(), \ - f"Should log startup message: {stderr}" - assert modalcloser_process.returncode == 0, f"Should exit cleanly: {stderr}" - - finally: - if modalcloser_process and modalcloser_process.poll() is None: - modalcloser_process.kill() - - -def test_config_poll_interval(): - """Test that MODALCLOSER_POLL_INTERVAL config is respected.""" - with tempfile.TemporaryDirectory() as tmpdir: - chrome_launch_process = None - chrome_pid = None - modalcloser_process = None - try: - with chrome_session( - Path(tmpdir), - crawl_id='test-poll', - snapshot_id='snap-poll', - test_url=TEST_URL, - ) as (chrome_launch_process, chrome_pid, snapshot_chrome_dir, env): - - modalcloser_dir = snapshot_chrome_dir.parent / 'modalcloser' - modalcloser_dir.mkdir() - - # Set very short poll interval (use env from setup_chrome_session) - env['MODALCLOSER_POLL_INTERVAL'] = '100' # 100ms - - modalcloser_process = subprocess.Popen( - ['node', str(MODALCLOSER_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-poll'], - cwd=str(modalcloser_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=env - ) - - # Run for short time - time.sleep(1) - - # Should still be running - assert modalcloser_process.poll() is None, "Should still be running" - - # Clean exit - modalcloser_process.send_signal(signal.SIGTERM) - stdout, stderr = modalcloser_process.communicate(timeout=5) - - assert modalcloser_process.returncode == 0, f"Should exit 0: {stderr}" - - # Verify JSONL output exists - result_json = None - for line in stdout.strip().split('\n'): - if line.strip().startswith('{'): - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - assert result_json is not None, "Should have JSONL output" - assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" - - finally: - if modalcloser_process and modalcloser_process.poll() is None: - modalcloser_process.kill() - - -def test_hides_cookie_consent_on_filmin(): - """Live test: verify modalcloser hides cookie consent popup on filmin.es.""" - # Create a test script that uses puppeteer directly - test_script = ''' -const puppeteer = require('puppeteer-core'); - -async function closeModals(page) { - return page.evaluate(() => { - let closed = 0; - - // Bootstrap 4/5 - if (typeof bootstrap !== 'undefined' && bootstrap.Modal) { - document.querySelectorAll('.modal.show').forEach(el => { - try { - const modal = bootstrap.Modal.getInstance(el); - if (modal) { modal.hide(); closed++; } - } catch (e) {} - }); - } - - // Bootstrap 3 / jQuery - if (typeof jQuery !== 'undefined' && jQuery.fn && jQuery.fn.modal) { - try { - const $modals = jQuery('.modal.in, .modal.show'); - if ($modals.length > 0) { - $modals.modal('hide'); - closed += $modals.length; - } - } catch (e) {} - } - - // Generic selectors including cookie consent - const genericSelectors = [ - // CookieYes (cky) specific selectors - '.cky-consent-container', - '.cky-popup-center', - '.cky-overlay', - '.cky-modal', - '#ckyPreferenceCenter', - // Generic cookie consent - '#cookie-consent', '.cookie-banner', '.cookie-notice', - '#cookieConsent', '.cookie-consent', '.cookies-banner', - '[class*="cookie"][class*="banner"]', - '[class*="cookie"][class*="notice"]', - '[class*="consent"]', - '[class*="gdpr"]', - '.modal-overlay', '.modal-backdrop', - '.popup-overlay', '.newsletter-popup', - ]; - - genericSelectors.forEach(selector => { - try { - document.querySelectorAll(selector).forEach(el => { - const style = window.getComputedStyle(el); - if (style.display === 'none' || style.visibility === 'hidden') return; - el.style.display = 'none'; - el.style.visibility = 'hidden'; - el.style.opacity = '0'; - el.style.pointerEvents = 'none'; - closed++; - }); - } catch (e) {} - }); - - document.body.style.overflow = ''; - document.body.classList.remove('modal-open', 'overflow-hidden', 'no-scroll'); - - return closed; - }); -} - -async function main() { - const browser = await puppeteer.launch({ - headless: 'new', - executablePath: process.env.CHROME_BINARY || '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', - args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-blink-features=AutomationControlled'] - }); - - const page = await browser.newPage(); - // Set real user agent to bypass headless detection - await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'); - await page.setViewport({ width: 1440, height: 900 }); - - console.error('Navigating to filmin.es...'); - await page.goto('https://www.filmin.es/', { waitUntil: 'networkidle2', timeout: 30000 }); - - // Wait for cookie consent to appear - await new Promise(r => setTimeout(r, 3000)); - - // Check BEFORE - const before = await page.evaluate(() => { - const el = document.querySelector('.cky-consent-container'); - if (!el) return { found: false }; - const style = window.getComputedStyle(el); - return { found: true, display: style.display, visibility: style.visibility }; - }); - - console.error('Before:', JSON.stringify(before)); - - // Run modal closer - const closed = await closeModals(page); - console.error('Closed:', closed, 'modals'); - - // Check AFTER - const after = await page.evaluate(() => { - const el = document.querySelector('.cky-consent-container'); - if (!el) return { found: false }; - const style = window.getComputedStyle(el); - return { found: true, display: style.display, visibility: style.visibility }; - }); - - console.error('After:', JSON.stringify(after)); - - await browser.close(); - - // Output result as JSON for Python to parse - const result = { - before_found: before.found, - before_visible: before.found && before.display !== 'none' && before.visibility !== 'hidden', - after_hidden: !after.found || after.display === 'none' || after.visibility === 'hidden', - modals_closed: closed - }; - console.log(JSON.stringify(result)); -} - -main().catch(e => { - console.error('Error:', e.message); - process.exit(1); -}); -''' - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - script_path = tmpdir / 'test_cookie_consent.js' - script_path.write_text(test_script) - - env = get_test_env() - - result = subprocess.run( - ['node', str(script_path)], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=60 - ) - - print(f"stderr: {result.stderr}") - print(f"stdout: {result.stdout}") - - assert result.returncode == 0, f"Test script failed: {result.stderr}" - - # Parse the JSON output - output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')] - assert len(output_lines) > 0, f"No JSON output from test script. stdout: {result.stdout}" - - test_result = json.loads(output_lines[-1]) - - # The cookie consent should have been found initially (or page changed) - # After running closeModals, it should be hidden - if test_result['before_found']: - assert test_result['after_hidden'], \ - f"Cookie consent should be hidden after modalcloser. Result: {test_result}" - assert test_result['modals_closed'] > 0, \ - f"Should have closed at least one modal. Result: {test_result}" - else: - # Page may have changed, just verify no errors - print("Cookie consent element not found (page may have changed)") - - -if __name__ == '__main__': - pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/npm/on_Binary__10_npm_install.py b/archivebox/plugins/npm/on_Binary__10_npm_install.py deleted file mode 100644 index f0b43893..00000000 --- a/archivebox/plugins/npm/on_Binary__10_npm_install.py +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/env python3 -""" -Install a binary using npm package manager. - -Usage: on_Binary__install_using_npm_provider.py --binary-id= --machine-id= --name= [--custom-cmd=] -Output: Binary JSONL record to stdout after installation - -Environment variables: - MACHINE_ID: Machine UUID (set by orchestrator) - LIB_DIR: Library directory including machine type (e.g., data/lib/arm64-darwin) (required) -""" - -import json -import os -import sys -from pathlib import Path - -import rich_click as click -from abx_pkg import Binary, NpmProvider, BinProviderOverrides - -# Fix pydantic forward reference issue -NpmProvider.model_rebuild() - - -@click.command() -@click.option('--machine-id', required=True, help="Machine UUID") -@click.option('--binary-id', required=True, help="Dependency UUID") -@click.option('--name', required=True, help="Binary name to install") -@click.option('--binproviders', default='*', help="Allowed providers (comma-separated)") -@click.option('--custom-cmd', default=None, help="Custom install command") -@click.option('--overrides', default=None, help="JSON-encoded overrides dict") -def main(binary_id: str, machine_id: str, name: str, binproviders: str, custom_cmd: str | None, overrides: str | None): - """Install binary using npm.""" - - if binproviders != '*' and 'npm' not in binproviders.split(','): - click.echo(f"npm provider not allowed for {name}", err=True) - sys.exit(0) - - # Get LIB_DIR from environment (required) - # Note: LIB_DIR already includes machine type (e.g., data/lib/arm64-darwin) - lib_dir = os.environ.get('LIB_DIR') - - if not lib_dir: - click.echo("ERROR: LIB_DIR environment variable not set", err=True) - sys.exit(1) - - # Structure: lib/arm64-darwin/npm (npm will create node_modules inside this) - npm_prefix = Path(lib_dir) / 'npm' - npm_prefix.mkdir(parents=True, exist_ok=True) - - # Use abx-pkg NpmProvider to install binary with custom prefix - provider = NpmProvider(npm_prefix=npm_prefix) - if not provider.INSTALLER_BIN: - click.echo("npm not available on this system", err=True) - sys.exit(1) - - click.echo(f"Installing {name} via npm to {npm_prefix}...", err=True) - - try: - # Parse overrides if provided - overrides_dict = None - if overrides: - try: - overrides_dict = json.loads(overrides) - click.echo(f"Using custom install overrides: {overrides_dict}", err=True) - except json.JSONDecodeError: - click.echo(f"Warning: Failed to parse overrides JSON: {overrides}", err=True) - - binary = Binary(name=name, binproviders=[provider], overrides=overrides_dict or {}).install() - except Exception as e: - click.echo(f"npm install failed: {e}", err=True) - sys.exit(1) - - if not binary.abspath: - click.echo(f"{name} not found after npm install", err=True) - sys.exit(1) - - machine_id = os.environ.get('MACHINE_ID', '') - - # Output Binary JSONL record to stdout - record = { - 'type': 'Binary', - 'name': name, - 'abspath': str(binary.abspath), - 'version': str(binary.version) if binary.version else '', - 'sha256': binary.sha256 or '', - 'binprovider': 'npm', - 'machine_id': machine_id, - 'binary_id': binary_id, - } - print(json.dumps(record)) - - # Emit PATH update for npm bin dirs (node_modules/.bin preferred) - npm_bin_dirs = [ - str(npm_prefix / 'node_modules' / '.bin'), - str(npm_prefix / 'bin'), - ] - current_path = os.environ.get('PATH', '') - path_dirs = current_path.split(':') if current_path else [] - new_path = current_path - - for npm_bin_dir in npm_bin_dirs: - if npm_bin_dir and npm_bin_dir not in path_dirs: - new_path = f"{npm_bin_dir}:{new_path}" if new_path else npm_bin_dir - path_dirs.insert(0, npm_bin_dir) - - print(json.dumps({ - 'type': 'Machine', - 'config': { - 'PATH': new_path, - }, - })) - - # Also emit NODE_MODULES_DIR for JS module resolution - node_modules_dir = str(npm_prefix / 'node_modules') - print(json.dumps({ - 'type': 'Machine', - 'config': { - 'NODE_MODULES_DIR': node_modules_dir, - }, - })) - - # Log human-readable info to stderr - click.echo(f"Installed {name} at {binary.abspath}", err=True) - click.echo(f" version: {binary.version}", err=True) - - sys.exit(0) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/npm/on_Crawl__00_npm_install.py b/archivebox/plugins/npm/on_Crawl__00_npm_install.py deleted file mode 100644 index 5660dd01..00000000 --- a/archivebox/plugins/npm/on_Crawl__00_npm_install.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 -""" -Emit node/npm Binary dependencies for the crawl. - -This hook runs early in the Crawl lifecycle so node/npm are installed -before any npm-based extractors (e.g., puppeteer) run. -""" - -import json -import os -import sys - - -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - - -def output_binary(name: str, binproviders: str, overrides: dict | None = None) -> None: - machine_id = os.environ.get('MACHINE_ID', '') - record = { - 'type': 'Binary', - 'name': name, - 'binproviders': binproviders, - 'machine_id': machine_id, - } - if overrides: - record['overrides'] = overrides - print(json.dumps(record)) - - -def main() -> None: - output_binary( - name='node', - binproviders='apt,brew,env', - overrides={'apt': {'packages': ['nodejs']}}, - ) - - output_binary( - name='npm', - binproviders='apt,brew,env', - overrides={ - 'apt': {'packages': ['nodejs', 'npm']}, - 'brew': {'packages': ['node']}, - }, - ) - - sys.exit(0) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/npm/templates/icon.html b/archivebox/plugins/npm/templates/icon.html deleted file mode 100644 index e69de29b..00000000 diff --git a/archivebox/plugins/npm/tests/test_npm_provider.py b/archivebox/plugins/npm/tests/test_npm_provider.py deleted file mode 100644 index 9f00d9d7..00000000 --- a/archivebox/plugins/npm/tests/test_npm_provider.py +++ /dev/null @@ -1,144 +0,0 @@ -""" -Tests for the npm binary provider plugin. - -Tests cover: -1. Hook script execution -2. npm package installation -3. PATH and NODE_MODULES_DIR updates -4. JSONL output format -""" - -import json -import os -import shutil -import subprocess -import sys -import tempfile -from pathlib import Path - -import pytest -from django.test import TestCase - - -# Get the path to the npm provider hook -PLUGIN_DIR = Path(__file__).parent.parent -INSTALL_HOOK = next(PLUGIN_DIR.glob('on_Binary__*_npm_install.py'), None) - - -def npm_available() -> bool: - """Check if npm is installed.""" - return shutil.which('npm') is not None - - -class TestNpmProviderHook(TestCase): - """Test the npm binary provider installation hook.""" - - def setUp(self): - """Set up test environment.""" - self.temp_dir = tempfile.mkdtemp() - self.lib_dir = Path(self.temp_dir) / 'lib' / 'x86_64-linux' - self.lib_dir.mkdir(parents=True) - - def tearDown(self): - """Clean up.""" - shutil.rmtree(self.temp_dir, ignore_errors=True) - - def test_hook_script_exists(self): - """Hook script should exist.""" - self.assertTrue(INSTALL_HOOK and INSTALL_HOOK.exists(), f"Hook not found: {INSTALL_HOOK}") - - def test_hook_requires_lib_dir(self): - """Hook should fail when LIB_DIR is not set.""" - env = os.environ.copy() - env.pop('LIB_DIR', None) # Remove LIB_DIR - - result = subprocess.run( - [ - sys.executable, str(INSTALL_HOOK), - '--name=some-package', - '--binary-id=test-uuid', - '--machine-id=test-machine', - ], - capture_output=True, - text=True, - env=env, - timeout=30 - ) - - self.assertIn('LIB_DIR environment variable not set', result.stderr) - self.assertEqual(result.returncode, 1) - - def test_hook_skips_when_npm_not_allowed(self): - """Hook should skip when npm not in allowed binproviders.""" - env = os.environ.copy() - env['LIB_DIR'] = str(self.lib_dir) - - result = subprocess.run( - [ - sys.executable, str(INSTALL_HOOK), - '--name=some-package', - '--binary-id=test-uuid', - '--machine-id=test-machine', - '--binproviders=pip,apt', # npm not allowed - ], - capture_output=True, - text=True, - env=env, - timeout=30 - ) - - # Should exit cleanly (code 0) when npm not allowed - self.assertIn('npm provider not allowed', result.stderr) - self.assertEqual(result.returncode, 0) - - def test_hook_creates_npm_prefix(self): - """Hook should create npm prefix directory.""" - assert npm_available(), "npm not installed" - env = os.environ.copy() - env['LIB_DIR'] = str(self.lib_dir) - - # Even if installation fails, the npm prefix should be created - subprocess.run( - [ - sys.executable, str(INSTALL_HOOK), - '--name=nonexistent-xyz123', - '--binary-id=test-uuid', - '--machine-id=test-machine', - ], - capture_output=True, - text=True, - env=env, - timeout=60 - ) - - npm_prefix = self.lib_dir / 'npm' - self.assertTrue(npm_prefix.exists()) - - def test_hook_handles_overrides(self): - """Hook should accept overrides JSON.""" - env = os.environ.copy() - env['LIB_DIR'] = str(self.lib_dir) - - overrides = json.dumps({'npm': {'packages': ['custom-pkg']}}) - - # Just verify it doesn't crash with overrides - result = subprocess.run( - [ - sys.executable, str(INSTALL_HOOK), - '--name=test-pkg', - '--binary-id=test-uuid', - '--machine-id=test-machine', - f'--overrides={overrides}', - ], - capture_output=True, - text=True, - env=env, - timeout=60 - ) - - # May fail to install, but should not crash parsing overrides - self.assertNotIn('Failed to parse overrides JSON', result.stderr) - - -if __name__ == '__main__': - pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/papersdl/config.json b/archivebox/plugins/papersdl/config.json deleted file mode 100644 index 2c6eb342..00000000 --- a/archivebox/plugins/papersdl/config.json +++ /dev/null @@ -1,39 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "additionalProperties": false, - "properties": { - "PAPERSDL_ENABLED": { - "type": "boolean", - "default": true, - "x-aliases": ["SAVE_PAPERSDL", "USE_PAPERSDL"], - "description": "Enable paper downloading with papers-dl" - }, - "PAPERSDL_BINARY": { - "type": "string", - "default": "papers-dl", - "description": "Path to papers-dl binary" - }, - "PAPERSDL_TIMEOUT": { - "type": "integer", - "default": 300, - "minimum": 30, - "x-fallback": "TIMEOUT", - "description": "Timeout for paper downloads in seconds" - }, - "PAPERSDL_ARGS": { - "type": "array", - "items": {"type": "string"}, - "default": ["fetch"], - "x-aliases": ["PAPERSDL_DEFAULT_ARGS"], - "description": "Default papers-dl arguments" - }, - "PAPERSDL_ARGS_EXTRA": { - "type": "array", - "items": {"type": "string"}, - "default": [], - "x-aliases": ["PAPERSDL_EXTRA_ARGS"], - "description": "Extra arguments to append to papers-dl command" - } - } -} diff --git a/archivebox/plugins/papersdl/on_Crawl__30_papersdl_install.py b/archivebox/plugins/papersdl/on_Crawl__30_papersdl_install.py deleted file mode 100755 index 050aa23b..00000000 --- a/archivebox/plugins/papersdl/on_Crawl__30_papersdl_install.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python3 -""" -Emit papers-dl Binary dependency for the crawl. -""" - -import json -import os -import sys - - -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - -def get_env_bool(name: str, default: bool = False) -> bool: - val = get_env(name, '').lower() - if val in ('true', '1', 'yes', 'on'): - return True - if val in ('false', '0', 'no', 'off'): - return False - return default - - -def output_binary(name: str, binproviders: str): - """Output Binary JSONL record for a dependency.""" - machine_id = os.environ.get('MACHINE_ID', '') - - record = { - 'type': 'Binary', - 'name': name, - 'binproviders': binproviders, - 'machine_id': machine_id, - } - print(json.dumps(record)) - - -def main(): - papersdl_enabled = get_env_bool('PAPERSDL_ENABLED', True) - - if not papersdl_enabled: - sys.exit(0) - - output_binary(name='papers-dl', binproviders='pip,env') - - sys.exit(0) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/papersdl/on_Snapshot__66_papersdl.bg.py b/archivebox/plugins/papersdl/on_Snapshot__66_papersdl.bg.py deleted file mode 100755 index 60015050..00000000 --- a/archivebox/plugins/papersdl/on_Snapshot__66_papersdl.bg.py +++ /dev/null @@ -1,220 +0,0 @@ -#!/usr/bin/env python3 -""" -Download scientific papers from a URL using papers-dl. - -Usage: on_Snapshot__papersdl.py --url= --snapshot-id= -Output: Downloads paper PDFs to $PWD/ - -Environment variables: - PAPERSDL_BINARY: Path to papers-dl binary - PAPERSDL_TIMEOUT: Timeout in seconds (default: 300 for paper downloads) - PAPERSDL_ARGS: Default papers-dl arguments (JSON array, default: ["fetch"]) - PAPERSDL_ARGS_EXTRA: Extra arguments to append (JSON array) - - # papers-dl feature toggles - SAVE_PAPERSDL: Enable papers-dl paper extraction (default: True) - - # Fallback to ARCHIVING_CONFIG values if PAPERSDL_* not set: - TIMEOUT: Fallback timeout -""" - -import json -import os -import re -import subprocess -import sys -import threading -from pathlib import Path - -import rich_click as click - - -# Extractor metadata -PLUGIN_NAME = 'papersdl' -BIN_NAME = 'papers-dl' -BIN_PROVIDERS = 'pip,env' -OUTPUT_DIR = '.' - - -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - - -def get_env_bool(name: str, default: bool = False) -> bool: - val = get_env(name, '').lower() - if val in ('true', '1', 'yes', 'on'): - return True - if val in ('false', '0', 'no', 'off'): - return False - return default - - -def get_env_int(name: str, default: int = 0) -> int: - try: - return int(get_env(name, str(default))) - except ValueError: - return default - - -def get_env_array(name: str, default: list[str] | None = None) -> list[str]: - """Parse a JSON array from environment variable.""" - val = get_env(name, '') - if not val: - return default if default is not None else [] - try: - result = json.loads(val) - if isinstance(result, list): - return [str(item) for item in result] - return default if default is not None else [] - except json.JSONDecodeError: - return default if default is not None else [] - - -def extract_doi_from_url(url: str) -> str | None: - """Extract DOI from common paper URLs.""" - # Match DOI pattern in URL - doi_pattern = r'10\.\d{4,}/[^\s]+' - match = re.search(doi_pattern, url) - if match: - return match.group(0) - return None - - -def save_paper(url: str, binary: str) -> tuple[bool, str | None, str]: - """ - Download paper using papers-dl. - - Returns: (success, output_path, error_message) - """ - # Get config from env - timeout = get_env_int('TIMEOUT', 300) - papersdl_args = get_env_array('PAPERSDL_ARGS', []) - papersdl_args_extra = get_env_array('PAPERSDL_ARGS_EXTRA', []) - - # Output directory is current directory (hook already runs in output dir) - output_dir = Path(OUTPUT_DIR) - - # Try to extract DOI from URL - doi = extract_doi_from_url(url) - if not doi: - # If no DOI found, papers-dl might handle the URL directly - identifier = url - else: - identifier = doi - - # Build command - papers-dl -o - cmd = [binary, *papersdl_args, identifier, '-o', str(output_dir)] - - if papersdl_args_extra: - cmd.extend(papersdl_args_extra) - - try: - print(f'[papersdl] Starting download (timeout={timeout}s)', file=sys.stderr) - output_lines: list[str] = [] - process = subprocess.Popen( - cmd, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - bufsize=1, - ) - - def _read_output() -> None: - if not process.stdout: - return - for line in process.stdout: - output_lines.append(line) - sys.stderr.write(line) - - reader = threading.Thread(target=_read_output, daemon=True) - reader.start() - - try: - process.wait(timeout=timeout) - except subprocess.TimeoutExpired: - process.kill() - reader.join(timeout=1) - return False, None, f'Timed out after {timeout} seconds' - - reader.join(timeout=1) - combined_output = ''.join(output_lines) - - # Check if any PDF files were downloaded - pdf_files = list(output_dir.glob('*.pdf')) - - if pdf_files: - # Return first PDF file - return True, str(pdf_files[0]), '' - else: - stderr = combined_output - stdout = combined_output - - # These are NOT errors - page simply has no downloadable paper - stderr_lower = stderr.lower() - stdout_lower = stdout.lower() - if 'not found' in stderr_lower or 'not found' in stdout_lower: - return True, None, '' # Paper not available - success, no output - if 'no results' in stderr_lower or 'no results' in stdout_lower: - return True, None, '' # No paper found - success, no output - if process.returncode == 0: - return True, None, '' # papers-dl exited cleanly, just no paper - success - - # These ARE errors - something went wrong - if '404' in stderr or '404' in stdout: - return False, None, '404 Not Found' - if '403' in stderr or '403' in stdout: - return False, None, '403 Forbidden' - - return False, None, f'papers-dl error: {stderr[:200] or stdout[:200]}' - - except subprocess.TimeoutExpired: - return False, None, f'Timed out after {timeout} seconds' - except Exception as e: - return False, None, f'{type(e).__name__}: {e}' - - -@click.command() -@click.option('--url', required=True, help='URL to download paper from') -@click.option('--snapshot-id', required=True, help='Snapshot UUID') -def main(url: str, snapshot_id: str): - """Download scientific paper from a URL using papers-dl.""" - - output = None - status = 'failed' - error = '' - - try: - # Check if papers-dl is enabled - if not get_env_bool('PAPERSDL_ENABLED', True): - print('Skipping papers-dl (PAPERSDL_ENABLED=False)', file=sys.stderr) - # Temporary failure (config disabled) - NO JSONL emission - sys.exit(0) - - # Get binary from environment - binary = get_env('PAPERSDL_BINARY', 'papers-dl') - - # Run extraction - success, output, error = save_paper(url, binary) - - if success: - # Success - emit ArchiveResult - result = { - 'type': 'ArchiveResult', - 'status': 'succeeded', - 'output_str': output or '' - } - print(json.dumps(result)) - sys.exit(0) - else: - # Transient error - emit NO JSONL - print(f'ERROR: {error}', file=sys.stderr) - sys.exit(1) - - except Exception as e: - # Transient error - emit NO JSONL - print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr) - sys.exit(1) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/papersdl/templates/card.html b/archivebox/plugins/papersdl/templates/card.html deleted file mode 100644 index abe6f09a..00000000 --- a/archivebox/plugins/papersdl/templates/card.html +++ /dev/null @@ -1,7 +0,0 @@ - -
-
- šŸ“„ - Paper -
-
diff --git a/archivebox/plugins/papersdl/templates/full.html b/archivebox/plugins/papersdl/templates/full.html deleted file mode 100644 index f2cee0c8..00000000 --- a/archivebox/plugins/papersdl/templates/full.html +++ /dev/null @@ -1,71 +0,0 @@ - - - - - - - Scientific Paper - - - -
-
šŸ“„
-

Scientific Paper

-
-
- -
- Download PDF - - diff --git a/archivebox/plugins/papersdl/templates/icon.html b/archivebox/plugins/papersdl/templates/icon.html deleted file mode 100644 index 94afb781..00000000 --- a/archivebox/plugins/papersdl/templates/icon.html +++ /dev/null @@ -1 +0,0 @@ - diff --git a/archivebox/plugins/papersdl/tests/test_papersdl.py b/archivebox/plugins/papersdl/tests/test_papersdl.py deleted file mode 100644 index d26ef9cb..00000000 --- a/archivebox/plugins/papersdl/tests/test_papersdl.py +++ /dev/null @@ -1,190 +0,0 @@ -""" -Integration tests for papersdl plugin - -Tests verify: -1. Hook script exists -2. Dependencies installed via validation hooks -3. Verify deps with abx-pkg -4. Paper extraction works on paper URLs -5. JSONL output is correct -6. Config options work -7. Handles non-paper URLs gracefully -""" - -import json -import subprocess -import sys -import tempfile -import uuid -from pathlib import Path -import pytest - -PLUGIN_DIR = Path(__file__).parent.parent -PLUGINS_ROOT = PLUGIN_DIR.parent -PAPERSDL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_papersdl.*'), None) -TEST_URL = 'https://example.com' - -# Module-level cache for binary path -_papersdl_binary_path = None - -def get_papersdl_binary_path(): - """Get the installed papers-dl binary path from cache or by running installation.""" - global _papersdl_binary_path - if _papersdl_binary_path: - return _papersdl_binary_path - - # Try to find papers-dl binary using abx-pkg - from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides - - try: - binary = Binary( - name='papers-dl', - binproviders=[PipProvider(), EnvProvider()] - ).load() - - if binary and binary.abspath: - _papersdl_binary_path = str(binary.abspath) - return _papersdl_binary_path - except Exception: - pass - - # If not found, try to install via pip - pip_hook = PLUGINS_ROOT / 'pip' / 'on_Binary__install_using_pip_provider.py' - if pip_hook.exists(): - binary_id = str(uuid.uuid4()) - machine_id = str(uuid.uuid4()) - - cmd = [ - sys.executable, str(pip_hook), - '--binary-id', binary_id, - '--machine-id', machine_id, - '--name', 'papers-dl' - ] - - install_result = subprocess.run( - cmd, - capture_output=True, - text=True, - timeout=300 - ) - - # Parse Binary from pip installation - for install_line in install_result.stdout.strip().split('\n'): - if install_line.strip(): - try: - install_record = json.loads(install_line) - if install_record.get('type') == 'Binary' and install_record.get('name') == 'papers-dl': - _papersdl_binary_path = install_record.get('abspath') - return _papersdl_binary_path - except json.JSONDecodeError: - pass - - return None - -def test_hook_script_exists(): - """Verify on_Snapshot hook exists.""" - assert PAPERSDL_HOOK.exists(), f"Hook not found: {PAPERSDL_HOOK}" - - -def test_verify_deps_with_abx_pkg(): - """Verify papers-dl is installed by calling the REAL installation hooks.""" - binary_path = get_papersdl_binary_path() - assert binary_path, "papers-dl must be installed successfully via install hook and pip provider" - assert Path(binary_path).is_file(), f"Binary path must be a valid file: {binary_path}" - - -def test_handles_non_paper_url(): - """Test that papers-dl extractor handles non-paper URLs gracefully via hook.""" - import os - - binary_path = get_papersdl_binary_path() - assert binary_path, "Binary must be installed for this test" - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - env = os.environ.copy() - env['PAPERSDL_BINARY'] = binary_path - - # Run papers-dl extraction hook on non-paper URL - result = subprocess.run( - [sys.executable, str(PAPERSDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'test789'], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=60 - ) - - # Should exit 0 even for non-paper URL - assert result.returncode == 0, f"Should handle non-paper URL gracefully: {result.stderr}" - - # Parse clean JSONL output - result_json = None - for line in result.stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - assert result_json, "Should have ArchiveResult JSONL output" - assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" - - -def test_config_save_papersdl_false_skips(): - """Test that PAPERSDL_ENABLED=False exits without emitting JSONL.""" - import os - - with tempfile.TemporaryDirectory() as tmpdir: - env = os.environ.copy() - env['PAPERSDL_ENABLED'] = 'False' - - result = subprocess.run( - [sys.executable, str(PAPERSDL_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=30 - ) - - assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}" - - # Feature disabled - temporary failure, should NOT emit JSONL - assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr" - - # Should NOT emit any JSONL - jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')] - assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}" - - -def test_config_timeout(): - """Test that PAPERSDL_TIMEOUT config is respected.""" - import os - - binary_path = get_papersdl_binary_path() - assert binary_path, "Binary must be installed for this test" - - with tempfile.TemporaryDirectory() as tmpdir: - env = os.environ.copy() - env['PAPERSDL_BINARY'] = binary_path - env['PAPERSDL_TIMEOUT'] = '5' - - result = subprocess.run( - [sys.executable, str(PAPERSDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'testtimeout'], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=30 - ) - - assert result.returncode == 0, "Should complete without hanging" - -if __name__ == '__main__': - pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/parse_dom_outlinks/config.json b/archivebox/plugins/parse_dom_outlinks/config.json deleted file mode 100644 index b391981b..00000000 --- a/archivebox/plugins/parse_dom_outlinks/config.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "additionalProperties": false, - "required_plugins": ["chrome"], - "properties": { - "PARSE_DOM_OUTLINKS_ENABLED": { - "type": "boolean", - "default": true, - "x-aliases": ["SAVE_DOM_OUTLINKS", "USE_PARSE_DOM_OUTLINKS"], - "description": "Enable DOM outlinks parsing from archived pages" - }, - "PARSE_DOM_OUTLINKS_TIMEOUT": { - "type": "integer", - "default": 30, - "minimum": 5, - "x-fallback": "TIMEOUT", - "description": "Timeout for DOM outlinks parsing in seconds" - } - } -} diff --git a/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js b/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js deleted file mode 100755 index 3076fe61..00000000 --- a/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js +++ /dev/null @@ -1,292 +0,0 @@ -#!/usr/bin/env node -/** - * Extract and categorize outgoing links from a page's DOM. - * - * Categorizes links by type: - * - hrefs: All links - * - images: - * - css_stylesheets: - * - css_images: CSS background-image: url() - * - js_scripts: ', '', html, flags=re.DOTALL | re.IGNORECASE) - html = re.sub(r']*>.*?', '', html, flags=re.DOTALL | re.IGNORECASE) - html = re.sub(r'<[^>]+>', ' ', html) - html = html.replace(' ', ' ').replace('&', '&') - html = html.replace('<', '<').replace('>', '>') - html = html.replace('"', '"') - html = re.sub(r'\s+', ' ', html) - return html.strip() - - -def find_indexable_content() -> list[tuple[str, str]]: - """Find text content to index from extractor outputs.""" - results = [] - cwd = Path.cwd() - - for extractor, file_pattern in INDEXABLE_FILES: - plugin_dir = cwd / extractor - if not plugin_dir.exists(): - continue - - if '*' in file_pattern: - matches = list(plugin_dir.glob(file_pattern)) - else: - match = plugin_dir / file_pattern - matches = [match] if match.exists() else [] - - for match in matches: - if match.is_file() and match.stat().st_size > 0: - try: - content = match.read_text(encoding='utf-8', errors='ignore') - if content.strip(): - if match.suffix in ('.html', '.htm'): - content = strip_html_tags(content) - results.append((f'{extractor}/{match.name}', content)) - except Exception: - continue - - return results - - -def get_sonic_config() -> dict: - """Get Sonic connection configuration.""" - return { - 'host': get_env('SEARCH_BACKEND_HOST_NAME', '127.0.0.1'), - 'port': get_env_int('SEARCH_BACKEND_PORT', 1491), - 'password': get_env('SEARCH_BACKEND_PASSWORD', 'SecretPassword'), - 'collection': get_env('SONIC_COLLECTION', 'archivebox'), - 'bucket': get_env('SONIC_BUCKET', 'snapshots'), - } - - -def index_in_sonic(snapshot_id: str, texts: list[str]) -> None: - """Index texts in Sonic.""" - try: - from sonic import IngestClient - except ImportError: - raise RuntimeError('sonic-client not installed. Run: pip install sonic-client') - - config = get_sonic_config() - - with IngestClient(config['host'], config['port'], config['password']) as ingest: - # Flush existing content - try: - ingest.flush_object(config['collection'], config['bucket'], snapshot_id) - except Exception: - pass - - # Index new content in chunks (Sonic has size limits) - content = ' '.join(texts) - chunk_size = 10000 - for i in range(0, len(content), chunk_size): - chunk = content[i:i + chunk_size] - ingest.push(config['collection'], config['bucket'], snapshot_id, chunk) - - -@click.command() -@click.option('--url', required=True, help='URL that was archived') -@click.option('--snapshot-id', required=True, help='Snapshot UUID') -def main(url: str, snapshot_id: str): - """Index snapshot content in Sonic.""" - - output = None - status = 'failed' - error = '' - indexed_sources = [] - - try: - # Check if this backend is enabled (permanent skips - don't retry) - backend = get_env('SEARCH_BACKEND_ENGINE', 'sqlite') - if backend != 'sonic': - print(f'Skipping Sonic indexing (SEARCH_BACKEND_ENGINE={backend})', file=sys.stderr) - sys.exit(0) # Permanent skip - different backend selected - if not get_env_bool('USE_INDEXING_BACKEND', True): - print('Skipping indexing (USE_INDEXING_BACKEND=False)', file=sys.stderr) - sys.exit(0) # Permanent skip - indexing disabled - else: - contents = find_indexable_content() - indexed_sources = [source for source, _ in contents] - - if not contents: - status = 'skipped' - print('No indexable content found', file=sys.stderr) - else: - texts = [content for _, content in contents] - index_in_sonic(snapshot_id, texts) - status = 'succeeded' - output = OUTPUT_DIR - - except Exception as e: - error = f'{type(e).__name__}: {e}' - status = 'failed' - - if error: - print(f'ERROR: {error}', file=sys.stderr) - - # Search indexing hooks don't emit ArchiveResult - they're utility hooks - # Exit code indicates success/failure - sys.exit(0 if status == 'succeeded' else 1) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/search_backend_sonic/search.py b/archivebox/plugins/search_backend_sonic/search.py deleted file mode 100644 index f9c518fd..00000000 --- a/archivebox/plugins/search_backend_sonic/search.py +++ /dev/null @@ -1,50 +0,0 @@ -""" -Sonic search backend - search and flush operations. - -This module provides the search interface for the Sonic backend. -""" - -import os -from typing import List, Iterable - - -def get_sonic_config() -> dict: - """Get Sonic connection configuration.""" - return { - 'host': os.environ.get('SEARCH_BACKEND_HOST_NAME', '127.0.0.1').strip(), - 'port': int(os.environ.get('SEARCH_BACKEND_PORT', '1491')), - 'password': os.environ.get('SEARCH_BACKEND_PASSWORD', 'SecretPassword').strip(), - 'collection': os.environ.get('SONIC_COLLECTION', 'archivebox').strip(), - 'bucket': os.environ.get('SONIC_BUCKET', 'snapshots').strip(), - } - - -def search(query: str) -> List[str]: - """Search for snapshots in Sonic.""" - try: - from sonic import SearchClient - except ImportError: - raise RuntimeError('sonic-client not installed. Run: pip install sonic-client') - - config = get_sonic_config() - - with SearchClient(config['host'], config['port'], config['password']) as search_client: - results = search_client.query(config['collection'], config['bucket'], query, limit=100) - return results - - -def flush(snapshot_ids: Iterable[str]) -> None: - """Remove snapshots from Sonic index.""" - try: - from sonic import IngestClient - except ImportError: - raise RuntimeError('sonic-client not installed. Run: pip install sonic-client') - - config = get_sonic_config() - - with IngestClient(config['host'], config['port'], config['password']) as ingest: - for snapshot_id in snapshot_ids: - try: - ingest.flush_object(config['collection'], config['bucket'], snapshot_id) - except Exception: - pass diff --git a/archivebox/plugins/search_backend_sonic/templates/icon.html b/archivebox/plugins/search_backend_sonic/templates/icon.html deleted file mode 100644 index bf81a372..00000000 --- a/archivebox/plugins/search_backend_sonic/templates/icon.html +++ /dev/null @@ -1 +0,0 @@ - diff --git a/archivebox/plugins/search_backend_sqlite/__init__.py b/archivebox/plugins/search_backend_sqlite/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/archivebox/plugins/search_backend_sqlite/config.json b/archivebox/plugins/search_backend_sqlite/config.json deleted file mode 100644 index aff5f1b3..00000000 --- a/archivebox/plugins/search_backend_sqlite/config.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "additionalProperties": false, - "properties": { - "SEARCH_BACKEND_SQLITE_DB": { - "type": "string", - "default": "search.sqlite3", - "x-aliases": ["SQLITEFTS_DB"], - "description": "SQLite FTS database filename" - }, - "SEARCH_BACKEND_SQLITE_SEPARATE_DATABASE": { - "type": "boolean", - "default": true, - "x-aliases": ["FTS_SEPARATE_DATABASE", "SQLITEFTS_SEPARATE_DATABASE"], - "description": "Use separate database file for FTS index" - }, - "SEARCH_BACKEND_SQLITE_TOKENIZERS": { - "type": "string", - "default": "porter unicode61 remove_diacritics 2", - "x-aliases": ["FTS_TOKENIZERS", "SQLITEFTS_TOKENIZERS"], - "description": "FTS5 tokenizer configuration" - } - } -} diff --git a/archivebox/plugins/search_backend_sqlite/on_Snapshot__90_index_sqlite.py b/archivebox/plugins/search_backend_sqlite/on_Snapshot__90_index_sqlite.py deleted file mode 100644 index 8a8a21b6..00000000 --- a/archivebox/plugins/search_backend_sqlite/on_Snapshot__90_index_sqlite.py +++ /dev/null @@ -1,181 +0,0 @@ -#!/usr/bin/env python3 -""" -SQLite FTS5 search backend - indexes snapshot content for full-text search. - -This hook runs after all extractors and indexes text content in SQLite FTS5. -Only runs if SEARCH_BACKEND_ENGINE=sqlite. - -Usage: on_Snapshot__90_index_sqlite.py --url= --snapshot-id= - -Environment variables: - SEARCH_BACKEND_ENGINE: Must be 'sqlite' for this hook to run - USE_INDEXING_BACKEND: Enable search indexing (default: true) - SQLITEFTS_DB: Database filename (default: search.sqlite3) - FTS_TOKENIZERS: FTS5 tokenizer config (default: porter unicode61 remove_diacritics 2) -""" - -import json -import os -import re -import sqlite3 -import sys -from pathlib import Path - -import rich_click as click - - -# Extractor metadata -PLUGIN_NAME = 'index_sqlite' -OUTPUT_DIR = '.' - -# Text file patterns to index, in priority order -INDEXABLE_FILES = [ - ('readability', 'content.txt'), - ('readability', 'content.html'), - ('mercury', 'content.txt'), - ('mercury', 'content.html'), - ('htmltotext', 'output.txt'), - ('singlefile', 'singlefile.html'), - ('dom', 'output.html'), - ('wget', '**/*.html'), - ('wget', '**/*.htm'), - ('title', 'title.txt'), -] - - -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - - -def get_env_bool(name: str, default: bool = False) -> bool: - val = get_env(name, '').lower() - if val in ('true', '1', 'yes', 'on'): - return True - if val in ('false', '0', 'no', 'off'): - return False - return default - - -def strip_html_tags(html: str) -> str: - """Remove HTML tags, keeping text content.""" - html = re.sub(r']*>.*?', '', html, flags=re.DOTALL | re.IGNORECASE) - html = re.sub(r']*>.*?', '', html, flags=re.DOTALL | re.IGNORECASE) - html = re.sub(r'<[^>]+>', ' ', html) - html = html.replace(' ', ' ').replace('&', '&') - html = html.replace('<', '<').replace('>', '>') - html = html.replace('"', '"') - html = re.sub(r'\s+', ' ', html) - return html.strip() - - -def find_indexable_content() -> list[tuple[str, str]]: - """Find text content to index from extractor outputs.""" - results = [] - cwd = Path.cwd() - - for extractor, file_pattern in INDEXABLE_FILES: - plugin_dir = cwd / extractor - if not plugin_dir.exists(): - continue - - if '*' in file_pattern: - matches = list(plugin_dir.glob(file_pattern)) - else: - match = plugin_dir / file_pattern - matches = [match] if match.exists() else [] - - for match in matches: - if match.is_file() and match.stat().st_size > 0: - try: - content = match.read_text(encoding='utf-8', errors='ignore') - if content.strip(): - if match.suffix in ('.html', '.htm'): - content = strip_html_tags(content) - results.append((f'{extractor}/{match.name}', content)) - except Exception: - continue - - return results - - -def get_db_path() -> Path: - """Get path to the search index database.""" - data_dir = get_env('DATA_DIR', str(Path.cwd().parent.parent)) - db_name = get_env('SQLITEFTS_DB', 'search.sqlite3') - return Path(data_dir) / db_name - - -def index_in_sqlite(snapshot_id: str, texts: list[str]) -> None: - """Index texts in SQLite FTS5.""" - db_path = get_db_path() - tokenizers = get_env('FTS_TOKENIZERS', 'porter unicode61 remove_diacritics 2') - conn = sqlite3.connect(str(db_path)) - - try: - # Create FTS5 table if needed - conn.execute(f''' - CREATE VIRTUAL TABLE IF NOT EXISTS search_index - USING fts5(snapshot_id, content, tokenize='{tokenizers}') - ''') - - # Remove existing entries - conn.execute('DELETE FROM search_index WHERE snapshot_id = ?', (snapshot_id,)) - - # Insert new content - content = '\n\n'.join(texts) - conn.execute( - 'INSERT INTO search_index (snapshot_id, content) VALUES (?, ?)', - (snapshot_id, content) - ) - conn.commit() - finally: - conn.close() - - -@click.command() -@click.option('--url', required=True, help='URL that was archived') -@click.option('--snapshot-id', required=True, help='Snapshot UUID') -def main(url: str, snapshot_id: str): - """Index snapshot content in SQLite FTS5.""" - - output = None - status = 'failed' - error = '' - indexed_sources = [] - - try: - # Check if this backend is enabled (permanent skips - don't retry) - backend = get_env('SEARCH_BACKEND_ENGINE', 'sqlite') - if backend != 'sqlite': - print(f'Skipping SQLite indexing (SEARCH_BACKEND_ENGINE={backend})', file=sys.stderr) - sys.exit(0) # Permanent skip - different backend selected - if not get_env_bool('USE_INDEXING_BACKEND', True): - print('Skipping indexing (USE_INDEXING_BACKEND=False)', file=sys.stderr) - sys.exit(0) # Permanent skip - indexing disabled - else: - contents = find_indexable_content() - indexed_sources = [source for source, _ in contents] - - if not contents: - status = 'skipped' - print('No indexable content found', file=sys.stderr) - else: - texts = [content for _, content in contents] - index_in_sqlite(snapshot_id, texts) - status = 'succeeded' - output = OUTPUT_DIR - - except Exception as e: - error = f'{type(e).__name__}: {e}' - status = 'failed' - - if error: - print(f'ERROR: {error}', file=sys.stderr) - - # Search indexing hooks don't emit ArchiveResult - they're utility hooks - # Exit code indicates success/failure - sys.exit(0 if status == 'succeeded' else 1) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/search_backend_sqlite/search.py b/archivebox/plugins/search_backend_sqlite/search.py deleted file mode 100644 index 0d3f5539..00000000 --- a/archivebox/plugins/search_backend_sqlite/search.py +++ /dev/null @@ -1,70 +0,0 @@ -""" -SQLite FTS5 search backend - search and flush operations. - -This module provides the search interface for the SQLite FTS backend. - -Environment variables: - SQLITEFTS_DB: Database filename (default: search.sqlite3) - FTS_SEPARATE_DATABASE: Use separate database file (default: true) - FTS_TOKENIZERS: FTS5 tokenizer config (default: porter unicode61 remove_diacritics 2) -""" - -import os -import sqlite3 -from pathlib import Path -from typing import List, Iterable - - -# Config with old var names for backwards compatibility -SQLITEFTS_DB = os.environ.get('SQLITEFTS_DB', 'search.sqlite3').strip() -FTS_SEPARATE_DATABASE = os.environ.get('FTS_SEPARATE_DATABASE', 'true').lower() in ('true', '1', 'yes') -FTS_TOKENIZERS = os.environ.get('FTS_TOKENIZERS', 'porter unicode61 remove_diacritics 2').strip() - - -def _get_data_dir() -> Path: - data_dir = os.environ.get('DATA_DIR', '').strip() - if data_dir: - return Path(data_dir) - return Path.cwd() / 'data' - - -def get_db_path() -> Path: - """Get path to the search index database.""" - return _get_data_dir() / SQLITEFTS_DB - - -def search(query: str) -> List[str]: - """Search for snapshots matching the query.""" - db_path = get_db_path() - if not db_path.exists(): - return [] - - conn = sqlite3.connect(str(db_path)) - try: - cursor = conn.execute( - 'SELECT DISTINCT snapshot_id FROM search_index WHERE search_index MATCH ?', - (query,) - ) - return [row[0] for row in cursor.fetchall()] - except sqlite3.OperationalError: - # Table doesn't exist yet - return [] - finally: - conn.close() - - -def flush(snapshot_ids: Iterable[str]) -> None: - """Remove snapshots from the index.""" - db_path = get_db_path() - if not db_path.exists(): - return - - conn = sqlite3.connect(str(db_path)) - try: - for snapshot_id in snapshot_ids: - conn.execute('DELETE FROM search_index WHERE snapshot_id = ?', (snapshot_id,)) - conn.commit() - except sqlite3.OperationalError: - pass # Table doesn't exist - finally: - conn.close() diff --git a/archivebox/plugins/search_backend_sqlite/templates/icon.html b/archivebox/plugins/search_backend_sqlite/templates/icon.html deleted file mode 100644 index 3c9f8646..00000000 --- a/archivebox/plugins/search_backend_sqlite/templates/icon.html +++ /dev/null @@ -1 +0,0 @@ - diff --git a/archivebox/plugins/search_backend_sqlite/tests/test_sqlite_search.py b/archivebox/plugins/search_backend_sqlite/tests/test_sqlite_search.py deleted file mode 100644 index d8d6035f..00000000 --- a/archivebox/plugins/search_backend_sqlite/tests/test_sqlite_search.py +++ /dev/null @@ -1,351 +0,0 @@ -""" -Tests for the SQLite FTS5 search backend. - -Tests cover: -1. Search index creation -2. Indexing snapshots -3. Search queries with real test data -4. Flush operations -5. Edge cases (empty index, special characters) -""" - -import os -import sqlite3 -import tempfile -from pathlib import Path -from unittest.mock import patch - -import pytest -from django.test import TestCase, override_settings - -from archivebox.plugins.search_backend_sqlite.search import ( - get_db_path, - search, - flush, - SQLITEFTS_DB, - FTS_TOKENIZERS, -) - - -class TestSqliteSearchBackend(TestCase): - """Test SQLite FTS5 search backend.""" - - def setUp(self): - """Create a temporary data directory with search index.""" - self.temp_dir = tempfile.mkdtemp() - self.db_path = Path(self.temp_dir) / SQLITEFTS_DB - - # Patch DATA_DIR - self.settings_patch = patch( - 'archivebox.plugins.search_backend_sqlite.search.settings' - ) - self.mock_settings = self.settings_patch.start() - self.mock_settings.DATA_DIR = self.temp_dir - - # Create FTS5 table - self._create_index() - - def tearDown(self): - """Clean up temporary directory.""" - self.settings_patch.stop() - import shutil - shutil.rmtree(self.temp_dir, ignore_errors=True) - - def _create_index(self): - """Create the FTS5 search index table.""" - conn = sqlite3.connect(str(self.db_path)) - try: - conn.execute(f''' - CREATE VIRTUAL TABLE IF NOT EXISTS search_index - USING fts5( - snapshot_id, - url, - title, - content, - tokenize = '{FTS_TOKENIZERS}' - ) - ''') - conn.commit() - finally: - conn.close() - - def _index_snapshot(self, snapshot_id: str, url: str, title: str, content: str): - """Add a snapshot to the index.""" - conn = sqlite3.connect(str(self.db_path)) - try: - conn.execute( - 'INSERT INTO search_index (snapshot_id, url, title, content) VALUES (?, ?, ?, ?)', - (snapshot_id, url, title, content) - ) - conn.commit() - finally: - conn.close() - - def test_get_db_path(self): - """get_db_path should return correct path.""" - path = get_db_path() - self.assertEqual(path, Path(self.temp_dir) / SQLITEFTS_DB) - - def test_search_empty_index(self): - """search should return empty list for empty index.""" - results = search('nonexistent') - self.assertEqual(results, []) - - def test_search_no_index_file(self): - """search should return empty list when index file doesn't exist.""" - os.remove(self.db_path) - results = search('test') - self.assertEqual(results, []) - - def test_search_single_result(self): - """search should find matching snapshot.""" - self._index_snapshot( - 'snap-001', - 'https://example.com/page1', - 'Example Page', - 'This is example content about testing.' - ) - - results = search('example') - self.assertEqual(len(results), 1) - self.assertEqual(results[0], 'snap-001') - - def test_search_multiple_results(self): - """search should find all matching snapshots.""" - self._index_snapshot('snap-001', 'https://example.com/1', 'Python Tutorial', 'Learn Python programming') - self._index_snapshot('snap-002', 'https://example.com/2', 'Python Guide', 'Advanced Python concepts') - self._index_snapshot('snap-003', 'https://example.com/3', 'JavaScript Basics', 'Learn JavaScript') - - results = search('Python') - self.assertEqual(len(results), 2) - self.assertIn('snap-001', results) - self.assertIn('snap-002', results) - self.assertNotIn('snap-003', results) - - def test_search_title_match(self): - """search should match against title.""" - self._index_snapshot('snap-001', 'https://example.com', 'Django Web Framework', 'Content here') - - results = search('Django') - self.assertEqual(len(results), 1) - self.assertEqual(results[0], 'snap-001') - - def test_search_url_match(self): - """search should match against URL.""" - self._index_snapshot('snap-001', 'https://archivebox.io/docs', 'Title', 'Content') - - results = search('archivebox') - self.assertEqual(len(results), 1) - - def test_search_content_match(self): - """search should match against content.""" - self._index_snapshot( - 'snap-001', - 'https://example.com', - 'Generic Title', - 'This document contains information about cryptography and security.' - ) - - results = search('cryptography') - self.assertEqual(len(results), 1) - - def test_search_case_insensitive(self): - """search should be case insensitive.""" - self._index_snapshot('snap-001', 'https://example.com', 'Title', 'PYTHON programming') - - results = search('python') - self.assertEqual(len(results), 1) - - def test_search_stemming(self): - """search should use porter stemmer for word stems.""" - self._index_snapshot('snap-001', 'https://example.com', 'Title', 'Programming concepts') - - # 'program' should match 'programming' with porter stemmer - results = search('program') - self.assertEqual(len(results), 1) - - def test_search_multiple_words(self): - """search should match documents with all words.""" - self._index_snapshot('snap-001', 'https://example.com', 'Web Development', 'Learn web development skills') - self._index_snapshot('snap-002', 'https://example.com', 'Web Design', 'Design beautiful websites') - - results = search('web development') - # FTS5 defaults to OR, so both might match - # With porter stemmer, both should match 'web' - self.assertIn('snap-001', results) - - def test_search_phrase(self): - """search should support phrase queries.""" - self._index_snapshot('snap-001', 'https://example.com', 'Title', 'machine learning algorithms') - self._index_snapshot('snap-002', 'https://example.com', 'Title', 'machine algorithms learning') - - # Phrase search with quotes - results = search('"machine learning"') - self.assertEqual(len(results), 1) - self.assertEqual(results[0], 'snap-001') - - def test_search_distinct_results(self): - """search should return distinct snapshot IDs.""" - # Index same snapshot twice (could happen with multiple fields matching) - self._index_snapshot('snap-001', 'https://python.org', 'Python', 'Python programming language') - - results = search('Python') - self.assertEqual(len(results), 1) - - def test_flush_single(self): - """flush should remove snapshot from index.""" - self._index_snapshot('snap-001', 'https://example.com', 'Title', 'Content') - self._index_snapshot('snap-002', 'https://example.com', 'Title', 'Content') - - flush(['snap-001']) - - results = search('Content') - self.assertEqual(len(results), 1) - self.assertEqual(results[0], 'snap-002') - - def test_flush_multiple(self): - """flush should remove multiple snapshots.""" - self._index_snapshot('snap-001', 'https://example.com', 'Title', 'Test') - self._index_snapshot('snap-002', 'https://example.com', 'Title', 'Test') - self._index_snapshot('snap-003', 'https://example.com', 'Title', 'Test') - - flush(['snap-001', 'snap-003']) - - results = search('Test') - self.assertEqual(len(results), 1) - self.assertEqual(results[0], 'snap-002') - - def test_flush_nonexistent(self): - """flush should not raise for nonexistent snapshots.""" - # Should not raise - flush(['nonexistent-snap']) - - def test_flush_no_index(self): - """flush should not raise when index doesn't exist.""" - os.remove(self.db_path) - # Should not raise - flush(['snap-001']) - - def test_search_special_characters(self): - """search should handle special characters in queries.""" - self._index_snapshot('snap-001', 'https://example.com', 'C++ Programming', 'Learn C++ basics') - - # FTS5 handles special chars - results = search('C++') - # May or may not match depending on tokenizer config - # At minimum, should not raise - self.assertIsInstance(results, list) - - def test_search_unicode(self): - """search should handle unicode content.""" - self._index_snapshot('snap-001', 'https://example.com', 'Titre Francais', 'cafe resume') - self._index_snapshot('snap-002', 'https://example.com', 'Japanese', 'Hello world') - - # With remove_diacritics, 'cafe' should match - results = search('cafe') - self.assertEqual(len(results), 1) - - -class TestSqliteSearchWithRealData(TestCase): - """Integration tests with realistic archived content.""" - - def setUp(self): - """Create index with realistic test data.""" - self.temp_dir = tempfile.mkdtemp() - self.db_path = Path(self.temp_dir) / SQLITEFTS_DB - - self.settings_patch = patch( - 'archivebox.plugins.search_backend_sqlite.search.settings' - ) - self.mock_settings = self.settings_patch.start() - self.mock_settings.DATA_DIR = self.temp_dir - - # Create index - conn = sqlite3.connect(str(self.db_path)) - try: - conn.execute(f''' - CREATE VIRTUAL TABLE IF NOT EXISTS search_index - USING fts5( - snapshot_id, - url, - title, - content, - tokenize = '{FTS_TOKENIZERS}' - ) - ''') - # Index realistic data - test_data = [ - ('snap-001', 'https://github.com/ArchiveBox/ArchiveBox', - 'ArchiveBox - Self-hosted web archiving', - 'Open source self-hosted web archiving. Collects, saves, and displays various types of content.'), - ('snap-002', 'https://docs.python.org/3/tutorial/', - 'Python 3 Tutorial', - 'An informal introduction to Python. Python is an easy to learn, powerful programming language.'), - ('snap-003', 'https://developer.mozilla.org/docs/Web/JavaScript', - 'JavaScript - MDN Web Docs', - 'JavaScript (JS) is a lightweight, interpreted programming language with first-class functions.'), - ('snap-004', 'https://news.ycombinator.com', - 'Hacker News', - 'Social news website focusing on computer science and entrepreneurship.'), - ('snap-005', 'https://en.wikipedia.org/wiki/Web_archiving', - 'Web archiving - Wikipedia', - 'Web archiving is the process of collecting portions of the World Wide Web to ensure the information is preserved.'), - ] - conn.executemany( - 'INSERT INTO search_index (snapshot_id, url, title, content) VALUES (?, ?, ?, ?)', - test_data - ) - conn.commit() - finally: - conn.close() - - def tearDown(self): - """Clean up.""" - self.settings_patch.stop() - import shutil - shutil.rmtree(self.temp_dir, ignore_errors=True) - - def test_search_archivebox(self): - """Search for 'archivebox' should find relevant results.""" - results = search('archivebox') - self.assertIn('snap-001', results) - - def test_search_programming(self): - """Search for 'programming' should find Python and JS docs.""" - results = search('programming') - self.assertIn('snap-002', results) - self.assertIn('snap-003', results) - - def test_search_web_archiving(self): - """Search for 'web archiving' should find relevant results.""" - results = search('web archiving') - # Both ArchiveBox and Wikipedia should match - self.assertIn('snap-001', results) - self.assertIn('snap-005', results) - - def test_search_github(self): - """Search for 'github' should find URL match.""" - results = search('github') - self.assertIn('snap-001', results) - - def test_search_tutorial(self): - """Search for 'tutorial' should find Python tutorial.""" - results = search('tutorial') - self.assertIn('snap-002', results) - - def test_flush_and_search(self): - """Flushing a snapshot should remove it from search results.""" - # Verify it's there first - results = search('archivebox') - self.assertIn('snap-001', results) - - # Flush it - flush(['snap-001']) - - # Should no longer be found - results = search('archivebox') - self.assertNotIn('snap-001', results) - - -if __name__ == '__main__': - pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/seo/config.json b/archivebox/plugins/seo/config.json deleted file mode 100644 index 43fca2ad..00000000 --- a/archivebox/plugins/seo/config.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "additionalProperties": false, - "required_plugins": ["chrome"], - "properties": { - "SEO_ENABLED": { - "type": "boolean", - "default": true, - "x-aliases": ["SAVE_SEO", "USE_SEO"], - "description": "Enable SEO metadata capture" - }, - "SEO_TIMEOUT": { - "type": "integer", - "default": 30, - "minimum": 5, - "x-fallback": "TIMEOUT", - "description": "Timeout for SEO capture in seconds" - } - } -} diff --git a/archivebox/plugins/seo/on_Snapshot__38_seo.js b/archivebox/plugins/seo/on_Snapshot__38_seo.js deleted file mode 100755 index cc107d64..00000000 --- a/archivebox/plugins/seo/on_Snapshot__38_seo.js +++ /dev/null @@ -1,169 +0,0 @@ -#!/usr/bin/env node -/** - * Extract SEO metadata from a URL. - * - * Extracts all tags including: - * - og:* (Open Graph) - * - twitter:* - * - description, keywords, author - * - Any other meta tags - * - * Usage: on_Snapshot__38_seo.js --url= --snapshot-id= - * Output: Writes seo/seo.json - * - * Environment variables: - * SAVE_SEO: Enable SEO extraction (default: true) - */ - -const fs = require('fs'); -const path = require('path'); -// Add NODE_MODULES_DIR to module resolution paths if set -if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); -const puppeteer = require('puppeteer-core'); - -// Import shared utilities from chrome_utils.js -const { - getEnvBool, - getEnvInt, - parseArgs, - connectToPage, - waitForPageLoaded, -} = require('../chrome/chrome_utils.js'); - -// Extractor metadata -const PLUGIN_NAME = 'seo'; -const OUTPUT_DIR = '.'; -const OUTPUT_FILE = 'seo.json'; -const CHROME_SESSION_DIR = '../chrome'; - -// Extract SEO metadata -async function extractSeo(url) { - // Output directory is current directory (hook already runs in output dir) - const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); - const timeout = getEnvInt('SEO_TIMEOUT', getEnvInt('TIMEOUT', 30)) * 1000; - let browser = null; - - try { - // Connect to existing Chrome session and get target page - const connection = await connectToPage({ - chromeSessionDir: CHROME_SESSION_DIR, - timeoutMs: timeout, - puppeteer, - }); - browser = connection.browser; - const page = connection.page; - - // Extract all meta tags - const seoData = await page.evaluate(() => { - const metaTags = Array.from(document.querySelectorAll('meta')); - const seo = { - url: window.location.href, - title: document.title || '', - }; - - // Process each meta tag - metaTags.forEach(tag => { - // Get the key (name or property attribute) - const key = tag.getAttribute('name') || tag.getAttribute('property') || ''; - const content = tag.getAttribute('content') || ''; - - if (key && content) { - // Store by key - seo[key] = content; - } - }); - - // Also get canonical URL if present - const canonical = document.querySelector('link[rel="canonical"]'); - if (canonical) { - seo.canonical = canonical.getAttribute('href'); - } - - // Get language - const htmlLang = document.documentElement.lang; - if (htmlLang) { - seo.language = htmlLang; - } - - return seo; - }); - - // Write output - fs.writeFileSync(outputPath, JSON.stringify(seoData, null, 2)); - - return { success: true, output: outputPath, seoData }; - - } catch (e) { - return { success: false, error: `${e.name}: ${e.message}` }; - } finally { - if (browser) { - browser.disconnect(); - } - } -} - -async function main() { - const args = parseArgs(); - const url = args.url; - const snapshotId = args.snapshot_id; - - if (!url || !snapshotId) { - console.error('Usage: on_Snapshot__38_seo.js --url= --snapshot-id='); - process.exit(1); - } - - const startTs = new Date(); - let status = 'failed'; - let output = null; - let error = ''; - - try { - // Check if enabled - if (!getEnvBool('SEO_ENABLED', true)) { - console.log('Skipping SEO (SEO_ENABLED=False)'); - // Output clean JSONL (no RESULT_JSON= prefix) - console.log(JSON.stringify({ - type: 'ArchiveResult', - status: 'skipped', - output_str: 'SEO_ENABLED=False', - })); - process.exit(0); - } - - const timeout = getEnvInt('SEO_TIMEOUT', getEnvInt('TIMEOUT', 30)) * 1000; - await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 200); - - const result = await extractSeo(url); - - if (result.success) { - status = 'succeeded'; - output = result.output; - const metaCount = Object.keys(result.seoData).length - 2; // Subtract url and title - console.log(`SEO metadata extracted: ${metaCount} meta tags`); - } else { - status = 'failed'; - error = result.error; - } - } catch (e) { - error = `${e.name}: ${e.message}`; - status = 'failed'; - } - - const endTs = new Date(); - - if (error) console.error(`ERROR: ${error}`); - - // Output clean JSONL (no RESULT_JSON= prefix) - console.log(JSON.stringify({ - type: 'ArchiveResult', - status, - output_str: output || error || '', - })); - - process.exit(status === 'succeeded' ? 0 : 1); -} - -main().catch(e => { - console.error(`Fatal error: ${e.message}`); - process.exit(1); -}); diff --git a/archivebox/plugins/seo/templates/icon.html b/archivebox/plugins/seo/templates/icon.html deleted file mode 100644 index 1306d22d..00000000 --- a/archivebox/plugins/seo/templates/icon.html +++ /dev/null @@ -1 +0,0 @@ - diff --git a/archivebox/plugins/seo/tests/test_seo.py b/archivebox/plugins/seo/tests/test_seo.py deleted file mode 100644 index d0e2f09f..00000000 --- a/archivebox/plugins/seo/tests/test_seo.py +++ /dev/null @@ -1,129 +0,0 @@ -""" -Tests for the SEO plugin. - -Tests the real SEO hook with an actual URL to verify -meta tag extraction. -""" - -import json -import subprocess -import sys -import tempfile -import shutil -from pathlib import Path - -from django.test import TestCase - -# Import chrome test helpers -sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests')) -from chrome_test_helpers import ( - chrome_session, - CHROME_NAVIGATE_HOOK, - get_plugin_dir, - get_hook_script, -) - - -# Get the path to the SEO hook -PLUGIN_DIR = get_plugin_dir(__file__) -SEO_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_seo.*') - - -class TestSEOPlugin(TestCase): - """Test the SEO plugin.""" - - def test_seo_hook_exists(self): - """SEO hook script should exist.""" - self.assertIsNotNone(SEO_HOOK, "SEO hook not found in plugin directory") - self.assertTrue(SEO_HOOK.exists(), f"Hook not found: {SEO_HOOK}") - - -class TestSEOWithChrome(TestCase): - """Integration tests for SEO plugin with Chrome.""" - - def setUp(self): - """Set up test environment.""" - self.temp_dir = Path(tempfile.mkdtemp()) - - def tearDown(self): - """Clean up.""" - shutil.rmtree(self.temp_dir, ignore_errors=True) - - def test_seo_extracts_meta_tags(self): - """SEO hook should extract meta tags from a real URL.""" - test_url = 'https://example.com' - snapshot_id = 'test-seo-snapshot' - - with chrome_session( - self.temp_dir, - crawl_id='test-seo-crawl', - snapshot_id=snapshot_id, - test_url=test_url, - navigate=False, - timeout=30, - ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env): - seo_dir = snapshot_chrome_dir.parent / 'seo' - seo_dir.mkdir(exist_ok=True) - - nav_result = subprocess.run( - ['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(snapshot_chrome_dir), - capture_output=True, - text=True, - timeout=120, - env=env - ) - self.assertEqual(nav_result.returncode, 0, f"Navigation failed: {nav_result.stderr}") - - # Run SEO hook with the active Chrome session - result = subprocess.run( - ['node', str(SEO_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(seo_dir), - capture_output=True, - text=True, - timeout=60, - env=env - ) - - # Check for output file - seo_output = seo_dir / 'seo.json' - - seo_data = None - - # Try parsing from file first - if seo_output.exists(): - with open(seo_output) as f: - try: - seo_data = json.load(f) - except json.JSONDecodeError: - pass - - # Try parsing from stdout if not in file - if not seo_data: - for line in result.stdout.split('\n'): - line = line.strip() - if line.startswith('{'): - try: - record = json.loads(line) - # SEO data typically has title, description, or og: tags - if any(key in record for key in ['title', 'description', 'og:title', 'canonical']): - seo_data = record - break - except json.JSONDecodeError: - continue - - # Verify hook ran successfully - self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}") - self.assertNotIn('Traceback', result.stderr) - self.assertNotIn('Error:', result.stderr) - - # example.com has a title, so we MUST get SEO data - self.assertIsNotNone(seo_data, "No SEO data extracted from file or stdout") - - # Verify we got some SEO data - has_seo_data = any(key in seo_data for key in ['title', 'description', 'og:title', 'canonical', 'meta']) - self.assertTrue(has_seo_data, f"No SEO data extracted: {seo_data}") - - -if __name__ == '__main__': - pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/singlefile/config.json b/archivebox/plugins/singlefile/config.json deleted file mode 100644 index c522efba..00000000 --- a/archivebox/plugins/singlefile/config.json +++ /dev/null @@ -1,77 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "additionalProperties": false, - "properties": { - "SINGLEFILE_ENABLED": { - "type": "boolean", - "default": true, - "x-aliases": ["SAVE_SINGLEFILE", "USE_SINGLEFILE"], - "description": "Enable SingleFile archiving" - }, - "SINGLEFILE_BINARY": { - "type": "string", - "default": "single-file", - "x-aliases": ["SINGLE_FILE_BINARY"], - "description": "Path to single-file binary" - }, - "SINGLEFILE_NODE_BINARY": { - "type": "string", - "default": "node", - "x-fallback": "NODE_BINARY", - "description": "Path to Node.js binary" - }, - "SINGLEFILE_CHROME_BINARY": { - "type": "string", - "default": "", - "x-fallback": "CHROME_BINARY", - "description": "Path to Chromium binary" - }, - "SINGLEFILE_TIMEOUT": { - "type": "integer", - "default": 60, - "minimum": 10, - "x-fallback": "TIMEOUT", - "description": "Timeout for SingleFile in seconds" - }, - "SINGLEFILE_USER_AGENT": { - "type": "string", - "default": "", - "x-fallback": "USER_AGENT", - "description": "User agent string" - }, - "SINGLEFILE_COOKIES_FILE": { - "type": "string", - "default": "", - "x-fallback": "COOKIES_FILE", - "description": "Path to cookies file" - }, - "SINGLEFILE_CHECK_SSL_VALIDITY": { - "type": "boolean", - "default": true, - "x-fallback": "CHECK_SSL_VALIDITY", - "description": "Whether to verify SSL certificates" - }, - "SINGLEFILE_CHROME_ARGS": { - "type": "array", - "items": {"type": "string"}, - "default": [], - "x-fallback": "CHROME_ARGS", - "description": "Chrome command-line arguments for SingleFile" - }, - "SINGLEFILE_ARGS": { - "type": "array", - "items": {"type": "string"}, - "default": ["--browser-headless"], - "x-aliases": ["SINGLEFILE_DEFAULT_ARGS"], - "description": "Default single-file arguments" - }, - "SINGLEFILE_ARGS_EXTRA": { - "type": "array", - "items": {"type": "string"}, - "default": [], - "x-aliases": ["SINGLEFILE_EXTRA_ARGS"], - "description": "Extra arguments to append to single-file command" - } - } -} diff --git a/archivebox/plugins/singlefile/on_Crawl__45_singlefile_install.py b/archivebox/plugins/singlefile/on_Crawl__45_singlefile_install.py deleted file mode 100755 index f2d22b3e..00000000 --- a/archivebox/plugins/singlefile/on_Crawl__45_singlefile_install.py +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python3 -""" -Emit single-file Binary dependency for the crawl. -""" - -import json -import os -import sys - - -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - -def get_env_bool(name: str, default: bool = False) -> bool: - val = get_env(name, '').lower() - if val in ('true', '1', 'yes', 'on'): - return True - if val in ('false', '0', 'no', 'off'): - return False - return default - - -def output_binary(name: str, binproviders: str, overrides: dict | None = None): - """Output Binary JSONL record for a dependency.""" - machine_id = os.environ.get('MACHINE_ID', '') - - record = { - 'type': 'Binary', - 'name': name, - 'binproviders': binproviders, - 'machine_id': machine_id, - } - if overrides: - record['overrides'] = overrides - print(json.dumps(record)) - - -def main(): - singlefile_enabled = get_env_bool('SINGLEFILE_ENABLED', True) - - if not singlefile_enabled: - sys.exit(0) - - output_binary( - name='single-file', - binproviders='npm,env', - overrides={'npm': {'packages': ['single-file-cli']}}, - ) - - sys.exit(0) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/singlefile/on_Crawl__82_singlefile_install.js b/archivebox/plugins/singlefile/on_Crawl__82_singlefile_install.js deleted file mode 100755 index 8abefe4f..00000000 --- a/archivebox/plugins/singlefile/on_Crawl__82_singlefile_install.js +++ /dev/null @@ -1,341 +0,0 @@ -#!/usr/bin/env node -/** - * SingleFile Extension Plugin - * - * Installs and uses the SingleFile Chrome extension for archiving complete web pages. - * Falls back to single-file-cli if the extension is not available. - * - * Extension: https://chromewebstore.google.com/detail/mpiodijhokgodhhofbcjdecpffjipkle - * - * Priority: 82 - Must install before Chrome session starts at Crawl level - * Hook: on_Crawl (runs once per crawl, not per snapshot) - * - * This extension automatically: - * - Saves complete web pages as single HTML files - * - Inlines all resources (CSS, JS, images, fonts) - * - Preserves page fidelity better than wget/curl - * - Works with SPAs and dynamically loaded content - */ - -const path = require('path'); -const fs = require('fs'); -const { promisify } = require('util'); -const { exec } = require('child_process'); - -const execAsync = promisify(exec); - -// Import extension utilities -const extensionUtils = require('../chrome/chrome_utils.js'); - -// Extension metadata -const EXTENSION = { - webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle', - name: 'singlefile', -}; - -// Get extensions directory from environment or use default -const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR || - path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions'); - -const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR || - path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads'); - -const OUTPUT_DIR = '.'; -const OUTPUT_FILE = 'singlefile.html'; - -/** - * Install the SingleFile extension - */ -async function installSinglefileExtension() { - console.log('[*] Installing SingleFile extension...'); - - // Install the extension - const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR); - - if (!extension) { - console.error('[āŒ] Failed to install SingleFile extension'); - return null; - } - - console.log('[+] SingleFile extension installed'); - console.log('[+] Web pages will be saved as single HTML files'); - - return extension; -} - -/** - * Wait for a specified amount of time - */ -function wait(ms) { - return new Promise(resolve => setTimeout(resolve, ms)); -} - -/** - * Save a page using the SingleFile extension - * - * @param {Object} page - Puppeteer page object - * @param {Object} extension - Extension metadata with dispatchAction method - * @param {Object} options - Additional options - * @returns {Promise} - Path to saved file or null on failure - */ -async function saveSinglefileWithExtension(page, extension, options = {}) { - if (!extension || !extension.version) { - throw new Error('SingleFile extension not found or not loaded'); - } - - const url = await page.url(); - console.error(`[singlefile] Triggering extension for: ${url}`); - - // Check for unsupported URL schemes - const URL_SCHEMES_IGNORED = ['about', 'chrome', 'chrome-extension', 'data', 'javascript', 'blob']; - const scheme = url.split(':')[0]; - if (URL_SCHEMES_IGNORED.includes(scheme)) { - console.log(`[āš ļø] Skipping SingleFile for URL scheme: ${scheme}`); - return null; - } - - const downloadsDir = options.downloadsDir || CHROME_DOWNLOADS_DIR; - console.error(`[singlefile] Watching downloads dir: ${downloadsDir}`); - - // Ensure downloads directory exists - await fs.promises.mkdir(downloadsDir, { recursive: true }); - - // Get list of existing files to ignore - const files_before = new Set( - (await fs.promises.readdir(downloadsDir)) - .filter(fn => fn.toLowerCase().endsWith('.html') || fn.toLowerCase().endsWith('.htm')) - ); - - // Output directory is current directory (hook already runs in output dir) - const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE); - - console.error(`[singlefile] Saving via extension (${extension.id})...`); - - // Bring page to front (extension action button acts on foreground tab) - await page.bringToFront(); - - // Trigger the extension's action (toolbar button click) - console.error('[singlefile] Dispatching extension action...'); - try { - const actionTimeoutMs = options.actionTimeoutMs || 5000; - const actionPromise = extension.dispatchAction(); - const actionResult = await Promise.race([ - actionPromise, - wait(actionTimeoutMs).then(() => 'timeout'), - ]); - if (actionResult === 'timeout') { - console.error(`[singlefile] Extension action did not resolve within ${actionTimeoutMs}ms, continuing...`); - } - } catch (err) { - console.error(`[singlefile] Extension action error: ${err.message || err}`); - } - - // Wait for file to appear in downloads directory - const check_delay = 3000; // 3 seconds - const max_tries = 10; - let files_new = []; - - console.error(`[singlefile] Waiting up to ${(check_delay * max_tries) / 1000}s for download...`); - for (let attempt = 0; attempt < max_tries; attempt++) { - await wait(check_delay); - - const files_after = (await fs.promises.readdir(downloadsDir)) - .filter(fn => fn.toLowerCase().endsWith('.html') || fn.toLowerCase().endsWith('.htm')); - - files_new = files_after.filter(file => !files_before.has(file)); - - if (files_new.length === 0) { - console.error(`[singlefile] No new downloads yet (${attempt + 1}/${max_tries})`); - continue; - } - - console.error(`[singlefile] New download(s) detected: ${files_new.join(', ')}`); - - // Prefer files that match the URL or have SingleFile markers - const url_variants = new Set([url]); - if (url.endsWith('/')) { - url_variants.add(url.slice(0, -1)); - } else { - url_variants.add(`${url}/`); - } - - const scored = []; - for (const file of files_new) { - const dl_path = path.join(downloadsDir, file); - let header = ''; - try { - const dl_text = await fs.promises.readFile(dl_path, 'utf-8'); - header = dl_text.slice(0, 200000); - const stat = await fs.promises.stat(dl_path); - console.error(`[singlefile] Download ${file} size=${stat.size} bytes`); - } catch (err) { - // Skip unreadable files - continue; - } - - const header_lower = header.toLowerCase(); - const has_url = Array.from(url_variants).some(v => header.includes(v)); - const has_singlefile_marker = header_lower.includes('singlefile') || header_lower.includes('single-file'); - const score = (has_url ? 2 : 0) + (has_singlefile_marker ? 1 : 0); - scored.push({ file, dl_path, score }); - } - - scored.sort((a, b) => b.score - a.score); - - if (scored.length > 0) { - const best = scored[0]; - if (best.score > 0 || files_new.length === 1) { - console.error(`[singlefile] Moving download from ${best.file} -> ${out_path}`); - await fs.promises.rename(best.dl_path, out_path); - const out_stat = await fs.promises.stat(out_path); - console.error(`[singlefile] Moved file size=${out_stat.size} bytes`); - return out_path; - } - } - - if (files_new.length > 0) { - // Fallback: move the newest file if no clear match found - let newest = null; - let newest_mtime = -1; - for (const file of files_new) { - const dl_path = path.join(downloadsDir, file); - try { - const stat = await fs.promises.stat(dl_path); - if (stat.mtimeMs > newest_mtime) { - newest_mtime = stat.mtimeMs; - newest = { file, dl_path }; - } - } catch (err) {} - } - if (newest) { - console.error(`[singlefile] Moving newest download from ${newest.file} -> ${out_path}`); - await fs.promises.rename(newest.dl_path, out_path); - const out_stat = await fs.promises.stat(out_path); - console.error(`[singlefile] Moved file size=${out_stat.size} bytes`); - return out_path; - } - } - } - - console.error(`[singlefile] Failed to find SingleFile HTML in ${downloadsDir} after ${(check_delay * max_tries) / 1000}s`); - console.error(`[singlefile] New files seen: ${files_new.join(', ')}`); - return null; -} - -/** - * Save a page using single-file-cli (fallback method) - * - * @param {string} url - URL to archive - * @param {Object} options - Additional options - * @returns {Promise} - Path to saved file or null on failure - */ -async function saveSinglefileWithCLI(url, options = {}) { - console.log('[*] Falling back to single-file-cli...'); - - // Find single-file binary - let binary = null; - try { - const { stdout } = await execAsync('which single-file'); - binary = stdout.trim(); - } catch (err) { - console.error('[āŒ] single-file-cli not found. Install with: npm install -g single-file-cli'); - return null; - } - - // Output directory is current directory (hook already runs in output dir) - const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE); - - // Build command - const cmd = [ - binary, - '--browser-headless', - url, - out_path, - ]; - - // Add optional args - if (options.userAgent) { - cmd.splice(2, 0, '--browser-user-agent', options.userAgent); - } - if (options.cookiesFile && fs.existsSync(options.cookiesFile)) { - cmd.splice(2, 0, '--browser-cookies-file', options.cookiesFile); - } - if (options.ignoreSSL) { - cmd.splice(2, 0, '--browser-ignore-insecure-certs'); - } - - // Execute - try { - const timeout = options.timeout || 120000; - await execAsync(cmd.join(' '), { timeout }); - - if (fs.existsSync(out_path) && fs.statSync(out_path).size > 0) { - console.log(`[+] SingleFile saved via CLI: ${out_path}`); - return out_path; - } - - console.error('[āŒ] SingleFile CLI completed but no output file found'); - return null; - } catch (err) { - console.error(`[āŒ] SingleFile CLI error: ${err.message}`); - return null; - } -} - -/** - * Main entry point - install extension before archiving - */ -async function main() { - // Check if extension is already cached - const cacheFile = path.join(EXTENSIONS_DIR, 'singlefile.extension.json'); - - if (fs.existsSync(cacheFile)) { - try { - const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8')); - const manifestPath = path.join(cached.unpacked_path, 'manifest.json'); - - if (fs.existsSync(manifestPath)) { - console.log('[*] SingleFile extension already installed (using cache)'); - return cached; - } - } catch (e) { - // Cache file corrupted, re-install - console.warn('[āš ļø] Extension cache corrupted, re-installing...'); - } - } - - // Install extension - const extension = await installSinglefileExtension(); - - // Export extension metadata for chrome plugin to load - if (extension) { - // Write extension info to a cache file that chrome plugin can read - await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true }); - await fs.promises.writeFile( - cacheFile, - JSON.stringify(extension, null, 2) - ); - console.log(`[+] Extension metadata written to ${cacheFile}`); - } - - return extension; -} - -// Export functions for use by other plugins -module.exports = { - EXTENSION, - installSinglefileExtension, - saveSinglefileWithExtension, - saveSinglefileWithCLI, -}; - -// Run if executed directly -if (require.main === module) { - main().then(() => { - console.log('[āœ“] SingleFile extension setup complete'); - process.exit(0); - }).catch(err => { - console.error('[āŒ] SingleFile extension setup failed:', err); - process.exit(1); - }); -} diff --git a/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py b/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py deleted file mode 100644 index 4d91e0e7..00000000 --- a/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py +++ /dev/null @@ -1,397 +0,0 @@ -#!/usr/bin/env python3 -""" -Archive a URL using SingleFile. - -Usage: on_Snapshot__singlefile.py --url= --snapshot-id= -Output: Writes singlefile.html to $PWD - -Environment variables: - SINGLEFILE_ENABLED: Enable SingleFile archiving (default: True) - SINGLEFILE_BINARY: Path to SingleFile binary (default: single-file) - SINGLEFILE_NODE_BINARY: Path to Node.js binary (x-fallback: NODE_BINARY) - SINGLEFILE_CHROME_BINARY: Path to Chrome binary (x-fallback: CHROME_BINARY) [unused; shared Chrome session required] - SINGLEFILE_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT) - SINGLEFILE_USER_AGENT: User agent string (x-fallback: USER_AGENT) - SINGLEFILE_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE) - SINGLEFILE_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY) - SINGLEFILE_CHROME_ARGS: Chrome command-line arguments (x-fallback: CHROME_ARGS) [unused; shared Chrome session required] - SINGLEFILE_ARGS: Default SingleFile arguments (JSON array) - SINGLEFILE_ARGS_EXTRA: Extra arguments to append (JSON array) -""" - -import json -import os -import subprocess -import sys -import threading -import time -from urllib.request import urlopen -from pathlib import Path -import shutil - -import rich_click as click - - -# Extractor metadata -PLUGIN_NAME = 'singlefile' -BIN_NAME = 'single-file' -BIN_PROVIDERS = 'npm,env' -OUTPUT_DIR = '.' -OUTPUT_FILE = 'singlefile.html' -EXTENSION_SAVE_SCRIPT = Path(__file__).parent / 'singlefile_extension_save.js' - - -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - - -def get_env_bool(name: str, default: bool = False) -> bool: - val = get_env(name, '').lower() - if val in ('true', '1', 'yes', 'on'): - return True - if val in ('false', '0', 'no', 'off'): - return False - return default - - -def get_env_int(name: str, default: int = 0) -> int: - try: - return int(get_env(name, str(default))) - except ValueError: - return default - - -def get_env_array(name: str, default: list[str] | None = None) -> list[str]: - """Parse a JSON array from environment variable.""" - val = get_env(name, '') - if not val: - return default if default is not None else [] - try: - result = json.loads(val) - if isinstance(result, list): - return [str(item) for item in result] - return default if default is not None else [] - except json.JSONDecodeError: - return default if default is not None else [] - - -STATICFILE_DIR = '../staticfile' - -def has_staticfile_output() -> bool: - """Check if staticfile extractor already downloaded this URL.""" - staticfile_dir = Path(STATICFILE_DIR) - if not staticfile_dir.exists(): - return False - stdout_log = staticfile_dir / 'stdout.log' - if not stdout_log.exists(): - return False - for line in stdout_log.read_text(errors='ignore').splitlines(): - line = line.strip() - if not line.startswith('{'): - continue - try: - record = json.loads(line) - except json.JSONDecodeError: - continue - if record.get('type') == 'ArchiveResult' and record.get('status') == 'succeeded': - return True - return False - - -# Chrome session directory (relative to extractor output dir) -# Note: Chrome binary is obtained via CHROME_BINARY env var, not searched for. -# The centralized Chrome binary search is in chrome_utils.js findChromium(). -CHROME_SESSION_DIR = '../chrome' - - -def get_cdp_url(wait_seconds: float = 0.0) -> str | None: - """Get CDP URL from chrome plugin if available.""" - cdp_file = Path(CHROME_SESSION_DIR) / 'cdp_url.txt' - deadline = time.time() + max(wait_seconds, 0.0) - while True: - if cdp_file.exists(): - cdp_url = cdp_file.read_text().strip() - return cdp_url or None - if time.time() >= deadline: - return None - time.sleep(0.2) - - -def get_port_from_cdp_url(cdp_url: str) -> str | None: - """Extract port from CDP WebSocket URL (ws://127.0.0.1:PORT/...).""" - import re - match = re.search(r':(\d+)/', cdp_url) - if match: - return match.group(1) - return None - - -def is_cdp_server_available(cdp_remote_url: str) -> bool: - try: - with urlopen(f'{cdp_remote_url}/json/version', timeout=1) as resp: - return resp.status == 200 - except Exception: - return False - - -def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]: - """ - Archive URL using SingleFile. - - Requires a Chrome session (from chrome plugin) and connects to it via CDP. - - Returns: (success, output_path, error_message) - """ - print(f'[singlefile] CLI mode start url={url}', file=sys.stderr) - # Get config from env (with SINGLEFILE_ prefix, x-fallback handled by config loader) - timeout = get_env_int('SINGLEFILE_TIMEOUT') or get_env_int('TIMEOUT', 120) - user_agent = get_env('SINGLEFILE_USER_AGENT') or get_env('USER_AGENT', '') - check_ssl = get_env_bool('SINGLEFILE_CHECK_SSL_VALIDITY', True) if get_env('SINGLEFILE_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True) - cookies_file = get_env('SINGLEFILE_COOKIES_FILE') or get_env('COOKIES_FILE', '') - singlefile_args = get_env_array('SINGLEFILE_ARGS', []) - singlefile_args_extra = get_env_array('SINGLEFILE_ARGS_EXTRA', []) - # Chrome args/binary are intentionally ignored because we require a shared Chrome session - - cmd = [binary, *singlefile_args] - - # Try to use existing Chrome session via CDP (prefer HTTP base URL) - cdp_wait = min(10, max(1, timeout // 10)) - cdp_url = get_cdp_url(wait_seconds=cdp_wait) - cdp_remote_url = None - if cdp_url: - if cdp_url.startswith(('http://', 'https://')): - cdp_remote_url = cdp_url - else: - port = get_port_from_cdp_url(cdp_url) - if port: - cdp_remote_url = f'http://127.0.0.1:{port}' - else: - cdp_remote_url = cdp_url - - if cdp_remote_url and not is_cdp_server_available(cdp_remote_url): - cdp_remote_url = None - - if cdp_remote_url: - print(f'[singlefile] Using existing Chrome session: {cdp_remote_url}', file=sys.stderr) - cmd.extend(['--browser-server', cdp_remote_url]) - else: - return False, None, 'No Chrome session found (chrome plugin must run first)' - - # SSL handling - if not check_ssl: - cmd.append('--browser-ignore-insecure-certs') - - if user_agent: - cmd.extend(['--user-agent', user_agent]) - - if cookies_file and Path(cookies_file).is_file(): - cmd.extend(['--browser-cookies-file', cookies_file]) - - # Add extra args from config - if singlefile_args_extra: - cmd.extend(singlefile_args_extra) - - # Output directory is current directory (hook already runs in output dir) - output_dir = Path(OUTPUT_DIR) - output_path = output_dir / OUTPUT_FILE - - cmd.extend([url, str(output_path)]) - print(f'[singlefile] CLI command: {" ".join(cmd[:6])} ...', file=sys.stderr) - - try: - output_lines: list[str] = [] - process = subprocess.Popen( - cmd, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - bufsize=1, - ) - - def _read_output() -> None: - if not process.stdout: - return - for line in process.stdout: - output_lines.append(line) - sys.stderr.write(line) - - reader = threading.Thread(target=_read_output, daemon=True) - reader.start() - - try: - process.wait(timeout=timeout) - except subprocess.TimeoutExpired: - process.kill() - reader.join(timeout=1) - return False, None, f'Timed out after {timeout} seconds' - - reader.join(timeout=1) - combined_output = ''.join(output_lines) - - if output_path.exists() and output_path.stat().st_size > 0: - return True, str(output_path), '' - else: - stderr = combined_output - if 'ERR_NAME_NOT_RESOLVED' in stderr: - return False, None, 'DNS resolution failed' - if 'ERR_CONNECTION_REFUSED' in stderr: - return False, None, 'Connection refused' - detail = (stderr or '').strip() - if len(detail) > 2000: - detail = detail[:2000] - cmd_preview = list(cmd) - if '--browser-args' in cmd_preview: - idx = cmd_preview.index('--browser-args') - if idx + 1 < len(cmd_preview): - cmd_preview[idx + 1] = '' - cmd_str = ' '.join(cmd_preview) - return False, None, f'SingleFile failed (cmd={cmd_str}): {detail}' - - except subprocess.TimeoutExpired: - return False, None, f'Timed out after {timeout} seconds' - except Exception as e: - return False, None, f'{type(e).__name__}: {e}' - - -def save_singlefile_with_extension(url: str, timeout: int) -> tuple[bool, str | None, str]: - """Save using the SingleFile Chrome extension via existing Chrome session.""" - print(f'[singlefile] Extension mode start url={url}', file=sys.stderr) - # Only attempt if chrome session exists - cdp_url = get_cdp_url(wait_seconds=min(5, max(1, timeout // 10))) - if not cdp_url: - print('[singlefile] No Chrome session found (chrome plugin must run first)', file=sys.stderr) - return False, None, 'No Chrome session found (chrome plugin must run first)' - - if not EXTENSION_SAVE_SCRIPT.exists(): - print(f'[singlefile] Missing helper script: {EXTENSION_SAVE_SCRIPT}', file=sys.stderr) - return False, None, 'SingleFile extension helper script missing' - - node_binary = get_env('SINGLEFILE_NODE_BINARY') or get_env('NODE_BINARY', 'node') - downloads_dir = get_env('CHROME_DOWNLOADS_DIR', '') - extensions_dir = get_env('CHROME_EXTENSIONS_DIR', '') - cmd = [node_binary, str(EXTENSION_SAVE_SCRIPT), f'--url={url}'] - print(f'[singlefile] cdp_url={cdp_url}', file=sys.stderr) - print(f'[singlefile] node={node_binary}', file=sys.stderr) - node_resolved = shutil.which(node_binary) if node_binary else None - print(f'[singlefile] node_resolved={node_resolved}', file=sys.stderr) - print(f'[singlefile] PATH={os.environ.get("PATH","")}', file=sys.stderr) - if downloads_dir: - print(f'[singlefile] CHROME_DOWNLOADS_DIR={downloads_dir}', file=sys.stderr) - if extensions_dir: - print(f'[singlefile] CHROME_EXTENSIONS_DIR={extensions_dir}', file=sys.stderr) - print(f'[singlefile] helper_cmd={" ".join(cmd)}', file=sys.stderr) - - try: - output_lines: list[str] = [] - error_lines: list[str] = [] - process = subprocess.Popen( - cmd, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - bufsize=1, - ) - - def _read_stream(stream, sink, label: str) -> None: - if not stream: - return - for line in stream: - sink.append(line) - sys.stderr.write(line) - sys.stderr.flush() - - stdout_thread = threading.Thread(target=_read_stream, args=(process.stdout, output_lines, 'stdout'), daemon=True) - stderr_thread = threading.Thread(target=_read_stream, args=(process.stderr, error_lines, 'stderr'), daemon=True) - stdout_thread.start() - stderr_thread.start() - - try: - process.wait(timeout=timeout) - except subprocess.TimeoutExpired: - process.kill() - stdout_thread.join(timeout=1) - stderr_thread.join(timeout=1) - print(f'[singlefile] Extension helper timed out after {timeout}s', file=sys.stderr) - return False, None, f'Timed out after {timeout} seconds' - - stdout_thread.join(timeout=1) - stderr_thread.join(timeout=1) - - result_stdout = ''.join(output_lines).encode('utf-8', errors='replace') - result_stderr = ''.join(error_lines).encode('utf-8', errors='replace') - result_returncode = process.returncode - except Exception as e: - print(f'[singlefile] Extension helper error: {type(e).__name__}: {e}', file=sys.stderr) - return False, None, f'{type(e).__name__}: {e}' - - print(f'[singlefile] helper_returncode={result_returncode}', file=sys.stderr) - print(f'[singlefile] helper_stdout_len={len(result_stdout or b"")}', file=sys.stderr) - print(f'[singlefile] helper_stderr_len={len(result_stderr or b"")}', file=sys.stderr) - - if result_returncode == 0: - # Prefer explicit stdout path, fallback to local output file - out_text = result_stdout.decode('utf-8', errors='replace').strip() - if out_text and Path(out_text).exists(): - print(f'[singlefile] Extension output: {out_text}', file=sys.stderr) - return True, out_text, '' - output_path = Path(OUTPUT_DIR) / OUTPUT_FILE - if output_path.exists() and output_path.stat().st_size > 0: - print(f'[singlefile] Extension output: {output_path}', file=sys.stderr) - return True, str(output_path), '' - return False, None, 'SingleFile extension completed but no output file found' - - stderr = result_stderr.decode('utf-8', errors='replace').strip() - stdout = result_stdout.decode('utf-8', errors='replace').strip() - detail = stderr or stdout - return False, None, detail or 'SingleFile extension failed' - - -@click.command() -@click.option('--url', required=True, help='URL to archive') -@click.option('--snapshot-id', required=True, help='Snapshot UUID') -def main(url: str, snapshot_id: str): - """Archive a URL using SingleFile.""" - - print(f'[singlefile] Hook starting pid={os.getpid()} url={url}', file=sys.stderr) - output = None - status = 'failed' - error = '' - - try: - # Check if SingleFile is enabled - if not get_env_bool('SINGLEFILE_ENABLED', True): - print('Skipping SingleFile (SINGLEFILE_ENABLED=False)', file=sys.stderr) - # Feature disabled - no ArchiveResult, just exit - sys.exit(0) - - # Check if staticfile extractor already handled this (permanent skip) - if has_staticfile_output(): - print('Skipping SingleFile - staticfile extractor already downloaded this', file=sys.stderr) - print(json.dumps({'type': 'ArchiveResult', 'status': 'skipped', 'output_str': 'staticfile already exists'})) - sys.exit(0) - - # Prefer SingleFile extension via existing Chrome session - timeout = get_env_int('SINGLEFILE_TIMEOUT') or get_env_int('TIMEOUT', 120) - success, output, error = save_singlefile_with_extension(url, timeout) - status = 'succeeded' if success else 'failed' - - except Exception as e: - error = f'{type(e).__name__}: {e}' - status = 'failed' - - if error: - print(f'ERROR: {error}', file=sys.stderr) - - # Output clean JSONL (no RESULT_JSON= prefix) - result = { - 'type': 'ArchiveResult', - 'status': status, - 'output_str': output or error or '', - } - print(json.dumps(result)) - - sys.exit(0 if status == 'succeeded' else 1) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/singlefile/singlefile_extension_save.js b/archivebox/plugins/singlefile/singlefile_extension_save.js deleted file mode 100644 index 7bb8138e..00000000 --- a/archivebox/plugins/singlefile/singlefile_extension_save.js +++ /dev/null @@ -1,207 +0,0 @@ -#!/usr/bin/env node -/** - * Save a page using the SingleFile Chrome extension via an existing Chrome session. - * - * Usage: singlefile_extension_save.js --url= - * Output: prints saved file path on success - */ - -const fs = require('fs'); -const path = require('path'); - -const CHROME_SESSION_DIR = '../chrome'; -const DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR || - path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads'); - -process.env.CHROME_DOWNLOADS_DIR = DOWNLOADS_DIR; - -async function setDownloadDir(page, downloadDir) { - try { - await fs.promises.mkdir(downloadDir, { recursive: true }); - const client = await page.target().createCDPSession(); - try { - await client.send('Page.setDownloadBehavior', { - behavior: 'allow', - downloadPath: downloadDir, - }); - } catch (err) { - // Fallback for newer protocol versions - await client.send('Browser.setDownloadBehavior', { - behavior: 'allow', - downloadPath: downloadDir, - }); - } - } catch (err) { - console.error(`[āš ļø] Failed to set download directory: ${err.message || err}`); - } -} - -function parseArgs() { - const args = {}; - process.argv.slice(2).forEach((arg) => { - if (arg.startsWith('--')) { - const [key, ...valueParts] = arg.slice(2).split('='); - args[key.replace(/-/g, '_')] = valueParts.join('=') || true; - } - }); - return args; -} - -async function main() { - const args = parseArgs(); - const url = args.url; - - if (!url) { - console.error('Usage: singlefile_extension_save.js --url='); - process.exit(1); - } - - console.error(`[singlefile] helper start url=${url}`); - console.error(`[singlefile] downloads_dir=${DOWNLOADS_DIR}`); - if (process.env.CHROME_EXTENSIONS_DIR) { - console.error(`[singlefile] extensions_dir=${process.env.CHROME_EXTENSIONS_DIR}`); - } - - try { - console.error('[singlefile] loading dependencies...'); - const puppeteer = require('puppeteer-core'); - const chromeUtils = require('../chrome/chrome_utils.js'); - const { - EXTENSION, - saveSinglefileWithExtension, - } = require('./on_Crawl__82_singlefile_install.js'); - console.error('[singlefile] dependencies loaded'); - - // Ensure extension is installed and metadata is cached - console.error('[singlefile] ensuring extension cache...'); - const extension = await chromeUtils.installExtensionWithCache( - EXTENSION, - { extensionsDir: process.env.CHROME_EXTENSIONS_DIR } - ); - if (!extension) { - console.error('[āŒ] SingleFile extension not installed'); - process.exit(2); - } - if (extension.unpacked_path) { - const runtimeId = chromeUtils.getExtensionId(extension.unpacked_path); - if (runtimeId) { - extension.id = runtimeId; - } - } - console.error(`[singlefile] extension ready id=${extension.id} version=${extension.version}`); - - // Connect to existing Chrome session - console.error('[singlefile] connecting to chrome session...'); - const { browser, page } = await chromeUtils.connectToPage({ - chromeSessionDir: CHROME_SESSION_DIR, - timeoutMs: 60000, - puppeteer, - }); - console.error('[singlefile] connected to chrome'); - - try { - // Ensure CDP target discovery is enabled so service_worker targets appear - try { - const client = await page.createCDPSession(); - await client.send('Target.setDiscoverTargets', { discover: true }); - await client.send('Target.setAutoAttach', { autoAttach: true, waitForDebuggerOnStart: false, flatten: true }); - } catch (err) { - console.error(`[singlefile] failed to enable target discovery: ${err.message || err}`); - } - - // Wait for extension target to be available, then attach dispatchAction - console.error('[singlefile] waiting for extension target...'); - const deadline = Date.now() + 30000; - let matchTarget = null; - let matchInfo = null; - let lastLog = 0; - const wantedName = (extension.name || 'singlefile').toLowerCase(); - - while (Date.now() < deadline && !matchTarget) { - const targets = browser.targets(); - for (const target of targets) { - const info = await chromeUtils.isTargetExtension(target); - if (!info?.target_is_extension || !info?.extension_id) { - continue; - } - const manifestName = (info.manifest_name || '').toLowerCase(); - const targetUrl = (info.target_url || '').toLowerCase(); - const nameMatches = manifestName.includes(wantedName) || manifestName.includes('singlefile') || manifestName.includes('single-file'); - const urlMatches = targetUrl.includes('singlefile') || targetUrl.includes('single-file') || targetUrl.includes('single-file-extension'); - if (nameMatches || urlMatches) { - matchTarget = target; - matchInfo = info; - break; - } - } - - if (!matchTarget) { - if (Date.now() - lastLog > 5000) { - const targetsSummary = []; - for (const target of targets) { - const info = await chromeUtils.isTargetExtension(target); - if (!info?.target_is_extension) { - continue; - } - targetsSummary.push({ - type: info.target_type, - url: info.target_url, - extensionId: info.extension_id, - manifestName: info.manifest_name, - }); - } - console.error(`[singlefile] waiting... targets total=${targets.length} extensions=${targetsSummary.length} details=${JSON.stringify(targetsSummary)}`); - lastLog = Date.now(); - } - await new Promise(r => setTimeout(r, 500)); - } - } - - if (!matchTarget || !matchInfo) { - const targets = chromeUtils.getExtensionTargets(browser); - console.error(`[singlefile] extension target not found (name=${extension.name})`); - console.error(`[singlefile] available targets: ${JSON.stringify(targets)}`); - await browser.disconnect(); - process.exit(5); - } - - // Use the runtime extension id from the matched target - extension.id = matchInfo.extension_id; - - console.error('[singlefile] loading extension from target...'); - await chromeUtils.loadExtensionFromTarget([extension], matchTarget); - if (typeof extension.dispatchAction !== 'function') { - const targets = chromeUtils.getExtensionTargets(browser); - console.error(`[singlefile] extension dispatchAction missing for id=${extension.id}`); - console.error(`[singlefile] available targets: ${JSON.stringify(targets)}`); - await browser.disconnect(); - process.exit(6); - } - console.error('[singlefile] setting download dir...'); - await setDownloadDir(page, DOWNLOADS_DIR); - - console.error('[singlefile] triggering save via extension...'); - const output = await saveSinglefileWithExtension(page, extension, { downloadsDir: DOWNLOADS_DIR }); - if (output && fs.existsSync(output)) { - console.error(`[singlefile] saved: ${output}`); - console.log(output); - await browser.disconnect(); - process.exit(0); - } - - console.error('[āŒ] SingleFile extension did not produce output'); - await browser.disconnect(); - process.exit(3); - } catch (err) { - await browser.disconnect(); - throw err; - } - } catch (err) { - console.error(`[āŒ] ${err.message || err}`); - process.exit(4); - } -} - -if (require.main === module) { - main(); -} diff --git a/archivebox/plugins/singlefile/templates/card.html b/archivebox/plugins/singlefile/templates/card.html deleted file mode 100644 index 5d7e5614..00000000 --- a/archivebox/plugins/singlefile/templates/card.html +++ /dev/null @@ -1,8 +0,0 @@ - -
- -
diff --git a/archivebox/plugins/singlefile/templates/icon.html b/archivebox/plugins/singlefile/templates/icon.html deleted file mode 100644 index cd055f8b..00000000 --- a/archivebox/plugins/singlefile/templates/icon.html +++ /dev/null @@ -1 +0,0 @@ - diff --git a/archivebox/plugins/singlefile/tests/test_singlefile.py b/archivebox/plugins/singlefile/tests/test_singlefile.py deleted file mode 100644 index 8de0a163..00000000 --- a/archivebox/plugins/singlefile/tests/test_singlefile.py +++ /dev/null @@ -1,304 +0,0 @@ -""" -Integration tests for singlefile plugin - -Tests verify: -1. Hook scripts exist with correct naming -2. CLI-based singlefile extraction works -3. Dependencies available via abx-pkg -4. Output contains valid HTML -5. Connects to Chrome session via CDP when available -6. Works with extensions loaded (ublock, etc.) -""" - -import json -import os -import subprocess -import sys -import tempfile -from pathlib import Path - -import pytest - -from archivebox.plugins.chrome.tests.chrome_test_helpers import ( - get_test_env, - get_plugin_dir, - get_hook_script, - chrome_session, - cleanup_chrome, -) - - -PLUGIN_DIR = get_plugin_dir(__file__) -SNAPSHOT_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_singlefile.py') -INSTALL_SCRIPT = PLUGIN_DIR / 'on_Crawl__82_singlefile_install.js' -TEST_URL = "https://example.com" - - -def test_snapshot_hook_exists(): - """Verify snapshot extraction hook exists""" - assert SNAPSHOT_HOOK is not None and SNAPSHOT_HOOK.exists(), f"Snapshot hook not found in {PLUGIN_DIR}" - - -def test_snapshot_hook_priority(): - """Test that snapshot hook has correct priority (50)""" - filename = SNAPSHOT_HOOK.name - assert "50" in filename, "SingleFile snapshot hook should have priority 50" - assert filename.startswith("on_Snapshot__50_"), "Should follow priority naming convention" - - -def test_verify_deps_with_abx_pkg(): - """Verify dependencies are available via abx-pkg.""" - from abx_pkg import Binary, EnvProvider - - EnvProvider.model_rebuild() - - # Verify node is available - node_binary = Binary(name='node', binproviders=[EnvProvider()]) - node_loaded = node_binary.load() - assert node_loaded and node_loaded.abspath, "Node.js required for singlefile plugin" - - -def test_singlefile_cli_archives_example_com(): - """Test that singlefile archives example.com and produces valid HTML.""" - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - data_dir = tmpdir / 'data' - extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions' - downloads_dir = data_dir / 'personas' / 'Default' / 'chrome_downloads' - user_data_dir = data_dir / 'personas' / 'Default' / 'chrome_user_data' - extensions_dir.mkdir(parents=True, exist_ok=True) - downloads_dir.mkdir(parents=True, exist_ok=True) - user_data_dir.mkdir(parents=True, exist_ok=True) - - env_install = os.environ.copy() - env_install.update({ - 'DATA_DIR': str(data_dir), - 'CHROME_EXTENSIONS_DIR': str(extensions_dir), - 'CHROME_DOWNLOADS_DIR': str(downloads_dir), - }) - - result = subprocess.run( - ['node', str(INSTALL_SCRIPT)], - capture_output=True, - text=True, - env=env_install, - timeout=120, - ) - assert result.returncode == 0, f"Extension install failed: {result.stderr}" - - old_env = os.environ.copy() - os.environ['CHROME_USER_DATA_DIR'] = str(user_data_dir) - os.environ['CHROME_DOWNLOADS_DIR'] = str(downloads_dir) - os.environ['CHROME_EXTENSIONS_DIR'] = str(extensions_dir) - try: - with chrome_session( - tmpdir=tmpdir, - crawl_id='singlefile-cli-crawl', - snapshot_id='singlefile-cli-snap', - test_url=TEST_URL, - navigate=True, - timeout=30, - ) as (_chrome_proc, _chrome_pid, snapshot_chrome_dir, env): - env['SINGLEFILE_ENABLED'] = 'true' - env['CHROME_EXTENSIONS_DIR'] = str(extensions_dir) - env['CHROME_DOWNLOADS_DIR'] = str(downloads_dir) - - singlefile_output_dir = snapshot_chrome_dir.parent / 'singlefile' - singlefile_output_dir.mkdir(parents=True, exist_ok=True) - - # Run singlefile snapshot hook - result = subprocess.run( - [sys.executable, str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'], - cwd=singlefile_output_dir, - capture_output=True, - text=True, - env=env, - timeout=120, - ) - finally: - os.environ.clear() - os.environ.update(old_env) - - assert result.returncode == 0, f"Hook execution failed: {result.stderr}" - - # Verify output file exists - output_file = singlefile_output_dir / 'singlefile.html' - assert output_file.exists(), f"singlefile.html not created. stdout: {result.stdout}, stderr: {result.stderr}" - - # Verify it contains real HTML - html_content = output_file.read_text() - assert len(html_content) > 500, "Output file too small to be valid HTML" - assert '' in html_content or ' 500, "Output file too small" - assert 'Example Domain' in html_content, "Should contain example.com content" - else: - # If singlefile couldn't connect to Chrome, it may have failed - # Check if it mentioned browser-server in its args (indicating it tried to use CDP) - assert result.returncode == 0 or 'browser-server' in result.stderr or 'cdp' in result.stderr.lower(), \ - f"Singlefile should attempt CDP connection. stderr: {result.stderr}" - - -def test_singlefile_with_extension_uses_existing_chrome(): - """Test SingleFile uses the Chrome extension via existing session (CLI fallback disabled).""" - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - data_dir = tmpdir / 'data' - extensions_dir = data_dir / 'personas' / 'Default' / 'chrome_extensions' - downloads_dir = data_dir / 'personas' / 'Default' / 'chrome_downloads' - user_data_dir = data_dir / 'personas' / 'Default' / 'chrome_user_data' - extensions_dir.mkdir(parents=True, exist_ok=True) - downloads_dir.mkdir(parents=True, exist_ok=True) - user_data_dir.mkdir(parents=True, exist_ok=True) - - env_install = os.environ.copy() - env_install.update({ - 'DATA_DIR': str(data_dir), - 'CHROME_EXTENSIONS_DIR': str(extensions_dir), - 'CHROME_DOWNLOADS_DIR': str(downloads_dir), - }) - - # Install SingleFile extension cache before launching Chrome - result = subprocess.run( - ['node', str(INSTALL_SCRIPT)], - capture_output=True, - text=True, - env=env_install, - timeout=120 - ) - assert result.returncode == 0, f"Extension install failed: {result.stderr}" - - # Launch Chrome session with extensions loaded - old_env = os.environ.copy() - os.environ['CHROME_USER_DATA_DIR'] = str(user_data_dir) - os.environ['CHROME_DOWNLOADS_DIR'] = str(downloads_dir) - os.environ['CHROME_EXTENSIONS_DIR'] = str(extensions_dir) - try: - with chrome_session( - tmpdir=tmpdir, - crawl_id='singlefile-ext-crawl', - snapshot_id='singlefile-ext-snap', - test_url=TEST_URL, - navigate=True, - timeout=30, - ) as (_chrome_proc, _chrome_pid, snapshot_chrome_dir, env): - singlefile_output_dir = tmpdir / 'snapshot' / 'singlefile' - singlefile_output_dir.mkdir(parents=True, exist_ok=True) - - # Ensure ../chrome points to snapshot chrome session (contains target_id.txt) - chrome_dir = singlefile_output_dir.parent / 'chrome' - if not chrome_dir.exists(): - chrome_dir.symlink_to(snapshot_chrome_dir) - - env['SINGLEFILE_ENABLED'] = 'true' - env['SINGLEFILE_BINARY'] = '/nonexistent/single-file' # force extension path - env['CHROME_EXTENSIONS_DIR'] = str(extensions_dir) - env['CHROME_DOWNLOADS_DIR'] = str(downloads_dir) - env['CHROME_HEADLESS'] = 'false' - - # Track downloads dir state before run to ensure file is created then moved out - downloads_before = set(downloads_dir.glob('*.html')) - downloads_mtime_before = downloads_dir.stat().st_mtime_ns - - result = subprocess.run( - [sys.executable, str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=singlefile-ext-snap'], - cwd=str(singlefile_output_dir), - capture_output=True, - text=True, - env=env, - timeout=120 - ) - - assert result.returncode == 0, f"SingleFile extension run failed: {result.stderr}" - - output_file = singlefile_output_dir / 'singlefile.html' - assert output_file.exists(), f"singlefile.html not created. stdout: {result.stdout}, stderr: {result.stderr}" - html_content = output_file.read_text(errors='ignore') - assert 'Example Domain' in html_content, "Output should contain example.com content" - - # Verify download moved out of downloads dir - downloads_after = set(downloads_dir.glob('*.html')) - new_downloads = downloads_after - downloads_before - downloads_mtime_after = downloads_dir.stat().st_mtime_ns - assert downloads_mtime_after != downloads_mtime_before, "Downloads dir should be modified during extension save" - assert not new_downloads, f"SingleFile download should be moved out of downloads dir, found: {new_downloads}" - finally: - os.environ.clear() - os.environ.update(old_env) - - -def test_singlefile_disabled_skips(): - """Test that SINGLEFILE_ENABLED=False exits without JSONL.""" - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - env = get_test_env() - env['SINGLEFILE_ENABLED'] = 'False' - - result = subprocess.run( - [sys.executable, str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-disabled'], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=30 - ) - - assert result.returncode == 0, f"Should exit 0 when disabled: {result.stderr}" - - # Should NOT emit JSONL when disabled - jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')] - assert len(jsonl_lines) == 0, f"Should not emit JSONL when disabled, but got: {jsonl_lines}" - - -if __name__ == '__main__': - pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/ssl/config.json b/archivebox/plugins/ssl/config.json deleted file mode 100644 index d83dbfd3..00000000 --- a/archivebox/plugins/ssl/config.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "additionalProperties": false, - "required_plugins": ["chrome"], - "properties": { - "SSL_ENABLED": { - "type": "boolean", - "default": true, - "x-aliases": ["SAVE_SSL", "USE_SSL"], - "description": "Enable SSL certificate capture" - }, - "SSL_TIMEOUT": { - "type": "integer", - "default": 30, - "minimum": 5, - "x-fallback": "TIMEOUT", - "description": "Timeout for SSL capture in seconds" - } - } -} diff --git a/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js b/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js deleted file mode 100755 index 6559d9fd..00000000 --- a/archivebox/plugins/ssl/on_Snapshot__23_ssl.bg.js +++ /dev/null @@ -1,197 +0,0 @@ -#!/usr/bin/env node -/** - * Extract SSL/TLS certificate details from a URL. - * - * This hook sets up CDP listeners BEFORE chrome_navigate loads the page, - * then waits for navigation to complete. The listener captures SSL details - * during the navigation request. - * - * Usage: on_Snapshot__23_ssl.js --url= --snapshot-id= - * Output: Writes ssl.jsonl - */ - -const fs = require('fs'); -const path = require('path'); - -// Add NODE_MODULES_DIR to module resolution paths if set -if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); - -const puppeteer = require('puppeteer-core'); - -// Import shared utilities from chrome_utils.js -const { - getEnvBool, - getEnvInt, - parseArgs, - connectToPage, - waitForPageLoaded, -} = require('../chrome/chrome_utils.js'); - -const PLUGIN_NAME = 'ssl'; -const OUTPUT_DIR = '.'; -const OUTPUT_FILE = 'ssl.jsonl'; -const CHROME_SESSION_DIR = '../chrome'; - -let browser = null; -let page = null; -let client = null; -let sslCaptured = false; -let shuttingDown = false; - -async function setupListener(url) { - const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); - const timeout = getEnvInt('SSL_TIMEOUT', 30) * 1000; - let targetHost = null; - - // Only extract SSL for HTTPS URLs - if (!url.startsWith('https://')) { - throw new Error('URL is not HTTPS'); - } - - try { - targetHost = new URL(url).host; - } catch (e) { - targetHost = null; - } - - // Connect to Chrome page using shared utility - const { browser, page } = await connectToPage({ - chromeSessionDir: CHROME_SESSION_DIR, - timeoutMs: timeout, - puppeteer, - }); - - client = await page.target().createCDPSession(); - await client.send('Network.enable'); - - client.on('Network.responseReceived', (params) => { - try { - if (sslCaptured) return; - if (params.type && params.type !== 'Document') return; - const response = params.response || {}; - const responseUrl = response.url || ''; - if (!responseUrl.startsWith('http')) return; - - if (targetHost) { - try { - const responseHost = new URL(responseUrl).host; - if (responseHost !== targetHost) return; - } catch (e) { - // Ignore URL parse errors, fall through - } - } - - const securityDetails = response.securityDetails || null; - let sslInfo = { url: responseUrl }; - - if (securityDetails) { - sslInfo.protocol = securityDetails.protocol; - sslInfo.subjectName = securityDetails.subjectName; - sslInfo.issuer = securityDetails.issuer; - sslInfo.validFrom = securityDetails.validFrom; - sslInfo.validTo = securityDetails.validTo; - sslInfo.certificateId = securityDetails.subjectName; - sslInfo.securityState = response.securityState || 'secure'; - sslInfo.schemeIsCryptographic = true; - - const sanList = securityDetails.sanList; - if (sanList && sanList.length > 0) { - sslInfo.subjectAlternativeNames = sanList; - } - } else if (responseUrl.startsWith('https://')) { - sslInfo.securityState = response.securityState || 'unknown'; - sslInfo.schemeIsCryptographic = true; - sslInfo.error = 'No security details available'; - } else { - sslInfo.securityState = 'insecure'; - sslInfo.schemeIsCryptographic = false; - } - - fs.writeFileSync(outputPath, JSON.stringify(sslInfo, null, 2)); - sslCaptured = true; - } catch (e) { - // Ignore errors - } - }); - - return { browser, page }; -} - -function emitResult(status = 'succeeded') { - if (shuttingDown) return; - shuttingDown = true; - - const outputStr = sslCaptured ? OUTPUT_FILE : OUTPUT_FILE; - console.log(JSON.stringify({ - type: 'ArchiveResult', - status, - output_str: outputStr, - })); -} - -async function handleShutdown(signal) { - console.error(`\nReceived ${signal}, emitting final results...`); - emitResult('succeeded'); - if (browser) { - try { - browser.disconnect(); - } catch (e) {} - } - process.exit(0); -} - -async function main() { - const args = parseArgs(); - const url = args.url; - const snapshotId = args.snapshot_id; - - if (!url || !snapshotId) { - console.error('Usage: on_Snapshot__23_ssl.js --url= --snapshot-id='); - process.exit(1); - } - - if (!getEnvBool('SSL_ENABLED', true)) { - console.error('Skipping (SSL_ENABLED=False)'); - console.log(JSON.stringify({type: 'ArchiveResult', status: 'skipped', output_str: 'SSL_ENABLED=False'})); - process.exit(0); - } - - try { - // Set up listener BEFORE navigation - const connection = await setupListener(url); - browser = connection.browser; - page = connection.page; - - // Register signal handlers for graceful shutdown - process.on('SIGTERM', () => handleShutdown('SIGTERM')); - process.on('SIGINT', () => handleShutdown('SIGINT')); - - // Wait for chrome_navigate to complete (non-fatal) - try { - const timeout = getEnvInt('SSL_TIMEOUT', 30) * 1000; - await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4); - } catch (e) { - console.error(`WARN: ${e.message}`); - } - - // console.error('SSL listener active, waiting for cleanup signal...'); - await new Promise(() => {}); // Keep alive until SIGTERM - return; - - } catch (e) { - const error = `${e.name}: ${e.message}`; - console.error(`ERROR: ${error}`); - - console.log(JSON.stringify({ - type: 'ArchiveResult', - status: 'failed', - output_str: error, - })); - process.exit(1); - } -} - -main().catch(e => { - console.error(`Fatal error: ${e.message}`); - process.exit(1); -}); diff --git a/archivebox/plugins/ssl/templates/icon.html b/archivebox/plugins/ssl/templates/icon.html deleted file mode 100644 index 1707e8b9..00000000 --- a/archivebox/plugins/ssl/templates/icon.html +++ /dev/null @@ -1 +0,0 @@ - diff --git a/archivebox/plugins/ssl/tests/test_ssl.py b/archivebox/plugins/ssl/tests/test_ssl.py deleted file mode 100644 index 6f8375c1..00000000 --- a/archivebox/plugins/ssl/tests/test_ssl.py +++ /dev/null @@ -1,147 +0,0 @@ -""" -Tests for the SSL plugin. - -Tests the real SSL hook with an actual HTTPS URL to verify -certificate information extraction. -""" - -import json -import shutil -import subprocess -import sys -import tempfile -import time -from pathlib import Path - -from django.test import TestCase - -# Import chrome test helpers -sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests')) -from chrome_test_helpers import ( - chrome_session, - CHROME_NAVIGATE_HOOK, - get_plugin_dir, - get_hook_script, -) - - -# Get the path to the SSL hook -PLUGIN_DIR = get_plugin_dir(__file__) -SSL_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_ssl.*') - - -class TestSSLPlugin(TestCase): - """Test the SSL plugin with real HTTPS URLs.""" - - def test_ssl_hook_exists(self): - """SSL hook script should exist.""" - self.assertIsNotNone(SSL_HOOK, "SSL hook not found in plugin directory") - self.assertTrue(SSL_HOOK.exists(), f"Hook not found: {SSL_HOOK}") - - -class TestSSLWithChrome(TestCase): - """Integration tests for SSL plugin with Chrome.""" - - def setUp(self): - """Set up test environment.""" - self.temp_dir = Path(tempfile.mkdtemp()) - - def tearDown(self): - """Clean up.""" - shutil.rmtree(self.temp_dir, ignore_errors=True) - - def test_ssl_extracts_certificate_from_https_url(self): - """SSL hook should extract certificate info from a real HTTPS URL.""" - test_url = 'https://example.com' - snapshot_id = 'test-ssl-snapshot' - - with chrome_session( - self.temp_dir, - crawl_id='test-ssl-crawl', - snapshot_id=snapshot_id, - test_url=test_url, - navigate=False, - timeout=30, - ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env): - ssl_dir = snapshot_chrome_dir.parent / 'ssl' - ssl_dir.mkdir(exist_ok=True) - - # Run SSL hook with the active Chrome session (background hook) - result = subprocess.Popen( - ['node', str(SSL_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(ssl_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=env - ) - - nav_result = subprocess.run( - ['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(snapshot_chrome_dir), - capture_output=True, - text=True, - timeout=120, - env=env - ) - self.assertEqual(nav_result.returncode, 0, f"Navigation failed: {nav_result.stderr}") - - # Check for output file - ssl_output = ssl_dir / 'ssl.jsonl' - for _ in range(30): - if ssl_output.exists() and ssl_output.stat().st_size > 0: - break - time.sleep(1) - - if result.poll() is None: - result.terminate() - try: - stdout, stderr = result.communicate(timeout=5) - except subprocess.TimeoutExpired: - result.kill() - stdout, stderr = result.communicate() - else: - stdout, stderr = result.communicate() - - ssl_data = None - - # Try parsing from file first - if ssl_output.exists(): - with open(ssl_output) as f: - content = f.read().strip() - if content.startswith('{'): - try: - ssl_data = json.loads(content) - except json.JSONDecodeError: - pass - - # Try parsing from stdout if not in file - if not ssl_data: - for line in stdout.split('\n'): - line = line.strip() - if line.startswith('{'): - try: - record = json.loads(line) - if 'protocol' in record or 'issuer' in record or record.get('type') == 'SSL': - ssl_data = record - break - except json.JSONDecodeError: - continue - - # Verify hook ran successfully - self.assertNotIn('Traceback', stderr) - self.assertNotIn('Error:', stderr) - - # example.com uses HTTPS, so we MUST get SSL certificate data - self.assertIsNotNone(ssl_data, "No SSL data extracted from HTTPS URL") - - # Verify we got certificate info - self.assertIn('protocol', ssl_data, f"SSL data missing protocol: {ssl_data}") - self.assertTrue( - ssl_data['protocol'].startswith('TLS') or ssl_data['protocol'].startswith('SSL'), - f"Unexpected protocol: {ssl_data['protocol']}" - ) - - -if __name__ == '__main__': - pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/staticfile/config.json b/archivebox/plugins/staticfile/config.json deleted file mode 100644 index 7e6df43c..00000000 --- a/archivebox/plugins/staticfile/config.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "additionalProperties": false, - "required_plugins": ["chrome"], - "properties": { - "STATICFILE_ENABLED": { - "type": "boolean", - "default": true, - "x-aliases": ["SAVE_STATICFILE", "USE_STATICFILE"], - "description": "Enable static file detection" - }, - "STATICFILE_TIMEOUT": { - "type": "integer", - "default": 30, - "minimum": 5, - "x-fallback": "TIMEOUT", - "description": "Timeout for static file detection in seconds" - } - } -} diff --git a/archivebox/plugins/staticfile/on_Snapshot__26_staticfile.bg.js b/archivebox/plugins/staticfile/on_Snapshot__26_staticfile.bg.js deleted file mode 100644 index 984e15c7..00000000 --- a/archivebox/plugins/staticfile/on_Snapshot__26_staticfile.bg.js +++ /dev/null @@ -1,366 +0,0 @@ -#!/usr/bin/env node -/** - * Detect and download static files using CDP during initial request. - * - * This hook sets up CDP listeners BEFORE chrome_navigate to capture the - * Content-Type from the initial response. If it's a static file (PDF, image, etc.), - * it downloads the content directly using CDP. - * - * Usage: on_Snapshot__26_staticfile.bg.js --url= --snapshot-id= - * Output: Downloads static file - */ - -const fs = require('fs'); -const path = require('path'); - -// Add NODE_MODULES_DIR to module resolution paths if set -if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); - -const puppeteer = require('puppeteer-core'); - -// Import shared utilities from chrome_utils.js -const { - getEnvBool, - getEnvInt, - parseArgs, - connectToPage, - waitForPageLoaded, -} = require('../chrome/chrome_utils.js'); - -const PLUGIN_NAME = 'staticfile'; -const OUTPUT_DIR = '.'; -const CHROME_SESSION_DIR = '../chrome'; - -// Content-Types that indicate static files -const STATIC_CONTENT_TYPES = new Set([ - // Documents - 'application/pdf', - 'application/msword', - 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', - 'application/vnd.ms-excel', - 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', - 'application/vnd.ms-powerpoint', - 'application/vnd.openxmlformats-officedocument.presentationml.presentation', - 'application/rtf', - 'application/epub+zip', - // Images - 'image/png', - 'image/jpeg', - 'image/gif', - 'image/webp', - 'image/svg+xml', - 'image/x-icon', - 'image/bmp', - 'image/tiff', - 'image/avif', - 'image/heic', - 'image/heif', - // Audio - 'audio/mpeg', - 'audio/mp3', - 'audio/wav', - 'audio/flac', - 'audio/aac', - 'audio/ogg', - 'audio/webm', - 'audio/m4a', - 'audio/opus', - // Video - 'video/mp4', - 'video/webm', - 'video/x-matroska', - 'video/avi', - 'video/quicktime', - 'video/x-ms-wmv', - 'video/x-flv', - // Archives - 'application/zip', - 'application/x-tar', - 'application/gzip', - 'application/x-bzip2', - 'application/x-xz', - 'application/x-7z-compressed', - 'application/x-rar-compressed', - 'application/vnd.rar', - // Data - 'application/json', - 'application/xml', - 'text/csv', - 'text/xml', - 'application/x-yaml', - // Executables/Binaries - 'application/octet-stream', - 'application/x-executable', - 'application/x-msdos-program', - 'application/x-apple-diskimage', - 'application/vnd.debian.binary-package', - 'application/x-rpm', - // Other - 'application/x-bittorrent', - 'application/wasm', -]); - -const STATIC_CONTENT_TYPE_PREFIXES = [ - 'image/', - 'audio/', - 'video/', - 'application/zip', - 'application/x-', -]; - -// Global state -let originalUrl = ''; -let detectedContentType = null; -let isStaticFile = false; -let downloadedFilePath = null; -let downloadError = null; -let page = null; -let browser = null; - -function isStaticContentType(contentType) { - if (!contentType) return false; - - const ct = contentType.split(';')[0].trim().toLowerCase(); - - // Check exact match - if (STATIC_CONTENT_TYPES.has(ct)) return true; - - // Check prefixes - for (const prefix of STATIC_CONTENT_TYPE_PREFIXES) { - if (ct.startsWith(prefix)) return true; - } - - return false; -} - -function sanitizeFilename(str, maxLen = 200) { - return str - .replace(/[^a-zA-Z0-9._-]/g, '_') - .slice(0, maxLen); -} - -function getFilenameFromUrl(url) { - try { - const pathname = new URL(url).pathname; - const filename = path.basename(pathname) || 'downloaded_file'; - return sanitizeFilename(filename); - } catch (e) { - return 'downloaded_file'; - } -} - -function normalizeUrl(url) { - try { - const parsed = new URL(url); - let path = parsed.pathname || ''; - if (path === '/') path = ''; - return `${parsed.origin}${path}`; - } catch (e) { - return url; - } -} - -async function setupStaticFileListener() { - const timeout = getEnvInt('STATICFILE_TIMEOUT', 30) * 1000; - - // Connect to Chrome page using shared utility - const connection = await connectToPage({ - chromeSessionDir: CHROME_SESSION_DIR, - timeoutMs: timeout, - puppeteer, - }); - browser = connection.browser; - page = connection.page; - - // Track the first response to check Content-Type - let firstResponseHandled = false; - - page.on('response', async (response) => { - if (firstResponseHandled) return; - - try { - const url = response.url(); - const headers = response.headers(); - const contentType = headers['content-type'] || ''; - const status = response.status(); - - // Only process the main document response - if (normalizeUrl(url) !== normalizeUrl(originalUrl)) return; - if (status < 200 || status >= 300) return; - - firstResponseHandled = true; - detectedContentType = contentType.split(';')[0].trim(); - - console.error(`Detected Content-Type: ${detectedContentType}`); - - // Check if it's a static file - if (!isStaticContentType(detectedContentType)) { - console.error('Not a static file, skipping download'); - return; - } - - isStaticFile = true; - console.error('Static file detected, downloading...'); - - // Download the file - const maxSize = getEnvInt('STATICFILE_MAX_SIZE', 1024 * 1024 * 1024); // 1GB default - const buffer = await response.buffer(); - - if (buffer.length > maxSize) { - downloadError = `File too large: ${buffer.length} bytes > ${maxSize} max`; - return; - } - - // Determine filename - let filename = getFilenameFromUrl(url); - - // Check content-disposition header for better filename - const contentDisp = headers['content-disposition'] || ''; - if (contentDisp.includes('filename=')) { - const match = contentDisp.match(/filename[*]?=["']?([^"';\n]+)/); - if (match) { - filename = sanitizeFilename(match[1].trim()); - } - } - - const outputPath = path.join(OUTPUT_DIR, filename); - fs.writeFileSync(outputPath, buffer); - - downloadedFilePath = filename; - console.error(`Static file downloaded (${buffer.length} bytes): ${filename}`); - - } catch (e) { - downloadError = `${e.name}: ${e.message}`; - console.error(`Error downloading static file: ${downloadError}`); - } - }); - - return { browser, page }; -} - -function handleShutdown(signal) { - console.error(`\nReceived ${signal}, emitting final results...`); - - let result; - - if (!detectedContentType) { - // No Content-Type detected (shouldn't happen, but handle it) - result = { - type: 'ArchiveResult', - status: 'skipped', - output_str: 'No Content-Type detected', - plugin: PLUGIN_NAME, - }; - } else if (!isStaticFile) { - // Not a static file (normal case for HTML pages) - result = { - type: 'ArchiveResult', - status: 'skipped', - output_str: `Not a static file (Content-Type: ${detectedContentType})`, - plugin: PLUGIN_NAME, - content_type: detectedContentType, - }; - } else if (downloadError) { - // Static file but download failed - result = { - type: 'ArchiveResult', - status: 'failed', - output_str: downloadError, - plugin: PLUGIN_NAME, - content_type: detectedContentType, - }; - } else if (downloadedFilePath) { - // Static file downloaded successfully - result = { - type: 'ArchiveResult', - status: 'succeeded', - output_str: downloadedFilePath, - plugin: PLUGIN_NAME, - content_type: detectedContentType, - }; - } else { - // Static file detected but no download happened (unexpected) - result = { - type: 'ArchiveResult', - status: 'failed', - output_str: 'Static file detected but download did not complete', - plugin: PLUGIN_NAME, - content_type: detectedContentType, - }; - } - - console.log(JSON.stringify(result)); - process.exit(0); -} - -async function main() { - const args = parseArgs(); - const url = args.url; - const snapshotId = args.snapshot_id; - - if (!url || !snapshotId) { - console.error('Usage: on_Snapshot__26_staticfile.bg.js --url= --snapshot-id='); - process.exit(1); - } - - originalUrl = url; - - if (!getEnvBool('STATICFILE_ENABLED', true)) { - console.error('Skipping (STATICFILE_ENABLED=False)'); - console.log(JSON.stringify({type: 'ArchiveResult', status: 'skipped', output_str: 'STATICFILE_ENABLED=False'})); - process.exit(0); - } - - const timeout = getEnvInt('STATICFILE_TIMEOUT', 30) * 1000; - - // Register signal handlers for graceful shutdown - process.on('SIGTERM', () => handleShutdown('SIGTERM')); - process.on('SIGINT', () => handleShutdown('SIGINT')); - - try { - // Set up static file listener BEFORE navigation - await setupStaticFileListener(); - - // Wait for chrome_navigate to complete (non-fatal) - try { - await waitForPageLoaded(CHROME_SESSION_DIR, timeout * 4, 500); - if (!detectedContentType && page) { - try { - const inferred = await page.evaluate(() => document.contentType || ''); - if (inferred) { - detectedContentType = inferred.split(';')[0].trim(); - if (isStaticContentType(detectedContentType)) { - isStaticFile = true; - } - } - } catch (e) { - // Best-effort only - } - } - } catch (e) { - console.error(`WARN: ${e.message}`); - } - - // Keep process alive until killed by cleanup - // console.error('Static file detection complete, waiting for cleanup signal...'); - - // Keep the process alive indefinitely - await new Promise(() => {}); // Never resolves - - } catch (e) { - const error = `${e.name}: ${e.message}`; - console.error(`ERROR: ${error}`); - - console.log(JSON.stringify({ - type: 'ArchiveResult', - status: 'failed', - output_str: error, - })); - process.exit(1); - } -} - -main().catch(e => { - console.error(`Fatal error: ${e.message}`); - process.exit(1); -}); diff --git a/archivebox/plugins/staticfile/templates/card.html b/archivebox/plugins/staticfile/templates/card.html deleted file mode 100644 index 6d16cbfa..00000000 --- a/archivebox/plugins/staticfile/templates/card.html +++ /dev/null @@ -1,24 +0,0 @@ - -
- {% if output_path %} - {% if output_path|lower|slice:"-4:" == ".pdf" or "application/pdf" in output_path %} - - {% elif output_path|lower|slice:"-4:" in ".jpg.png.gif.svg.bmp.webp.avif.heic" or output_path|lower|slice:"-5:" == ".jpeg" %} - - {% elif output_path|lower|slice:"-4:" in ".mp4.webm.mov.avi.mkv" or output_path|lower|slice:"-5:" == ".mpeg" %} - - {% else %} - - {% endif %} - {% endif %} -
diff --git a/archivebox/plugins/staticfile/templates/icon.html b/archivebox/plugins/staticfile/templates/icon.html deleted file mode 100644 index bc71e426..00000000 --- a/archivebox/plugins/staticfile/templates/icon.html +++ /dev/null @@ -1 +0,0 @@ - diff --git a/archivebox/plugins/staticfile/tests/test_staticfile.py b/archivebox/plugins/staticfile/tests/test_staticfile.py deleted file mode 100644 index f40b0677..00000000 --- a/archivebox/plugins/staticfile/tests/test_staticfile.py +++ /dev/null @@ -1,123 +0,0 @@ -""" -Tests for the staticfile plugin. - -Tests the real staticfile hook with actual URLs to verify -static file detection and download. -""" - -import json -import shutil -import subprocess -import sys -import tempfile -import time -from pathlib import Path - -import pytest -from django.test import TestCase - -# Import chrome test helpers -sys.path.insert(0, str(Path(__file__).parent.parent.parent / 'chrome' / 'tests')) -from chrome_test_helpers import ( - chrome_session, - get_test_env, - get_plugin_dir, - get_hook_script, -) - - -def chrome_available() -> bool: - """Check if Chrome/Chromium is available.""" - for name in ['chromium', 'chromium-browser', 'google-chrome', 'chrome']: - if shutil.which(name): - return True - return False - - -# Get the path to the staticfile hook -PLUGIN_DIR = get_plugin_dir(__file__) -STATICFILE_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_staticfile.*') - - -class TestStaticfilePlugin(TestCase): - """Test the staticfile plugin.""" - - def test_staticfile_hook_exists(self): - """Staticfile hook script should exist.""" - self.assertIsNotNone(STATICFILE_HOOK, "Staticfile hook not found in plugin directory") - self.assertTrue(STATICFILE_HOOK.exists(), f"Hook not found: {STATICFILE_HOOK}") - - -class TestStaticfileWithChrome(TestCase): - """Integration tests for staticfile plugin with Chrome.""" - - def setUp(self): - """Set up test environment.""" - self.temp_dir = Path(tempfile.mkdtemp()) - - def tearDown(self): - """Clean up.""" - shutil.rmtree(self.temp_dir, ignore_errors=True) - - def test_staticfile_skips_html_pages(self): - """Staticfile hook should skip HTML pages (not static files).""" - test_url = 'https://example.com' # HTML page, not a static file - snapshot_id = 'test-staticfile-snapshot' - - try: - with chrome_session( - self.temp_dir, - crawl_id='test-staticfile-crawl', - snapshot_id=snapshot_id, - test_url=test_url, - navigate=True, - timeout=30, - ) as (chrome_process, chrome_pid, snapshot_chrome_dir, env): - # Use the environment from chrome_session (already has CHROME_HEADLESS=true) - - - # Run staticfile hook with the active Chrome session (background hook) - result = subprocess.Popen( - ['node', str(STATICFILE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(snapshot_chrome_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=env - ) - - # Allow it to run briefly, then terminate (background hook) - time.sleep(3) - if result.poll() is None: - result.terminate() - try: - stdout, stderr = result.communicate(timeout=5) - except subprocess.TimeoutExpired: - result.kill() - stdout, stderr = result.communicate() - else: - stdout, stderr = result.communicate() - - # Verify hook ran without crash - self.assertNotIn('Traceback', stderr) - - # Parse JSONL output to verify it recognized HTML as non-static - for line in stdout.split('\n'): - line = line.strip() - if line.startswith('{'): - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - # HTML pages should be skipped - if record.get('status') == 'skipped': - self.assertIn('Not a static file', record.get('output_str', '')) - break - except json.JSONDecodeError: - continue - - except RuntimeError: - raise - - -if __name__ == '__main__': - pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/title/config.json b/archivebox/plugins/title/config.json deleted file mode 100644 index 550c6de2..00000000 --- a/archivebox/plugins/title/config.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "additionalProperties": false, - "required_plugins": ["chrome"], - "properties": { - "TITLE_ENABLED": { - "type": "boolean", - "default": true, - "x-aliases": ["SAVE_TITLE", "USE_TITLE"], - "description": "Enable title extraction" - }, - "TITLE_TIMEOUT": { - "type": "integer", - "default": 30, - "minimum": 5, - "x-fallback": "TIMEOUT", - "description": "Timeout for title extraction in seconds" - } - } -} diff --git a/archivebox/plugins/title/on_Snapshot__54_title.js b/archivebox/plugins/title/on_Snapshot__54_title.js deleted file mode 100644 index af89e779..00000000 --- a/archivebox/plugins/title/on_Snapshot__54_title.js +++ /dev/null @@ -1,139 +0,0 @@ -#!/usr/bin/env node -/** - * Extract the title of a URL. - * - * Requires a Chrome session (from chrome plugin) and connects to it via CDP - * to get the page title (which includes JS-rendered content). - * - * Usage: on_Snapshot__10_title.js --url= --snapshot-id= - * Output: Writes title/title.txt - * - * Environment variables: - * TITLE_TIMEOUT: Timeout in seconds (default: 30) - */ - -const fs = require('fs'); -const path = require('path'); -const puppeteer = require('puppeteer-core'); - -// Import shared utilities from chrome_utils.js -const { - getEnvInt, - parseArgs, - connectToPage, - waitForPageLoaded, -} = require('../chrome/chrome_utils.js'); - -// Extractor metadata -const PLUGIN_NAME = 'title'; -const OUTPUT_DIR = '.'; -const OUTPUT_FILE = 'title.txt'; -const CHROME_SESSION_DIR = '../chrome'; - -async function extractTitle(url) { - // Output directory is current directory (hook already runs in output dir) - const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE); - const timeoutMs = getEnvInt('TITLE_TIMEOUT', getEnvInt('TIMEOUT', 30)) * 1000; - let browser = null; - - try { - const connection = await connectToPage({ - chromeSessionDir: CHROME_SESSION_DIR, - timeoutMs, - puppeteer, - }); - browser = connection.browser; - const page = connection.page; - - await waitForPageLoaded(CHROME_SESSION_DIR, timeoutMs * 4, 200); - - // Get title from page - let title = await page.title(); - - if (!title) { - // Try getting from DOM directly - title = await page.evaluate(() => { - return document.title || - document.querySelector('meta[property="og:title"]')?.content || - document.querySelector('meta[name="twitter:title"]')?.content || - document.querySelector('h1')?.textContent?.trim(); - }); - } - - if (title) { - fs.writeFileSync(outputPath, title, 'utf8'); - return { success: true, output: outputPath, title, method: 'cdp' }; - } - return { success: false, error: 'No title found in Chrome session' }; - } catch (e) { - return { success: false, error: e.message }; - } finally { - if (browser) { - browser.disconnect(); - } - } -} - -async function main() { - const args = parseArgs(); - const url = args.url; - const snapshotId = args.snapshot_id; - - if (!url || !snapshotId) { - console.error('Usage: on_Snapshot__10_title.js --url= --snapshot-id='); - process.exit(1); - } - - const startTs = new Date(); - let status = 'failed'; - let output = null; - let error = ''; - let extractedTitle = null; - - try { - const result = await extractTitle(url); - - if (result.success) { - status = 'succeeded'; - output = result.output; - extractedTitle = result.title; - console.error(`Title extracted (${result.method}): ${result.title}`); - } else { - status = 'failed'; - error = result.error; - } - } catch (e) { - error = `${e.name}: ${e.message}`; - status = 'failed'; - } - - const endTs = new Date(); - - if (error) { - console.error(`ERROR: ${error}`); - } - - // Update snapshot title via JSONL - if (status === 'succeeded' && extractedTitle) { - console.log(JSON.stringify({ - type: 'Snapshot', - id: snapshotId, - title: extractedTitle - })); - } - - // Output ArchiveResult JSONL - const archiveResult = { - type: 'ArchiveResult', - status, - output_str: output || error || '', - }; - console.log(JSON.stringify(archiveResult)); - - process.exit(status === 'succeeded' ? 0 : 1); -} - -main().catch(e => { - console.error(`Fatal error: ${e.message}`); - process.exit(1); -}); diff --git a/archivebox/plugins/title/templates/icon.html b/archivebox/plugins/title/templates/icon.html deleted file mode 100644 index 0cc05a17..00000000 --- a/archivebox/plugins/title/templates/icon.html +++ /dev/null @@ -1 +0,0 @@ - diff --git a/archivebox/plugins/title/tests/test_title.py b/archivebox/plugins/title/tests/test_title.py deleted file mode 100644 index 78b2ffbd..00000000 --- a/archivebox/plugins/title/tests/test_title.py +++ /dev/null @@ -1,277 +0,0 @@ -""" -Integration tests for title plugin - -Tests verify: -1. Plugin script exists -2. Node.js is available -3. Title extraction works for real example.com -4. Output file contains actual page title -5. Handles various title sources (, og:title, twitter:title) -6. Config options work (TITLE_TIMEOUT) -""" - -import json -import shutil -import subprocess -import tempfile -from pathlib import Path - -import pytest - -from archivebox.plugins.chrome.tests.chrome_test_helpers import ( - get_plugin_dir, - get_hook_script, - parse_jsonl_output, - get_test_env, - chrome_session, - CHROME_NAVIGATE_HOOK, -) - - -PLUGIN_DIR = get_plugin_dir(__file__) -TITLE_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_title.*') -TEST_URL = 'https://example.com' - -def run_title_capture(title_dir, snapshot_chrome_dir, env, url, snapshot_id): - nav_result = subprocess.run( - ['node', str(CHROME_NAVIGATE_HOOK), f'--url={url}', f'--snapshot-id={snapshot_id}'], - cwd=str(snapshot_chrome_dir), - capture_output=True, - text=True, - timeout=120, - env=env, - ) - result = subprocess.run( - ['node', str(TITLE_HOOK), f'--url={url}', f'--snapshot-id={snapshot_id}'], - cwd=title_dir, - capture_output=True, - text=True, - timeout=60, - env=env, - ) - return nav_result, result - - -def test_hook_script_exists(): - """Verify hook script exists.""" - assert TITLE_HOOK.exists(), f"Hook script not found: {TITLE_HOOK}" - - -def test_extracts_title_from_example_com(): - """Test full workflow: extract title from real example.com.""" - - # Check node is available - if not shutil.which('node'): - pass - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env): - title_dir = snapshot_chrome_dir.parent / 'title' - title_dir.mkdir(exist_ok=True) - - nav_result, result = run_title_capture( - title_dir, - snapshot_chrome_dir, - env, - TEST_URL, - 'test789', - ) - assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}" - - assert result.returncode == 0, f"Extraction failed: {result.stderr}" - - # Parse clean JSONL output - result_json = None - for line in result.stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - pass - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - assert result_json, "Should have ArchiveResult JSONL output" - assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" - - # Verify output file exists (hook writes to current directory) - title_file = title_dir / 'title.txt' - assert title_file.exists(), "title.txt not created" - - # Verify title contains REAL example.com title - title_text = title_file.read_text().strip() - assert len(title_text) > 0, "Title should not be empty" - assert 'example' in title_text.lower(), "Title should contain 'example'" - - # example.com has title "Example Domain" - assert 'example domain' in title_text.lower(), f"Expected 'Example Domain', got: {title_text}" - - -def test_fails_without_chrome_session(): - """Test that title plugin fails when chrome session is missing.""" - - if not shutil.which('node'): - pass - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - title_dir = tmpdir / 'snapshot' / 'title' - title_dir.mkdir(parents=True, exist_ok=True) - - # Run title extraction - result = subprocess.run( - ['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=testhttp'], - cwd=title_dir, - capture_output=True, - text=True, - timeout=60, - env=get_test_env(), - ) - - assert result.returncode != 0, f"Should fail without chrome session: {result.stderr}" - assert 'No Chrome session found (chrome plugin must run first)' in (result.stdout + result.stderr) - - -def test_config_timeout_honored(): - """Test that TITLE_TIMEOUT config is respected.""" - - if not shutil.which('node'): - pass - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - # Set very short timeout (but example.com should still succeed) - import os - env_override = os.environ.copy() - env_override['TITLE_TIMEOUT'] = '5' - - with chrome_session(tmpdir, test_url=TEST_URL, navigate=False) as (_process, _pid, snapshot_chrome_dir, env): - title_dir = snapshot_chrome_dir.parent / 'title' - title_dir.mkdir(exist_ok=True) - env.update(env_override) - - nav_result, result = run_title_capture( - title_dir, - snapshot_chrome_dir, - env, - TEST_URL, - 'testtimeout', - ) - assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}" - - # Should complete (success or fail, but not hang) - assert result.returncode in (0, 1), "Should complete without hanging" - - -def test_handles_https_urls(): - """Test that HTTPS URLs work correctly.""" - - if not shutil.which('node'): - pass - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - with chrome_session(tmpdir, test_url='https://example.org', navigate=False) as (_process, _pid, snapshot_chrome_dir, env): - title_dir = snapshot_chrome_dir.parent / 'title' - title_dir.mkdir(exist_ok=True) - - nav_result, result = run_title_capture( - title_dir, - snapshot_chrome_dir, - env, - 'https://example.org', - 'testhttps', - ) - assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}" - - if result.returncode == 0: - # Hook writes to current directory - output_title_file = title_dir / 'title.txt' - if output_title_file.exists(): - title_text = output_title_file.read_text().strip() - assert len(title_text) > 0, "Title should not be empty" - assert 'example' in title_text.lower() - - -def test_handles_404_gracefully(): - """Test that title plugin handles 404 pages. - - Note: example.com returns valid HTML even for 404 pages, so extraction may succeed - with the generic "Example Domain" title. - """ - - if not shutil.which('node'): - pass - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - with chrome_session(tmpdir, test_url='https://example.com/nonexistent-page-404', navigate=False) as ( - _process, - _pid, - snapshot_chrome_dir, - env, - ): - title_dir = snapshot_chrome_dir.parent / 'title' - title_dir.mkdir(exist_ok=True) - - nav_result, result = run_title_capture( - title_dir, - snapshot_chrome_dir, - env, - 'https://example.com/nonexistent-page-404', - 'test404', - ) - assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}" - - # May succeed or fail depending on server behavior - # example.com returns "Example Domain" even for 404s - assert result.returncode in (0, 1), "Should complete (may succeed or fail)" - - -def test_handles_redirects(): - """Test that title plugin handles redirects correctly.""" - - if not shutil.which('node'): - pass - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - with chrome_session(tmpdir, test_url='http://example.com', navigate=False) as ( - _process, - _pid, - snapshot_chrome_dir, - env, - ): - title_dir = snapshot_chrome_dir.parent / 'title' - title_dir.mkdir(exist_ok=True) - - # http://example.com redirects to https://example.com - nav_result, result = run_title_capture( - title_dir, - snapshot_chrome_dir, - env, - 'http://example.com', - 'testredirect', - ) - assert nav_result.returncode == 0, f"Navigation failed: {nav_result.stderr}" - - # Should succeed and follow redirect - if result.returncode == 0: - # Hook writes to current directory - output_title_file = title_dir / 'title.txt' - if output_title_file.exists(): - title_text = output_title_file.read_text().strip() - assert 'example' in title_text.lower() - - -if __name__ == '__main__': - pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/twocaptcha/config.json b/archivebox/plugins/twocaptcha/config.json deleted file mode 100644 index d6c08ecf..00000000 --- a/archivebox/plugins/twocaptcha/config.json +++ /dev/null @@ -1,50 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "additionalProperties": false, - "required_plugins": ["chrome"], - "properties": { - "TWOCAPTCHA_ENABLED": { - "type": "boolean", - "default": true, - "x-aliases": ["CAPTCHA2_ENABLED", "USE_CAPTCHA2", "USE_TWOCAPTCHA"], - "description": "Enable 2captcha browser extension for automatic CAPTCHA solving" - }, - "TWOCAPTCHA_API_KEY": { - "type": "string", - "default": "", - "x-aliases": ["API_KEY_2CAPTCHA", "CAPTCHA2_API_KEY"], - "x-sensitive": true, - "description": "2captcha API key for CAPTCHA solving service (get from https://2captcha.com)" - }, - "TWOCAPTCHA_RETRY_COUNT": { - "type": "integer", - "default": 3, - "minimum": 0, - "maximum": 10, - "x-aliases": ["CAPTCHA2_RETRY_COUNT"], - "description": "Number of times to retry CAPTCHA solving on error" - }, - "TWOCAPTCHA_RETRY_DELAY": { - "type": "integer", - "default": 5, - "minimum": 0, - "maximum": 60, - "x-aliases": ["CAPTCHA2_RETRY_DELAY"], - "description": "Delay in seconds between CAPTCHA solving retries" - }, - "TWOCAPTCHA_TIMEOUT": { - "type": "integer", - "default": 60, - "minimum": 5, - "x-fallback": "TIMEOUT", - "x-aliases": ["CAPTCHA2_TIMEOUT"], - "description": "Timeout for CAPTCHA solving in seconds" - }, - "TWOCAPTCHA_AUTO_SUBMIT": { - "type": "boolean", - "default": false, - "description": "Automatically submit forms after CAPTCHA is solved" - } - } -} diff --git a/archivebox/plugins/twocaptcha/on_Crawl__83_twocaptcha_install.js b/archivebox/plugins/twocaptcha/on_Crawl__83_twocaptcha_install.js deleted file mode 100755 index 23a1b3f2..00000000 --- a/archivebox/plugins/twocaptcha/on_Crawl__83_twocaptcha_install.js +++ /dev/null @@ -1,66 +0,0 @@ -#!/usr/bin/env node -/** - * 2Captcha Extension Plugin - * - * Installs and configures the 2captcha Chrome extension for automatic - * CAPTCHA solving during page archiving. - * - * Extension: https://chromewebstore.google.com/detail/ifibfemgeogfhoebkmokieepdoobkbpo - * Documentation: https://2captcha.com/blog/how-to-use-2captcha-solver-extension-in-puppeteer - * - * Priority: 83 - Must install before Chrome session starts at Crawl level - * Hook: on_Crawl (runs once per crawl, not per snapshot) - * - * Requirements: - * - TWOCAPTCHA_API_KEY environment variable must be set - * - Extension will automatically solve reCAPTCHA, hCaptcha, Cloudflare Turnstile, etc. - */ - -// Import extension utilities -const { installExtensionWithCache } = require('../chrome/chrome_utils.js'); - -// Extension metadata -const EXTENSION = { - webstore_id: 'ifibfemgeogfhoebkmokieepdoobkbpo', - name: 'twocaptcha', -}; - -/** - * Main entry point - install extension before archiving - * - * Note: 2captcha configuration is handled by on_Crawl__95_twocaptcha_config.js - * during first-time browser setup to avoid repeated configuration on every snapshot. - * The API key is injected via chrome.storage API once per browser session. - */ -async function main() { - const extension = await installExtensionWithCache(EXTENSION); - - if (extension) { - // Check if API key is configured - const apiKey = process.env.TWOCAPTCHA_API_KEY || process.env.API_KEY_2CAPTCHA; - if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') { - console.warn('[āš ļø] 2captcha extension installed but TWOCAPTCHA_API_KEY not configured'); - console.warn('[āš ļø] Set TWOCAPTCHA_API_KEY environment variable to enable automatic CAPTCHA solving'); - } else { - console.log('[+] 2captcha extension installed and API key configured'); - } - } - - return extension; -} - -// Export functions for use by other plugins -module.exports = { - EXTENSION, -}; - -// Run if executed directly -if (require.main === module) { - main().then(() => { - console.log('[āœ“] 2captcha extension setup complete'); - process.exit(0); - }).catch(err => { - console.error('[āŒ] 2captcha extension setup failed:', err); - process.exit(1); - }); -} diff --git a/archivebox/plugins/twocaptcha/on_Crawl__95_twocaptcha_config.js b/archivebox/plugins/twocaptcha/on_Crawl__95_twocaptcha_config.js deleted file mode 100755 index 2dd2002f..00000000 --- a/archivebox/plugins/twocaptcha/on_Crawl__95_twocaptcha_config.js +++ /dev/null @@ -1,389 +0,0 @@ -#!/usr/bin/env node -/** - * 2Captcha Extension Configuration - * - * Configures the 2captcha extension with API key and settings after Crawl-level Chrome session starts. - * Runs once per crawl to inject configuration into extension storage. - * - * Priority: 95 (after chrome_launch at 90, before snapshots start) - * Hook: on_Crawl (runs once per crawl, not per snapshot) - * - * Config Options (from config.json / environment): - * - TWOCAPTCHA_API_KEY: API key for 2captcha service - * - TWOCAPTCHA_ENABLED: Enable/disable the extension - * - TWOCAPTCHA_RETRY_COUNT: Number of retries on error - * - TWOCAPTCHA_RETRY_DELAY: Delay between retries (seconds) - * - TWOCAPTCHA_AUTO_SUBMIT: Auto-submit forms after solving - * - * Requirements: - * - TWOCAPTCHA_API_KEY environment variable must be set - * - chrome plugin must have loaded extensions (extensions.json must exist) - */ - -const path = require('path'); -const fs = require('fs'); -// Add NODE_MODULES_DIR to module resolution paths if set -if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); -const puppeteer = require('puppeteer-core'); - -// Get crawl's chrome directory from environment variable set by hooks.py -function getCrawlChromeSessionDir() { - const crawlOutputDir = process.env.CRAWL_OUTPUT_DIR || ''; - if (!crawlOutputDir) { - return null; - } - return path.join(crawlOutputDir, 'chrome'); -} - -const CHROME_SESSION_DIR = getCrawlChromeSessionDir() || '../chrome'; -const CONFIG_MARKER = path.join(CHROME_SESSION_DIR, '.twocaptcha_configured'); - -// Get environment variable with default -function getEnv(name, defaultValue = '') { - return (process.env[name] || defaultValue).trim(); -} - -// Get boolean environment variable -function getEnvBool(name, defaultValue = false) { - const val = getEnv(name, '').toLowerCase(); - if (['true', '1', 'yes', 'on'].includes(val)) return true; - if (['false', '0', 'no', 'off'].includes(val)) return false; - return defaultValue; -} - -// Get integer environment variable -function getEnvInt(name, defaultValue = 0) { - const val = parseInt(getEnv(name, String(defaultValue)), 10); - return isNaN(val) ? defaultValue : val; -} - -// Parse command line arguments -function parseArgs() { - const args = {}; - process.argv.slice(2).forEach(arg => { - if (arg.startsWith('--')) { - const [key, ...valueParts] = arg.slice(2).split('='); - args[key.replace(/-/g, '_')] = valueParts.join('=') || true; - } - }); - return args; -} - -/** - * Get 2captcha configuration from environment variables. - * Supports both TWOCAPTCHA_* and legacy API_KEY_2CAPTCHA naming. - */ -function getTwoCaptchaConfig() { - const apiKey = getEnv('TWOCAPTCHA_API_KEY') || getEnv('API_KEY_2CAPTCHA') || getEnv('CAPTCHA2_API_KEY'); - const isEnabled = getEnvBool('TWOCAPTCHA_ENABLED', true); - const retryCount = getEnvInt('TWOCAPTCHA_RETRY_COUNT', 3); - const retryDelay = getEnvInt('TWOCAPTCHA_RETRY_DELAY', 5); - const autoSubmit = getEnvBool('TWOCAPTCHA_AUTO_SUBMIT', false); - - // Build the full config object matching the extension's storage structure - // Structure: chrome.storage.local.set({config: {...}}) - return { - // API key - both variants for compatibility - apiKey: apiKey, - api_key: apiKey, - - // Plugin enabled state - isPluginEnabled: isEnabled, - - // Retry settings - repeatOnErrorTimes: retryCount, - repeatOnErrorDelay: retryDelay, - - // Auto-submit setting - autoSubmitForms: autoSubmit, - submitFormsDelay: 0, - - // Enable all CAPTCHA types - enabledForNormal: true, - enabledForRecaptchaV2: true, - enabledForInvisibleRecaptchaV2: true, - enabledForRecaptchaV3: true, - enabledForRecaptchaAudio: false, - enabledForGeetest: true, - enabledForGeetest_v4: true, - enabledForKeycaptcha: true, - enabledForArkoselabs: true, - enabledForLemin: true, - enabledForYandex: true, - enabledForCapyPuzzle: true, - enabledForTurnstile: true, - enabledForAmazonWaf: true, - enabledForMTCaptcha: true, - - // Auto-solve all CAPTCHA types - autoSolveNormal: true, - autoSolveRecaptchaV2: true, - autoSolveInvisibleRecaptchaV2: true, - autoSolveRecaptchaV3: true, - autoSolveRecaptchaAudio: false, - autoSolveGeetest: true, - autoSolveGeetest_v4: true, - autoSolveKeycaptcha: true, - autoSolveArkoselabs: true, - autoSolveLemin: true, - autoSolveYandex: true, - autoSolveCapyPuzzle: true, - autoSolveTurnstile: true, - autoSolveAmazonWaf: true, - autoSolveMTCaptcha: true, - - // Other settings with sensible defaults - recaptchaV2Type: 'token', - recaptchaV3MinScore: 0.3, - buttonPosition: 'inner', - useProxy: false, - proxy: '', - proxytype: 'HTTP', - blackListDomain: '', - autoSubmitRules: [], - normalSources: [], - }; -} - -async function configure2Captcha() { - // Check if already configured in this session - if (fs.existsSync(CONFIG_MARKER)) { - console.error('[*] 2captcha already configured in this browser session'); - return { success: true, skipped: true }; - } - - // Get configuration - const config = getTwoCaptchaConfig(); - - // Check if API key is set - if (!config.apiKey || config.apiKey === 'YOUR_API_KEY_HERE') { - console.warn('[!] 2captcha extension loaded but TWOCAPTCHA_API_KEY not configured'); - console.warn('[!] Set TWOCAPTCHA_API_KEY environment variable to enable automatic CAPTCHA solving'); - return { success: false, error: 'TWOCAPTCHA_API_KEY not configured' }; - } - - console.error('[*] Configuring 2captcha extension...'); - console.error(`[*] API Key: ${config.apiKey.slice(0, 8)}...${config.apiKey.slice(-4)}`); - console.error(`[*] Enabled: ${config.isPluginEnabled}`); - console.error(`[*] Retry Count: ${config.repeatOnErrorTimes}`); - console.error(`[*] Retry Delay: ${config.repeatOnErrorDelay}s`); - console.error(`[*] Auto Submit: ${config.autoSubmitForms}`); - console.error(`[*] Auto Solve: all CAPTCHA types enabled`); - - try { - // Connect to the existing Chrome session via CDP - const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt'); - if (!fs.existsSync(cdpFile)) { - return { success: false, error: 'No Chrome session found (chrome plugin must run first)' }; - } - - const cdpUrl = fs.readFileSync(cdpFile, 'utf-8').trim(); - const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl }); - - try { - // First, navigate to a page to trigger extension content scripts and wake up service worker - console.error('[*] Waking up extension by visiting a page...'); - const triggerPage = await browser.newPage(); - try { - await triggerPage.goto('https://www.google.com', { waitUntil: 'domcontentloaded', timeout: 10000 }); - await new Promise(r => setTimeout(r, 3000)); // Give extension time to initialize - } catch (e) { - console.warn(`[!] Trigger page failed: ${e.message}`); - } - try { await triggerPage.close(); } catch (e) {} - - // Get 2captcha extension info from extensions.json - const extensionsFile = path.join(CHROME_SESSION_DIR, 'extensions.json'); - if (!fs.existsSync(extensionsFile)) { - return { success: false, error: 'extensions.json not found - chrome plugin must run first' }; - } - - const extensions = JSON.parse(fs.readFileSync(extensionsFile, 'utf-8')); - const captchaExt = extensions.find(ext => ext.name === 'twocaptcha'); - - if (!captchaExt) { - console.error('[*] 2captcha extension not installed, skipping configuration'); - return { success: true, skipped: true }; - } - - if (!captchaExt.id) { - return { success: false, error: '2captcha extension ID not found in extensions.json' }; - } - - const extensionId = captchaExt.id; - console.error(`[*] 2captcha Extension ID: ${extensionId}`); - - // Configure via options page - console.error('[*] Configuring via options page...'); - const optionsUrl = `chrome-extension://${extensionId}/options/options.html`; - - let configPage = await browser.newPage(); - - try { - // Navigate to options page - catch error but continue since page may still load - try { - await configPage.goto(optionsUrl, { waitUntil: 'networkidle0', timeout: 10000 }); - } catch (navError) { - // Navigation may throw ERR_BLOCKED_BY_CLIENT but page still loads - console.error(`[*] Navigation threw error (may still work): ${navError.message}`); - } - - // Wait a moment for page to settle - await new Promise(r => setTimeout(r, 3000)); - - // Check all pages for the extension page (Chrome may open it in a different tab) - const pages = await browser.pages(); - for (const page of pages) { - const url = page.url(); - if (url.startsWith(`chrome-extension://${extensionId}`)) { - configPage = page; - break; - } - } - - const currentUrl = configPage.url(); - console.error(`[*] Current URL: ${currentUrl}`); - - if (!currentUrl.startsWith(`chrome-extension://${extensionId}`)) { - return { success: false, error: `Failed to navigate to options page, got: ${currentUrl}` }; - } - - // Wait for Config object to be available - console.error('[*] Waiting for Config object...'); - await configPage.waitForFunction(() => typeof Config !== 'undefined', { timeout: 10000 }); - - // Use chrome.storage.local.set with the config wrapper - const result = await configPage.evaluate((cfg) => { - return new Promise((resolve) => { - if (typeof chrome !== 'undefined' && chrome.storage) { - chrome.storage.local.set({ config: cfg }, () => { - if (chrome.runtime.lastError) { - resolve({ success: false, error: chrome.runtime.lastError.message }); - } else { - resolve({ success: true, method: 'options_page' }); - } - }); - } else { - resolve({ success: false, error: 'chrome.storage not available' }); - } - }); - }, config); - - if (result.success) { - console.error(`[+] 2captcha configured via ${result.method}`); - - // Verify config was applied by reloading options page and checking form values - console.error('[*] Verifying config by reloading options page...'); - try { - await configPage.reload({ waitUntil: 'networkidle0', timeout: 10000 }); - } catch (e) { - console.error(`[*] Reload threw error (may still work): ${e.message}`); - } - - await new Promise(r => setTimeout(r, 2000)); - - // Wait for Config object again - await configPage.waitForFunction(() => typeof Config !== 'undefined', { timeout: 10000 }); - - // Read back the config using Config.getAll() - const verifyConfig = await configPage.evaluate(async () => { - if (typeof Config !== 'undefined' && typeof Config.getAll === 'function') { - return await Config.getAll(); - } - return null; - }); - - if (!verifyConfig) { - return { success: false, error: 'Could not verify config - Config.getAll() not available' }; - } - - // Check that API key was actually set - const actualApiKey = verifyConfig.apiKey || verifyConfig.api_key; - if (!actualApiKey || actualApiKey !== config.apiKey) { - console.error(`[!] Config verification FAILED - API key mismatch`); - console.error(`[!] Expected: ${config.apiKey.slice(0, 8)}...${config.apiKey.slice(-4)}`); - console.error(`[!] Got: ${actualApiKey ? actualApiKey.slice(0, 8) + '...' + actualApiKey.slice(-4) : 'null'}`); - return { success: false, error: 'Config verification failed - API key not set correctly' }; - } - - console.error('[+] Config verified successfully!'); - console.error(`[+] API Key: ${actualApiKey.slice(0, 8)}...${actualApiKey.slice(-4)}`); - console.error(`[+] Plugin Enabled: ${verifyConfig.isPluginEnabled}`); - console.error(`[+] Auto Solve Turnstile: ${verifyConfig.autoSolveTurnstile}`); - - fs.writeFileSync(CONFIG_MARKER, JSON.stringify({ - timestamp: new Date().toISOString(), - method: result.method, - extensionId: extensionId, - verified: true, - config: { - apiKeySet: !!config.apiKey, - isPluginEnabled: config.isPluginEnabled, - repeatOnErrorTimes: config.repeatOnErrorTimes, - repeatOnErrorDelay: config.repeatOnErrorDelay, - autoSubmitForms: config.autoSubmitForms, - autoSolveEnabled: true, - } - }, null, 2)); - return { success: true, method: result.method, verified: true }; - } - - return { success: false, error: result.error || 'Config failed' }; - } finally { - try { await configPage.close(); } catch (e) {} - } - } finally { - browser.disconnect(); - } - } catch (e) { - return { success: false, error: `${e.name}: ${e.message}` }; - } -} - -async function main() { - const args = parseArgs(); - const url = args.url; - const snapshotId = args.snapshot_id; - - if (!url || !snapshotId) { - console.error('Usage: on_Crawl__95_twocaptcha_config.js --url=<url> --snapshot-id=<uuid>'); - process.exit(1); - } - - const startTs = new Date(); - let status = 'failed'; - let error = ''; - - try { - const result = await configure2Captcha(); - - if (result.skipped) { - status = 'skipped'; - } else if (result.success) { - status = 'succeeded'; - } else { - status = 'failed'; - error = result.error || 'Configuration failed'; - } - } catch (e) { - error = `${e.name}: ${e.message}`; - status = 'failed'; - } - - const endTs = new Date(); - const duration = (endTs - startTs) / 1000; - - if (error) { - console.error(`ERROR: ${error}`); - } - - // Config hooks don't emit JSONL - they're utility hooks for setup - // Exit code indicates success/failure - - process.exit(status === 'succeeded' || status === 'skipped' ? 0 : 1); -} - -main().catch(e => { - console.error(`Fatal error: ${e.message}`); - process.exit(1); -}); diff --git a/archivebox/plugins/twocaptcha/templates/icon.html b/archivebox/plugins/twocaptcha/templates/icon.html deleted file mode 100644 index e69de29b..00000000 diff --git a/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py b/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py deleted file mode 100644 index 4569cb49..00000000 --- a/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py +++ /dev/null @@ -1,338 +0,0 @@ -""" -Integration tests for twocaptcha plugin - -Run with: TWOCAPTCHA_API_KEY=your_key pytest archivebox/plugins/twocaptcha/tests/ -xvs - -NOTE: Chrome 137+ removed --load-extension support, so these tests MUST use Chromium. -""" - -import json -import os -import signal -import subprocess -import tempfile -import time -from pathlib import Path - -import pytest - -from archivebox.plugins.chrome.tests.chrome_test_helpers import ( - setup_test_env, - launch_chromium_session, - kill_chromium_session, - CHROME_LAUNCH_HOOK, - PLUGINS_ROOT, -) - - -PLUGIN_DIR = Path(__file__).parent.parent -INSTALL_SCRIPT = PLUGIN_DIR / 'on_Crawl__83_twocaptcha_install.js' -CONFIG_SCRIPT = PLUGIN_DIR / 'on_Crawl__95_twocaptcha_config.js' - -TEST_URL = 'https://2captcha.com/demo/cloudflare-turnstile' - - -# Alias for backward compatibility with existing test names -launch_chrome = launch_chromium_session -kill_chrome = kill_chromium_session - - -class TestTwoCaptcha: - """Integration tests requiring TWOCAPTCHA_API_KEY.""" - - @pytest.fixture(autouse=True) - def setup(self): - self.api_key = os.environ.get('TWOCAPTCHA_API_KEY') or os.environ.get('API_KEY_2CAPTCHA') - if not self.api_key: - pytest.fail("TWOCAPTCHA_API_KEY required") - - def test_install_and_load(self): - """Extension installs and loads in Chromium.""" - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - env = setup_test_env(tmpdir) - env['TWOCAPTCHA_API_KEY'] = self.api_key - - # Install - result = subprocess.run(['node', str(INSTALL_SCRIPT)], env=env, timeout=120, capture_output=True, text=True) - assert result.returncode == 0, f"Install failed: {result.stderr}" - - cache = Path(env['CHROME_EXTENSIONS_DIR']) / 'twocaptcha.extension.json' - assert cache.exists() - data = json.loads(cache.read_text()) - assert data['webstore_id'] == 'ifibfemgeogfhoebkmokieepdoobkbpo' - - # Launch Chromium in crawls directory - crawl_id = 'test' - crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id - chrome_dir = crawl_dir / 'chrome' - env['CRAWL_OUTPUT_DIR'] = str(crawl_dir) - process, cdp_url = launch_chrome(env, chrome_dir, crawl_id) - - try: - # Wait for extensions.json to be written - extensions_file = chrome_dir / 'extensions.json' - for i in range(20): - if extensions_file.exists(): - break - time.sleep(0.5) - - assert extensions_file.exists(), f"extensions.json not created. Chrome dir files: {list(chrome_dir.iterdir())}" - - exts = json.loads(extensions_file.read_text()) - assert any(e['name'] == 'twocaptcha' for e in exts), f"twocaptcha not loaded: {exts}" - print(f"[+] Extension loaded: id={next(e['id'] for e in exts if e['name']=='twocaptcha')}") - finally: - kill_chrome(process, chrome_dir) - - def test_config_applied(self): - """Configuration is applied to extension and verified via Config.getAll().""" - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - env = setup_test_env(tmpdir) - env['TWOCAPTCHA_API_KEY'] = self.api_key - env['TWOCAPTCHA_RETRY_COUNT'] = '5' - env['TWOCAPTCHA_RETRY_DELAY'] = '10' - - subprocess.run(['node', str(INSTALL_SCRIPT)], env=env, timeout=120, capture_output=True) - - # Launch Chromium in crawls directory - crawl_id = 'cfg' - crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id - chrome_dir = crawl_dir / 'chrome' - env['CRAWL_OUTPUT_DIR'] = str(crawl_dir) - process, cdp_url = launch_chrome(env, chrome_dir, crawl_id) - - try: - # Wait for extensions.json to be written - extensions_file = chrome_dir / 'extensions.json' - for i in range(20): - if extensions_file.exists(): - break - time.sleep(0.5) - assert extensions_file.exists(), f"extensions.json not created" - - result = subprocess.run( - ['node', str(CONFIG_SCRIPT), '--url=https://example.com', '--snapshot-id=test'], - env=env, timeout=30, capture_output=True, text=True - ) - assert result.returncode == 0, f"Config failed: {result.stderr}" - assert (chrome_dir / '.twocaptcha_configured').exists() - - # Verify config via options.html and Config.getAll() - # Get the actual extension ID from the config marker (Chrome computes IDs differently) - config_marker = json.loads((chrome_dir / '.twocaptcha_configured').read_text()) - ext_id = config_marker['extensionId'] - script = f''' -if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); -const puppeteer = require('puppeteer-core'); -(async () => {{ - const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }}); - - // Load options.html and use Config.getAll() to verify - const optionsUrl = 'chrome-extension://{ext_id}/options/options.html'; - const page = await browser.newPage(); - console.error('[*] Loading options page:', optionsUrl); - - // Navigate - catch error but continue since page may still load - try {{ - await page.goto(optionsUrl, {{ waitUntil: 'networkidle0', timeout: 10000 }}); - }} catch (e) {{ - console.error('[*] Navigation threw error (may still work):', e.message); - }} - - // Wait for page to settle - await new Promise(r => setTimeout(r, 2000)); - console.error('[*] Current URL:', page.url()); - - // Wait for Config object to be available - await page.waitForFunction(() => typeof Config !== 'undefined', {{ timeout: 5000 }}); - - // Call Config.getAll() - the extension's own API (returns a Promise) - const cfg = await page.evaluate(async () => await Config.getAll()); - console.error('[*] Config.getAll() returned:', JSON.stringify(cfg)); - - await page.close(); - browser.disconnect(); - console.log(JSON.stringify(cfg)); -}})(); -''' - (tmpdir / 'v.js').write_text(script) - r = subprocess.run(['node', str(tmpdir / 'v.js')], env=env, timeout=30, capture_output=True, text=True) - print(r.stderr) - assert r.returncode == 0, f"Verify failed: {r.stderr}" - - cfg = json.loads(r.stdout.strip().split('\n')[-1]) - print(f"[*] Config from extension: {json.dumps(cfg, indent=2)}") - - # Verify all the fields we care about - assert cfg.get('apiKey') == self.api_key or cfg.get('api_key') == self.api_key, f"API key not set: {cfg}" - assert cfg.get('isPluginEnabled') == True, f"Plugin not enabled: {cfg}" - assert cfg.get('repeatOnErrorTimes') == 5, f"Retry count wrong: {cfg}" - assert cfg.get('repeatOnErrorDelay') == 10, f"Retry delay wrong: {cfg}" - assert cfg.get('autoSolveRecaptchaV2') == True, f"autoSolveRecaptchaV2 not enabled: {cfg}" - assert cfg.get('autoSolveRecaptchaV3') == True, f"autoSolveRecaptchaV3 not enabled: {cfg}" - assert cfg.get('autoSolveTurnstile') == True, f"autoSolveTurnstile not enabled: {cfg}" - assert cfg.get('enabledForRecaptchaV2') == True, f"enabledForRecaptchaV2 not enabled: {cfg}" - - print(f"[+] Config verified via Config.getAll()!") - finally: - kill_chrome(process, chrome_dir) - - def test_solves_recaptcha(self): - """Extension attempts to solve CAPTCHA on demo page. - - CRITICAL: DO NOT SKIP OR DISABLE THIS TEST EVEN IF IT'S FLAKY! - - This test is INTENTIONALLY left enabled to expose the REAL, ACTUAL flakiness - of the 2captcha service and demo page. The test failures you see here are NOT - test bugs - they are ACCURATE representations of the real-world reliability - of this CAPTCHA solving service. - - If this test is flaky, that's because 2captcha IS FLAKY in production. - If this test fails intermittently, that's because 2captcha FAILS INTERMITTENTLY in production. - - NEVER EVER hide real flakiness by disabling tests or adding @pytest.mark.skip. - Users NEED to see this failure rate to understand what they're getting into. - - When this test DOES pass, it confirms: - - Extension loads and configures correctly - - 2captcha API key is accepted - - Extension can successfully auto-solve CAPTCHAs - - The entire flow works end-to-end - - When it fails (as it often does): - - Demo page has JavaScript errors (representing real-world broken sites) - - Turnstile tokens expire before solving (representing real-world timing issues) - - 2captcha service may be slow/down (representing real-world service issues) - - This is VALUABLE INFORMATION about the service. DO NOT HIDE IT. - """ - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - env = setup_test_env(tmpdir) - env['TWOCAPTCHA_API_KEY'] = self.api_key - - subprocess.run(['node', str(INSTALL_SCRIPT)], env=env, timeout=120, capture_output=True) - - # Launch Chromium in crawls directory - crawl_id = 'solve' - crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id - chrome_dir = crawl_dir / 'chrome' - env['CRAWL_OUTPUT_DIR'] = str(crawl_dir) - process, cdp_url = launch_chrome(env, chrome_dir, crawl_id) - - try: - # Wait for extensions.json to be written - extensions_file = chrome_dir / 'extensions.json' - for i in range(20): - if extensions_file.exists(): - break - time.sleep(0.5) - assert extensions_file.exists(), f"extensions.json not created" - - subprocess.run(['node', str(CONFIG_SCRIPT), '--url=x', '--snapshot-id=x'], env=env, timeout=30, capture_output=True) - - script = f''' -if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); -const puppeteer = require('puppeteer-core'); -(async () => {{ - const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }}); - const page = await browser.newPage(); - - // Capture console messages from the page (including extension messages) - page.on('console', msg => {{ - const text = msg.text(); - if (text.includes('2captcha') || text.includes('turnstile') || text.includes('captcha')) {{ - console.error('[CONSOLE]', text); - }} - }}); - - await page.setViewport({{ width: 1440, height: 900 }}); - console.error('[*] Loading {TEST_URL}...'); - await page.goto('{TEST_URL}', {{ waitUntil: 'networkidle2', timeout: 30000 }}); - - // Wait for CAPTCHA iframe (minimal wait to avoid token expiration) - console.error('[*] Waiting for CAPTCHA iframe...'); - await page.waitForSelector('iframe', {{ timeout: 30000 }}); - console.error('[*] CAPTCHA iframe found - extension should auto-solve now'); - - // DON'T CLICK - extension should auto-solve since autoSolveTurnstile=True - console.error('[*] Waiting for auto-solve (extension configured with autoSolveTurnstile=True)...'); - - // Poll for data-state changes with debug output - console.error('[*] Waiting for CAPTCHA to be solved (up to 150s)...'); - const start = Date.now(); - let solved = false; - let lastState = null; - - while (!solved && (Date.now() - start) < 150000) {{ - const state = await page.evaluate(() => {{ - const solver = document.querySelector('.captcha-solver'); - return {{ - state: solver?.getAttribute('data-state'), - text: solver?.textContent?.trim(), - classList: solver?.className - }}; - }}); - - if (state.state !== lastState) {{ - const elapsed = Math.round((Date.now() - start) / 1000); - console.error(`[*] State change at ${{elapsed}}s: "${{lastState}}" -> "${{state.state}}" (text: "${{state.text?.slice(0, 50)}}")`); - lastState = state.state; - }} - - if (state.state === 'solved') {{ - solved = true; - const elapsed = Math.round((Date.now() - start) / 1000); - console.error('[+] SOLVED in ' + elapsed + 's!'); - break; - }} - - // Check every 2 seconds - await new Promise(r => setTimeout(r, 2000)); - }} - - if (!solved) {{ - const elapsed = Math.round((Date.now() - start) / 1000); - const finalState = await page.evaluate(() => {{ - const solver = document.querySelector('.captcha-solver'); - return {{ - state: solver?.getAttribute('data-state'), - text: solver?.textContent?.trim(), - html: solver?.outerHTML?.slice(0, 200) - }}; - }}); - console.error(`[!] TIMEOUT after ${{elapsed}}s. Final state: ${{JSON.stringify(finalState)}}`); - browser.disconnect(); - process.exit(1); - }} - - const final = await page.evaluate(() => {{ - const solver = document.querySelector('.captcha-solver'); - return {{ - solved: true, - state: solver?.getAttribute('data-state'), - text: solver?.textContent?.trim() - }}; - }}); - browser.disconnect(); - console.log(JSON.stringify(final)); -}})(); -''' - (tmpdir / 's.js').write_text(script) - print("\n[*] Solving CAPTCHA (this can take up to 150s for 2captcha API)...") - r = subprocess.run(['node', str(tmpdir / 's.js')], env=env, timeout=200, capture_output=True, text=True) - print(r.stderr) - assert r.returncode == 0, f"Failed: {r.stderr}" - - final = json.loads([l for l in r.stdout.strip().split('\n') if l.startswith('{')][-1]) - assert final.get('solved'), f"Not solved: {final}" - assert final.get('state') == 'solved', f"State not 'solved': {final}" - print(f"[+] SUCCESS! CAPTCHA solved: {final.get('text','')[:50]}") - finally: - kill_chrome(process, chrome_dir) - - -if __name__ == '__main__': - pytest.main([__file__, '-xvs']) diff --git a/archivebox/plugins/ublock/config.json b/archivebox/plugins/ublock/config.json deleted file mode 100644 index f7f47aef..00000000 --- a/archivebox/plugins/ublock/config.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "additionalProperties": false, - "required_plugins": ["chrome"], - "properties": { - "UBLOCK_ENABLED": { - "type": "boolean", - "default": true, - "x-aliases": ["USE_UBLOCK"], - "description": "Enable uBlock Origin browser extension for ad blocking" - } - } -} diff --git a/archivebox/plugins/ublock/on_Crawl__80_install_ublock_extension.js b/archivebox/plugins/ublock/on_Crawl__80_install_ublock_extension.js deleted file mode 100755 index ea5fd474..00000000 --- a/archivebox/plugins/ublock/on_Crawl__80_install_ublock_extension.js +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env node -/** - * uBlock Origin Extension Plugin - * - * Installs and configures the uBlock Origin Chrome extension for ad blocking - * and privacy protection during page archiving. - * - * Extension: https://chromewebstore.google.com/detail/cjpalhdlnbpafiamejdnhcphjbkeiagm - * - * Priority: 80 - Must install before Chrome session starts at Crawl level - * Hook: on_Crawl (runs once per crawl, not per snapshot) - * - * This extension automatically: - * - Blocks ads, trackers, and malware domains - * - Reduces page load time and bandwidth usage - * - Improves privacy during archiving - * - Removes clutter from archived pages - * - Uses efficient blocking with filter lists - */ - -// Import extension utilities -const { installExtensionWithCache } = require('../chrome/chrome_utils.js'); - -// Extension metadata -const EXTENSION = { - webstore_id: 'cjpalhdlnbpafiamejdnhcphjbkeiagm', - name: 'ublock', -}; - -/** - * Main entry point - install extension before archiving - * - * Note: uBlock Origin works automatically with default filter lists. - * No configuration needed - blocks ads, trackers, and malware domains out of the box. - */ -async function main() { - const extension = await installExtensionWithCache(EXTENSION); - - if (extension) { - console.log('[+] Ads and trackers will be blocked during archiving'); - } - - return extension; -} - -// Export functions for use by other plugins -module.exports = { - EXTENSION, -}; - -// Run if executed directly -if (require.main === module) { - main().then(() => { - console.log('[āœ“] uBlock Origin extension setup complete'); - process.exit(0); - }).catch(err => { - console.error('[āŒ] uBlock Origin extension setup failed:', err); - process.exit(1); - }); -} diff --git a/archivebox/plugins/ublock/templates/icon.html b/archivebox/plugins/ublock/templates/icon.html deleted file mode 100644 index e69de29b..00000000 diff --git a/archivebox/plugins/ublock/tests/test_ublock.py b/archivebox/plugins/ublock/tests/test_ublock.py deleted file mode 100644 index a3ab08a8..00000000 --- a/archivebox/plugins/ublock/tests/test_ublock.py +++ /dev/null @@ -1,725 +0,0 @@ -""" -Unit tests for ublock plugin - -Tests invoke the plugin hook as an external process and verify outputs/side effects. -""" - -import json -import os -import subprocess -import tempfile -from pathlib import Path - -import pytest - -from archivebox.plugins.chrome.tests.chrome_test_helpers import ( - setup_test_env, - get_test_env, - launch_chromium_session, - kill_chromium_session, - CHROME_LAUNCH_HOOK, - PLUGINS_ROOT, -) - - -PLUGIN_DIR = Path(__file__).parent.parent -INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_ublock_extension.*'), None) - - -def test_install_script_exists(): - """Verify install script exists""" - assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}" - - -def test_extension_metadata(): - """Test that uBlock Origin extension has correct metadata""" - with tempfile.TemporaryDirectory() as tmpdir: - env = os.environ.copy() - env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions") - - result = subprocess.run( - ["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"], - capture_output=True, - text=True, - env=env - ) - - assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}" - - metadata = json.loads(result.stdout) - assert metadata["webstore_id"] == "cjpalhdlnbpafiamejdnhcphjbkeiagm" - assert metadata["name"] == "ublock" - - -def test_install_creates_cache(): - """Test that install creates extension cache""" - with tempfile.TemporaryDirectory() as tmpdir: - ext_dir = Path(tmpdir) / "chrome_extensions" - ext_dir.mkdir(parents=True) - - env = os.environ.copy() - env["CHROME_EXTENSIONS_DIR"] = str(ext_dir) - - result = subprocess.run( - ["node", str(INSTALL_SCRIPT)], - capture_output=True, - text=True, - env=env, - timeout=120 # uBlock is large, may take longer to download - ) - - # Check output mentions installation - assert "uBlock" in result.stdout or "ublock" in result.stdout - - # Check cache file was created - cache_file = ext_dir / "ublock.extension.json" - assert cache_file.exists(), "Cache file should be created" - - # Verify cache content - cache_data = json.loads(cache_file.read_text()) - assert cache_data["webstore_id"] == "cjpalhdlnbpafiamejdnhcphjbkeiagm" - assert cache_data["name"] == "ublock" - - -def test_install_twice_uses_cache(): - """Test that running install twice uses existing cache on second run""" - with tempfile.TemporaryDirectory() as tmpdir: - ext_dir = Path(tmpdir) / "chrome_extensions" - ext_dir.mkdir(parents=True) - - env = os.environ.copy() - env["CHROME_EXTENSIONS_DIR"] = str(ext_dir) - - # First install - downloads the extension - result1 = subprocess.run( - ["node", str(INSTALL_SCRIPT)], - capture_output=True, - text=True, - env=env, - timeout=120 # uBlock is large - ) - assert result1.returncode == 0, f"First install failed: {result1.stderr}" - - # Verify cache was created - cache_file = ext_dir / "ublock.extension.json" - assert cache_file.exists(), "Cache file should exist after first install" - - # Second install - should use cache and be faster - result2 = subprocess.run( - ["node", str(INSTALL_SCRIPT)], - capture_output=True, - text=True, - env=env, - timeout=30 - ) - assert result2.returncode == 0, f"Second install failed: {result2.stderr}" - - # Second run should mention cache reuse - assert "already installed" in result2.stdout or "cache" in result2.stdout.lower() or result2.returncode == 0 - - -def test_no_configuration_required(): - """Test that uBlock Origin works without configuration""" - with tempfile.TemporaryDirectory() as tmpdir: - ext_dir = Path(tmpdir) / "chrome_extensions" - ext_dir.mkdir(parents=True) - - env = os.environ.copy() - env["CHROME_EXTENSIONS_DIR"] = str(ext_dir) - # No API keys needed - works with default filter lists - - result = subprocess.run( - ["node", str(INSTALL_SCRIPT)], - capture_output=True, - text=True, - env=env, - timeout=120 - ) - - # Should not require any API keys - combined_output = result.stdout + result.stderr - assert "API" not in combined_output or result.returncode == 0 - - -def test_large_extension_size(): - """Test that uBlock Origin is downloaded successfully despite large size""" - with tempfile.TemporaryDirectory() as tmpdir: - ext_dir = Path(tmpdir) / "chrome_extensions" - ext_dir.mkdir(parents=True) - - env = os.environ.copy() - env["CHROME_EXTENSIONS_DIR"] = str(ext_dir) - - result = subprocess.run( - ["node", str(INSTALL_SCRIPT)], - capture_output=True, - text=True, - env=env, - timeout=120 - ) - - # If extension was downloaded, verify it's substantial size - crx_file = ext_dir / "cjpalhdlnbpafiamejdnhcphjbkeiagm__ublock.crx" - if crx_file.exists(): - # uBlock Origin with filter lists is typically 2-5 MB - size_bytes = crx_file.stat().st_size - assert size_bytes > 1_000_000, f"uBlock Origin should be > 1MB, got {size_bytes} bytes" - - -def check_ad_blocking(cdp_url: str, test_url: str, env: dict, script_dir: Path) -> dict: - """Check ad blocking effectiveness by counting ad elements on page. - - Returns dict with: - - adElementsFound: int - number of ad-related elements found - - adElementsVisible: int - number of visible ad elements - - blockedRequests: int - number of blocked network requests (ads/trackers) - - totalRequests: int - total network requests made - - percentBlocked: int - percentage of ad elements hidden (0-100) - """ - test_script = f''' -if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); -const puppeteer = require('puppeteer-core'); - -(async () => {{ - const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }}); - - const page = await browser.newPage(); - await page.setUserAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'); - await page.setViewport({{ width: 1440, height: 900 }}); - - // Track network requests - let blockedRequests = 0; - let totalRequests = 0; - const adDomains = ['doubleclick', 'googlesyndication', 'googleadservices', 'facebook.com/tr', - 'analytics', 'adservice', 'advertising', 'taboola', 'outbrain', 'criteo', - 'amazon-adsystem', 'ads.yahoo', 'gemini.yahoo', 'yimg.com/cv/', 'beap.gemini']; - - page.on('request', request => {{ - totalRequests++; - const url = request.url().toLowerCase(); - if (adDomains.some(d => url.includes(d))) {{ - // This is an ad request - }} - }}); - - page.on('requestfailed', request => {{ - const url = request.url().toLowerCase(); - if (adDomains.some(d => url.includes(d))) {{ - blockedRequests++; - }} - }}); - - console.error('Navigating to {test_url}...'); - await page.goto('{test_url}', {{ waitUntil: 'domcontentloaded', timeout: 60000 }}); - - // Wait for page to fully render and ads to load - await new Promise(r => setTimeout(r, 5000)); - - // Check for ad elements in the DOM - const result = await page.evaluate(() => {{ - // Common ad-related selectors - const adSelectors = [ - // Generic ad containers - '[class*="ad-"]', '[class*="ad_"]', '[class*="-ad"]', '[class*="_ad"]', - '[id*="ad-"]', '[id*="ad_"]', '[id*="-ad"]', '[id*="_ad"]', - '[class*="advertisement"]', '[id*="advertisement"]', - '[class*="sponsored"]', '[id*="sponsored"]', - // Google ads - 'ins.adsbygoogle', '[data-ad-client]', '[data-ad-slot]', - // Yahoo specific - '[class*="gemini"]', '[data-beacon]', '[class*="native-ad"]', - '[class*="stream-ad"]', '[class*="LDRB"]', '[class*="ntv-ad"]', - // iframes (often ads) - 'iframe[src*="ad"]', 'iframe[src*="doubleclick"]', 'iframe[src*="googlesyndication"]', - // Common ad sizes - '[style*="300px"][style*="250px"]', '[style*="728px"][style*="90px"]', - '[style*="160px"][style*="600px"]', '[style*="320px"][style*="50px"]', - ]; - - let adElementsFound = 0; - let adElementsVisible = 0; - - for (const selector of adSelectors) {{ - try {{ - const elements = document.querySelectorAll(selector); - for (const el of elements) {{ - adElementsFound++; - const style = window.getComputedStyle(el); - const rect = el.getBoundingClientRect(); - const isVisible = style.display !== 'none' && - style.visibility !== 'hidden' && - style.opacity !== '0' && - rect.width > 0 && rect.height > 0; - if (isVisible) {{ - adElementsVisible++; - }} - }} - }} catch (e) {{ - // Invalid selector, skip - }} - }} - - return {{ - adElementsFound, - adElementsVisible, - pageTitle: document.title - }}; - }}); - - result.blockedRequests = blockedRequests; - result.totalRequests = totalRequests; - // Calculate how many ad elements were hidden (found but not visible) - const hiddenAds = result.adElementsFound - result.adElementsVisible; - result.percentBlocked = result.adElementsFound > 0 - ? Math.round((hiddenAds / result.adElementsFound) * 100) - : 0; - - console.error('Ad blocking result:', JSON.stringify(result)); - browser.disconnect(); - console.log(JSON.stringify(result)); -}})(); -''' - script_path = script_dir / 'check_ads.js' - script_path.write_text(test_script) - - result = subprocess.run( - ['node', str(script_path)], - cwd=str(script_dir), - capture_output=True, - text=True, - env=env, - timeout=90 - ) - - if result.returncode != 0: - raise RuntimeError(f"Ad check script failed: {result.stderr}") - - output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')] - if not output_lines: - raise RuntimeError(f"No JSON output from ad check: {result.stdout}\nstderr: {result.stderr}") - - return json.loads(output_lines[-1]) - - -# Test URL: Yahoo has many ads that uBlock should block (no mocks) -TEST_URL = 'https://www.yahoo.com/' - - -def test_extension_loads_in_chromium(): - """Verify uBlock extension loads in Chromium by visiting its dashboard page. - - Uses Chromium with --load-extension to load the extension, then navigates - to chrome-extension://<id>/dashboard.html and checks that "uBlock" appears - in the page content. - """ - import signal - import time - print("[test] Starting test_extension_loads_in_chromium", flush=True) - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - print(f"[test] tmpdir={tmpdir}", flush=True) - - # Set up isolated env with proper directory structure - env = setup_test_env(tmpdir) - env.setdefault('CHROME_HEADLESS', 'true') - print(f"[test] DATA_DIR={env.get('DATA_DIR')}", flush=True) - print(f"[test] CHROME_BINARY={env.get('CHROME_BINARY')}", flush=True) - - ext_dir = Path(env['CHROME_EXTENSIONS_DIR']) - - # Step 1: Install the uBlock extension - print("[test] Installing uBlock extension...", flush=True) - result = subprocess.run( - ['node', str(INSTALL_SCRIPT)], - capture_output=True, - text=True, - env=env, - timeout=5 - ) - print(f"[test] Extension install rc={result.returncode}", flush=True) - assert result.returncode == 0, f"Extension install failed: {result.stderr}" - - # Verify extension cache was created - cache_file = ext_dir / 'ublock.extension.json' - assert cache_file.exists(), "Extension cache not created" - ext_data = json.loads(cache_file.read_text()) - print(f"[test] Extension installed: {ext_data.get('name')} v{ext_data.get('version')}", flush=True) - - # Step 2: Launch Chromium using the chrome hook (loads extensions automatically) - print(f"[test] NODE_MODULES_DIR={env.get('NODE_MODULES_DIR')}", flush=True) - print(f"[test] puppeteer-core exists: {(Path(env['NODE_MODULES_DIR']) / 'puppeteer-core').exists()}", flush=True) - print("[test] Launching Chromium...", flush=True) - - # Launch Chromium in crawls directory - crawl_id = 'test-ublock' - crawl_dir = Path(env['CRAWLS_DIR']) / crawl_id - crawl_dir.mkdir(parents=True, exist_ok=True) - chrome_dir = crawl_dir / 'chrome' - chrome_dir.mkdir(parents=True, exist_ok=True) - env['CRAWL_OUTPUT_DIR'] = str(crawl_dir) - - chrome_launch_process = subprocess.Popen( - ['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'], - cwd=str(chrome_dir), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - env=env - ) - print("[test] Chrome hook started, waiting for CDP...", flush=True) - - # Wait for Chromium to launch and CDP URL to be available - cdp_url = None - import select - for i in range(20): - poll_result = chrome_launch_process.poll() - if poll_result is not None: - stdout, stderr = chrome_launch_process.communicate() - raise RuntimeError(f"Chromium launch failed (exit={poll_result}):\nStdout: {stdout}\nStderr: {stderr}") - cdp_file = chrome_dir / 'cdp_url.txt' - if cdp_file.exists(): - cdp_url = cdp_file.read_text().strip() - print(f"[test] CDP URL found after {i+1} attempts", flush=True) - break - # Read any available stderr - while select.select([chrome_launch_process.stderr], [], [], 0)[0]: - line = chrome_launch_process.stderr.readline() - if not line: - break - print(f"[hook] {line.strip()}", flush=True) - time.sleep(0.3) - - assert cdp_url, "Chromium CDP URL not found after 20s" - print(f"[test] Chromium launched with CDP URL: {cdp_url}", flush=True) - print("[test] Reading hook stderr...", flush=True) - - # Check what extensions were loaded by chrome hook - extensions_file = chrome_dir / 'extensions.json' - if extensions_file.exists(): - loaded_exts = json.loads(extensions_file.read_text()) - print(f"Extensions loaded by chrome hook: {[e.get('name') for e in loaded_exts]}") - else: - print("Warning: extensions.json not found") - - # Get the unpacked extension ID - Chrome computes this from the path - unpacked_path = ext_data.get('unpacked_path', '') - print(f"[test] Extension unpacked path: {unpacked_path}", flush=True) - print("[test] Running puppeteer test script...", flush=True) - - try: - # Step 3: Connect to Chromium and verify extension loads - # First use CDP to get all targets and find extension ID - test_script = f''' -if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); -const puppeteer = require('puppeteer-core'); - -(async () => {{ - const browser = await puppeteer.connect({{ browserWSEndpoint: '{cdp_url}' }}); - - // Wait for extension to initialize - await new Promise(r => setTimeout(r, 500)); - - // Use CDP to get all targets including service workers - const pages = await browser.pages(); - const page = pages[0] || await browser.newPage(); - const client = await page.createCDPSession(); - - const {{ targetInfos }} = await client.send('Target.getTargets'); - console.error('All CDP targets:'); - targetInfos.forEach(t => console.error(' -', t.type, t.url.slice(0, 100))); - - // Find any chrome-extension:// URLs - const extTargets = targetInfos.filter(t => t.url.startsWith('chrome-extension://')); - console.error('Extension targets:', extTargets.length); - - // Filter out built-in extensions - const builtinIds = ['nkeimhogjdpnpccoofpliimaahmaaome', 'fignfifoniblkonapihmkfakmlgkbkcf', - 'ahfgeienlihckogmohjhadlkjgocpleb', 'mhjfbmdgcfjbbpaeojofohoefgiehjai']; - const customExts = extTargets.filter(t => {{ - const extId = t.url.split('://')[1].split('/')[0]; - return !builtinIds.includes(extId); - }}); - - if (customExts.length === 0) {{ - console.log(JSON.stringify({{ loaded: false, error: 'No custom extension found via CDP' }})); - browser.disconnect(); - return; - }} - - // Get extension ID from first custom extension - const extId = customExts[0].url.split('://')[1].split('/')[0]; - console.error('Found extension ID:', extId); - - // Try to load dashboard.html - const newPage = await browser.newPage(); - const dashboardUrl = 'chrome-extension://' + extId + '/dashboard.html'; - console.error('Loading:', dashboardUrl); - - try {{ - await newPage.goto(dashboardUrl, {{ waitUntil: 'domcontentloaded', timeout: 15000 }}); - const title = await newPage.title(); - const content = await newPage.content(); - const hasUblock = content.toLowerCase().includes('ublock') || title.toLowerCase().includes('ublock'); - - console.log(JSON.stringify({{ - loaded: true, - extensionId: extId, - pageTitle: title, - hasExtensionName: hasUblock, - contentLength: content.length - }})); - }} catch (e) {{ - console.error('Dashboard load failed:', e.message); - console.log(JSON.stringify({{ loaded: true, extensionId: extId, dashboardError: e.message }})); - }} - - browser.disconnect(); -}})(); -''' - script_path = tmpdir / 'test_ublock.js' - script_path.write_text(test_script) - - result = subprocess.run( - ['node', str(script_path)], - cwd=str(tmpdir), - capture_output=True, - text=True, - env=env, - timeout=10 - ) - - print(f"stderr: {result.stderr}") - print(f"stdout: {result.stdout}") - - assert result.returncode == 0, f"Test failed: {result.stderr}" - - output_lines = [l for l in result.stdout.strip().split('\n') if l.startswith('{')] - assert output_lines, f"No JSON output: {result.stdout}" - - test_result = json.loads(output_lines[-1]) - assert test_result.get('loaded'), \ - f"uBlock extension should be loaded in Chromium. Result: {test_result}" - print(f"Extension loaded successfully: {test_result}") - - finally: - # Clean up Chromium - try: - chrome_launch_process.send_signal(signal.SIGTERM) - chrome_launch_process.wait(timeout=5) - except: - pass - chrome_pid_file = chrome_dir / 'chrome.pid' - if chrome_pid_file.exists(): - try: - chrome_pid = int(chrome_pid_file.read_text().strip()) - os.kill(chrome_pid, signal.SIGKILL) - except (OSError, ValueError): - pass - - -def test_blocks_ads_on_yahoo_com(): - """Live test: verify uBlock Origin blocks ads on yahoo.com (real network). - - This test runs TWO browser sessions: - 1. WITHOUT extension - verifies ads are NOT blocked (baseline) - 2. WITH extension - verifies ads ARE blocked - - This ensures we're actually testing the extension's effect, not just - that a test page happens to show ads as blocked. No mocks are used. - """ - import time - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - # Set up isolated env with proper directory structure - env_base = setup_test_env(tmpdir) - env_base['CHROME_HEADLESS'] = 'true' - - # ============================================================ - # STEP 1: BASELINE - Run WITHOUT extension, verify ads are NOT blocked - # ============================================================ - print("\n" + "="*60) - print("STEP 1: BASELINE TEST (no extension)") - print("="*60) - - data_dir = Path(env_base['DATA_DIR']) - - env_no_ext = env_base.copy() - env_no_ext['CHROME_EXTENSIONS_DIR'] = str(data_dir / 'personas' / 'Default' / 'empty_extensions') - (data_dir / 'personas' / 'Default' / 'empty_extensions').mkdir(parents=True, exist_ok=True) - - # Launch baseline Chromium in crawls directory - baseline_crawl_id = 'baseline-no-ext' - baseline_crawl_dir = Path(env_base['CRAWLS_DIR']) / baseline_crawl_id - baseline_crawl_dir.mkdir(parents=True, exist_ok=True) - baseline_chrome_dir = baseline_crawl_dir / 'chrome' - env_no_ext['CRAWL_OUTPUT_DIR'] = str(baseline_crawl_dir) - baseline_process = None - - try: - baseline_process, baseline_cdp_url = launch_chromium_session( - env_no_ext, baseline_chrome_dir, baseline_crawl_id - ) - print(f"Baseline Chromium launched: {baseline_cdp_url}") - - # Wait a moment for browser to be ready - time.sleep(2) - - baseline_result = check_ad_blocking( - baseline_cdp_url, TEST_URL, env_no_ext, tmpdir - ) - - print(f"Baseline result: {baseline_result['adElementsVisible']} visible ads " - f"(found {baseline_result['adElementsFound']} ad elements)") - - finally: - if baseline_process: - kill_chromium_session(baseline_process, baseline_chrome_dir) - - # Verify baseline shows ads ARE visible (not blocked) - if baseline_result['adElementsFound'] == 0: - pytest.fail( - f"Baseline must find ad elements on {TEST_URL}, but found none. " - f"This test requires a real ad-heavy page." - ) - - if baseline_result['adElementsVisible'] == 0: - pytest.fail( - f"Baseline must have visible ads on {TEST_URL}, but none were visible. " - f"This likely means another ad blocker is active or network-level blocking is in effect." - ) - - print(f"\nāœ“ Baseline confirmed: {baseline_result['adElementsVisible']} visible ads without extension") - - # ============================================================ - # STEP 2: Install the uBlock extension - # ============================================================ - print("\n" + "="*60) - print("STEP 2: INSTALLING EXTENSION") - print("="*60) - - ext_dir = Path(env_base['CHROME_EXTENSIONS_DIR']) - - result = subprocess.run( - ['node', str(INSTALL_SCRIPT)], - capture_output=True, - text=True, - env=env_base, - timeout=60 - ) - assert result.returncode == 0, f"Extension install failed: {result.stderr}" - - cache_file = ext_dir / 'ublock.extension.json' - assert cache_file.exists(), "Extension cache not created" - ext_data = json.loads(cache_file.read_text()) - print(f"Extension installed: {ext_data.get('name')} v{ext_data.get('version')}") - - # ============================================================ - # STEP 3: Run WITH extension, verify ads ARE blocked - # ============================================================ - print("\n" + "="*60) - print("STEP 3: TEST WITH EXTENSION") - print("="*60) - - # Launch extension test Chromium in crawls directory - ext_crawl_id = 'test-with-ext' - ext_crawl_dir = Path(env_base['CRAWLS_DIR']) / ext_crawl_id - ext_crawl_dir.mkdir(parents=True, exist_ok=True) - ext_chrome_dir = ext_crawl_dir / 'chrome' - env_base['CRAWL_OUTPUT_DIR'] = str(ext_crawl_dir) - ext_process = None - - try: - ext_process, ext_cdp_url = launch_chromium_session( - env_base, ext_chrome_dir, ext_crawl_id - ) - print(f"Extension Chromium launched: {ext_cdp_url}") - - # Check that extension was loaded - extensions_file = ext_chrome_dir / 'extensions.json' - if extensions_file.exists(): - loaded_exts = json.loads(extensions_file.read_text()) - print(f"Extensions loaded: {[e.get('name') for e in loaded_exts]}") - - # Verify extension has ID and is initialized - if loaded_exts and loaded_exts[0].get('id'): - ext_id = loaded_exts[0]['id'] - print(f"Extension ID: {ext_id}") - - # Visit the extension dashboard to ensure it's fully loaded - print("Visiting extension dashboard to verify initialization...") - dashboard_script = f''' -const puppeteer = require('{env_base['NODE_MODULES_DIR']}/puppeteer-core'); -(async () => {{ - const browser = await puppeteer.connect({{ - browserWSEndpoint: '{ext_cdp_url}', - defaultViewport: null - }}); - const page = await browser.newPage(); - await page.goto('chrome-extension://{ext_id}/dashboard.html', {{ waitUntil: 'domcontentloaded', timeout: 10000 }}); - const title = await page.title(); - console.log('Dashboard title:', title); - await page.close(); - browser.disconnect(); -}})(); -''' - dash_script_path = tmpdir / 'check_dashboard.js' - dash_script_path.write_text(dashboard_script) - subprocess.run(['node', str(dash_script_path)], capture_output=True, timeout=15, env=env_base) - - # Wait longer for extension to fully initialize filters - # On first run, uBlock needs to download filter lists which can take 10-15 seconds - print("Waiting for uBlock filter lists to download and initialize...") - time.sleep(15) - - ext_result = check_ad_blocking( - ext_cdp_url, TEST_URL, env_base, tmpdir - ) - - print(f"Extension result: {ext_result['adElementsVisible']} visible ads " - f"(found {ext_result['adElementsFound']} ad elements)") - - finally: - if ext_process: - kill_chromium_session(ext_process, ext_chrome_dir) - - # ============================================================ - # STEP 4: Compare results - # ============================================================ - print("\n" + "="*60) - print("STEP 4: COMPARISON") - print("="*60) - print(f"Baseline (no extension): {baseline_result['adElementsVisible']} visible ads") - print(f"With extension: {ext_result['adElementsVisible']} visible ads") - - # Calculate reduction in visible ads - ads_blocked = baseline_result['adElementsVisible'] - ext_result['adElementsVisible'] - reduction_percent = (ads_blocked / baseline_result['adElementsVisible'] * 100) if baseline_result['adElementsVisible'] > 0 else 0 - - print(f"Reduction: {ads_blocked} fewer visible ads ({reduction_percent:.0f}% reduction)") - - # Extension should significantly reduce visible ads - assert ext_result['adElementsVisible'] < baseline_result['adElementsVisible'], \ - f"uBlock should reduce visible ads.\n" \ - f"Baseline: {baseline_result['adElementsVisible']} visible ads\n" \ - f"With extension: {ext_result['adElementsVisible']} visible ads\n" \ - f"Expected fewer ads with extension." - - # Ensure uBlock actually blocks at least some ad/track requests - assert ext_result['blockedRequests'] > 0, \ - "uBlock should block at least one ad/track request on yahoo.com" - - # Extension should block at least 20% of ads (was consistently blocking 5-13% without proper init time) - assert reduction_percent >= 20, \ - f"uBlock should block at least 20% of ads.\n" \ - f"Baseline: {baseline_result['adElementsVisible']} visible ads\n" \ - f"With extension: {ext_result['adElementsVisible']} visible ads\n" \ - f"Reduction: only {reduction_percent:.0f}% (expected at least 20%)\n" \ - f"Note: Filter lists must be downloaded on first run (takes ~15s)" - - print(f"\nāœ“ SUCCESS: uBlock correctly blocks ads!") - print(f" - Baseline: {baseline_result['adElementsVisible']} visible ads") - print(f" - With extension: {ext_result['adElementsVisible']} visible ads") - print(f" - Blocked: {ads_blocked} ads ({reduction_percent:.0f}% reduction)") diff --git a/archivebox/plugins/wget/config.json b/archivebox/plugins/wget/config.json deleted file mode 100644 index 70893612..00000000 --- a/archivebox/plugins/wget/config.json +++ /dev/null @@ -1,75 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "additionalProperties": false, - "properties": { - "WGET_ENABLED": { - "type": "boolean", - "default": true, - "x-aliases": ["SAVE_WGET", "USE_WGET"], - "description": "Enable wget archiving" - }, - "WGET_WARC_ENABLED": { - "type": "boolean", - "default": true, - "x-aliases": ["SAVE_WARC", "WGET_SAVE_WARC"], - "description": "Save WARC archive file" - }, - "WGET_BINARY": { - "type": "string", - "default": "wget", - "description": "Path to wget binary" - }, - "WGET_TIMEOUT": { - "type": "integer", - "default": 60, - "minimum": 5, - "x-fallback": "TIMEOUT", - "description": "Timeout for wget in seconds" - }, - "WGET_USER_AGENT": { - "type": "string", - "default": "", - "x-fallback": "USER_AGENT", - "description": "User agent string for wget" - }, - "WGET_COOKIES_FILE": { - "type": "string", - "default": "", - "x-fallback": "COOKIES_FILE", - "description": "Path to cookies file" - }, - "WGET_CHECK_SSL_VALIDITY": { - "type": "boolean", - "default": true, - "x-fallback": "CHECK_SSL_VALIDITY", - "description": "Whether to verify SSL certificates" - }, - "WGET_ARGS": { - "type": "array", - "items": {"type": "string"}, - "default": [ - "--no-verbose", - "--adjust-extension", - "--convert-links", - "--force-directories", - "--backup-converted", - "--span-hosts", - "--no-parent", - "--page-requisites", - "--restrict-file-names=windows", - "--tries=2", - "-e", "robots=off" - ], - "x-aliases": ["WGET_DEFAULT_ARGS"], - "description": "Default wget arguments" - }, - "WGET_ARGS_EXTRA": { - "type": "array", - "items": {"type": "string"}, - "default": [], - "x-aliases": ["WGET_EXTRA_ARGS"], - "description": "Extra arguments to append to wget command" - } - } -} diff --git a/archivebox/plugins/wget/on_Crawl__10_wget_install.py b/archivebox/plugins/wget/on_Crawl__10_wget_install.py deleted file mode 100755 index 16d95332..00000000 --- a/archivebox/plugins/wget/on_Crawl__10_wget_install.py +++ /dev/null @@ -1,95 +0,0 @@ -#!/usr/bin/env python3 -""" -Emit wget Binary dependency for the crawl. -""" - -import json -import os -import sys - - -# Read config from environment (already validated by JSONSchema) -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - -def get_env_bool(name: str, default: bool = False) -> bool: - val = get_env(name, '').lower() - if val in ('true', '1', 'yes', 'on'): - return True - if val in ('false', '0', 'no', 'off'): - return False - return default - -def get_env_int(name: str, default: int = 0) -> int: - try: - return int(get_env(name, str(default))) - except ValueError: - return default - - -def output_binary(name: str, binproviders: str): - """Output Binary JSONL record for a dependency.""" - machine_id = os.environ.get('MACHINE_ID', '') - - record = { - 'type': 'Binary', - 'name': name, - 'binproviders': binproviders, - 'machine_id': machine_id, - } - print(json.dumps(record)) - - -def output_machine_config(config: dict): - """Output Machine config JSONL patch.""" - if not config: - return - record = { - 'type': 'Machine', - 'config': config, - } - print(json.dumps(record)) - - -def main(): - warnings = [] - errors = [] - - # Get config values - wget_enabled = get_env_bool('WGET_ENABLED', True) - wget_save_warc = get_env_bool('WGET_SAVE_WARC', True) - wget_timeout = get_env_int('WGET_TIMEOUT') or get_env_int('TIMEOUT', 60) - wget_binary = get_env('WGET_BINARY', 'wget') - - # Compute derived values (USE_WGET for backward compatibility) - use_wget = wget_enabled - - # Validate timeout with warning (not error) - if use_wget and wget_timeout < 20: - warnings.append( - f"WGET_TIMEOUT={wget_timeout} is very low. " - "wget may fail to archive sites if set to less than ~20 seconds. " - "Consider setting WGET_TIMEOUT=60 or higher." - ) - - if use_wget: - output_binary(name='wget', binproviders='apt,brew,pip,env') - - # Output computed config patch as JSONL - output_machine_config({ - 'USE_WGET': use_wget, - 'WGET_BINARY': wget_binary, - }) - - for warning in warnings: - print(f"WARNING:{warning}", file=sys.stderr) - - for error in errors: - print(f"ERROR:{error}", file=sys.stderr) - - # Exit with error if any hard errors - sys.exit(1 if errors else 0) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/wget/on_Snapshot__06_wget.bg.py b/archivebox/plugins/wget/on_Snapshot__06_wget.bg.py deleted file mode 100644 index f62b21b5..00000000 --- a/archivebox/plugins/wget/on_Snapshot__06_wget.bg.py +++ /dev/null @@ -1,233 +0,0 @@ -#!/usr/bin/env python3 -""" -Archive a URL using wget. - -Usage: on_Snapshot__06_wget.bg.py --url=<url> --snapshot-id=<uuid> -Output: Downloads files to $PWD - -Environment variables: - WGET_ENABLED: Enable wget archiving (default: True) - WGET_WARC_ENABLED: Save WARC file (default: True) - WGET_BINARY: Path to wget binary (default: wget) - WGET_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT) - WGET_USER_AGENT: User agent string (x-fallback: USER_AGENT) - WGET_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE) - WGET_CHECK_SSL_VALIDITY: Whether to check SSL certificates (x-fallback: CHECK_SSL_VALIDITY) - WGET_ARGS: Default wget arguments (JSON array) - WGET_ARGS_EXTRA: Extra arguments to append (JSON array) -""" - -import json -import os -import re -import subprocess -import sys -from datetime import datetime, timezone -from pathlib import Path - -import rich_click as click - - -# Extractor metadata -PLUGIN_NAME = 'wget' -BIN_NAME = 'wget' -BIN_PROVIDERS = 'apt,brew,env' -OUTPUT_DIR = '.' - - -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - - -def get_env_bool(name: str, default: bool = False) -> bool: - val = get_env(name, '').lower() - if val in ('true', '1', 'yes', 'on'): - return True - if val in ('false', '0', 'no', 'off'): - return False - return default - - -def get_env_int(name: str, default: int = 0) -> int: - try: - return int(get_env(name, str(default))) - except ValueError: - return default - - -def get_env_array(name: str, default: list[str] | None = None) -> list[str]: - """Parse a JSON array from environment variable.""" - val = get_env(name, '') - if not val: - return default if default is not None else [] - try: - result = json.loads(val) - if isinstance(result, list): - return [str(item) for item in result] - return default if default is not None else [] - except json.JSONDecodeError: - return default if default is not None else [] - - -STATICFILE_DIR = '../staticfile' - -def has_staticfile_output() -> bool: - """Check if staticfile extractor already downloaded this URL.""" - staticfile_dir = Path(STATICFILE_DIR) - if not staticfile_dir.exists(): - return False - stdout_log = staticfile_dir / 'stdout.log' - if not stdout_log.exists(): - return False - for line in stdout_log.read_text(errors='ignore').splitlines(): - line = line.strip() - if not line.startswith('{'): - continue - try: - record = json.loads(line) - except json.JSONDecodeError: - continue - if record.get('type') == 'ArchiveResult' and record.get('status') == 'succeeded': - return True - return False - - - - -def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]: - """ - Archive URL using wget. - - Returns: (success, output_path, error_message) - """ - # Get config from env (with WGET_ prefix, x-fallback handled by config loader) - timeout = get_env_int('WGET_TIMEOUT') or get_env_int('TIMEOUT', 60) - user_agent = get_env('WGET_USER_AGENT') or get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)') - check_ssl = get_env_bool('WGET_CHECK_SSL_VALIDITY', True) if get_env('WGET_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True) - cookies_file = get_env('WGET_COOKIES_FILE') or get_env('COOKIES_FILE', '') - wget_args = get_env_array('WGET_ARGS', []) - wget_args_extra = get_env_array('WGET_ARGS_EXTRA', []) - - # Feature toggles - warc_enabled = get_env_bool('WGET_WARC_ENABLED', True) - - # Build wget command (later options take precedence) - cmd = [ - binary, - *wget_args, - f'--timeout={timeout}', - ] - - if user_agent: - cmd.append(f'--user-agent={user_agent}') - - if warc_enabled: - warc_dir = Path('warc') - warc_dir.mkdir(exist_ok=True) - warc_path = warc_dir / str(int(datetime.now(timezone.utc).timestamp())) - cmd.append(f'--warc-file={warc_path}') - else: - cmd.append('--timestamping') - - if cookies_file and Path(cookies_file).is_file(): - cmd.extend(['--load-cookies', cookies_file]) - - if not check_ssl: - cmd.extend(['--no-check-certificate', '--no-hsts']) - - if wget_args_extra: - cmd.extend(wget_args_extra) - - cmd.append(url) - - # Run wget - try: - result = subprocess.run( - cmd, - capture_output=True, - text=True, - timeout=timeout * 2, # Allow extra time for large downloads - ) - - # Find downloaded files - downloaded_files = [ - f for f in Path('.').rglob('*') - if f.is_file() and f.name != '.gitkeep' and not str(f).startswith('warc/') - ] - - if not downloaded_files: - if result.returncode != 0: - return False, None, f'wget failed (exit={result.returncode})' - return False, None, 'No files downloaded' - - # Find main HTML file - html_files = [ - f for f in downloaded_files - if re.search(r'\.[Ss]?[Hh][Tt][Mm][Ll]?$', str(f)) - ] - output_path = str(html_files[0]) if html_files else str(downloaded_files[0]) - - # Parse download stats from wget output - stderr_text = (result.stderr or '') - output_tail = stderr_text.strip().split('\n')[-3:] if stderr_text else [] - files_count = len(downloaded_files) - - return True, output_path, '' - - except subprocess.TimeoutExpired: - return False, None, f'Timed out after {timeout * 2} seconds' - except Exception as e: - return False, None, f'{type(e).__name__}: {e}' - - -@click.command() -@click.option('--url', required=True, help='URL to archive') -@click.option('--snapshot-id', required=True, help='Snapshot UUID') -def main(url: str, snapshot_id: str): - """Archive a URL using wget.""" - - output = None - status = 'failed' - error = '' - - try: - # Check if wget is enabled - if not get_env_bool('WGET_ENABLED', True): - print('Skipping wget (WGET_ENABLED=False)', file=sys.stderr) - # Temporary failure (config disabled) - NO JSONL emission - sys.exit(0) - - # Check if staticfile extractor already handled this (permanent skip) - if has_staticfile_output(): - print('Skipping wget - staticfile extractor already downloaded this', file=sys.stderr) - print(json.dumps({'type': 'ArchiveResult', 'status': 'skipped', 'output_str': 'staticfile already exists'})) - sys.exit(0) - - # Get binary from environment - binary = get_env('WGET_BINARY', 'wget') - - # Run extraction - success, output, error = save_wget(url, binary) - - if success: - # Success - emit ArchiveResult - result = { - 'type': 'ArchiveResult', - 'status': 'succeeded', - 'output_str': output or '' - } - print(json.dumps(result)) - sys.exit(0) - else: - # Transient error - emit NO JSONL - print(f'ERROR: {error}', file=sys.stderr) - sys.exit(1) - - except Exception as e: - # Transient error - emit NO JSONL - print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr) - sys.exit(1) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/wget/templates/card.html b/archivebox/plugins/wget/templates/card.html deleted file mode 100644 index 550db449..00000000 --- a/archivebox/plugins/wget/templates/card.html +++ /dev/null @@ -1,8 +0,0 @@ -<!-- Wget thumbnail - scaled down iframe preview of mirrored site --> -<div class="extractor-thumbnail wget-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fff;"> - <iframe src="{{ output_path }}" - style="width: 400%; height: 400px; transform: scale(0.25); transform-origin: top left; pointer-events: none; border: none;" - loading="lazy" - sandbox="allow-same-origin"> - </iframe> -</div> diff --git a/archivebox/plugins/wget/templates/icon.html b/archivebox/plugins/wget/templates/icon.html deleted file mode 100644 index 430432cf..00000000 --- a/archivebox/plugins/wget/templates/icon.html +++ /dev/null @@ -1 +0,0 @@ -<span class="abx-output-icon abx-output-icon--wget" title="Wget"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><path d="M12 4v10"/><path d="M8 10l4 4 4-4"/><path d="M4 20h16"/></svg></span> diff --git a/archivebox/plugins/wget/tests/test_wget.py b/archivebox/plugins/wget/tests/test_wget.py deleted file mode 100644 index 52c1fc55..00000000 --- a/archivebox/plugins/wget/tests/test_wget.py +++ /dev/null @@ -1,433 +0,0 @@ -""" -Integration tests for wget plugin - -Tests verify: - pass -1. Validate hook checks for wget binary -2. Verify deps with abx-pkg -3. Config options work (WGET_ENABLED, WGET_SAVE_WARC, etc.) -4. Extraction works against real example.com -5. Output files contain actual page content -6. Skip cases work (WGET_ENABLED=False, staticfile present) -7. Failure cases handled (404, network errors) -""" - -import json -import os -import shutil -import subprocess -import sys -import tempfile -import uuid -from pathlib import Path - -import pytest - - -PLUGIN_DIR = Path(__file__).parent.parent -PLUGINS_ROOT = PLUGIN_DIR.parent -WGET_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_wget.*')) -BREW_HOOK = PLUGINS_ROOT / 'brew' / 'on_Binary__install_using_brew_provider.py' -APT_HOOK = PLUGINS_ROOT / 'apt' / 'on_Binary__install_using_apt_provider.py' -TEST_URL = 'https://example.com' - - -def test_hook_script_exists(): - """Verify hook script exists.""" - assert WGET_HOOK.exists(), f"Hook script not found: {WGET_HOOK}" - - -def test_verify_deps_with_abx_pkg(): - """Verify wget is available via abx-pkg.""" - from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides - - wget_binary = Binary(name='wget', binproviders=[AptProvider(), BrewProvider(), EnvProvider()]) - wget_loaded = wget_binary.load() - - if wget_loaded and wget_loaded.abspath: - assert True, "wget is available" - else: - pass - - -def test_reports_missing_dependency_when_not_installed(): - """Test that script reports DEPENDENCY_NEEDED when wget is not found.""" - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - # Run with empty PATH so binary won't be found - env = {'PATH': '/nonexistent', 'HOME': str(tmpdir)} - - result = subprocess.run( - [sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'test123'], - cwd=tmpdir, - capture_output=True, - text=True, - env=env - ) - - # Missing binary is a transient error - should exit 1 with no JSONL - assert result.returncode == 1, "Should exit 1 when dependency missing" - - # Should NOT emit JSONL (transient error - will be retried) - jsonl_lines = [line for line in result.stdout.strip().split('\n') - if line.strip().startswith('{')] - assert len(jsonl_lines) == 0, "Should not emit JSONL for transient error (missing binary)" - - # Should log error to stderr - assert 'wget' in result.stderr.lower() or 'error' in result.stderr.lower(), \ - "Should report error in stderr" - - -def test_can_install_wget_via_provider(): - """Test that wget can be installed via brew/apt provider hooks.""" - - # Determine which provider to use - if shutil.which('brew'): - provider_hook = BREW_HOOK - provider_name = 'brew' - elif shutil.which('apt-get'): - provider_hook = APT_HOOK - provider_name = 'apt' - else: - pass - - assert provider_hook.exists(), f"Provider hook not found: {provider_hook}" - - # Test installation via provider hook - binary_id = str(uuid.uuid4()) - machine_id = str(uuid.uuid4()) - - result = subprocess.run( - [ - sys.executable, - str(provider_hook), - '--binary-id', binary_id, - '--machine-id', machine_id, - '--name', 'wget', - '--binproviders', 'apt,brew,env' - ], - capture_output=True, - text=True, - timeout=300 # Installation can take time - ) - - # Should succeed (wget installs successfully or is already installed) - assert result.returncode == 0, f"{provider_name} install failed: {result.stderr}" - - # Should output Binary JSONL record - assert 'Binary' in result.stdout or 'wget' in result.stderr, \ - f"Should output installation info: stdout={result.stdout}, stderr={result.stderr}" - - # Parse JSONL if present - if result.stdout.strip(): - pass - for line in result.stdout.strip().split('\n'): - pass - try: - record = json.loads(line) - if record.get('type') == 'Binary': - assert record['name'] == 'wget' - assert record['binprovider'] in ['brew', 'apt'] - assert record['abspath'], "Should have binary path" - assert Path(record['abspath']).exists(), f"Binary should exist at {record['abspath']}" - break - except json.JSONDecodeError: - continue - - # Verify wget is now available - result = subprocess.run(['which', 'wget'], capture_output=True, text=True) - assert result.returncode == 0, "wget should be available after installation" - - -def test_archives_example_com(): - """Test full workflow: ensure wget installed then archive example.com.""" - - # First ensure wget is installed via provider - if shutil.which('brew'): - provider_hook = BREW_HOOK - elif shutil.which('apt-get'): - provider_hook = APT_HOOK - else: - pass - - # Run installation (idempotent - will succeed if already installed) - install_result = subprocess.run( - [ - sys.executable, - str(provider_hook), - '--dependency-id', str(uuid.uuid4()), - '--bin-name', 'wget', - '--bin-providers', 'apt,brew,env' - ], - capture_output=True, - text=True, - timeout=300 - ) - - if install_result.returncode != 0: - pass - - # Now test archiving - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - # Run wget extraction - result = subprocess.run( - [sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'], - cwd=tmpdir, - capture_output=True, - text=True, - timeout=120 - ) - - assert result.returncode == 0, f"Extraction failed: {result.stderr}" - - # Parse clean JSONL output - result_json = None - for line in result.stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - pass - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - assert result_json, "Should have ArchiveResult JSONL output" - assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" - - # Verify files were downloaded - downloaded_files = list(tmpdir.rglob('*.html')) + list(tmpdir.rglob('*.htm')) - assert len(downloaded_files) > 0, "No HTML files downloaded" - - # Find main HTML file (should contain example.com) - main_html = None - for html_file in downloaded_files: - content = html_file.read_text(errors='ignore') - if 'example domain' in content.lower(): - main_html = html_file - break - - assert main_html is not None, "Could not find main HTML file with example.com content" - - # Verify HTML content contains REAL example.com text - html_content = main_html.read_text(errors='ignore') - assert len(html_content) > 200, f"HTML content too short: {len(html_content)} bytes" - assert 'example domain' in html_content.lower(), "Missing 'Example Domain' in HTML" - assert ('this domain' in html_content.lower() or - 'illustrative examples' in html_content.lower()), \ - "Missing example.com description text" - assert ('iana' in html_content.lower() or - 'more information' in html_content.lower()), \ - "Missing IANA reference" - - -def test_config_save_wget_false_skips(): - """Test that WGET_ENABLED=False exits without emitting JSONL.""" - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - # Set WGET_ENABLED=False - env = os.environ.copy() - env['WGET_ENABLED'] = 'False' - - result = subprocess.run( - [sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=30 - ) - - # Should exit 0 when feature disabled - assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}" - - # Feature disabled - no JSONL emission, just logs to stderr - assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr" - - # Should NOT emit any JSONL - jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')] - assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}" - - -def test_config_save_warc(): - """Test that WGET_SAVE_WARC=True creates WARC files.""" - - # Ensure wget is available - if not shutil.which('wget'): - pass - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - # Set WGET_SAVE_WARC=True explicitly - env = os.environ.copy() - env['WGET_SAVE_WARC'] = 'True' - - result = subprocess.run( - [sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'testwarc'], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=120 - ) - - if result.returncode == 0: - # Look for WARC files in warc/ subdirectory - warc_dir = tmpdir / 'warc' - if warc_dir.exists(): - warc_files = list(warc_dir.rglob('*')) - warc_files = [f for f in warc_files if f.is_file()] - assert len(warc_files) > 0, "WARC file not created when WGET_SAVE_WARC=True" - - -def test_staticfile_present_skips(): - """Test that wget skips when staticfile already downloaded.""" - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - # Create directory structure like real ArchiveBox: - # tmpdir/ - # staticfile/ <- staticfile extractor output - # wget/ <- wget extractor runs here, looks for ../staticfile - staticfile_dir = tmpdir / 'staticfile' - staticfile_dir.mkdir() - (staticfile_dir / 'stdout.log').write_text('{"type":"ArchiveResult","status":"succeeded","output_str":"index.html"}\n') - - wget_dir = tmpdir / 'wget' - wget_dir.mkdir() - - result = subprocess.run( - [sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'teststatic'], - cwd=wget_dir, # Run from wget subdirectory - capture_output=True, - text=True, - timeout=30 - ) - - # Should skip with permanent skip JSONL - assert result.returncode == 0, "Should exit 0 when permanently skipping" - - # Parse clean JSONL output - result_json = None - for line in result.stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - pass - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - assert result_json, "Should emit ArchiveResult JSONL for permanent skip" - assert result_json['status'] == 'skipped', f"Should have status='skipped': {result_json}" - assert 'staticfile' in result_json.get('output_str', '').lower(), "Should mention staticfile in output_str" - - -def test_handles_404_gracefully(): - """Test that wget fails gracefully on 404.""" - - if not shutil.which('wget'): - pass - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - # Try to download non-existent page - result = subprocess.run( - [sys.executable, str(WGET_HOOK), '--url', 'https://example.com/nonexistent-page-404', '--snapshot-id', 'test404'], - cwd=tmpdir, - capture_output=True, - text=True, - timeout=60 - ) - - # Should fail - assert result.returncode != 0, "Should fail on 404" - combined = result.stdout + result.stderr - assert '404' in combined or 'Not Found' in combined or 'No files downloaded' in combined, \ - "Should report 404 or no files downloaded" - - -def test_config_timeout_honored(): - """Test that WGET_TIMEOUT config is respected.""" - - if not shutil.which('wget'): - pass - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - # Set very short timeout - env = os.environ.copy() - env['WGET_TIMEOUT'] = '5' - - # This should still succeed for example.com (it's fast) - result = subprocess.run( - [sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'testtimeout'], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=30 - ) - - # Verify it completed (success or fail, but didn't hang) - assert result.returncode in (0, 1), "Should complete (success or fail)" - - -def test_config_user_agent(): - """Test that WGET_USER_AGENT config is used.""" - - if not shutil.which('wget'): - pass - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - # Set custom user agent - env = os.environ.copy() - env['WGET_USER_AGENT'] = 'TestBot/1.0' - - result = subprocess.run( - [sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'testua'], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=120 - ) - - # Should succeed (example.com doesn't block) - if result.returncode == 0: - # Parse clean JSONL output - result_json = None - for line in result.stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - pass - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - assert result_json, "Should have ArchiveResult JSONL output" - assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" - - -if __name__ == '__main__': - pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/ytdlp/config.json b/archivebox/plugins/ytdlp/config.json deleted file mode 100644 index 2a98e24e..00000000 --- a/archivebox/plugins/ytdlp/config.json +++ /dev/null @@ -1,92 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "additionalProperties": false, - "properties": { - "YTDLP_ENABLED": { - "type": "boolean", - "default": true, - "x-aliases": [ - "MEDIA_ENABLED", - "SAVE_MEDIA", - "USE_MEDIA", - "USE_YTDLP", - "FETCH_MEDIA", - "SAVE_YTDLP" - ], - "description": "Enable video/audio downloading with yt-dlp" - }, - "YTDLP_BINARY": { - "type": "string", - "default": "yt-dlp", - "x-aliases": ["YOUTUBEDL_BINARY", "YOUTUBE_DL_BINARY"], - "description": "Path to yt-dlp binary" - }, - "YTDLP_NODE_BINARY": { - "type": "string", - "default": "node", - "x-fallback": "NODE_BINARY", - "description": "Path to Node.js binary for yt-dlp JS runtime" - }, - "YTDLP_TIMEOUT": { - "type": "integer", - "default": 3600, - "minimum": 30, - "x-fallback": "TIMEOUT", - "x-aliases": ["MEDIA_TIMEOUT"], - "description": "Timeout for yt-dlp downloads in seconds" - }, - "YTDLP_COOKIES_FILE": { - "type": "string", - "default": "", - "x-fallback": "COOKIES_FILE", - "description": "Path to cookies file" - }, - "YTDLP_MAX_SIZE": { - "type": "string", - "default": "750m", - "pattern": "^\\d+[kmgKMG]?$", - "x-aliases": ["MEDIA_MAX_SIZE"], - "description": "Maximum file size for yt-dlp downloads" - }, - "YTDLP_CHECK_SSL_VALIDITY": { - "type": "boolean", - "default": true, - "x-fallback": "CHECK_SSL_VALIDITY", - "description": "Whether to verify SSL certificates" - }, - "YTDLP_ARGS": { - "type": "array", - "items": { "type": "string" }, - "default": [ - "--restrict-filenames", - "--trim-filenames=128", - "--write-description", - "--write-info-json", - "--write-thumbnail", - "--write-sub", - "--write-auto-subs", - "--convert-subs=srt", - "--yes-playlist", - "--continue", - "--no-abort-on-error", - "--ignore-errors", - "--geo-bypass", - "--add-metadata", - "--no-progress", - "--remote-components=ejs:github", - "-o", - "%(title)s.%(ext)s" - ], - "x-aliases": ["YTDLP_DEFAULT_ARGS"], - "description": "Default yt-dlp arguments" - }, - "YTDLP_ARGS_EXTRA": { - "type": "array", - "items": { "type": "string" }, - "default": [], - "x-aliases": ["YTDLP_EXTRA_ARGS"], - "description": "Extra arguments to append to yt-dlp command" - } - } -} diff --git a/archivebox/plugins/ytdlp/on_Crawl__15_ytdlp_install.py b/archivebox/plugins/ytdlp/on_Crawl__15_ytdlp_install.py deleted file mode 100755 index 7b81b5d9..00000000 --- a/archivebox/plugins/ytdlp/on_Crawl__15_ytdlp_install.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python3 -""" -Emit yt-dlp (and related) Binary dependencies for the crawl. -""" - -import json -import os -import sys - - -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - -def get_env_bool(name: str, default: bool = False) -> bool: - val = get_env(name, '').lower() - if val in ('true', '1', 'yes', 'on'): - return True - if val in ('false', '0', 'no', 'off'): - return False - return default - - -def output_binary(name: str, binproviders: str, overrides: dict | None = None): - """Output Binary JSONL record for a dependency.""" - machine_id = os.environ.get('MACHINE_ID', '') - - record = { - 'type': 'Binary', - 'name': name, - 'binproviders': binproviders, - 'machine_id': machine_id, - } - if overrides: - record['overrides'] = overrides - print(json.dumps(record)) - - -def main(): - ytdlp_enabled = get_env_bool('YTDLP_ENABLED', True) - - if not ytdlp_enabled: - sys.exit(0) - - output_binary( - name='yt-dlp', - binproviders='pip,brew,apt,env', - overrides={'pip': {'packages': ['yt-dlp[default]']}}, - ) - - # Node.js (required by several JS-based extractors, declared here per legacy binaries.jsonl) - output_binary( - name='node', - binproviders='apt,brew,env', - overrides={'apt': {'packages': ['nodejs']}}, - ) - - # ffmpeg (used by media extraction) - output_binary(name='ffmpeg', binproviders='apt,brew,env') - - sys.exit(0) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/ytdlp/on_Snapshot__02_ytdlp.bg.py b/archivebox/plugins/ytdlp/on_Snapshot__02_ytdlp.bg.py deleted file mode 100644 index fbf841ae..00000000 --- a/archivebox/plugins/ytdlp/on_Snapshot__02_ytdlp.bg.py +++ /dev/null @@ -1,258 +0,0 @@ -#!/usr/bin/env python3 -""" -Download video/audio from a URL using yt-dlp. - -Usage: on_Snapshot__02_ytdlp.bg.py --url=<url> --snapshot-id=<uuid> -Output: Downloads video/audio files to $PWD - -Environment variables: - YTDLP_ENABLED: Enable yt-dlp extraction (default: True) - YTDLP_BINARY: Path to yt-dlp binary (default: yt-dlp) - YTDLP_NODE_BINARY: Path to Node.js binary (x-fallback: NODE_BINARY) - YTDLP_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT) - YTDLP_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE) - YTDLP_MAX_SIZE: Maximum file size (default: 750m) - YTDLP_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY) - YTDLP_ARGS: Default yt-dlp arguments (JSON array) - YTDLP_ARGS_EXTRA: Extra arguments to append (JSON array) -""" - -import json -import os -import subprocess -import sys -import threading -from pathlib import Path - -import rich_click as click - - - - -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - - -def get_env_bool(name: str, default: bool = False) -> bool: - val = get_env(name, '').lower() - if val in ('true', '1', 'yes', 'on'): - return True - if val in ('false', '0', 'no', 'off'): - return False - return default - - -def get_env_int(name: str, default: int = 0) -> int: - try: - return int(get_env(name, str(default))) - except ValueError: - return default - - -def get_env_array(name: str, default: list[str] | None = None) -> list[str]: - """Parse a JSON array from environment variable.""" - val = get_env(name, '') - if not val: - return default if default is not None else [] - try: - result = json.loads(val) - if isinstance(result, list): - return [str(item) for item in result] - return default if default is not None else [] - except json.JSONDecodeError: - return default if default is not None else [] - - -STATICFILE_DIR = '../staticfile' - -def has_staticfile_output() -> bool: - """Check if staticfile extractor already downloaded this URL.""" - staticfile_dir = Path(STATICFILE_DIR) - if not staticfile_dir.exists(): - return False - stdout_log = staticfile_dir / 'stdout.log' - if not stdout_log.exists(): - return False - for line in stdout_log.read_text(errors='ignore').splitlines(): - line = line.strip() - if not line.startswith('{'): - continue - try: - record = json.loads(line) - except json.JSONDecodeError: - continue - if record.get('type') == 'ArchiveResult' and record.get('status') == 'succeeded': - return True - return False - - -def save_ytdlp(url: str, binary: str) -> tuple[bool, str | None, str]: - """ - Download video/audio using yt-dlp. - - Returns: (success, output_path, error_message) - """ - # Get config from env (with YTDLP_ prefix, x-fallback handled by config loader) - timeout = get_env_int('YTDLP_TIMEOUT') or get_env_int('TIMEOUT', 3600) - check_ssl = get_env_bool('YTDLP_CHECK_SSL_VALIDITY', True) if get_env('YTDLP_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True) - cookies_file = get_env('YTDLP_COOKIES_FILE') or get_env('COOKIES_FILE', '') - max_size = get_env('YTDLP_MAX_SIZE', '750m') - node_binary = get_env('YTDLP_NODE_BINARY') or get_env('NODE_BINARY', 'node') - ytdlp_args = get_env_array('YTDLP_ARGS', []) - ytdlp_args_extra = get_env_array('YTDLP_ARGS_EXTRA', []) - - # Output directory is current directory (hook already runs in output dir) - output_dir = Path('.') - - # Build command (later options take precedence) - cmd = [ - binary, - *ytdlp_args, - # Format with max_size limit (appended after YTDLP_ARGS so it can be overridden by YTDLP_ARGS_EXTRA) - f'--format=(bv*+ba/b)[filesize<={max_size}][filesize_approx<=?{max_size}]/(bv*+ba/b)', - f'--js-runtimes=node:{node_binary}', - ] - - if not check_ssl: - cmd.append('--no-check-certificate') - - if cookies_file and Path(cookies_file).is_file(): - cmd.extend(['--cookies', cookies_file]) - - if ytdlp_args_extra: - cmd.extend(ytdlp_args_extra) - - if '--newline' not in cmd: - cmd.append('--newline') - - cmd.append(url) - - try: - print(f'[ytdlp] Starting download (timeout={timeout}s)', file=sys.stderr) - - output_lines: list[str] = [] - process = subprocess.Popen( - cmd, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - bufsize=1, - ) - - def _read_output() -> None: - if not process.stdout: - return - for line in process.stdout: - output_lines.append(line) - sys.stderr.write(line) - - reader = threading.Thread(target=_read_output, daemon=True) - reader.start() - - try: - process.wait(timeout=timeout) - except subprocess.TimeoutExpired: - process.kill() - reader.join(timeout=1) - return False, None, f'Timed out after {timeout} seconds' - - reader.join(timeout=1) - combined_output = ''.join(output_lines) - - # Check if any media files were downloaded - media_extensions = ( - '.mp4', '.webm', '.mkv', '.avi', '.mov', '.flv', '.wmv', '.m4v', - '.mp3', '.m4a', '.ogg', '.wav', '.flac', '.aac', '.opus', - '.json', '.jpg', '.png', '.webp', '.jpeg', - '.vtt', '.srt', '.ass', '.lrc', - '.description', - ) - - downloaded_files = [ - f for f in output_dir.glob('*') - if f.is_file() and f.suffix.lower() in media_extensions - ] - - if downloaded_files: - # Return first video/audio file, or first file if no media - video_audio = [ - f for f in downloaded_files - if f.suffix.lower() in ('.mp4', '.webm', '.mkv', '.avi', '.mov', '.mp3', '.m4a', '.ogg', '.wav', '.flac') - ] - output = str(video_audio[0]) if video_audio else str(downloaded_files[0]) - return True, output, '' - else: - stderr = combined_output - - # These are NOT errors - page simply has no downloadable media - # Return success with no output (legitimate "nothing to download") - if 'ERROR: Unsupported URL' in stderr: - return True, None, '' # Not a media site - success, no output - if 'URL could be a direct video link' in stderr: - return True, None, '' # Not a supported media URL - success, no output - if process.returncode == 0: - return True, None, '' # yt-dlp exited cleanly, just no media - success - - # These ARE errors - something went wrong - if 'HTTP Error 404' in stderr: - return False, None, '404 Not Found' - if 'HTTP Error 403' in stderr: - return False, None, '403 Forbidden' - if 'Unable to extract' in stderr: - return False, None, 'Unable to extract media info' - - return False, None, f'yt-dlp error: {stderr}' - - except subprocess.TimeoutExpired: - return False, None, f'Timed out after {timeout} seconds' - except Exception as e: - return False, None, f'{type(e).__name__}: {e}' - - -@click.command() -@click.option('--url', required=True, help='URL to download video/audio from') -@click.option('--snapshot-id', required=True, help='Snapshot UUID') -def main(url: str, snapshot_id: str): - """Download video/audio from a URL using yt-dlp.""" - - try: - # Check if yt-dlp downloading is enabled - if not get_env_bool('YTDLP_ENABLED', True): - print('Skipping ytdlp (YTDLP_ENABLED=False)', file=sys.stderr) - # Temporary failure (config disabled) - NO JSONL emission - sys.exit(0) - - # Check if staticfile extractor already handled this (permanent skip) - if has_staticfile_output(): - print('Skipping ytdlp - staticfile extractor already downloaded this', file=sys.stderr) - print(json.dumps({'type': 'ArchiveResult', 'status': 'skipped', 'output_str': 'staticfile already exists'})) - sys.exit(0) - - # Get binary from environment - binary = get_env('YTDLP_BINARY', 'yt-dlp') - - # Run extraction - success, output, error = save_ytdlp(url, binary) - - if success: - # Success - emit ArchiveResult - result = { - 'type': 'ArchiveResult', - 'status': 'succeeded', - 'output_str': output or '' - } - print(json.dumps(result)) - sys.exit(0) - else: - # Transient error - emit NO JSONL - print(f'ERROR: {error}', file=sys.stderr) - sys.exit(1) - - except Exception as e: - # Transient error - emit NO JSONL - print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr) - sys.exit(1) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/ytdlp/templates/card.html b/archivebox/plugins/ytdlp/templates/card.html deleted file mode 100644 index 6fe32098..00000000 --- a/archivebox/plugins/ytdlp/templates/card.html +++ /dev/null @@ -1,17 +0,0 @@ -<!-- YT-DLP output list --> -{% if media_files %} - <div class="loose-items" style="pointer-events: auto;"> - {% for file in media_files %} - <a href="{{ file.url|default:file.path|urlencode }}" target="preview" - title="{{ file.name }}"> - šŸ“„ {{ file.name }} - </a> - {% endfor %} - </div> -{% else %} - <div class="thumbnail-compact" data-plugin="ytdlp" data-compact="1"> - <span class="thumbnail-compact-icon">šŸŽ¬</span> - <span class="thumbnail-compact-label">YT-DLP</span> - <span class="thumbnail-compact-meta">media</span> - </div> -{% endif %} diff --git a/archivebox/plugins/ytdlp/templates/full.html b/archivebox/plugins/ytdlp/templates/full.html deleted file mode 100644 index 6a4b2b35..00000000 --- a/archivebox/plugins/ytdlp/templates/full.html +++ /dev/null @@ -1,10 +0,0 @@ -<!-- YT-DLP fullscreen - full video/audio player --> -<div class="extractor-fullscreen ytdlp-fullscreen" style="width: 100%; height: 100vh; background: #000; display: flex; align-items: center; justify-content: center;"> - <video src="{{ output_path }}" - style="max-width: 100%; max-height: 100%;" - controls - autoplay - preload="auto"> - Your browser does not support the video tag. - </video> -</div> diff --git a/archivebox/plugins/ytdlp/templates/icon.html b/archivebox/plugins/ytdlp/templates/icon.html deleted file mode 100644 index bf0e4ee4..00000000 --- a/archivebox/plugins/ytdlp/templates/icon.html +++ /dev/null @@ -1 +0,0 @@ -<span class="abx-output-icon abx-output-icon--ytdlp" title="Video"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><rect x="3" y="6" width="18" height="12" rx="2"/><path d="M10 9l5 3-5 3z"/></svg></span> diff --git a/archivebox/plugins/ytdlp/tests/test_ytdlp.py b/archivebox/plugins/ytdlp/tests/test_ytdlp.py deleted file mode 100644 index 561c4324..00000000 --- a/archivebox/plugins/ytdlp/tests/test_ytdlp.py +++ /dev/null @@ -1,202 +0,0 @@ -""" -Integration tests for ytdlp plugin - -Tests verify: -1. Hook script exists -2. Verify deps with abx-pkg -3. YT-DLP extraction works on video URLs -4. JSONL output is correct -5. Config options work (YTDLP_ENABLED, YTDLP_TIMEOUT) -6. Handles non-video URLs gracefully -""" - -import json -import subprocess -import sys -import tempfile -import time -from pathlib import Path -import pytest - -PLUGIN_DIR = Path(__file__).parent.parent -PLUGINS_ROOT = PLUGIN_DIR.parent -YTDLP_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_ytdlp.*'), None) -TEST_URL = 'https://example.com/video.mp4' - -def test_hook_script_exists(): - """Verify on_Snapshot hook exists.""" - assert YTDLP_HOOK.exists(), f"Hook not found: {YTDLP_HOOK}" - - -def test_verify_deps_with_abx_pkg(): - """Verify yt-dlp, node, and ffmpeg are available via abx-pkg.""" - from abx_pkg import Binary, PipProvider, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides - - missing_binaries = [] - - # Verify yt-dlp is available - ytdlp_binary = Binary(name='yt-dlp', binproviders=[PipProvider(), EnvProvider()]) - ytdlp_loaded = ytdlp_binary.load() - if not (ytdlp_loaded and ytdlp_loaded.abspath): - missing_binaries.append('yt-dlp') - - # Verify node is available (yt-dlp needs it for JS extraction) - node_binary = Binary( - name='node', - binproviders=[AptProvider(), BrewProvider(), EnvProvider()] - ) - node_loaded = node_binary.load() - if not (node_loaded and node_loaded.abspath): - missing_binaries.append('node') - - # Verify ffmpeg is available (yt-dlp needs it for video conversion) - ffmpeg_binary = Binary(name='ffmpeg', binproviders=[AptProvider(), BrewProvider(), EnvProvider()]) - ffmpeg_loaded = ffmpeg_binary.load() - if not (ffmpeg_loaded and ffmpeg_loaded.abspath): - missing_binaries.append('ffmpeg') - - if missing_binaries: - pass - -def test_handles_non_video_url(): - """Test that ytdlp extractor handles non-video URLs gracefully via hook.""" - # Prerequisites checked by earlier test - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - # Run ytdlp extraction hook on non-video URL - result = subprocess.run( - [sys.executable, str(YTDLP_HOOK), '--url', 'https://example.com', '--snapshot-id', 'test789'], - cwd=tmpdir, - capture_output=True, - text=True, - timeout=60 - ) - - # Should exit 0 even for non-media URL - assert result.returncode == 0, f"Should handle non-media URL gracefully: {result.stderr}" - - # Parse clean JSONL output - result_json = None - for line in result.stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - pass - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - assert result_json, "Should have ArchiveResult JSONL output" - assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" - - -def test_config_ytdlp_enabled_false_skips(): - """Test that YTDLP_ENABLED=False exits without emitting JSONL.""" - import os - - with tempfile.TemporaryDirectory() as tmpdir: - env = os.environ.copy() - env['YTDLP_ENABLED'] = 'False' - - result = subprocess.run( - [sys.executable, str(YTDLP_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=30 - ) - - assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}" - - # Feature disabled - temporary failure, should NOT emit JSONL - assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr" - - # Should NOT emit any JSONL - jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')] - assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}" - - -def test_config_timeout(): - """Test that YTDLP_TIMEOUT config is respected (also via MEDIA_TIMEOUT alias).""" - import os - - with tempfile.TemporaryDirectory() as tmpdir: - env = os.environ.copy() - env['YTDLP_TIMEOUT'] = '5' - - start_time = time.time() - result = subprocess.run( - [sys.executable, str(YTDLP_HOOK), '--url', 'https://example.com', '--snapshot-id', 'testtimeout'], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=10 # Should complete in 5s, use 10s as safety margin - ) - elapsed_time = time.time() - start_time - - assert result.returncode == 0, f"Should complete without hanging: {result.stderr}" - # Allow 1 second overhead for subprocess startup and Python interpreter - assert elapsed_time <= 6.0, f"Should complete within 6 seconds (5s timeout + 1s overhead), took {elapsed_time:.2f}s" - - -def test_real_youtube_url(): - """Test that yt-dlp can extract video/audio from a real YouTube URL.""" - import os - - with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) - - # Use a short, stable YouTube video (YouTube's own about video) - youtube_url = 'https://www.youtube.com/watch?v=jNQXAC9IVRw' # "Me at the zoo" - first YouTube video - - env = os.environ.copy() - env['YTDLP_TIMEOUT'] = '120' # Give it time to download - - start_time = time.time() - result = subprocess.run( - [sys.executable, str(YTDLP_HOOK), '--url', youtube_url, '--snapshot-id', 'testyoutube'], - cwd=tmpdir, - capture_output=True, - text=True, - env=env, - timeout=180 - ) - elapsed_time = time.time() - start_time - - # Should succeed - assert result.returncode == 0, f"Should extract video/audio successfully: {result.stderr}" - - # Parse JSONL output - result_json = None - for line in result.stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass - - assert result_json, f"Should have ArchiveResult JSONL output. stdout: {result.stdout}" - assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" - - # Check that some video/audio files were downloaded - output_files = list(tmpdir.glob('**/*')) - media_files = [f for f in output_files if f.is_file() and f.suffix.lower() in ('.mp4', '.webm', '.mkv', '.m4a', '.mp3', '.json', '.jpg', '.webp')] - - assert len(media_files) > 0, f"Should have downloaded at least one video/audio file. Files: {output_files}" - - print(f"Successfully extracted {len(media_files)} file(s) in {elapsed_time:.2f}s") - - -if __name__ == '__main__': - pytest.main([__file__, '-v']) diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py index f4e670cb..b98f7f95 100644 --- a/archivebox/search/__init__.py +++ b/archivebox/search/__init__.py @@ -2,7 +2,7 @@ Search module for ArchiveBox. Search indexing is handled by search backend hooks in plugins: - archivebox/plugins/search_backend_*/on_Snapshot__*_index_*.py + abx_plugins/plugins/search_backend_*/on_Snapshot__*_index_*.py This module provides the query interface that dynamically discovers search backend plugins using the hooks system. diff --git a/archivebox/templates/admin/base.html b/archivebox/templates/admin/base.html index 86bd85c8..b2b5bcc9 100644 --- a/archivebox/templates/admin/base.html +++ b/archivebox/templates/admin/base.html @@ -275,6 +275,21 @@ -moz-osx-font-smoothing: grayscale; } + /* Snapshot admin actions: hide label/colon and remove card border */ + #content-main form .field-admin_actions > label, + #content form .field-admin_actions > label, + #content-main form .field-admin_actions label, + #content form .field-admin_actions label { + display: none !important; + } + + #content-main form fieldset.actions-card, + #content form fieldset.actions-card { + border: none !important; + box-shadow: none !important; + background: transparent !important; + } + /* Readonly fields styling */ #content-main form fieldset .readonly, #content form fieldset .readonly { diff --git a/archivebox/templates/admin/progress_monitor.html b/archivebox/templates/admin/progress_monitor.html index 5fc449e6..733ad9eb 100644 --- a/archivebox/templates/admin/progress_monitor.html +++ b/archivebox/templates/admin/progress_monitor.html @@ -608,10 +608,6 @@ </div> </div> - <div class="thumbnail-strip empty" id="thumbnail-strip"> - <span class="thumbnail-label">Recent:</span> - </div> - <div class="tree-container" id="tree-container"> <div class="idle-message" id="idle-message">No active crawls</div> <div id="crawl-tree"></div> @@ -625,7 +621,7 @@ const treeContainer = document.getElementById('tree-container'); const crawlTree = document.getElementById('crawl-tree'); const idleMessage = document.getElementById('idle-message'); - const thumbnailStrip = document.getElementById('thumbnail-strip'); + const thumbnailStrip = null; let pollInterval = null; let pollDelayMs = 1000; @@ -697,65 +693,8 @@ } - function renderThumbnail(thumb, isNew) { - const ext = (thumb.embed_path || '').toLowerCase().split('.').pop(); - const isImage = ['png', 'jpg', 'jpeg', 'gif', 'webp', 'svg', 'ico'].includes(ext); - - const item = document.createElement('a'); - item.className = 'thumbnail-item' + (isNew ? ' new' : ''); - item.href = `/admin/core/snapshot/${thumb.snapshot_id}/change/`; - item.title = `${thumb.plugin}: ${thumb.snapshot_url}`; - item.dataset.id = thumb.id; - - const archiveUrl = thumb.archive_url || thumb.archive_path; - if (isImage && archiveUrl) { - item.innerHTML = ` - <img src="${archiveUrl}" alt="${thumb.plugin}" loading="lazy" onerror="this.parentElement.innerHTML='<div class=\\'thumbnail-fallback\\'>${getPluginIcon(thumb.plugin)}</div><span class=\\'thumbnail-plugin\\'>${thumb.plugin}</span>'"> - <span class="thumbnail-plugin">${thumb.plugin}</span> - `; - } else { - item.innerHTML = ` - <div class="thumbnail-fallback">${getPluginIcon(thumb.plugin)}</div> - <span class="thumbnail-plugin">${thumb.plugin}</span> - `; - } - - return item; - } - - function updateThumbnails(thumbnails) { - if (!thumbnails || thumbnails.length === 0) { - thumbnailStrip.classList.add('empty'); - return; - } - - thumbnailStrip.classList.remove('empty'); - - // Find new thumbnails (ones we haven't seen before) - const newThumbs = thumbnails.filter(t => !knownThumbnailIds.has(t.id)); - - // Add new thumbnails to the beginning (after the label) - const label = thumbnailStrip.querySelector('.thumbnail-label'); - newThumbs.reverse().forEach(thumb => { - const item = renderThumbnail(thumb, true); - if (label.nextSibling) { - thumbnailStrip.insertBefore(item, label.nextSibling); - } else { - thumbnailStrip.appendChild(item); - } - knownThumbnailIds.add(thumb.id); - }); - - // Limit to 20 thumbnails (remove old ones) - const items = thumbnailStrip.querySelectorAll('.thumbnail-item'); - if (items.length > 20) { - for (let i = 20; i < items.length; i++) { - const id = items[i].dataset.id; - knownThumbnailIds.delete(id); - items[i].remove(); - } - } - } + function renderThumbnail(thumb, isNew) { return null; } + function updateThumbnails(thumbnails) {} function renderExtractor(extractor) { const icon = extractor.status === 'started' ? '↻' : @@ -1009,8 +948,7 @@ crawlTree.innerHTML = ''; } - // Update thumbnail strip with recently completed results - updateThumbnails(data.recent_thumbnails || []); + // Recent thumbnails removed } function fetchProgress() { diff --git a/archivebox/templates/core/snapshot.html b/archivebox/templates/core/snapshot.html index 6adbf7c4..0ad5a226 100644 --- a/archivebox/templates/core/snapshot.html +++ b/archivebox/templates/core/snapshot.html @@ -717,7 +717,7 @@ <p class="card-text"><code>{{ result_info.path }}</code></p> </a> <a href="{{ display_url }}" target="preview"> - <h4 class="card-title">{{ result_info.name|plugin_display_name|title }}</h4> + <h4 class="card-title">{{ result_info.name|title }}</h4> </a> {% if result_info.result %} {% with plugin_base=result_info.name|plugin_name %} diff --git a/archivebox/tests/test_auth_ldap.py b/archivebox/tests/test_auth_ldap.py index a56d29f7..10972acd 100644 --- a/archivebox/tests/test_auth_ldap.py +++ b/archivebox/tests/test_auth_ldap.py @@ -63,7 +63,7 @@ class TestLDAPConfig(unittest.TestCase): config = LDAPConfig( LDAP_ENABLED=True, - LDAP_SERVER_URI="ldap://localhost:389", + LDAP_SERVER_URI="ldap://ldap-test.localhost:389", LDAP_BIND_DN="cn=admin,dc=example,dc=com", LDAP_BIND_PASSWORD="password", LDAP_USER_BASE="ou=users,dc=example,dc=com", @@ -172,7 +172,7 @@ class TestArchiveBoxWithLDAP(unittest.TestCase): env={ **os.environ, 'LDAP_ENABLED': 'False', - 'LDAP_SERVER_URI': 'ldap://localhost:389', + 'LDAP_SERVER_URI': 'ldap://ldap-test.localhost:389', } ) diff --git a/archivebox/tests/test_hooks.py b/archivebox/tests/test_hooks.py index 308633ba..9d0afa0e 100755 --- a/archivebox/tests/test_hooks.py +++ b/archivebox/tests/test_hooks.py @@ -468,7 +468,7 @@ class TestPluginMetadata(unittest.TestCase): def test_plugin_name_added(self): """run_hook() should add plugin name to records.""" # Simulate what run_hook() does - script = Path('/archivebox/plugins/wget/on_Snapshot__50_wget.py') + script = Path('/abx_plugins/plugins/wget/on_Snapshot__50_wget.py') plugin_name = script.parent.name record = {'type': 'ArchiveResult', 'status': 'succeeded'} diff --git a/archivebox/workers/orchestrator.py b/archivebox/workers/orchestrator.py index 6465ef88..c83d4a55 100644 --- a/archivebox/workers/orchestrator.py +++ b/archivebox/workers/orchestrator.py @@ -3,13 +3,16 @@ Orchestrator for managing worker processes. The Orchestrator polls the Crawl queue and spawns CrawlWorkers as needed. -Architecture: - Orchestrator (polls Crawl queue) - └── CrawlWorker(s) (one per active Crawl) - └── SnapshotWorker(s) (one per Snapshot, up to limit) - └── Hook Processes (sequential, forked by SnapshotWorker) +Orchestrator (takes list of specific crawls | polls for pending queued crawls forever) spawns: +└── CrawlWorker(s) (one per active Crawl) + └── SnapshotWorker(s) (one per Snapshot, up to limit) + └── Hook Processes (sequential, forked by SnapshotWorker) + e.g on_Snapshot__23_save_pdf.js + on_Snapshot__24_save_screenshot.js + ... Usage: + # Default: runs forever (for use as subprocess of server) orchestrator = Orchestrator(exit_on_idle=False) orchestrator.runloop() diff --git a/pyproject.toml b/pyproject.toml index 65983d51..23f34ab7 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,6 +84,7 @@ dependencies = [ "yt-dlp>=2024.1.0", # for: media extractor ### Binary/Package Management "abx-pkg>=0.1.0", # for: detecting, versioning, and installing binaries via apt/brew/pip/npm + "abx-plugins>=0.1.0", # shared plugin package (sourced from uv workspace in local dev) "gallery-dl>=1.31.1", ### UUID7 backport for Python <3.14 "uuid7>=0.1.0; python_version < '3.14'", # for: uuid7 support on Python 3.13 (provides uuid_extensions module) @@ -164,6 +165,9 @@ package = true python-version = "3.13" # compile-bytecode = true +[tool.uv.sources] +abx-plugins = { workspace = true } + [build-system] requires = ["pdm-backend"] build-backend = "pdm.backend"