diff --git a/.github/workflows/test-parallel.yml b/.github/workflows/test-parallel.yml index 98edb7cc..77db7ac6 100644 --- a/.github/workflows/test-parallel.yml +++ b/.github/workflows/test-parallel.yml @@ -86,57 +86,33 @@ jobs: python-version: ${{ matrix.python }} architecture: x64 + - name: Install uv + uses: astral-sh/setup-uv@v4 + with: + version: "latest" + - name: Set up Node JS uses: actions/setup-node@v4 with: - node-version: 20.10.0 + node-version: 22 - - name: Get pip cache dir - id: pip-cache - run: | - echo "dir=$(pip cache dir)" >> $GITHUB_OUTPUT - - - name: Cache pip + - name: Cache uv uses: actions/cache@v3 - id: cache-pip with: - path: ${{ steps.pip-cache.outputs.dir }} - key: ${{ runner.os }}-${{ matrix.python }}-venv-${{ hashFiles('setup.py') }} + path: ~/.cache/uv + key: ${{ runner.os }}-${{ matrix.python }}-uv-${{ hashFiles('pyproject.toml', 'uv.lock') }} restore-keys: | - ${{ runner.os }}-${{ matrix.python }}-venv- + ${{ runner.os }}-${{ matrix.python }}-uv- - uses: awalsh128/cache-apt-pkgs-action@latest with: - packages: ripgrep build-essential python3-dev python3-setuptools libssl-dev libldap2-dev libsasl2-dev zlib1g-dev libatomic1 python3-minimal gnupg2 curl wget python3-ldap python3-msgpack python3-mutagen python3-regex python3-pycryptodome procps - version: 1.0 + packages: git ripgrep build-essential python3-dev python3-setuptools libssl-dev libldap2-dev libsasl2-dev zlib1g-dev libatomic1 python3-minimal gnupg2 curl wget python3-ldap python3-msgpack python3-mutagen python3-regex python3-pycryptodome procps + version: 1.1 - - name: Install pip dependencies + - name: Install dependencies with uv run: | - python -m pip install --upgrade pip setuptools wheel pytest bottle build - python -m pip install -r requirements.txt - python -m pip install -e .[sonic,ldap] - - - name: Get npm cache dir - id: npm-cache - run: | - echo "dir=$GITHUB_WORKSPACE/node_modules" >> $GITHUB_OUTPUT - - - name: Cache npm - uses: actions/cache@v3 - id: cache-npm - with: - path: ${{ steps.npm-cache.outputs.dir }} - key: ${{ runner.os }}-node_modules-${{ hashFiles('package-lock.json') }} - restore-keys: | - ${{ runner.os }}-node_modules - - - name: Install npm requirements - run: | - npm install - echo "SINGLEFILE_BINARY=$GITHUB_WORKSPACE/node_modules/.bin/single-file" >> $GITHUB_ENV - echo "READABILITY_BINARY=$GITHUB_WORKSPACE/node_modules/.bin/readability-extractor" >> $GITHUB_ENV - echo "MERCURY_BINARY=$GITHUB_WORKSPACE/node_modules/.bin/mercury-parser" >> $GITHUB_ENV + uv sync --dev --all-extras - name: Run test - ${{ matrix.test.name }} run: | - python -m pytest -xvs "${{ matrix.test.path }}" --basetemp=tests/out --ignore=archivebox/pkgs + uv run pytest -xvs "${{ matrix.test.path }}" --basetemp=tests/out --ignore=archivebox/pkgs diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 90542bed..c30061c2 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -26,9 +26,7 @@ from archivebox.misc.system import get_dir_size, atomic_write from archivebox.misc.util import parse_date, base_url, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode from archivebox.misc.hashing import get_dir_info from archivebox.hooks import ( - EXTRACTOR_INDEXING_PRECEDENCE, get_plugins, get_plugin_name, get_plugin_icon, - DEFAULT_PLUGIN_ICONS, ) from archivebox.base_models.models import ( ModelWithUUID, ModelWithSerializers, ModelWithOutputDir, @@ -1931,16 +1929,6 @@ class SnapshotMachine(BaseStateMachine, strict_states=True): ) -class ArchiveResultManager(models.Manager): - def indexable(self, sorted: bool = True): - INDEXABLE_METHODS = [r[0] for r in EXTRACTOR_INDEXING_PRECEDENCE] - qs = self.get_queryset().filter(plugin__in=INDEXABLE_METHODS, status='succeeded') - if sorted: - precedence = [When(plugin=method, then=Value(p)) for method, p in EXTRACTOR_INDEXING_PRECEDENCE] - qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000), output_field=IntegerField())).order_by('indexing_precedence') - return qs - - class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine): class StatusChoices(models.TextChoices): QUEUED = 'queued', 'Queued' @@ -2000,8 +1988,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi state_field_name = 'status' active_state = StatusChoices.STARTED - objects = ArchiveResultManager() - class Meta(TypedModelMeta): app_label = 'core' verbose_name = 'Archive Result' diff --git a/archivebox/hooks.py b/archivebox/hooks.py index 2c0ffcb5..3cc8e83e 100644 --- a/archivebox/hooks.py +++ b/archivebox/hooks.py @@ -619,20 +619,6 @@ def is_parser_plugin(plugin: str) -> bool: return name.startswith('parse_') and name.endswith('_urls') -# Precedence order for search indexing (lower number = higher priority) -# Used to select which plugin's output to use for full-text search -# Plugin names here should match the part after the numeric prefix -# e.g., '31_readability' -> 'readability' -EXTRACTOR_INDEXING_PRECEDENCE = [ - ('readability', 1), - ('mercury', 2), - ('htmltotext', 3), - ('singlefile', 4), - ('dom', 5), - ('wget', 6), -] - - def get_enabled_plugins(config: Optional[Dict[str, Any]] = None) -> List[str]: """ Get the list of enabled plugins based on config and available hooks. @@ -960,25 +946,6 @@ DEFAULT_TEMPLATES = { ''', } -# Default icons for known extractor plugins (emoji or short HTML) -DEFAULT_PLUGIN_ICONS = { - 'screenshot': '📷', - 'pdf': '📄', - 'singlefile': '📦', - 'dom': '🌐', - 'wget': '📥', - 'media': '🎬', - 'git': '📂', - 'readability': '📖', - 'mercury': '☿️', - 'favicon': '⭐', - 'title': '📝', - 'headers': '📋', - 'archive_org': '🏛️', - 'htmltotext': '📃', - 'warc': '🗄️', -} - def get_plugin_template(plugin: str, template_name: str, fallback: bool = True) -> Optional[str]: """ @@ -1018,10 +985,7 @@ def get_plugin_template(plugin: str, template_name: str, fallback: bool = True) def get_plugin_icon(plugin: str) -> str: """ - Get the icon for a plugin. - - First checks for plugin-provided icon.html template, - then falls back to DEFAULT_PLUGIN_ICONS. + Get the icon for a plugin from its icon.html template. Args: plugin: Plugin name (e.g., 'screenshot', '15_singlefile') @@ -1029,15 +993,13 @@ def get_plugin_icon(plugin: str) -> str: Returns: Icon HTML/emoji string. """ - base_name = get_plugin_name(plugin) - # Try plugin-provided icon template icon_template = get_plugin_template(plugin, 'icon', fallback=False) if icon_template: return icon_template.strip() - # Fall back to default icon - return DEFAULT_PLUGIN_ICONS.get(base_name, '📁') + # Fall back to generic folder icon + return '📁' def get_all_plugin_icons() -> Dict[str, str]: diff --git a/archivebox/plugins/chrome/config.json b/archivebox/plugins/chrome/config.json index 5fc7c32b..56316089 100644 --- a/archivebox/plugins/chrome/config.json +++ b/archivebox/plugins/chrome/config.json @@ -9,10 +9,10 @@ "x-aliases": ["CHROMIUM_BINARY", "GOOGLE_CHROME_BINARY"], "description": "Path to Chrome/Chromium binary" }, - "NODE_BINARY": { + "CHROME_NODE_BINARY": { "type": "string", "default": "node", - "x-aliases": ["NODEJS_BINARY"], + "x-fallback": "NODE_BINARY", "description": "Path to Node.js binary (for Puppeteer)" }, "CHROME_TIMEOUT": { @@ -50,16 +50,19 @@ "x-fallback": "USER_AGENT", "description": "User agent string for Chrome" }, - "CHROME_EXTRA_ARGS": { - "type": "string", - "default": "", - "description": "Extra command-line arguments for Chrome (space-separated)" + "CHROME_ARGS": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["CHROME_DEFAULT_ARGS"], + "description": "Default Chrome command-line arguments" }, - "CHROME_CHECK_SSL_VALIDITY": { - "type": "boolean", - "default": true, - "x-fallback": "CHECK_SSL_VALIDITY", - "description": "Whether to verify SSL certificates" + "CHROME_ARGS_EXTRA": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["CHROME_EXTRA_ARGS"], + "description": "Extra arguments to append to Chrome command" } } } diff --git a/archivebox/plugins/favicon/config.json b/archivebox/plugins/favicon/config.json index 6be0a26e..4c67e18f 100644 --- a/archivebox/plugins/favicon/config.json +++ b/archivebox/plugins/favicon/config.json @@ -21,12 +21,6 @@ "default": "", "x-fallback": "USER_AGENT", "description": "User agent string" - }, - "FAVICON_CHECK_SSL_VALIDITY": { - "type": "boolean", - "default": true, - "x-fallback": "CHECK_SSL_VALIDITY", - "description": "Whether to verify SSL certificates" } } } diff --git a/archivebox/plugins/forumdl/config.json b/archivebox/plugins/forumdl/config.json index ac26ea37..9e9ea10a 100644 --- a/archivebox/plugins/forumdl/config.json +++ b/archivebox/plugins/forumdl/config.json @@ -27,21 +27,25 @@ "enum": ["jsonl", "warc", "mbox", "maildir", "mh", "mmdf", "babyl"], "description": "Output format for forum downloads" }, - "FORUMDL_TEXTIFY": { - "type": "boolean", - "default": false, - "description": "Convert HTML content to plaintext (keep false to preserve HTML)" - }, "FORUMDL_CHECK_SSL_VALIDITY": { "type": "boolean", "default": true, "x-fallback": "CHECK_SSL_VALIDITY", "description": "Whether to verify SSL certificates" }, - "FORUMDL_EXTRA_ARGS": { - "type": "string", - "default": "", - "description": "Extra arguments for forum-dl (space-separated)" + "FORUMDL_ARGS": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["FORUMDL_DEFAULT_ARGS"], + "description": "Default forum-dl arguments" + }, + "FORUMDL_ARGS_EXTRA": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["FORUMDL_EXTRA_ARGS"], + "description": "Extra arguments to append to forum-dl command" } } } diff --git a/archivebox/plugins/forumdl/on_Snapshot__65_forumdl.bg.py b/archivebox/plugins/forumdl/on_Snapshot__65_forumdl.bg.py index 4d0e0f79..3fe7a94a 100755 --- a/archivebox/plugins/forumdl/on_Snapshot__65_forumdl.bg.py +++ b/archivebox/plugins/forumdl/on_Snapshot__65_forumdl.bg.py @@ -6,19 +6,13 @@ Usage: on_Snapshot__forumdl.py --url= --snapshot-id= Output: Downloads forum content to $PWD/ Environment variables: - FORUMDL_BINARY: Path to forum-dl binary - FORUMDL_TIMEOUT: Timeout in seconds (default: 3600 for large forums) + FORUMDL_ENABLED: Enable forum downloading (default: True) + FORUMDL_BINARY: Path to forum-dl binary (default: forum-dl) + FORUMDL_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT) FORUMDL_OUTPUT_FORMAT: Output format (default: jsonl) - FORUMDL_TEXTIFY: Convert HTML to plaintext (default: False - keeps HTML) - FORUMDL_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True) - FORUMDL_EXTRA_ARGS: Extra arguments for forum-dl (space-separated) - - # Forum-dl feature toggles - SAVE_FORUMDL: Enable forum-dl forum extraction (default: True) - - # Fallback to ARCHIVING_CONFIG values if FORUMDL_* not set: - TIMEOUT: Fallback timeout - CHECK_SSL_VALIDITY: Fallback SSL check + FORUMDL_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY) + FORUMDL_ARGS: Default forum-dl arguments (JSON array) + FORUMDL_ARGS_EXTRA: Extra arguments to append (JSON array) """ import json @@ -78,6 +72,20 @@ def get_env_int(name: str, default: int = 0) -> int: return default +def get_env_array(name: str, default: list[str] | None = None) -> list[str]: + """Parse a JSON array from environment variable.""" + val = get_env(name, '') + if not val: + return default if default is not None else [] + try: + result = json.loads(val) + if isinstance(result, list): + return [str(item) for item in result] + return default if default is not None else [] + except json.JSONDecodeError: + return default if default is not None else [] + + def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]: """ @@ -85,11 +93,11 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]: Returns: (success, output_path, error_message) """ - # Get config from env - timeout = get_env_int('TIMEOUT', 3600) - check_ssl = get_env_bool('CHECK_SSL_VALIDITY', True) - textify = get_env_bool('FORUMDL_TEXTIFY', False) - extra_args = get_env('FORUMDL_EXTRA_ARGS', '') + # Get config from env (with FORUMDL_ prefix, x-fallback handled by config loader) + timeout = get_env_int('FORUMDL_TIMEOUT') or get_env_int('TIMEOUT', 3600) + check_ssl = get_env_bool('FORUMDL_CHECK_SSL_VALIDITY', True) if get_env('FORUMDL_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True) + forumdl_args = get_env_array('FORUMDL_ARGS', []) + forumdl_args_extra = get_env_array('FORUMDL_ARGS_EXTRA', []) output_format = get_env('FORUMDL_OUTPUT_FORMAT', 'jsonl') # Output directory is current directory (hook already runs in output dir) @@ -108,16 +116,13 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]: output_file = output_dir / f'forum.{output_format}' # Build command - cmd = [binary, '-f', output_format, '-o', str(output_file)] - - if textify: - cmd.append('--textify') + cmd = [binary, *forumdl_args, '-f', output_format, '-o', str(output_file)] if not check_ssl: cmd.append('--no-check-certificate') - if extra_args: - cmd.extend(extra_args.split()) + if forumdl_args_extra: + cmd.extend(forumdl_args_extra) cmd.append(url) diff --git a/archivebox/plugins/gallerydl/config.json b/archivebox/plugins/gallerydl/config.json index 92dab2cd..522a4b22 100644 --- a/archivebox/plugins/gallerydl/config.json +++ b/archivebox/plugins/gallerydl/config.json @@ -21,6 +21,12 @@ "x-fallback": "TIMEOUT", "description": "Timeout for gallery downloads in seconds" }, + "GALLERYDL_COOKIES_FILE": { + "type": "string", + "default": "", + "x-fallback": "COOKIES_FILE", + "description": "Path to cookies file" + }, "GALLERYDL_CHECK_SSL_VALIDITY": { "type": "boolean", "default": true, @@ -34,12 +40,15 @@ "--write-metadata", "--write-info-json" ], + "x-aliases": ["GALLERYDL_DEFAULT_ARGS"], "description": "Default gallery-dl arguments" }, - "GALLERYDL_EXTRA_ARGS": { - "type": "string", - "default": "", - "description": "Extra arguments for gallery-dl (space-separated)" + "GALLERYDL_ARGS_EXTRA": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["GALLERYDL_EXTRA_ARGS"], + "description": "Extra arguments to append to gallery-dl command" } } } diff --git a/archivebox/plugins/gallerydl/on_Snapshot__64_gallerydl.bg.py b/archivebox/plugins/gallerydl/on_Snapshot__64_gallerydl.bg.py index 86ff868b..78c1128a 100755 --- a/archivebox/plugins/gallerydl/on_Snapshot__64_gallerydl.bg.py +++ b/archivebox/plugins/gallerydl/on_Snapshot__64_gallerydl.bg.py @@ -6,20 +6,13 @@ Usage: on_Snapshot__gallerydl.py --url= --snapshot-id= Output: Downloads gallery images to $PWD/gallerydl/ Environment variables: - GALLERYDL_BINARY: Path to gallery-dl binary - GALLERYDL_TIMEOUT: Timeout in seconds (default: 3600 for large galleries) - GALLERYDL_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True) - GALLERYDL_EXTRA_ARGS: Extra arguments for gallery-dl (space-separated) - COOKIES_FILE: Path to cookies file for authentication - - # Gallery-dl feature toggles - USE_GALLERYDL: Enable gallery-dl gallery extraction (default: True) - SAVE_GALLERYDL: Alias for USE_GALLERYDL - - # Fallback to ARCHIVING_CONFIG values if GALLERYDL_* not set: - GALLERYDL_TIMEOUT: Fallback timeout for gallery downloads - TIMEOUT: Fallback timeout - CHECK_SSL_VALIDITY: Fallback SSL check + GALLERYDL_ENABLED: Enable gallery-dl gallery extraction (default: True) + GALLERYDL_BINARY: Path to gallery-dl binary (default: gallery-dl) + GALLERYDL_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT) + GALLERYDL_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE) + GALLERYDL_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY) + GALLERYDL_ARGS: Default gallery-dl arguments (JSON array) + GALLERYDL_ARGS_EXTRA: Extra arguments to append (JSON array) """ import json @@ -58,6 +51,20 @@ def get_env_int(name: str, default: int = 0) -> int: return default +def get_env_array(name: str, default: list[str] | None = None) -> list[str]: + """Parse a JSON array from environment variable.""" + val = get_env(name, '') + if not val: + return default if default is not None else [] + try: + result = json.loads(val) + if isinstance(result, list): + return [str(item) for item in result] + return default if default is not None else [] + except json.JSONDecodeError: + return default if default is not None else [] + + STATICFILE_DIR = '../staticfile' def has_staticfile_output() -> bool: @@ -66,35 +73,27 @@ def has_staticfile_output() -> bool: return staticfile_dir.exists() and any(staticfile_dir.iterdir()) -# Default gallery-dl args -def get_gallerydl_default_args() -> list[str]: - """Build default gallery-dl arguments.""" - return [ - '--write-metadata', - '--write-info-json', - ] - - def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]: """ Download gallery using gallery-dl. Returns: (success, output_path, error_message) """ - # Get config from env - timeout = get_env_int('TIMEOUT', 3600) - check_ssl = get_env_bool('CHECK_SSL_VALIDITY', True) - extra_args = get_env('GALLERYDL_EXTRA_ARGS', '') - cookies_file = get_env('COOKIES_FILE', '') + # Get config from env (with GALLERYDL_ prefix, x-fallback handled by config loader) + timeout = get_env_int('GALLERYDL_TIMEOUT') or get_env_int('TIMEOUT', 3600) + check_ssl = get_env_bool('GALLERYDL_CHECK_SSL_VALIDITY', True) if get_env('GALLERYDL_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True) + gallerydl_args = get_env_array('GALLERYDL_ARGS', []) + gallerydl_args_extra = get_env_array('GALLERYDL_ARGS_EXTRA', []) + cookies_file = get_env('GALLERYDL_COOKIES_FILE') or get_env('COOKIES_FILE', '') # Output directory is current directory (hook already runs in output dir) output_dir = Path(OUTPUT_DIR) - # Build command (later options take precedence) + # Build command # Use -D for exact directory (flat structure) instead of -d (nested structure) cmd = [ binary, - *get_gallerydl_default_args(), + *gallerydl_args, '-D', str(output_dir), ] @@ -104,8 +103,8 @@ def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]: if cookies_file and Path(cookies_file).exists(): cmd.extend(['-C', cookies_file]) - if extra_args: - cmd.extend(extra_args.split()) + if gallerydl_args_extra: + cmd.extend(gallerydl_args_extra) cmd.append(url) diff --git a/archivebox/plugins/git/config.json b/archivebox/plugins/git/config.json index 125cb6ec..da0a3b02 100644 --- a/archivebox/plugins/git/config.json +++ b/archivebox/plugins/git/config.json @@ -26,16 +26,19 @@ "default": "github.com,gitlab.com,bitbucket.org,gist.github.com,codeberg.org,gitea.com,git.sr.ht", "description": "Comma-separated list of domains to treat as git repositories" }, - "GIT_CLONE_DEPTH": { - "type": "integer", - "default": 1, - "minimum": 0, - "description": "Depth of git clone (0 for full history, 1 for shallow)" + "GIT_ARGS": { + "type": "array", + "items": {"type": "string"}, + "default": ["clone", "--depth=1", "--recursive"], + "x-aliases": ["GIT_DEFAULT_ARGS"], + "description": "Default git arguments" }, - "GIT_EXTRA_ARGS": { - "type": "string", - "default": "", - "description": "Extra arguments for git clone" + "GIT_ARGS_EXTRA": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["GIT_EXTRA_ARGS"], + "description": "Extra arguments to append to git command" } } } diff --git a/archivebox/plugins/git/on_Snapshot__62_git.py b/archivebox/plugins/git/on_Snapshot__62_git.py index 37f6e245..943be861 100644 --- a/archivebox/plugins/git/on_Snapshot__62_git.py +++ b/archivebox/plugins/git/on_Snapshot__62_git.py @@ -8,7 +8,8 @@ Output: Clones repository to $PWD/repo Environment variables: GIT_BINARY: Path to git binary GIT_TIMEOUT: Timeout in seconds (default: 120) - GIT_ARGS: Extra arguments for git clone (space-separated) + GIT_ARGS: Default git arguments (JSON array, default: ["clone", "--depth=1", "--recursive"]) + GIT_ARGS_EXTRA: Extra arguments to append (JSON array, default: []) # Fallback to ARCHIVING_CONFIG values if GIT_* not set: TIMEOUT: Fallback timeout @@ -41,6 +42,20 @@ def get_env_int(name: str, default: int = 0) -> int: return default +def get_env_array(name: str, default: list[str] | None = None) -> list[str]: + """Parse a JSON array from environment variable.""" + val = get_env(name, '') + if not val: + return default if default is not None else [] + try: + result = json.loads(val) + if isinstance(result, list): + return [str(item) for item in result] + return default if default is not None else [] + except json.JSONDecodeError: + return default if default is not None else [] + + def is_git_url(url: str) -> bool: """Check if URL looks like a git repository.""" git_patterns = [ @@ -61,19 +76,10 @@ def clone_git(url: str, binary: str) -> tuple[bool, str | None, str]: Returns: (success, output_path, error_message) """ timeout = get_env_int('GIT_TIMEOUT') or get_env_int('TIMEOUT', 120) - extra_args = get_env('GIT_ARGS') + git_args = get_env_array('GIT_ARGS', []) + git_args_extra = get_env_array('GIT_ARGS_EXTRA', []) - cmd = [ - binary, - 'clone', - '--depth=1', - '--recursive', - ] - - if extra_args: - cmd.extend(extra_args.split()) - - cmd.extend([url, OUTPUT_DIR]) + cmd = [binary, *git_args, *git_args_extra, url, OUTPUT_DIR] try: result = subprocess.run(cmd, capture_output=True, timeout=timeout) diff --git a/archivebox/plugins/infiniscroll/config.json b/archivebox/plugins/infiniscroll/config.json new file mode 100644 index 00000000..8f0304ad --- /dev/null +++ b/archivebox/plugins/infiniscroll/config.json @@ -0,0 +1,46 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "additionalProperties": false, + "required_plugins": ["chrome"], + "properties": { + "INFINISCROLL_ENABLED": { + "type": "boolean", + "default": true, + "x-aliases": ["SAVE_INFINISCROLL", "USE_INFINISCROLL"], + "description": "Enable infinite scroll page expansion" + }, + "INFINISCROLL_TIMEOUT": { + "type": "integer", + "default": 120, + "minimum": 10, + "x-fallback": "TIMEOUT", + "description": "Maximum timeout for scrolling in seconds" + }, + "INFINISCROLL_SCROLL_DELAY": { + "type": "integer", + "default": 2000, + "minimum": 500, + "description": "Delay between scrolls in milliseconds" + }, + "INFINISCROLL_SCROLL_DISTANCE": { + "type": "integer", + "default": 1600, + "minimum": 100, + "description": "Distance to scroll per step in pixels" + }, + "INFINISCROLL_SCROLL_LIMIT": { + "type": "integer", + "default": 10, + "minimum": 1, + "maximum": 100, + "description": "Maximum number of scroll steps" + }, + "INFINISCROLL_MIN_HEIGHT": { + "type": "integer", + "default": 16000, + "minimum": 1000, + "description": "Minimum page height to scroll to in pixels" + } + } +} diff --git a/archivebox/plugins/infiniscroll/on_Snapshot__45_infiniscroll.js b/archivebox/plugins/infiniscroll/on_Snapshot__45_infiniscroll.js new file mode 100755 index 00000000..905f1c12 --- /dev/null +++ b/archivebox/plugins/infiniscroll/on_Snapshot__45_infiniscroll.js @@ -0,0 +1,267 @@ +#!/usr/bin/env node +/** + * Scroll the page down to trigger infinite scroll / lazy loading. + * + * Scrolls down 1 page at a time, up to INFINISCROLL_SCROLL_LIMIT times, + * ensuring at least INFINISCROLL_MIN_HEIGHT (default 16,000px) is reached. + * Stops early if no new content loads after a scroll. + * + * Usage: on_Snapshot__45_infiniscroll.js --url= --snapshot-id= + * Output: JSONL with scroll stats (no files created) + * + * Environment variables: + * INFINISCROLL_ENABLED: Enable/disable (default: true) + * INFINISCROLL_TIMEOUT: Max timeout in seconds (default: 120) + * INFINISCROLL_SCROLL_DELAY: Delay between scrolls in ms (default: 2000) + * INFINISCROLL_SCROLL_DISTANCE: Pixels per scroll (default: 1600) + * INFINISCROLL_SCROLL_LIMIT: Max scroll iterations (default: 10) + * INFINISCROLL_MIN_HEIGHT: Min page height to reach in px (default: 16000) + */ + +function getEnv(name, defaultValue = '') { + return (process.env[name] || defaultValue).trim(); +} + +function getEnvBool(name, defaultValue = false) { + const val = getEnv(name, '').toLowerCase(); + if (['true', '1', 'yes', 'on'].includes(val)) return true; + if (['false', '0', 'no', 'off'].includes(val)) return false; + return defaultValue; +} + +function getEnvInt(name, defaultValue = 0) { + const val = parseInt(getEnv(name, String(defaultValue)), 10); + return isNaN(val) ? defaultValue : val; +} + +// Check if infiniscroll is enabled BEFORE requiring puppeteer +if (!getEnvBool('INFINISCROLL_ENABLED', true)) { + console.error('Skipping infiniscroll (INFINISCROLL_ENABLED=False)'); + process.exit(0); +} + +const fs = require('fs'); +const path = require('path'); +const puppeteer = require('puppeteer-core'); + +const PLUGIN_NAME = 'infiniscroll'; +const CHROME_SESSION_DIR = '../chrome'; + +function parseArgs() { + const args = {}; + process.argv.slice(2).forEach(arg => { + if (arg.startsWith('--')) { + const [key, ...valueParts] = arg.slice(2).split('='); + args[key.replace(/-/g, '_')] = valueParts.join('=') || true; + } + }); + return args; +} + +function getCdpUrl() { + const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt'); + if (fs.existsSync(cdpFile)) { + return fs.readFileSync(cdpFile, 'utf8').trim(); + } + return null; +} + +function getPageId() { + const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt'); + if (fs.existsSync(targetIdFile)) { + return fs.readFileSync(targetIdFile, 'utf8').trim(); + } + return null; +} + +async function waitForChromeTabLoaded(timeoutMs = 60000) { + const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json'); + const startTime = Date.now(); + + while (Date.now() - startTime < timeoutMs) { + if (fs.existsSync(navigationFile)) { + return true; + } + await new Promise(resolve => setTimeout(resolve, 100)); + } + return false; +} + +function sleep(ms) { + return new Promise(resolve => setTimeout(resolve, ms)); +} + +async function scrollDown(page, options = {}) { + const { + timeout = 120000, + scrollDelay = 2000, + scrollDistance = 1600, + scrollLimit = 10, + minHeight = 16000, + } = options; + + const startTime = Date.now(); + const startingHeight = await page.evaluate(() => document.body.scrollHeight); + let lastHeight = startingHeight; + let scrollCount = 0; + let scrollPosition = 0; + + // Scroll to top first + await page.evaluate(() => { + window.scrollTo({ top: 0, left: 0, behavior: 'smooth' }); + }); + await sleep(500); + + while (scrollCount < scrollLimit) { + // Check timeout + const elapsed = Date.now() - startTime; + if (elapsed >= timeout) { + console.error(`Timeout reached after ${scrollCount} scrolls`); + break; + } + + scrollPosition = (scrollCount + 1) * scrollDistance; + console.error(`Scrolling down ${scrollCount + 1}x ${scrollDistance}px... (${scrollPosition}/${lastHeight})`); + + await page.evaluate((yOffset) => { + window.scrollTo({ top: yOffset, left: 0, behavior: 'smooth' }); + }, scrollPosition); + + scrollCount++; + await sleep(scrollDelay); + + // Check if new content was added (infinite scroll detection) + const newHeight = await page.evaluate(() => document.body.scrollHeight); + const addedPx = newHeight - lastHeight; + + if (addedPx > 0) { + console.error(`Detected infini-scrolling: ${lastHeight}+${addedPx} => ${newHeight}`); + } else if (scrollPosition >= newHeight + scrollDistance) { + // Reached the bottom + if (scrollCount > 2) { + console.error(`Reached bottom of page at ${newHeight}px`); + break; + } + } + + lastHeight = newHeight; + + // Check if we've reached minimum height and can stop + if (lastHeight >= minHeight && scrollPosition >= lastHeight) { + console.error(`Reached minimum height target (${minHeight}px)`); + break; + } + } + + // Scroll to absolute bottom + if (scrollPosition < lastHeight) { + await page.evaluate(() => { + window.scrollTo({ top: document.body.scrollHeight, left: 0, behavior: 'smooth' }); + }); + await sleep(scrollDelay); + } + + // Scroll back to top + console.error(`Reached bottom of page at ${lastHeight}px, scrolling back to top...`); + await page.evaluate(() => { + window.scrollTo({ top: 0, left: 0, behavior: 'smooth' }); + }); + await sleep(scrollDelay); + + const totalElapsed = Date.now() - startTime; + + return { + scrollCount, + finalHeight: lastHeight, + startingHeight, + elapsedMs: totalElapsed, + }; +} + +async function main() { + const args = parseArgs(); + const url = args.url; + const snapshotId = args.snapshot_id; + + if (!url || !snapshotId) { + console.error('Usage: on_Snapshot__45_infiniscroll.js --url= --snapshot-id='); + process.exit(1); + } + + const timeout = getEnvInt('INFINISCROLL_TIMEOUT', 120) * 1000; + const scrollDelay = getEnvInt('INFINISCROLL_SCROLL_DELAY', 2000); + const scrollDistance = getEnvInt('INFINISCROLL_SCROLL_DISTANCE', 1600); + const scrollLimit = getEnvInt('INFINISCROLL_SCROLL_LIMIT', 10); + const minHeight = getEnvInt('INFINISCROLL_MIN_HEIGHT', 16000); + + const cdpUrl = getCdpUrl(); + if (!cdpUrl) { + console.error('ERROR: Chrome CDP URL not found (chrome plugin must run first)'); + process.exit(1); + } + + // Wait for page to be loaded + const pageLoaded = await waitForChromeTabLoaded(60000); + if (!pageLoaded) { + console.error('ERROR: Page not loaded after 60s (chrome_navigate must complete first)'); + process.exit(1); + } + + let browser = null; + try { + browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl }); + + const pages = await browser.pages(); + if (pages.length === 0) { + throw new Error('No pages found in browser'); + } + + // Find the right page by target ID + const targetId = getPageId(); + let page = null; + if (targetId) { + page = pages.find(p => { + const target = p.target(); + return target && target._targetId === targetId; + }); + } + if (!page) { + page = pages[pages.length - 1]; + } + + console.error(`Starting infinite scroll on ${url}`); + const result = await scrollDown(page, { + timeout, + scrollDelay, + scrollDistance, + scrollLimit, + minHeight, + }); + + browser.disconnect(); + + const elapsedSec = (result.elapsedMs / 1000).toFixed(1); + const finalHeightStr = result.finalHeight.toLocaleString(); + const addedHeight = result.finalHeight - result.startingHeight; + const addedStr = addedHeight > 0 ? `+${addedHeight.toLocaleString()}px new content` : 'no new content'; + const outputStr = `scrolled to ${finalHeightStr}px (${addedStr}) over ${elapsedSec}s`; + + console.error(`Success: ${outputStr}`); + console.log(JSON.stringify({ + type: 'ArchiveResult', + status: 'succeeded', + output_str: outputStr, + })); + process.exit(0); + + } catch (e) { + if (browser) browser.disconnect(); + console.error(`ERROR: ${e.name}: ${e.message}`); + process.exit(1); + } +} + +main().catch(e => { + console.error(`Fatal error: ${e.message}`); + process.exit(1); +}); diff --git a/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py b/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py new file mode 100644 index 00000000..7a178958 --- /dev/null +++ b/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py @@ -0,0 +1,352 @@ +""" +Integration tests for infiniscroll plugin + +Tests verify: +1. Hook script exists +2. Dependencies installed via chrome validation hooks +3. Verify deps with abx-pkg +4. INFINISCROLL_ENABLED=False skips without JSONL +5. Fails gracefully when no chrome session exists +6. Full integration test: scrolls page and outputs stats +7. Config options work (scroll limit, min height) +""" + +import json +import os +import re +import signal +import subprocess +import time +import tempfile +from pathlib import Path + +import pytest + + +PLUGIN_DIR = Path(__file__).parent.parent +PLUGINS_ROOT = PLUGIN_DIR.parent +INFINISCROLL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_infiniscroll.*'), None) +CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js' +CHROME_TAB_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Snapshot__20_chrome_tab.bg.js' +CHROME_NAVIGATE_HOOK = next((PLUGINS_ROOT / 'chrome').glob('on_Snapshot__*_chrome_navigate.*'), None) +TEST_URL = 'https://www.singsing.movie/' + + +def get_node_modules_dir(): + """Get NODE_MODULES_DIR for tests, checking env first.""" + # Check if NODE_PATH is already set in environment + if os.environ.get('NODE_PATH'): + return Path(os.environ['NODE_PATH']) + # Otherwise compute from LIB_DIR + from archivebox.config.common import STORAGE_CONFIG + lib_dir = Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR)) + return lib_dir / 'npm' / 'node_modules' + + +NODE_MODULES_DIR = get_node_modules_dir() + + +def get_test_env(): + """Get environment with NODE_PATH set correctly.""" + env = os.environ.copy() + env['NODE_PATH'] = str(NODE_MODULES_DIR) + return env + + +def test_hook_script_exists(): + """Verify on_Snapshot hook exists.""" + assert INFINISCROLL_HOOK is not None, "Infiniscroll hook not found" + assert INFINISCROLL_HOOK.exists(), f"Hook not found: {INFINISCROLL_HOOK}" + + +def test_verify_deps_with_abx_pkg(): + """Verify dependencies are available via abx-pkg after hook installation.""" + from abx_pkg import Binary, EnvProvider, BinProviderOverrides + + EnvProvider.model_rebuild() + + # Verify node is available + node_binary = Binary(name='node', binproviders=[EnvProvider()]) + node_loaded = node_binary.load() + assert node_loaded and node_loaded.abspath, "Node.js required for infiniscroll plugin" + + +def test_config_infiniscroll_disabled_skips(): + """Test that INFINISCROLL_ENABLED=False exits without emitting JSONL.""" + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + env = get_test_env() + env['INFINISCROLL_ENABLED'] = 'False' + + result = subprocess.run( + ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-disabled'], + cwd=tmpdir, + capture_output=True, + text=True, + env=env, + timeout=30 + ) + + assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}" + assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr" + + # Should NOT emit any JSONL + jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')] + assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, got: {jsonl_lines}" + + +def test_fails_gracefully_without_chrome_session(): + """Test that hook fails gracefully when no chrome session exists.""" + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + + result = subprocess.run( + ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-no-chrome'], + cwd=tmpdir, + capture_output=True, + text=True, + env=get_test_env(), + timeout=30 + ) + + # Should fail (exit 1) when no chrome session + assert result.returncode != 0, "Should fail when no chrome session exists" + # Error could be about chrome/CDP not found, or puppeteer module missing + err_lower = result.stderr.lower() + assert any(x in err_lower for x in ['chrome', 'cdp', 'puppeteer', 'module']), \ + f"Should mention chrome/CDP/puppeteer in error: {result.stderr}" + + +def setup_chrome_session(tmpdir): + """Helper to set up Chrome session with tab and navigation.""" + crawl_dir = Path(tmpdir) / 'crawl' + crawl_dir.mkdir() + chrome_dir = crawl_dir / 'chrome' + + env = get_test_env() + env['CHROME_HEADLESS'] = 'true' + + # Launch Chrome at crawl level + chrome_launch_process = subprocess.Popen( + ['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-infiniscroll'], + cwd=str(crawl_dir), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + env=env + ) + + # Wait for Chrome to launch + for i in range(15): + if chrome_launch_process.poll() is not None: + stdout, stderr = chrome_launch_process.communicate() + raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}") + if (chrome_dir / 'cdp_url.txt').exists(): + break + time.sleep(1) + + if not (chrome_dir / 'cdp_url.txt').exists(): + raise RuntimeError("Chrome CDP URL not found after 15s") + + chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip()) + + # Create snapshot directory structure + snapshot_dir = Path(tmpdir) / 'snapshot' + snapshot_dir.mkdir() + snapshot_chrome_dir = snapshot_dir / 'chrome' + snapshot_chrome_dir.mkdir() + + # Create tab + tab_env = env.copy() + tab_env['CRAWL_OUTPUT_DIR'] = str(crawl_dir) + result = subprocess.run( + ['node', str(CHROME_TAB_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll', '--crawl-id=test-infiniscroll'], + cwd=str(snapshot_chrome_dir), + capture_output=True, + text=True, + timeout=60, + env=tab_env + ) + if result.returncode != 0: + raise RuntimeError(f"Tab creation failed: {result.stderr}") + + # Navigate to URL + result = subprocess.run( + ['node', str(CHROME_NAVIGATE_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll'], + cwd=str(snapshot_chrome_dir), + capture_output=True, + text=True, + timeout=120, + env=env + ) + if result.returncode != 0: + raise RuntimeError(f"Navigation failed: {result.stderr}") + + return chrome_launch_process, chrome_pid, snapshot_chrome_dir + + +def cleanup_chrome(chrome_launch_process, chrome_pid): + """Helper to clean up Chrome processes.""" + try: + chrome_launch_process.send_signal(signal.SIGTERM) + chrome_launch_process.wait(timeout=5) + except: + pass + try: + os.kill(chrome_pid, signal.SIGKILL) + except OSError: + pass + + +def test_scrolls_page_and_outputs_stats(): + """Integration test: scroll page and verify JSONL output format.""" + with tempfile.TemporaryDirectory() as tmpdir: + chrome_launch_process = None + chrome_pid = None + try: + chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir) + + # Create infiniscroll output directory (sibling to chrome) + infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll' + infiniscroll_dir.mkdir() + + # Run infiniscroll hook + env = get_test_env() + env['INFINISCROLL_SCROLL_LIMIT'] = '3' # Limit scrolls for faster test + env['INFINISCROLL_SCROLL_DELAY'] = '500' # Faster scrolling + env['INFINISCROLL_MIN_HEIGHT'] = '1000' # Lower threshold for test + + result = subprocess.run( + ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll'], + cwd=str(infiniscroll_dir), + capture_output=True, + text=True, + timeout=60, + env=env + ) + + assert result.returncode == 0, f"Infiniscroll failed: {result.stderr}\nStdout: {result.stdout}" + + # Parse JSONL output + result_json = None + for line in result.stdout.strip().split('\n'): + line = line.strip() + if line.startswith('{'): + try: + record = json.loads(line) + if record.get('type') == 'ArchiveResult': + result_json = record + break + except json.JSONDecodeError: + pass + + assert result_json is not None, f"Should have ArchiveResult JSONL output. Stdout: {result.stdout}" + assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" + + # Verify output_str format: "scrolled to X,XXXpx (+Y,YYYpx new content) over Z.Zs" + output_str = result_json.get('output_str', '') + assert output_str.startswith('scrolled to'), f"output_str should start with 'scrolled to': {output_str}" + assert 'px' in output_str, f"output_str should contain pixel count: {output_str}" + assert re.search(r'over \d+(\.\d+)?s', output_str), f"output_str should contain duration: {output_str}" + + # Verify no files created in output directory + output_files = list(infiniscroll_dir.iterdir()) + assert len(output_files) == 0, f"Should not create any files, but found: {output_files}" + + finally: + if chrome_launch_process and chrome_pid: + cleanup_chrome(chrome_launch_process, chrome_pid) + + +def test_config_scroll_limit_honored(): + """Test that INFINISCROLL_SCROLL_LIMIT config is respected.""" + with tempfile.TemporaryDirectory() as tmpdir: + chrome_launch_process = None + chrome_pid = None + try: + chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir) + + infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll' + infiniscroll_dir.mkdir() + + # Set scroll limit to 2 + env = get_test_env() + env['INFINISCROLL_SCROLL_LIMIT'] = '2' + env['INFINISCROLL_SCROLL_DELAY'] = '500' + env['INFINISCROLL_MIN_HEIGHT'] = '100000' # High threshold so limit kicks in + + result = subprocess.run( + ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-limit'], + cwd=str(infiniscroll_dir), + capture_output=True, + text=True, + timeout=60, + env=env + ) + + assert result.returncode == 0, f"Infiniscroll failed: {result.stderr}" + + # Parse output and verify scroll count + result_json = None + for line in result.stdout.strip().split('\n'): + if line.strip().startswith('{'): + try: + record = json.loads(line) + if record.get('type') == 'ArchiveResult': + result_json = record + break + except json.JSONDecodeError: + pass + + assert result_json is not None, "Should have JSONL output" + output_str = result_json.get('output_str', '') + + # Verify output format and that it completed (scroll limit enforced internally) + assert output_str.startswith('scrolled to'), f"Should have valid output_str: {output_str}" + assert result_json['status'] == 'succeeded', f"Should succeed with scroll limit: {result_json}" + + finally: + if chrome_launch_process and chrome_pid: + cleanup_chrome(chrome_launch_process, chrome_pid) + + +def test_config_timeout_honored(): + """Test that INFINISCROLL_TIMEOUT config is respected.""" + with tempfile.TemporaryDirectory() as tmpdir: + chrome_launch_process = None + chrome_pid = None + try: + chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir) + + infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll' + infiniscroll_dir.mkdir() + + # Set very short timeout + env = get_test_env() + env['INFINISCROLL_TIMEOUT'] = '3' # 3 seconds + env['INFINISCROLL_SCROLL_DELAY'] = '2000' # 2s delay - timeout should trigger + env['INFINISCROLL_SCROLL_LIMIT'] = '100' # High limit + env['INFINISCROLL_MIN_HEIGHT'] = '100000' + + start_time = time.time() + result = subprocess.run( + ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-timeout'], + cwd=str(infiniscroll_dir), + capture_output=True, + text=True, + timeout=30, + env=env + ) + elapsed = time.time() - start_time + + # Should complete within reasonable time (timeout + buffer) + assert elapsed < 15, f"Should respect timeout, took {elapsed:.1f}s" + assert result.returncode == 0, f"Should complete even with timeout: {result.stderr}" + + finally: + if chrome_launch_process and chrome_pid: + cleanup_chrome(chrome_launch_process, chrome_pid) + + +if __name__ == '__main__': + pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/mercury/config.json b/archivebox/plugins/mercury/config.json index 184f3efc..039c38a7 100644 --- a/archivebox/plugins/mercury/config.json +++ b/archivebox/plugins/mercury/config.json @@ -15,17 +15,26 @@ "x-aliases": ["POSTLIGHT_PARSER_BINARY"], "description": "Path to Mercury/Postlight parser binary" }, - "NODE_BINARY": { - "type": "string", - "default": "node", - "description": "Path to Node.js binary" - }, "MERCURY_TIMEOUT": { "type": "integer", "default": 30, "minimum": 5, "x-fallback": "TIMEOUT", "description": "Timeout for Mercury in seconds" + }, + "MERCURY_ARGS": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["MERCURY_DEFAULT_ARGS"], + "description": "Default Mercury parser arguments" + }, + "MERCURY_ARGS_EXTRA": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["MERCURY_EXTRA_ARGS"], + "description": "Extra arguments to append to Mercury parser command" } } } diff --git a/archivebox/plugins/mercury/on_Snapshot__56_mercury.py b/archivebox/plugins/mercury/on_Snapshot__56_mercury.py index 4c182137..5b710711 100644 --- a/archivebox/plugins/mercury/on_Snapshot__56_mercury.py +++ b/archivebox/plugins/mercury/on_Snapshot__56_mercury.py @@ -8,8 +8,8 @@ Output: Creates mercury/ directory with content.html, content.txt, article.json Environment variables: MERCURY_BINARY: Path to postlight-parser binary MERCURY_TIMEOUT: Timeout in seconds (default: 60) - - # Fallback to ARCHIVING_CONFIG values if MERCURY_* not set: + MERCURY_ARGS: Default Mercury arguments (JSON array) + MERCURY_ARGS_EXTRA: Extra arguments to append (JSON array) TIMEOUT: Fallback timeout Note: Requires postlight-parser: npm install -g @postlight/parser @@ -51,6 +51,20 @@ def get_env_int(name: str, default: int = 0) -> int: return default +def get_env_array(name: str, default: list[str] | None = None) -> list[str]: + """Parse a JSON array from environment variable.""" + val = get_env(name, '') + if not val: + return default if default is not None else [] + try: + result = json.loads(val) + if isinstance(result, list): + return [str(item) for item in result] + return default if default is not None else [] + except json.JSONDecodeError: + return default if default is not None else [] + + def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]: """ Extract article using Mercury Parser. @@ -58,13 +72,15 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]: Returns: (success, output_path, error_message) """ timeout = get_env_int('MERCURY_TIMEOUT') or get_env_int('TIMEOUT', 60) + mercury_args = get_env_array('MERCURY_ARGS', []) + mercury_args_extra = get_env_array('MERCURY_ARGS_EXTRA', []) # Output directory is current directory (hook already runs in output dir) output_dir = Path(OUTPUT_DIR) try: # Get text version - cmd_text = [binary, url, '--format=text'] + cmd_text = [binary, *mercury_args, *mercury_args_extra, url, '--format=text'] result_text = subprocess.run(cmd_text, capture_output=True, timeout=timeout) if result_text.returncode != 0: @@ -84,7 +100,7 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]: (output_dir / 'content.txt').write_text(text_content, encoding='utf-8') # Get HTML version - cmd_html = [binary, url, '--format=html'] + cmd_html = [binary, *mercury_args, *mercury_args_extra, url, '--format=html'] result_html = subprocess.run(cmd_html, capture_output=True, timeout=timeout) try: diff --git a/archivebox/plugins/papersdl/config.json b/archivebox/plugins/papersdl/config.json index 4d96d3bd..2c6eb342 100644 --- a/archivebox/plugins/papersdl/config.json +++ b/archivebox/plugins/papersdl/config.json @@ -21,10 +21,19 @@ "x-fallback": "TIMEOUT", "description": "Timeout for paper downloads in seconds" }, - "PAPERSDL_EXTRA_ARGS": { - "type": "string", - "default": "", - "description": "Extra arguments for papers-dl (space-separated)" + "PAPERSDL_ARGS": { + "type": "array", + "items": {"type": "string"}, + "default": ["fetch"], + "x-aliases": ["PAPERSDL_DEFAULT_ARGS"], + "description": "Default papers-dl arguments" + }, + "PAPERSDL_ARGS_EXTRA": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["PAPERSDL_EXTRA_ARGS"], + "description": "Extra arguments to append to papers-dl command" } } } diff --git a/archivebox/plugins/papersdl/on_Snapshot__66_papersdl.bg.py b/archivebox/plugins/papersdl/on_Snapshot__66_papersdl.bg.py index a75dc4ea..859d911e 100755 --- a/archivebox/plugins/papersdl/on_Snapshot__66_papersdl.bg.py +++ b/archivebox/plugins/papersdl/on_Snapshot__66_papersdl.bg.py @@ -8,7 +8,8 @@ Output: Downloads paper PDFs to $PWD/ Environment variables: PAPERSDL_BINARY: Path to papers-dl binary PAPERSDL_TIMEOUT: Timeout in seconds (default: 300 for paper downloads) - PAPERSDL_EXTRA_ARGS: Extra arguments for papers-dl (space-separated) + PAPERSDL_ARGS: Default papers-dl arguments (JSON array, default: ["fetch"]) + PAPERSDL_ARGS_EXTRA: Extra arguments to append (JSON array) # papers-dl feature toggles SAVE_PAPERSDL: Enable papers-dl paper extraction (default: True) @@ -54,6 +55,20 @@ def get_env_int(name: str, default: int = 0) -> int: return default +def get_env_array(name: str, default: list[str] | None = None) -> list[str]: + """Parse a JSON array from environment variable.""" + val = get_env(name, '') + if not val: + return default if default is not None else [] + try: + result = json.loads(val) + if isinstance(result, list): + return [str(item) for item in result] + return default if default is not None else [] + except json.JSONDecodeError: + return default if default is not None else [] + + def extract_doi_from_url(url: str) -> str | None: """Extract DOI from common paper URLs.""" # Match DOI pattern in URL @@ -72,7 +87,8 @@ def save_paper(url: str, binary: str) -> tuple[bool, str | None, str]: """ # Get config from env timeout = get_env_int('TIMEOUT', 300) - extra_args = get_env('PAPERSDL_EXTRA_ARGS', '') + papersdl_args = get_env_array('PAPERSDL_ARGS', []) + papersdl_args_extra = get_env_array('PAPERSDL_ARGS_EXTRA', []) # Output directory is current directory (hook already runs in output dir) output_dir = Path(OUTPUT_DIR) @@ -85,11 +101,11 @@ def save_paper(url: str, binary: str) -> tuple[bool, str | None, str]: else: identifier = doi - # Build command - papers-dl fetch -o - cmd = [binary, 'fetch', identifier, '-o', str(output_dir)] + # Build command - papers-dl -o + cmd = [binary, *papersdl_args, identifier, '-o', str(output_dir)] - if extra_args: - cmd.extend(extra_args.split()) + if papersdl_args_extra: + cmd.extend(papersdl_args_extra) try: result = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True) diff --git a/archivebox/plugins/readability/config.json b/archivebox/plugins/readability/config.json index b6db094c..90173047 100644 --- a/archivebox/plugins/readability/config.json +++ b/archivebox/plugins/readability/config.json @@ -14,17 +14,26 @@ "default": "readability-extractor", "description": "Path to readability-extractor binary" }, - "NODE_BINARY": { - "type": "string", - "default": "node", - "description": "Path to Node.js binary" - }, "READABILITY_TIMEOUT": { "type": "integer", "default": 30, "minimum": 5, "x-fallback": "TIMEOUT", "description": "Timeout for Readability in seconds" + }, + "READABILITY_ARGS": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["READABILITY_DEFAULT_ARGS"], + "description": "Default Readability arguments" + }, + "READABILITY_ARGS_EXTRA": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["READABILITY_EXTRA_ARGS"], + "description": "Extra arguments to append to Readability command" } } } diff --git a/archivebox/plugins/readability/on_Snapshot__55_readability.py b/archivebox/plugins/readability/on_Snapshot__55_readability.py index 41970437..2777479a 100644 --- a/archivebox/plugins/readability/on_Snapshot__55_readability.py +++ b/archivebox/plugins/readability/on_Snapshot__55_readability.py @@ -8,8 +8,8 @@ Output: Creates readability/ directory with content.html, content.txt, article.j Environment variables: READABILITY_BINARY: Path to readability-extractor binary READABILITY_TIMEOUT: Timeout in seconds (default: 60) - - # Fallback to ARCHIVING_CONFIG values if READABILITY_* not set: + READABILITY_ARGS: Default Readability arguments (JSON array) + READABILITY_ARGS_EXTRA: Extra arguments to append (JSON array) TIMEOUT: Fallback timeout Note: Requires readability-extractor from https://github.com/ArchiveBox/readability-extractor @@ -44,6 +44,20 @@ def get_env_int(name: str, default: int = 0) -> int: return default +def get_env_array(name: str, default: list[str] | None = None) -> list[str]: + """Parse a JSON array from environment variable.""" + val = get_env(name, '') + if not val: + return default if default is not None else [] + try: + result = json.loads(val) + if isinstance(result, list): + return [str(item) for item in result] + return default if default is not None else [] + except json.JSONDecodeError: + return default if default is not None else [] + + def find_html_source() -> str | None: """Find HTML content from other extractors in the snapshot directory.""" # Hooks run in snapshot_dir, sibling extractor outputs are in subdirectories @@ -73,6 +87,8 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]: Returns: (success, output_path, error_message) """ timeout = get_env_int('READABILITY_TIMEOUT') or get_env_int('TIMEOUT', 60) + readability_args = get_env_array('READABILITY_ARGS', []) + readability_args_extra = get_env_array('READABILITY_ARGS_EXTRA', []) # Find HTML source html_source = find_html_source() @@ -84,7 +100,7 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]: try: # Run readability-extractor (outputs JSON by default) - cmd = [binary, html_source] + cmd = [binary, *readability_args, *readability_args_extra, html_source] result = subprocess.run(cmd, capture_output=True, timeout=timeout) if result.returncode != 0: diff --git a/archivebox/plugins/search_backend_ripgrep/config.json b/archivebox/plugins/search_backend_ripgrep/config.json index 0753c938..49c5c885 100644 --- a/archivebox/plugins/search_backend_ripgrep/config.json +++ b/archivebox/plugins/search_backend_ripgrep/config.json @@ -3,25 +3,32 @@ "type": "object", "additionalProperties": false, "properties": { - "SEARCH_BACKEND_RIPGREP_BINARY": { + "RIPGREP_BINARY": { "type": "string", "default": "rg", - "x-aliases": ["RIPGREP_BINARY"], "description": "Path to ripgrep binary" }, - "SEARCH_BACKEND_RIPGREP_IGNORE_EXTENSIONS": { - "type": "string", - "default": "css,js,orig,svg", - "x-aliases": ["RIPGREP_IGNORE_EXTENSIONS"], - "description": "Comma-separated file extensions to ignore" - }, - "SEARCH_BACKEND_RIPGREP_TIMEOUT": { + "RIPGREP_TIMEOUT": { "type": "integer", "default": 90, "minimum": 5, "x-fallback": "TIMEOUT", "x-aliases": ["SEARCH_BACKEND_TIMEOUT"], "description": "Search timeout in seconds" + }, + "RIPGREP_ARGS": { + "type": "array", + "items": { "type": "string" }, + "default": ["--files-with-matches", "--no-messages", "--ignore-case"], + "x-aliases": ["RIPGREP_DEFAULT_ARGS"], + "description": "Default ripgrep arguments" + }, + "RIPGREP_ARGS_EXTRA": { + "type": "array", + "items": { "type": "string" }, + "default": [], + "x-aliases": ["RIPGREP_EXTRA_ARGS"], + "description": "Extra arguments to append to ripgrep command" } } } diff --git a/archivebox/plugins/search_backend_ripgrep/search.py b/archivebox/plugins/search_backend_ripgrep/search.py index 135b392e..140a32d1 100644 --- a/archivebox/plugins/search_backend_ripgrep/search.py +++ b/archivebox/plugins/search_backend_ripgrep/search.py @@ -6,10 +6,12 @@ using ripgrep (rg). This is simpler but slower for large archives. Environment variables: RIPGREP_BINARY: Path to ripgrep binary (default: rg) - RIPGREP_IGNORE_EXTENSIONS: Comma-separated extensions to ignore (default: css,js,orig,svg) - SEARCH_BACKEND_TIMEOUT: Search timeout in seconds (default: 90) + RIPGREP_ARGS: Default ripgrep arguments (JSON array) + RIPGREP_ARGS_EXTRA: Extra arguments to append (JSON array) + RIPGREP_TIMEOUT: Search timeout in seconds (default: 90) """ +import json import os import subprocess import shutil @@ -19,39 +21,57 @@ from typing import List, Iterable from django.conf import settings -# Config with old var names for backwards compatibility -RIPGREP_BINARY = os.environ.get('RIPGREP_BINARY', 'rg').strip() -RIPGREP_IGNORE_EXTENSIONS = os.environ.get('RIPGREP_IGNORE_EXTENSIONS', 'css,js,orig,svg').strip() -SEARCH_BACKEND_TIMEOUT = int(os.environ.get('SEARCH_BACKEND_TIMEOUT', '90')) +def get_env(name: str, default: str = '') -> str: + return os.environ.get(name, default).strip() + + +def get_env_int(name: str, default: int = 0) -> int: + try: + return int(get_env(name, str(default))) + except ValueError: + return default + + +def get_env_array(name: str, default: list[str] | None = None) -> list[str]: + """Parse a JSON array from environment variable.""" + val = get_env(name, '') + if not val: + return default if default is not None else [] + try: + result = json.loads(val) + if isinstance(result, list): + return [str(item) for item in result] + return default if default is not None else [] + except json.JSONDecodeError: + return default if default is not None else [] def search(query: str) -> List[str]: """Search for snapshots using ripgrep.""" - rg_binary = shutil.which(RIPGREP_BINARY) or RIPGREP_BINARY + rg_binary = get_env('RIPGREP_BINARY', 'rg') + rg_binary = shutil.which(rg_binary) or rg_binary if not rg_binary or not Path(rg_binary).exists(): - raise RuntimeError(f'ripgrep binary not found ({RIPGREP_BINARY}). Install with: apt install ripgrep') + raise RuntimeError(f'ripgrep binary not found. Install with: apt install ripgrep') + + timeout = get_env_int('RIPGREP_TIMEOUT', 90) + ripgrep_args = get_env_array('RIPGREP_ARGS', []) + ripgrep_args_extra = get_env_array('RIPGREP_ARGS_EXTRA', []) archive_dir = Path(settings.ARCHIVE_DIR) if not archive_dir.exists(): return [] - # Build ignore pattern from config - ignore_pattern = f'*.{{{RIPGREP_IGNORE_EXTENSIONS}}}' - cmd = [ rg_binary, - f'--type-add=ignore:{ignore_pattern}', - '--type-not=ignore', - '--files-with-matches', - '--no-messages', - '--ignore-case', + *ripgrep_args, + *ripgrep_args_extra, '--regexp', query, str(archive_dir), ] try: - result = subprocess.run(cmd, capture_output=True, text=True, timeout=SEARCH_BACKEND_TIMEOUT) + result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) # Extract snapshot IDs from file paths # Paths look like: archive///file.txt diff --git a/archivebox/plugins/singlefile/config.json b/archivebox/plugins/singlefile/config.json index ddfec833..ee546272 100644 --- a/archivebox/plugins/singlefile/config.json +++ b/archivebox/plugins/singlefile/config.json @@ -15,11 +15,18 @@ "x-aliases": ["SINGLE_FILE_BINARY"], "description": "Path to single-file binary" }, - "NODE_BINARY": { + "SINGLEFILE_NODE_BINARY": { "type": "string", "default": "node", + "x-fallback": "NODE_BINARY", "description": "Path to Node.js binary" }, + "SINGLEFILE_CHROME_BINARY": { + "type": "string", + "default": "", + "x-fallback": "CHROME_BINARY", + "description": "Path to Chrome/Chromium binary" + }, "SINGLEFILE_TIMEOUT": { "type": "integer", "default": 60, @@ -39,16 +46,25 @@ "x-fallback": "COOKIES_FILE", "description": "Path to cookies file" }, + "SINGLEFILE_CHECK_SSL_VALIDITY": { + "type": "boolean", + "default": true, + "x-fallback": "CHECK_SSL_VALIDITY", + "description": "Whether to verify SSL certificates" + }, "SINGLEFILE_ARGS": { "type": "array", "items": {"type": "string"}, - "default": [], + "default": ["--browser-headless"], + "x-aliases": ["SINGLEFILE_DEFAULT_ARGS"], "description": "Default single-file arguments" }, - "SINGLEFILE_EXTRA_ARGS": { - "type": "string", - "default": "", - "description": "Extra arguments for single-file" + "SINGLEFILE_ARGS_EXTRA": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["SINGLEFILE_EXTRA_ARGS"], + "description": "Extra arguments to append to single-file command" } } } diff --git a/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py b/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py index cfda31aa..c7dc1686 100644 --- a/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py +++ b/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py @@ -6,24 +6,16 @@ Usage: on_Snapshot__singlefile.py --url= --snapshot-id= Output: Writes singlefile.html to $PWD Environment variables: - SINGLEFILE_BINARY: Path to SingleFile binary - SINGLEFILE_TIMEOUT: Timeout in seconds (default: 120) - SINGLEFILE_USER_AGENT: User agent string (optional) - SINGLEFILE_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True) - SINGLEFILE_COOKIES_FILE: Path to cookies file (optional) - SINGLEFILE_EXTRA_ARGS: Extra arguments for SingleFile (space-separated) - - # Feature toggle - SAVE_SINGLEFILE: Enable SingleFile archiving (default: True) - - # Chrome binary (SingleFile needs Chrome) - CHROME_BINARY: Path to Chrome/Chromium binary - - # Fallback to ARCHIVING_CONFIG values if SINGLEFILE_* not set: - TIMEOUT: Fallback timeout - USER_AGENT: Fallback user agent - CHECK_SSL_VALIDITY: Fallback SSL check - COOKIES_FILE: Fallback cookies file + SINGLEFILE_ENABLED: Enable SingleFile archiving (default: True) + SINGLEFILE_BINARY: Path to SingleFile binary (default: single-file) + SINGLEFILE_NODE_BINARY: Path to Node.js binary (x-fallback: NODE_BINARY) + SINGLEFILE_CHROME_BINARY: Path to Chrome binary (x-fallback: CHROME_BINARY) + SINGLEFILE_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT) + SINGLEFILE_USER_AGENT: User agent string (x-fallback: USER_AGENT) + SINGLEFILE_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE) + SINGLEFILE_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY) + SINGLEFILE_ARGS: Default SingleFile arguments (JSON array) + SINGLEFILE_ARGS_EXTRA: Extra arguments to append (JSON array) """ import json @@ -63,6 +55,20 @@ def get_env_int(name: str, default: int = 0) -> int: return default +def get_env_array(name: str, default: list[str] | None = None) -> list[str]: + """Parse a JSON array from environment variable.""" + val = get_env(name, '') + if not val: + return default if default is not None else [] + try: + result = json.loads(val) + if isinstance(result, list): + return [str(item) for item in result] + return default if default is not None else [] + except json.JSONDecodeError: + return default if default is not None else [] + + STATICFILE_DIR = '../staticfile' def has_staticfile_output() -> bool: @@ -121,15 +127,16 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]: Returns: (success, output_path, error_message) """ - # Get config from env (with SINGLEFILE_ prefix or fallback to ARCHIVING_CONFIG style) + # Get config from env (with SINGLEFILE_ prefix, x-fallback handled by config loader) timeout = get_env_int('SINGLEFILE_TIMEOUT') or get_env_int('TIMEOUT', 120) user_agent = get_env('SINGLEFILE_USER_AGENT') or get_env('USER_AGENT', '') - check_ssl = get_env_bool('SINGLEFILE_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True)) + check_ssl = get_env_bool('SINGLEFILE_CHECK_SSL_VALIDITY', True) if get_env('SINGLEFILE_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True) cookies_file = get_env('SINGLEFILE_COOKIES_FILE') or get_env('COOKIES_FILE', '') - extra_args = get_env('SINGLEFILE_EXTRA_ARGS', '') - chrome = get_env('CHROME_BINARY', '') + singlefile_args = get_env_array('SINGLEFILE_ARGS', []) + singlefile_args_extra = get_env_array('SINGLEFILE_ARGS_EXTRA', []) + chrome = get_env('SINGLEFILE_CHROME_BINARY') or get_env('CHROME_BINARY', '') - cmd = [binary] + cmd = [binary, *singlefile_args] # Try to use existing Chrome session via CDP cdp_url = get_cdp_url() @@ -142,11 +149,6 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]: elif chrome: cmd.extend(['--browser-executable-path', chrome]) - # Common options - cmd.extend([ - '--browser-headless', - ]) - # SSL handling if not check_ssl: cmd.append('--browser-ignore-insecure-certs') @@ -157,8 +159,9 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]: if cookies_file and Path(cookies_file).is_file(): cmd.extend(['--browser-cookies-file', cookies_file]) - if extra_args: - cmd.extend(extra_args.split()) + # Add extra args from config + if singlefile_args_extra: + cmd.extend(singlefile_args_extra) # Output directory is current directory (hook already runs in output dir) output_dir = Path(OUTPUT_DIR) diff --git a/archivebox/plugins/wget/config.json b/archivebox/plugins/wget/config.json index 968791ac..70893612 100644 --- a/archivebox/plugins/wget/config.json +++ b/archivebox/plugins/wget/config.json @@ -9,18 +9,12 @@ "x-aliases": ["SAVE_WGET", "USE_WGET"], "description": "Enable wget archiving" }, - "WGET_SAVE_WARC": { + "WGET_WARC_ENABLED": { "type": "boolean", "default": true, - "x-aliases": ["SAVE_WARC"], + "x-aliases": ["SAVE_WARC", "WGET_SAVE_WARC"], "description": "Save WARC archive file" }, - "WGET_SAVE_REQUISITES": { - "type": "boolean", - "default": true, - "x-aliases": ["SAVE_WGET_REQUISITES"], - "description": "Download page requisites (CSS, JS, images)" - }, "WGET_BINARY": { "type": "string", "default": "wget", @@ -39,25 +33,17 @@ "x-fallback": "USER_AGENT", "description": "User agent string for wget" }, - "WGET_CHECK_SSL_VALIDITY": { - "type": "boolean", - "default": true, - "x-fallback": "CHECK_SSL_VALIDITY", - "x-aliases": ["CHECK_SSL_VALIDITY"], - "description": "Whether to verify SSL certificates" - }, "WGET_COOKIES_FILE": { "type": "string", "default": "", "x-fallback": "COOKIES_FILE", "description": "Path to cookies file" }, - "WGET_RESTRICT_FILE_NAMES": { - "type": "string", - "default": "windows", - "enum": ["windows", "unix", "ascii", "nocontrol", "lowercase", "uppercase"], - "x-fallback": "RESTRICT_FILE_NAMES", - "description": "Filename restriction mode" + "WGET_CHECK_SSL_VALIDITY": { + "type": "boolean", + "default": true, + "x-fallback": "CHECK_SSL_VALIDITY", + "description": "Whether to verify SSL certificates" }, "WGET_ARGS": { "type": "array", @@ -70,14 +56,20 @@ "--backup-converted", "--span-hosts", "--no-parent", + "--page-requisites", + "--restrict-file-names=windows", + "--tries=2", "-e", "robots=off" ], + "x-aliases": ["WGET_DEFAULT_ARGS"], "description": "Default wget arguments" }, - "WGET_EXTRA_ARGS": { - "type": "string", - "default": "", - "description": "Extra arguments for wget (space-separated)" + "WGET_ARGS_EXTRA": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["WGET_EXTRA_ARGS"], + "description": "Extra arguments to append to wget command" } } } diff --git a/archivebox/plugins/wget/on_Snapshot__61_wget.py b/archivebox/plugins/wget/on_Snapshot__61_wget.py index b605ea6c..8d4372d5 100644 --- a/archivebox/plugins/wget/on_Snapshot__61_wget.py +++ b/archivebox/plugins/wget/on_Snapshot__61_wget.py @@ -6,25 +6,15 @@ Usage: on_Snapshot__wget.py --url= --snapshot-id= Output: Downloads files to $PWD Environment variables: - WGET_BINARY: Path to wget binary (optional, falls back to PATH) - WGET_TIMEOUT: Timeout in seconds (default: 60) - WGET_USER_AGENT: User agent string - WGET_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True) - WGET_COOKIES_FILE: Path to cookies file (optional) - WGET_RESTRICT_FILE_NAMES: Filename restriction mode (default: windows) - WGET_EXTRA_ARGS: Extra arguments for wget (space-separated) - - # Wget feature toggles - SAVE_WGET: Enable wget archiving (default: True) - SAVE_WARC: Save WARC file (default: True) - SAVE_WGET_REQUISITES: Download page requisites (default: True) - - # Fallback to ARCHIVING_CONFIG values if WGET_* not set: - TIMEOUT: Fallback timeout - USER_AGENT: Fallback user agent - CHECK_SSL_VALIDITY: Fallback SSL check - COOKIES_FILE: Fallback cookies file - RESTRICT_FILE_NAMES: Fallback filename restriction + WGET_ENABLED: Enable wget archiving (default: True) + WGET_WARC_ENABLED: Save WARC file (default: True) + WGET_BINARY: Path to wget binary (default: wget) + WGET_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT) + WGET_USER_AGENT: User agent string (x-fallback: USER_AGENT) + WGET_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE) + WGET_CHECK_SSL_VALIDITY: Whether to check SSL certificates (x-fallback: CHECK_SSL_VALIDITY) + WGET_ARGS: Default wget arguments (JSON array) + WGET_ARGS_EXTRA: Extra arguments to append (JSON array) """ import json @@ -65,6 +55,20 @@ def get_env_int(name: str, default: int = 0) -> int: return default +def get_env_array(name: str, default: list[str] | None = None) -> list[str]: + """Parse a JSON array from environment variable.""" + val = get_env(name, '') + if not val: + return default if default is not None else [] + try: + result = json.loads(val) + if isinstance(result, list): + return [str(item) for item in result] + return default if default is not None else [] + except json.JSONDecodeError: + return default if default is not None else [] + + STATICFILE_DIR = '../staticfile' def has_staticfile_output() -> bool: @@ -73,17 +77,6 @@ def has_staticfile_output() -> bool: return staticfile_dir.exists() and any(staticfile_dir.iterdir()) -# Default wget args (from old WGET_CONFIG) -WGET_DEFAULT_ARGS = [ - '--no-verbose', - '--adjust-extension', - '--convert-links', - '--force-directories', - '--backup-converted', - '--span-hosts', - '--no-parent', - '-e', 'robots=off', -] def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]: @@ -92,36 +85,28 @@ def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]: Returns: (success, output_path, error_message) """ - # Get config from env (with WGET_ prefix or fallback to ARCHIVING_CONFIG style) + # Get config from env (with WGET_ prefix, x-fallback handled by config loader) timeout = get_env_int('WGET_TIMEOUT') or get_env_int('TIMEOUT', 60) user_agent = get_env('WGET_USER_AGENT') or get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)') - check_ssl = get_env_bool('WGET_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True)) + check_ssl = get_env_bool('WGET_CHECK_SSL_VALIDITY', True) if get_env('WGET_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True) cookies_file = get_env('WGET_COOKIES_FILE') or get_env('COOKIES_FILE', '') - restrict_names = get_env('WGET_RESTRICT_FILE_NAMES') or get_env('RESTRICT_FILE_NAMES', 'windows') - extra_args = get_env('WGET_EXTRA_ARGS', '') + wget_args = get_env_array('WGET_ARGS', []) + wget_args_extra = get_env_array('WGET_ARGS_EXTRA', []) # Feature toggles - save_warc = get_env_bool('WGET_SAVE_WARC', True) - save_requisites = get_env_bool('WGET_SAVE_REQUISITES', True) + warc_enabled = get_env_bool('WGET_WARC_ENABLED', True) # Build wget command (later options take precedence) cmd = [ binary, - *WGET_DEFAULT_ARGS, + *wget_args, f'--timeout={timeout}', - '--tries=2', ] if user_agent: cmd.append(f'--user-agent={user_agent}') - if restrict_names: - cmd.append(f'--restrict-file-names={restrict_names}') - - if save_requisites: - cmd.append('--page-requisites') - - if save_warc: + if warc_enabled: warc_dir = Path('warc') warc_dir.mkdir(exist_ok=True) warc_path = warc_dir / str(int(datetime.now(timezone.utc).timestamp())) @@ -135,8 +120,8 @@ def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]: if not check_ssl: cmd.extend(['--no-check-certificate', '--no-hsts']) - if extra_args: - cmd.extend(extra_args.split()) + if wget_args_extra: + cmd.extend(wget_args_extra) cmd.append(url) diff --git a/archivebox/plugins/ytdlp/binaries.jsonl b/archivebox/plugins/ytdlp/binaries.jsonl index beb44a4a..05240fd2 100644 --- a/archivebox/plugins/ytdlp/binaries.jsonl +++ b/archivebox/plugins/ytdlp/binaries.jsonl @@ -1,3 +1,3 @@ -{"type": "Binary", "name": "yt-dlp", "binproviders": "pip,brew,apt,env"} +{"type": "Binary", "name": "yt-dlp", "binproviders": "pip,brew,apt,env", "overrides": {"pip": {"packages": "yt-dlp[default]"}}} {"type": "Binary", "name": "node", "binproviders": "apt,brew,env", "overrides": {"apt": {"packages": ["nodejs"]}}} {"type": "Binary", "name": "ffmpeg", "binproviders": "apt,brew,env"} diff --git a/archivebox/plugins/ytdlp/config.json b/archivebox/plugins/ytdlp/config.json index 69ae5566..eb76ac3b 100644 --- a/archivebox/plugins/ytdlp/config.json +++ b/archivebox/plugins/ytdlp/config.json @@ -6,15 +6,28 @@ "YTDLP_ENABLED": { "type": "boolean", "default": true, - "x-aliases": ["MEDIA_ENABLED", "SAVE_MEDIA", "USE_MEDIA", "USE_YTDLP", "FETCH_MEDIA", "SAVE_YTDLP"], + "x-aliases": [ + "MEDIA_ENABLED", + "SAVE_MEDIA", + "USE_MEDIA", + "USE_YTDLP", + "FETCH_MEDIA", + "SAVE_YTDLP" + ], "description": "Enable video/audio downloading with yt-dlp" }, "YTDLP_BINARY": { "type": "string", "default": "yt-dlp", - "x-aliases": ["MEDIA_BINARY", "YOUTUBEDL_BINARY", "YOUTUBE_DL_BINARY"], + "x-aliases": ["YOUTUBEDL_BINARY", "YOUTUBE_DL_BINARY"], "description": "Path to yt-dlp binary" }, + "YTDLP_NODE_BINARY": { + "type": "string", + "default": "node", + "x-fallback": "NODE_BINARY", + "description": "Path to Node.js binary for yt-dlp JS runtime" + }, "YTDLP_TIMEOUT": { "type": "integer", "default": 3600, @@ -23,6 +36,12 @@ "x-aliases": ["MEDIA_TIMEOUT"], "description": "Timeout for yt-dlp downloads in seconds" }, + "YTDLP_COOKIES_FILE": { + "type": "string", + "default": "", + "x-fallback": "COOKIES_FILE", + "description": "Path to cookies file" + }, "YTDLP_MAX_SIZE": { "type": "string", "default": "750m", @@ -34,15 +53,14 @@ "type": "boolean", "default": true, "x-fallback": "CHECK_SSL_VALIDITY", - "x-aliases": ["MEDIA_CHECK_SSL_VALIDITY"], "description": "Whether to verify SSL certificates" }, "YTDLP_ARGS": { "type": "array", - "items": {"type": "string"}, + "items": { "type": "string" }, "default": [ "--restrict-filenames", - "--trim-filenames", "128", + "--trim-filenames=128", "--write-description", "--write-info-json", "--write-thumbnail", @@ -56,16 +74,19 @@ "--geo-bypass", "--add-metadata", "--no-progress", - "-o", "%(title)s.%(ext)s" + "--remote-components ejs:github", + "-o", + "%(title)s.%(ext)s" ], - "x-aliases": ["MEDIA_ARGS"], - "description": "Default yt-dlp arguments (override to customize behavior)" + "x-aliases": ["YTDLP_DEFAULT_ARGS"], + "description": "Default yt-dlp arguments" }, - "YTDLP_EXTRA_ARGS": { - "type": "string", - "default": "", - "x-aliases": ["MEDIA_EXTRA_ARGS"], - "description": "Extra arguments for yt-dlp (space-separated, appended after YTDLP_ARGS)" + "YTDLP_ARGS_EXTRA": { + "type": "array", + "items": { "type": "string" }, + "default": [], + "x-aliases": ["YTDLP_EXTRA_ARGS"], + "description": "Extra arguments to append to yt-dlp command" } } } diff --git a/archivebox/plugins/ytdlp/on_Snapshot__63_ytdlp.bg.py b/archivebox/plugins/ytdlp/on_Snapshot__63_ytdlp.bg.py index 6471645d..d8faae21 100644 --- a/archivebox/plugins/ytdlp/on_Snapshot__63_ytdlp.bg.py +++ b/archivebox/plugins/ytdlp/on_Snapshot__63_ytdlp.bg.py @@ -3,24 +3,18 @@ Download video/audio from a URL using yt-dlp. Usage: on_Snapshot__ytdlp.py --url= --snapshot-id= -Output: Downloads video/audio files to $PWD/ytdlp/ +Output: Downloads video/audio files to $PWD Environment variables: - YTDLP_BINARY: Path to yt-dlp binary - YTDLP_TIMEOUT: Timeout in seconds (default: 3600 for large downloads) - YTDLP_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True) - YTDLP_ARGS: JSON array of yt-dlp arguments (overrides defaults) - YTDLP_EXTRA_ARGS: Extra arguments for yt-dlp (space-separated, appended) - YTDLP_MAX_SIZE: Maximum file size (default: 750m) - - # Feature toggles (with backwards-compatible aliases) YTDLP_ENABLED: Enable yt-dlp extraction (default: True) - SAVE_YTDLP: Alias for YTDLP_ENABLED - MEDIA_ENABLED: Backwards-compatible alias for YTDLP_ENABLED - - # Fallback to ARCHIVING_CONFIG values if YTDLP_* not set: - TIMEOUT: Fallback timeout - CHECK_SSL_VALIDITY: Fallback SSL check + YTDLP_BINARY: Path to yt-dlp binary (default: yt-dlp) + YTDLP_NODE_BINARY: Path to Node.js binary (x-fallback: NODE_BINARY) + YTDLP_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT) + YTDLP_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE) + YTDLP_MAX_SIZE: Maximum file size (default: 750m) + YTDLP_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY) + YTDLP_ARGS: Default yt-dlp arguments (JSON array) + YTDLP_ARGS_EXTRA: Extra arguments to append (JSON array) """ import json @@ -32,11 +26,6 @@ from pathlib import Path import rich_click as click -# Extractor metadata -PLUGIN_NAME = 'ytdlp' -BIN_NAME = 'yt-dlp' -BIN_PROVIDERS = 'pip,apt,brew,env' -OUTPUT_DIR = '.' def get_env(name: str, default: str = '') -> str: @@ -59,6 +48,20 @@ def get_env_int(name: str, default: int = 0) -> int: return default +def get_env_array(name: str, default: list[str] | None = None) -> list[str]: + """Parse a JSON array from environment variable.""" + val = get_env(name, '') + if not val: + return default if default is not None else [] + try: + result = json.loads(val) + if isinstance(result, list): + return [str(item) for item in result] + return default if default is not None else [] + except json.JSONDecodeError: + return default if default is not None else [] + + STATICFILE_DIR = '../staticfile' def has_staticfile_output() -> bool: @@ -67,69 +70,41 @@ def has_staticfile_output() -> bool: return staticfile_dir.exists() and any(staticfile_dir.iterdir()) -# Default yt-dlp args (can be overridden via YTDLP_ARGS env var) -YTDLP_DEFAULT_ARGS = [ - '--restrict-filenames', - '--trim-filenames', '128', - '--write-description', - '--write-info-json', - '--write-thumbnail', - '--write-sub', - '--write-auto-subs', - '--convert-subs=srt', - '--yes-playlist', - '--continue', - '--no-abort-on-error', - '--ignore-errors', - '--geo-bypass', - '--add-metadata', - '--no-progress', - '-o', '%(title)s.%(ext)s', -] - - -def get_ytdlp_args() -> list[str]: - """Get yt-dlp arguments from YTDLP_ARGS env var or use defaults.""" - ytdlp_args_str = get_env('YTDLP_ARGS', '') - if ytdlp_args_str: - try: - # Try to parse as JSON array - args = json.loads(ytdlp_args_str) - if isinstance(args, list): - return [str(arg) for arg in args] - except json.JSONDecodeError: - pass - return YTDLP_DEFAULT_ARGS - - def save_ytdlp(url: str, binary: str) -> tuple[bool, str | None, str]: """ Download video/audio using yt-dlp. Returns: (success, output_path, error_message) """ - # Get config from env (YTDLP_* primary, MEDIA_* as fallback via aliases) - timeout = get_env_int('TIMEOUT', 3600) - check_ssl = get_env_bool('CHECK_SSL_VALIDITY', True) - extra_args = get_env('YTDLP_EXTRA_ARGS', '') - max_size = get_env('YTDLP_MAX_SIZE', '') or get_env('MEDIA_MAX_SIZE', '750m') + # Get config from env (with YTDLP_ prefix, x-fallback handled by config loader) + timeout = get_env_int('YTDLP_TIMEOUT') or get_env_int('TIMEOUT', 3600) + check_ssl = get_env_bool('YTDLP_CHECK_SSL_VALIDITY', True) if get_env('YTDLP_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True) + cookies_file = get_env('YTDLP_COOKIES_FILE') or get_env('COOKIES_FILE', '') + max_size = get_env('YTDLP_MAX_SIZE', '750m') + node_binary = get_env('YTDLP_NODE_BINARY') or get_env('NODE_BINARY', 'node') + ytdlp_args = get_env_array('YTDLP_ARGS', []) + ytdlp_args_extra = get_env_array('YTDLP_ARGS_EXTRA', []) # Output directory is current directory (hook already runs in output dir) - output_dir = Path(OUTPUT_DIR) + output_dir = Path('.') - # Build command using configurable YTDLP_ARGS (later options take precedence) + # Build command (later options take precedence) cmd = [ binary, - *get_ytdlp_args(), - # Format with max_size limit (appended after YTDLP_ARGS so it can be overridden by YTDLP_EXTRA_ARGS) + *ytdlp_args, + # Format with max_size limit (appended after YTDLP_ARGS so it can be overridden by YTDLP_ARGS_EXTRA) f'--format=(bv*+ba/b)[filesize<={max_size}][filesize_approx<=?{max_size}]/(bv*+ba/b)', + f'--js-runtimes=node:{node_binary}', ] if not check_ssl: cmd.append('--no-check-certificate') - if extra_args: - cmd.extend(extra_args.split()) + if cookies_file and Path(cookies_file).is_file(): + cmd.extend(['--cookies', cookies_file]) + + if ytdlp_args_extra: + cmd.extend(ytdlp_args_extra) cmd.append(url) @@ -193,9 +168,8 @@ def main(url: str, snapshot_id: str): """Download video/audio from a URL using yt-dlp.""" try: - # Check if yt-dlp downloading is enabled (YTDLP_ENABLED primary, MEDIA_ENABLED fallback) - ytdlp_enabled = get_env_bool('YTDLP_ENABLED', True) and get_env_bool('MEDIA_ENABLED', True) - if not ytdlp_enabled: + # Check if yt-dlp downloading is enabled + if not get_env_bool('YTDLP_ENABLED', True): print('Skipping ytdlp (YTDLP_ENABLED=False)', file=sys.stderr) # Temporary failure (config disabled) - NO JSONL emission sys.exit(0)