From 967c5d53e0e315d893369329de7b632409b9fcff Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 29 Dec 2025 13:11:34 -0800 Subject: [PATCH] make plugin config more consistent --- archivebox/plugins/chrome/config.json | 25 +++-- archivebox/plugins/favicon/config.json | 6 - archivebox/plugins/forumdl/config.json | 22 ++-- .../forumdl/on_Snapshot__65_forumdl.bg.py | 51 +++++---- archivebox/plugins/gallerydl/config.json | 17 ++- .../gallerydl/on_Snapshot__64_gallerydl.bg.py | 63 ++++++----- archivebox/plugins/git/config.json | 21 ++-- archivebox/plugins/git/on_Snapshot__62_git.py | 32 +++--- archivebox/plugins/mercury/config.json | 19 +++- .../mercury/on_Snapshot__56_mercury.py | 24 +++- archivebox/plugins/papersdl/config.json | 17 ++- .../papersdl/on_Snapshot__66_papersdl.bg.py | 28 ++++- archivebox/plugins/readability/config.json | 19 +++- .../on_Snapshot__55_readability.py | 22 +++- .../search_backend_ripgrep/config.json | 25 +++-- .../plugins/search_backend_ripgrep/search.py | 54 ++++++--- archivebox/plugins/singlefile/config.json | 28 ++++- .../singlefile/on_Snapshot__50_singlefile.py | 63 ++++++----- archivebox/plugins/wget/config.json | 42 +++---- .../plugins/wget/on_Snapshot__61_wget.py | 79 ++++++------- archivebox/plugins/ytdlp/binaries.jsonl | 2 +- archivebox/plugins/ytdlp/config.json | 27 +++-- .../plugins/ytdlp/on_Snapshot__63_ytdlp.bg.py | 105 +++++++----------- 23 files changed, 452 insertions(+), 339 deletions(-) diff --git a/archivebox/plugins/chrome/config.json b/archivebox/plugins/chrome/config.json index 5fc7c32b..56316089 100644 --- a/archivebox/plugins/chrome/config.json +++ b/archivebox/plugins/chrome/config.json @@ -9,10 +9,10 @@ "x-aliases": ["CHROMIUM_BINARY", "GOOGLE_CHROME_BINARY"], "description": "Path to Chrome/Chromium binary" }, - "NODE_BINARY": { + "CHROME_NODE_BINARY": { "type": "string", "default": "node", - "x-aliases": ["NODEJS_BINARY"], + "x-fallback": "NODE_BINARY", "description": "Path to Node.js binary (for Puppeteer)" }, "CHROME_TIMEOUT": { @@ -50,16 +50,19 @@ "x-fallback": "USER_AGENT", "description": "User agent string for Chrome" }, - "CHROME_EXTRA_ARGS": { - "type": "string", - "default": "", - "description": "Extra command-line arguments for Chrome (space-separated)" + "CHROME_ARGS": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["CHROME_DEFAULT_ARGS"], + "description": "Default Chrome command-line arguments" }, - "CHROME_CHECK_SSL_VALIDITY": { - "type": "boolean", - "default": true, - "x-fallback": "CHECK_SSL_VALIDITY", - "description": "Whether to verify SSL certificates" + "CHROME_ARGS_EXTRA": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["CHROME_EXTRA_ARGS"], + "description": "Extra arguments to append to Chrome command" } } } diff --git a/archivebox/plugins/favicon/config.json b/archivebox/plugins/favicon/config.json index 6be0a26e..4c67e18f 100644 --- a/archivebox/plugins/favicon/config.json +++ b/archivebox/plugins/favicon/config.json @@ -21,12 +21,6 @@ "default": "", "x-fallback": "USER_AGENT", "description": "User agent string" - }, - "FAVICON_CHECK_SSL_VALIDITY": { - "type": "boolean", - "default": true, - "x-fallback": "CHECK_SSL_VALIDITY", - "description": "Whether to verify SSL certificates" } } } diff --git a/archivebox/plugins/forumdl/config.json b/archivebox/plugins/forumdl/config.json index ac26ea37..9e9ea10a 100644 --- a/archivebox/plugins/forumdl/config.json +++ b/archivebox/plugins/forumdl/config.json @@ -27,21 +27,25 @@ "enum": ["jsonl", "warc", "mbox", "maildir", "mh", "mmdf", "babyl"], "description": "Output format for forum downloads" }, - "FORUMDL_TEXTIFY": { - "type": "boolean", - "default": false, - "description": "Convert HTML content to plaintext (keep false to preserve HTML)" - }, "FORUMDL_CHECK_SSL_VALIDITY": { "type": "boolean", "default": true, "x-fallback": "CHECK_SSL_VALIDITY", "description": "Whether to verify SSL certificates" }, - "FORUMDL_EXTRA_ARGS": { - "type": "string", - "default": "", - "description": "Extra arguments for forum-dl (space-separated)" + "FORUMDL_ARGS": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["FORUMDL_DEFAULT_ARGS"], + "description": "Default forum-dl arguments" + }, + "FORUMDL_ARGS_EXTRA": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["FORUMDL_EXTRA_ARGS"], + "description": "Extra arguments to append to forum-dl command" } } } diff --git a/archivebox/plugins/forumdl/on_Snapshot__65_forumdl.bg.py b/archivebox/plugins/forumdl/on_Snapshot__65_forumdl.bg.py index 4d0e0f79..3fe7a94a 100755 --- a/archivebox/plugins/forumdl/on_Snapshot__65_forumdl.bg.py +++ b/archivebox/plugins/forumdl/on_Snapshot__65_forumdl.bg.py @@ -6,19 +6,13 @@ Usage: on_Snapshot__forumdl.py --url= --snapshot-id= Output: Downloads forum content to $PWD/ Environment variables: - FORUMDL_BINARY: Path to forum-dl binary - FORUMDL_TIMEOUT: Timeout in seconds (default: 3600 for large forums) + FORUMDL_ENABLED: Enable forum downloading (default: True) + FORUMDL_BINARY: Path to forum-dl binary (default: forum-dl) + FORUMDL_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT) FORUMDL_OUTPUT_FORMAT: Output format (default: jsonl) - FORUMDL_TEXTIFY: Convert HTML to plaintext (default: False - keeps HTML) - FORUMDL_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True) - FORUMDL_EXTRA_ARGS: Extra arguments for forum-dl (space-separated) - - # Forum-dl feature toggles - SAVE_FORUMDL: Enable forum-dl forum extraction (default: True) - - # Fallback to ARCHIVING_CONFIG values if FORUMDL_* not set: - TIMEOUT: Fallback timeout - CHECK_SSL_VALIDITY: Fallback SSL check + FORUMDL_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY) + FORUMDL_ARGS: Default forum-dl arguments (JSON array) + FORUMDL_ARGS_EXTRA: Extra arguments to append (JSON array) """ import json @@ -78,6 +72,20 @@ def get_env_int(name: str, default: int = 0) -> int: return default +def get_env_array(name: str, default: list[str] | None = None) -> list[str]: + """Parse a JSON array from environment variable.""" + val = get_env(name, '') + if not val: + return default if default is not None else [] + try: + result = json.loads(val) + if isinstance(result, list): + return [str(item) for item in result] + return default if default is not None else [] + except json.JSONDecodeError: + return default if default is not None else [] + + def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]: """ @@ -85,11 +93,11 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]: Returns: (success, output_path, error_message) """ - # Get config from env - timeout = get_env_int('TIMEOUT', 3600) - check_ssl = get_env_bool('CHECK_SSL_VALIDITY', True) - textify = get_env_bool('FORUMDL_TEXTIFY', False) - extra_args = get_env('FORUMDL_EXTRA_ARGS', '') + # Get config from env (with FORUMDL_ prefix, x-fallback handled by config loader) + timeout = get_env_int('FORUMDL_TIMEOUT') or get_env_int('TIMEOUT', 3600) + check_ssl = get_env_bool('FORUMDL_CHECK_SSL_VALIDITY', True) if get_env('FORUMDL_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True) + forumdl_args = get_env_array('FORUMDL_ARGS', []) + forumdl_args_extra = get_env_array('FORUMDL_ARGS_EXTRA', []) output_format = get_env('FORUMDL_OUTPUT_FORMAT', 'jsonl') # Output directory is current directory (hook already runs in output dir) @@ -108,16 +116,13 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]: output_file = output_dir / f'forum.{output_format}' # Build command - cmd = [binary, '-f', output_format, '-o', str(output_file)] - - if textify: - cmd.append('--textify') + cmd = [binary, *forumdl_args, '-f', output_format, '-o', str(output_file)] if not check_ssl: cmd.append('--no-check-certificate') - if extra_args: - cmd.extend(extra_args.split()) + if forumdl_args_extra: + cmd.extend(forumdl_args_extra) cmd.append(url) diff --git a/archivebox/plugins/gallerydl/config.json b/archivebox/plugins/gallerydl/config.json index 92dab2cd..522a4b22 100644 --- a/archivebox/plugins/gallerydl/config.json +++ b/archivebox/plugins/gallerydl/config.json @@ -21,6 +21,12 @@ "x-fallback": "TIMEOUT", "description": "Timeout for gallery downloads in seconds" }, + "GALLERYDL_COOKIES_FILE": { + "type": "string", + "default": "", + "x-fallback": "COOKIES_FILE", + "description": "Path to cookies file" + }, "GALLERYDL_CHECK_SSL_VALIDITY": { "type": "boolean", "default": true, @@ -34,12 +40,15 @@ "--write-metadata", "--write-info-json" ], + "x-aliases": ["GALLERYDL_DEFAULT_ARGS"], "description": "Default gallery-dl arguments" }, - "GALLERYDL_EXTRA_ARGS": { - "type": "string", - "default": "", - "description": "Extra arguments for gallery-dl (space-separated)" + "GALLERYDL_ARGS_EXTRA": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["GALLERYDL_EXTRA_ARGS"], + "description": "Extra arguments to append to gallery-dl command" } } } diff --git a/archivebox/plugins/gallerydl/on_Snapshot__64_gallerydl.bg.py b/archivebox/plugins/gallerydl/on_Snapshot__64_gallerydl.bg.py index 86ff868b..78c1128a 100755 --- a/archivebox/plugins/gallerydl/on_Snapshot__64_gallerydl.bg.py +++ b/archivebox/plugins/gallerydl/on_Snapshot__64_gallerydl.bg.py @@ -6,20 +6,13 @@ Usage: on_Snapshot__gallerydl.py --url= --snapshot-id= Output: Downloads gallery images to $PWD/gallerydl/ Environment variables: - GALLERYDL_BINARY: Path to gallery-dl binary - GALLERYDL_TIMEOUT: Timeout in seconds (default: 3600 for large galleries) - GALLERYDL_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True) - GALLERYDL_EXTRA_ARGS: Extra arguments for gallery-dl (space-separated) - COOKIES_FILE: Path to cookies file for authentication - - # Gallery-dl feature toggles - USE_GALLERYDL: Enable gallery-dl gallery extraction (default: True) - SAVE_GALLERYDL: Alias for USE_GALLERYDL - - # Fallback to ARCHIVING_CONFIG values if GALLERYDL_* not set: - GALLERYDL_TIMEOUT: Fallback timeout for gallery downloads - TIMEOUT: Fallback timeout - CHECK_SSL_VALIDITY: Fallback SSL check + GALLERYDL_ENABLED: Enable gallery-dl gallery extraction (default: True) + GALLERYDL_BINARY: Path to gallery-dl binary (default: gallery-dl) + GALLERYDL_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT) + GALLERYDL_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE) + GALLERYDL_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY) + GALLERYDL_ARGS: Default gallery-dl arguments (JSON array) + GALLERYDL_ARGS_EXTRA: Extra arguments to append (JSON array) """ import json @@ -58,6 +51,20 @@ def get_env_int(name: str, default: int = 0) -> int: return default +def get_env_array(name: str, default: list[str] | None = None) -> list[str]: + """Parse a JSON array from environment variable.""" + val = get_env(name, '') + if not val: + return default if default is not None else [] + try: + result = json.loads(val) + if isinstance(result, list): + return [str(item) for item in result] + return default if default is not None else [] + except json.JSONDecodeError: + return default if default is not None else [] + + STATICFILE_DIR = '../staticfile' def has_staticfile_output() -> bool: @@ -66,35 +73,27 @@ def has_staticfile_output() -> bool: return staticfile_dir.exists() and any(staticfile_dir.iterdir()) -# Default gallery-dl args -def get_gallerydl_default_args() -> list[str]: - """Build default gallery-dl arguments.""" - return [ - '--write-metadata', - '--write-info-json', - ] - - def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]: """ Download gallery using gallery-dl. Returns: (success, output_path, error_message) """ - # Get config from env - timeout = get_env_int('TIMEOUT', 3600) - check_ssl = get_env_bool('CHECK_SSL_VALIDITY', True) - extra_args = get_env('GALLERYDL_EXTRA_ARGS', '') - cookies_file = get_env('COOKIES_FILE', '') + # Get config from env (with GALLERYDL_ prefix, x-fallback handled by config loader) + timeout = get_env_int('GALLERYDL_TIMEOUT') or get_env_int('TIMEOUT', 3600) + check_ssl = get_env_bool('GALLERYDL_CHECK_SSL_VALIDITY', True) if get_env('GALLERYDL_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True) + gallerydl_args = get_env_array('GALLERYDL_ARGS', []) + gallerydl_args_extra = get_env_array('GALLERYDL_ARGS_EXTRA', []) + cookies_file = get_env('GALLERYDL_COOKIES_FILE') or get_env('COOKIES_FILE', '') # Output directory is current directory (hook already runs in output dir) output_dir = Path(OUTPUT_DIR) - # Build command (later options take precedence) + # Build command # Use -D for exact directory (flat structure) instead of -d (nested structure) cmd = [ binary, - *get_gallerydl_default_args(), + *gallerydl_args, '-D', str(output_dir), ] @@ -104,8 +103,8 @@ def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]: if cookies_file and Path(cookies_file).exists(): cmd.extend(['-C', cookies_file]) - if extra_args: - cmd.extend(extra_args.split()) + if gallerydl_args_extra: + cmd.extend(gallerydl_args_extra) cmd.append(url) diff --git a/archivebox/plugins/git/config.json b/archivebox/plugins/git/config.json index 125cb6ec..da0a3b02 100644 --- a/archivebox/plugins/git/config.json +++ b/archivebox/plugins/git/config.json @@ -26,16 +26,19 @@ "default": "github.com,gitlab.com,bitbucket.org,gist.github.com,codeberg.org,gitea.com,git.sr.ht", "description": "Comma-separated list of domains to treat as git repositories" }, - "GIT_CLONE_DEPTH": { - "type": "integer", - "default": 1, - "minimum": 0, - "description": "Depth of git clone (0 for full history, 1 for shallow)" + "GIT_ARGS": { + "type": "array", + "items": {"type": "string"}, + "default": ["clone", "--depth=1", "--recursive"], + "x-aliases": ["GIT_DEFAULT_ARGS"], + "description": "Default git arguments" }, - "GIT_EXTRA_ARGS": { - "type": "string", - "default": "", - "description": "Extra arguments for git clone" + "GIT_ARGS_EXTRA": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["GIT_EXTRA_ARGS"], + "description": "Extra arguments to append to git command" } } } diff --git a/archivebox/plugins/git/on_Snapshot__62_git.py b/archivebox/plugins/git/on_Snapshot__62_git.py index 37f6e245..943be861 100644 --- a/archivebox/plugins/git/on_Snapshot__62_git.py +++ b/archivebox/plugins/git/on_Snapshot__62_git.py @@ -8,7 +8,8 @@ Output: Clones repository to $PWD/repo Environment variables: GIT_BINARY: Path to git binary GIT_TIMEOUT: Timeout in seconds (default: 120) - GIT_ARGS: Extra arguments for git clone (space-separated) + GIT_ARGS: Default git arguments (JSON array, default: ["clone", "--depth=1", "--recursive"]) + GIT_ARGS_EXTRA: Extra arguments to append (JSON array, default: []) # Fallback to ARCHIVING_CONFIG values if GIT_* not set: TIMEOUT: Fallback timeout @@ -41,6 +42,20 @@ def get_env_int(name: str, default: int = 0) -> int: return default +def get_env_array(name: str, default: list[str] | None = None) -> list[str]: + """Parse a JSON array from environment variable.""" + val = get_env(name, '') + if not val: + return default if default is not None else [] + try: + result = json.loads(val) + if isinstance(result, list): + return [str(item) for item in result] + return default if default is not None else [] + except json.JSONDecodeError: + return default if default is not None else [] + + def is_git_url(url: str) -> bool: """Check if URL looks like a git repository.""" git_patterns = [ @@ -61,19 +76,10 @@ def clone_git(url: str, binary: str) -> tuple[bool, str | None, str]: Returns: (success, output_path, error_message) """ timeout = get_env_int('GIT_TIMEOUT') or get_env_int('TIMEOUT', 120) - extra_args = get_env('GIT_ARGS') + git_args = get_env_array('GIT_ARGS', []) + git_args_extra = get_env_array('GIT_ARGS_EXTRA', []) - cmd = [ - binary, - 'clone', - '--depth=1', - '--recursive', - ] - - if extra_args: - cmd.extend(extra_args.split()) - - cmd.extend([url, OUTPUT_DIR]) + cmd = [binary, *git_args, *git_args_extra, url, OUTPUT_DIR] try: result = subprocess.run(cmd, capture_output=True, timeout=timeout) diff --git a/archivebox/plugins/mercury/config.json b/archivebox/plugins/mercury/config.json index 184f3efc..039c38a7 100644 --- a/archivebox/plugins/mercury/config.json +++ b/archivebox/plugins/mercury/config.json @@ -15,17 +15,26 @@ "x-aliases": ["POSTLIGHT_PARSER_BINARY"], "description": "Path to Mercury/Postlight parser binary" }, - "NODE_BINARY": { - "type": "string", - "default": "node", - "description": "Path to Node.js binary" - }, "MERCURY_TIMEOUT": { "type": "integer", "default": 30, "minimum": 5, "x-fallback": "TIMEOUT", "description": "Timeout for Mercury in seconds" + }, + "MERCURY_ARGS": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["MERCURY_DEFAULT_ARGS"], + "description": "Default Mercury parser arguments" + }, + "MERCURY_ARGS_EXTRA": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["MERCURY_EXTRA_ARGS"], + "description": "Extra arguments to append to Mercury parser command" } } } diff --git a/archivebox/plugins/mercury/on_Snapshot__56_mercury.py b/archivebox/plugins/mercury/on_Snapshot__56_mercury.py index 4c182137..5b710711 100644 --- a/archivebox/plugins/mercury/on_Snapshot__56_mercury.py +++ b/archivebox/plugins/mercury/on_Snapshot__56_mercury.py @@ -8,8 +8,8 @@ Output: Creates mercury/ directory with content.html, content.txt, article.json Environment variables: MERCURY_BINARY: Path to postlight-parser binary MERCURY_TIMEOUT: Timeout in seconds (default: 60) - - # Fallback to ARCHIVING_CONFIG values if MERCURY_* not set: + MERCURY_ARGS: Default Mercury arguments (JSON array) + MERCURY_ARGS_EXTRA: Extra arguments to append (JSON array) TIMEOUT: Fallback timeout Note: Requires postlight-parser: npm install -g @postlight/parser @@ -51,6 +51,20 @@ def get_env_int(name: str, default: int = 0) -> int: return default +def get_env_array(name: str, default: list[str] | None = None) -> list[str]: + """Parse a JSON array from environment variable.""" + val = get_env(name, '') + if not val: + return default if default is not None else [] + try: + result = json.loads(val) + if isinstance(result, list): + return [str(item) for item in result] + return default if default is not None else [] + except json.JSONDecodeError: + return default if default is not None else [] + + def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]: """ Extract article using Mercury Parser. @@ -58,13 +72,15 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]: Returns: (success, output_path, error_message) """ timeout = get_env_int('MERCURY_TIMEOUT') or get_env_int('TIMEOUT', 60) + mercury_args = get_env_array('MERCURY_ARGS', []) + mercury_args_extra = get_env_array('MERCURY_ARGS_EXTRA', []) # Output directory is current directory (hook already runs in output dir) output_dir = Path(OUTPUT_DIR) try: # Get text version - cmd_text = [binary, url, '--format=text'] + cmd_text = [binary, *mercury_args, *mercury_args_extra, url, '--format=text'] result_text = subprocess.run(cmd_text, capture_output=True, timeout=timeout) if result_text.returncode != 0: @@ -84,7 +100,7 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]: (output_dir / 'content.txt').write_text(text_content, encoding='utf-8') # Get HTML version - cmd_html = [binary, url, '--format=html'] + cmd_html = [binary, *mercury_args, *mercury_args_extra, url, '--format=html'] result_html = subprocess.run(cmd_html, capture_output=True, timeout=timeout) try: diff --git a/archivebox/plugins/papersdl/config.json b/archivebox/plugins/papersdl/config.json index 4d96d3bd..2c6eb342 100644 --- a/archivebox/plugins/papersdl/config.json +++ b/archivebox/plugins/papersdl/config.json @@ -21,10 +21,19 @@ "x-fallback": "TIMEOUT", "description": "Timeout for paper downloads in seconds" }, - "PAPERSDL_EXTRA_ARGS": { - "type": "string", - "default": "", - "description": "Extra arguments for papers-dl (space-separated)" + "PAPERSDL_ARGS": { + "type": "array", + "items": {"type": "string"}, + "default": ["fetch"], + "x-aliases": ["PAPERSDL_DEFAULT_ARGS"], + "description": "Default papers-dl arguments" + }, + "PAPERSDL_ARGS_EXTRA": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["PAPERSDL_EXTRA_ARGS"], + "description": "Extra arguments to append to papers-dl command" } } } diff --git a/archivebox/plugins/papersdl/on_Snapshot__66_papersdl.bg.py b/archivebox/plugins/papersdl/on_Snapshot__66_papersdl.bg.py index a75dc4ea..859d911e 100755 --- a/archivebox/plugins/papersdl/on_Snapshot__66_papersdl.bg.py +++ b/archivebox/plugins/papersdl/on_Snapshot__66_papersdl.bg.py @@ -8,7 +8,8 @@ Output: Downloads paper PDFs to $PWD/ Environment variables: PAPERSDL_BINARY: Path to papers-dl binary PAPERSDL_TIMEOUT: Timeout in seconds (default: 300 for paper downloads) - PAPERSDL_EXTRA_ARGS: Extra arguments for papers-dl (space-separated) + PAPERSDL_ARGS: Default papers-dl arguments (JSON array, default: ["fetch"]) + PAPERSDL_ARGS_EXTRA: Extra arguments to append (JSON array) # papers-dl feature toggles SAVE_PAPERSDL: Enable papers-dl paper extraction (default: True) @@ -54,6 +55,20 @@ def get_env_int(name: str, default: int = 0) -> int: return default +def get_env_array(name: str, default: list[str] | None = None) -> list[str]: + """Parse a JSON array from environment variable.""" + val = get_env(name, '') + if not val: + return default if default is not None else [] + try: + result = json.loads(val) + if isinstance(result, list): + return [str(item) for item in result] + return default if default is not None else [] + except json.JSONDecodeError: + return default if default is not None else [] + + def extract_doi_from_url(url: str) -> str | None: """Extract DOI from common paper URLs.""" # Match DOI pattern in URL @@ -72,7 +87,8 @@ def save_paper(url: str, binary: str) -> tuple[bool, str | None, str]: """ # Get config from env timeout = get_env_int('TIMEOUT', 300) - extra_args = get_env('PAPERSDL_EXTRA_ARGS', '') + papersdl_args = get_env_array('PAPERSDL_ARGS', []) + papersdl_args_extra = get_env_array('PAPERSDL_ARGS_EXTRA', []) # Output directory is current directory (hook already runs in output dir) output_dir = Path(OUTPUT_DIR) @@ -85,11 +101,11 @@ def save_paper(url: str, binary: str) -> tuple[bool, str | None, str]: else: identifier = doi - # Build command - papers-dl fetch -o - cmd = [binary, 'fetch', identifier, '-o', str(output_dir)] + # Build command - papers-dl -o + cmd = [binary, *papersdl_args, identifier, '-o', str(output_dir)] - if extra_args: - cmd.extend(extra_args.split()) + if papersdl_args_extra: + cmd.extend(papersdl_args_extra) try: result = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True) diff --git a/archivebox/plugins/readability/config.json b/archivebox/plugins/readability/config.json index b6db094c..90173047 100644 --- a/archivebox/plugins/readability/config.json +++ b/archivebox/plugins/readability/config.json @@ -14,17 +14,26 @@ "default": "readability-extractor", "description": "Path to readability-extractor binary" }, - "NODE_BINARY": { - "type": "string", - "default": "node", - "description": "Path to Node.js binary" - }, "READABILITY_TIMEOUT": { "type": "integer", "default": 30, "minimum": 5, "x-fallback": "TIMEOUT", "description": "Timeout for Readability in seconds" + }, + "READABILITY_ARGS": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["READABILITY_DEFAULT_ARGS"], + "description": "Default Readability arguments" + }, + "READABILITY_ARGS_EXTRA": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["READABILITY_EXTRA_ARGS"], + "description": "Extra arguments to append to Readability command" } } } diff --git a/archivebox/plugins/readability/on_Snapshot__55_readability.py b/archivebox/plugins/readability/on_Snapshot__55_readability.py index 41970437..2777479a 100644 --- a/archivebox/plugins/readability/on_Snapshot__55_readability.py +++ b/archivebox/plugins/readability/on_Snapshot__55_readability.py @@ -8,8 +8,8 @@ Output: Creates readability/ directory with content.html, content.txt, article.j Environment variables: READABILITY_BINARY: Path to readability-extractor binary READABILITY_TIMEOUT: Timeout in seconds (default: 60) - - # Fallback to ARCHIVING_CONFIG values if READABILITY_* not set: + READABILITY_ARGS: Default Readability arguments (JSON array) + READABILITY_ARGS_EXTRA: Extra arguments to append (JSON array) TIMEOUT: Fallback timeout Note: Requires readability-extractor from https://github.com/ArchiveBox/readability-extractor @@ -44,6 +44,20 @@ def get_env_int(name: str, default: int = 0) -> int: return default +def get_env_array(name: str, default: list[str] | None = None) -> list[str]: + """Parse a JSON array from environment variable.""" + val = get_env(name, '') + if not val: + return default if default is not None else [] + try: + result = json.loads(val) + if isinstance(result, list): + return [str(item) for item in result] + return default if default is not None else [] + except json.JSONDecodeError: + return default if default is not None else [] + + def find_html_source() -> str | None: """Find HTML content from other extractors in the snapshot directory.""" # Hooks run in snapshot_dir, sibling extractor outputs are in subdirectories @@ -73,6 +87,8 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]: Returns: (success, output_path, error_message) """ timeout = get_env_int('READABILITY_TIMEOUT') or get_env_int('TIMEOUT', 60) + readability_args = get_env_array('READABILITY_ARGS', []) + readability_args_extra = get_env_array('READABILITY_ARGS_EXTRA', []) # Find HTML source html_source = find_html_source() @@ -84,7 +100,7 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]: try: # Run readability-extractor (outputs JSON by default) - cmd = [binary, html_source] + cmd = [binary, *readability_args, *readability_args_extra, html_source] result = subprocess.run(cmd, capture_output=True, timeout=timeout) if result.returncode != 0: diff --git a/archivebox/plugins/search_backend_ripgrep/config.json b/archivebox/plugins/search_backend_ripgrep/config.json index 0753c938..49c5c885 100644 --- a/archivebox/plugins/search_backend_ripgrep/config.json +++ b/archivebox/plugins/search_backend_ripgrep/config.json @@ -3,25 +3,32 @@ "type": "object", "additionalProperties": false, "properties": { - "SEARCH_BACKEND_RIPGREP_BINARY": { + "RIPGREP_BINARY": { "type": "string", "default": "rg", - "x-aliases": ["RIPGREP_BINARY"], "description": "Path to ripgrep binary" }, - "SEARCH_BACKEND_RIPGREP_IGNORE_EXTENSIONS": { - "type": "string", - "default": "css,js,orig,svg", - "x-aliases": ["RIPGREP_IGNORE_EXTENSIONS"], - "description": "Comma-separated file extensions to ignore" - }, - "SEARCH_BACKEND_RIPGREP_TIMEOUT": { + "RIPGREP_TIMEOUT": { "type": "integer", "default": 90, "minimum": 5, "x-fallback": "TIMEOUT", "x-aliases": ["SEARCH_BACKEND_TIMEOUT"], "description": "Search timeout in seconds" + }, + "RIPGREP_ARGS": { + "type": "array", + "items": { "type": "string" }, + "default": ["--files-with-matches", "--no-messages", "--ignore-case"], + "x-aliases": ["RIPGREP_DEFAULT_ARGS"], + "description": "Default ripgrep arguments" + }, + "RIPGREP_ARGS_EXTRA": { + "type": "array", + "items": { "type": "string" }, + "default": [], + "x-aliases": ["RIPGREP_EXTRA_ARGS"], + "description": "Extra arguments to append to ripgrep command" } } } diff --git a/archivebox/plugins/search_backend_ripgrep/search.py b/archivebox/plugins/search_backend_ripgrep/search.py index 135b392e..140a32d1 100644 --- a/archivebox/plugins/search_backend_ripgrep/search.py +++ b/archivebox/plugins/search_backend_ripgrep/search.py @@ -6,10 +6,12 @@ using ripgrep (rg). This is simpler but slower for large archives. Environment variables: RIPGREP_BINARY: Path to ripgrep binary (default: rg) - RIPGREP_IGNORE_EXTENSIONS: Comma-separated extensions to ignore (default: css,js,orig,svg) - SEARCH_BACKEND_TIMEOUT: Search timeout in seconds (default: 90) + RIPGREP_ARGS: Default ripgrep arguments (JSON array) + RIPGREP_ARGS_EXTRA: Extra arguments to append (JSON array) + RIPGREP_TIMEOUT: Search timeout in seconds (default: 90) """ +import json import os import subprocess import shutil @@ -19,39 +21,57 @@ from typing import List, Iterable from django.conf import settings -# Config with old var names for backwards compatibility -RIPGREP_BINARY = os.environ.get('RIPGREP_BINARY', 'rg').strip() -RIPGREP_IGNORE_EXTENSIONS = os.environ.get('RIPGREP_IGNORE_EXTENSIONS', 'css,js,orig,svg').strip() -SEARCH_BACKEND_TIMEOUT = int(os.environ.get('SEARCH_BACKEND_TIMEOUT', '90')) +def get_env(name: str, default: str = '') -> str: + return os.environ.get(name, default).strip() + + +def get_env_int(name: str, default: int = 0) -> int: + try: + return int(get_env(name, str(default))) + except ValueError: + return default + + +def get_env_array(name: str, default: list[str] | None = None) -> list[str]: + """Parse a JSON array from environment variable.""" + val = get_env(name, '') + if not val: + return default if default is not None else [] + try: + result = json.loads(val) + if isinstance(result, list): + return [str(item) for item in result] + return default if default is not None else [] + except json.JSONDecodeError: + return default if default is not None else [] def search(query: str) -> List[str]: """Search for snapshots using ripgrep.""" - rg_binary = shutil.which(RIPGREP_BINARY) or RIPGREP_BINARY + rg_binary = get_env('RIPGREP_BINARY', 'rg') + rg_binary = shutil.which(rg_binary) or rg_binary if not rg_binary or not Path(rg_binary).exists(): - raise RuntimeError(f'ripgrep binary not found ({RIPGREP_BINARY}). Install with: apt install ripgrep') + raise RuntimeError(f'ripgrep binary not found. Install with: apt install ripgrep') + + timeout = get_env_int('RIPGREP_TIMEOUT', 90) + ripgrep_args = get_env_array('RIPGREP_ARGS', []) + ripgrep_args_extra = get_env_array('RIPGREP_ARGS_EXTRA', []) archive_dir = Path(settings.ARCHIVE_DIR) if not archive_dir.exists(): return [] - # Build ignore pattern from config - ignore_pattern = f'*.{{{RIPGREP_IGNORE_EXTENSIONS}}}' - cmd = [ rg_binary, - f'--type-add=ignore:{ignore_pattern}', - '--type-not=ignore', - '--files-with-matches', - '--no-messages', - '--ignore-case', + *ripgrep_args, + *ripgrep_args_extra, '--regexp', query, str(archive_dir), ] try: - result = subprocess.run(cmd, capture_output=True, text=True, timeout=SEARCH_BACKEND_TIMEOUT) + result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) # Extract snapshot IDs from file paths # Paths look like: archive///file.txt diff --git a/archivebox/plugins/singlefile/config.json b/archivebox/plugins/singlefile/config.json index ddfec833..ee546272 100644 --- a/archivebox/plugins/singlefile/config.json +++ b/archivebox/plugins/singlefile/config.json @@ -15,11 +15,18 @@ "x-aliases": ["SINGLE_FILE_BINARY"], "description": "Path to single-file binary" }, - "NODE_BINARY": { + "SINGLEFILE_NODE_BINARY": { "type": "string", "default": "node", + "x-fallback": "NODE_BINARY", "description": "Path to Node.js binary" }, + "SINGLEFILE_CHROME_BINARY": { + "type": "string", + "default": "", + "x-fallback": "CHROME_BINARY", + "description": "Path to Chrome/Chromium binary" + }, "SINGLEFILE_TIMEOUT": { "type": "integer", "default": 60, @@ -39,16 +46,25 @@ "x-fallback": "COOKIES_FILE", "description": "Path to cookies file" }, + "SINGLEFILE_CHECK_SSL_VALIDITY": { + "type": "boolean", + "default": true, + "x-fallback": "CHECK_SSL_VALIDITY", + "description": "Whether to verify SSL certificates" + }, "SINGLEFILE_ARGS": { "type": "array", "items": {"type": "string"}, - "default": [], + "default": ["--browser-headless"], + "x-aliases": ["SINGLEFILE_DEFAULT_ARGS"], "description": "Default single-file arguments" }, - "SINGLEFILE_EXTRA_ARGS": { - "type": "string", - "default": "", - "description": "Extra arguments for single-file" + "SINGLEFILE_ARGS_EXTRA": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["SINGLEFILE_EXTRA_ARGS"], + "description": "Extra arguments to append to single-file command" } } } diff --git a/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py b/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py index cfda31aa..c7dc1686 100644 --- a/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py +++ b/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py @@ -6,24 +6,16 @@ Usage: on_Snapshot__singlefile.py --url= --snapshot-id= Output: Writes singlefile.html to $PWD Environment variables: - SINGLEFILE_BINARY: Path to SingleFile binary - SINGLEFILE_TIMEOUT: Timeout in seconds (default: 120) - SINGLEFILE_USER_AGENT: User agent string (optional) - SINGLEFILE_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True) - SINGLEFILE_COOKIES_FILE: Path to cookies file (optional) - SINGLEFILE_EXTRA_ARGS: Extra arguments for SingleFile (space-separated) - - # Feature toggle - SAVE_SINGLEFILE: Enable SingleFile archiving (default: True) - - # Chrome binary (SingleFile needs Chrome) - CHROME_BINARY: Path to Chrome/Chromium binary - - # Fallback to ARCHIVING_CONFIG values if SINGLEFILE_* not set: - TIMEOUT: Fallback timeout - USER_AGENT: Fallback user agent - CHECK_SSL_VALIDITY: Fallback SSL check - COOKIES_FILE: Fallback cookies file + SINGLEFILE_ENABLED: Enable SingleFile archiving (default: True) + SINGLEFILE_BINARY: Path to SingleFile binary (default: single-file) + SINGLEFILE_NODE_BINARY: Path to Node.js binary (x-fallback: NODE_BINARY) + SINGLEFILE_CHROME_BINARY: Path to Chrome binary (x-fallback: CHROME_BINARY) + SINGLEFILE_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT) + SINGLEFILE_USER_AGENT: User agent string (x-fallback: USER_AGENT) + SINGLEFILE_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE) + SINGLEFILE_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY) + SINGLEFILE_ARGS: Default SingleFile arguments (JSON array) + SINGLEFILE_ARGS_EXTRA: Extra arguments to append (JSON array) """ import json @@ -63,6 +55,20 @@ def get_env_int(name: str, default: int = 0) -> int: return default +def get_env_array(name: str, default: list[str] | None = None) -> list[str]: + """Parse a JSON array from environment variable.""" + val = get_env(name, '') + if not val: + return default if default is not None else [] + try: + result = json.loads(val) + if isinstance(result, list): + return [str(item) for item in result] + return default if default is not None else [] + except json.JSONDecodeError: + return default if default is not None else [] + + STATICFILE_DIR = '../staticfile' def has_staticfile_output() -> bool: @@ -121,15 +127,16 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]: Returns: (success, output_path, error_message) """ - # Get config from env (with SINGLEFILE_ prefix or fallback to ARCHIVING_CONFIG style) + # Get config from env (with SINGLEFILE_ prefix, x-fallback handled by config loader) timeout = get_env_int('SINGLEFILE_TIMEOUT') or get_env_int('TIMEOUT', 120) user_agent = get_env('SINGLEFILE_USER_AGENT') or get_env('USER_AGENT', '') - check_ssl = get_env_bool('SINGLEFILE_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True)) + check_ssl = get_env_bool('SINGLEFILE_CHECK_SSL_VALIDITY', True) if get_env('SINGLEFILE_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True) cookies_file = get_env('SINGLEFILE_COOKIES_FILE') or get_env('COOKIES_FILE', '') - extra_args = get_env('SINGLEFILE_EXTRA_ARGS', '') - chrome = get_env('CHROME_BINARY', '') + singlefile_args = get_env_array('SINGLEFILE_ARGS', []) + singlefile_args_extra = get_env_array('SINGLEFILE_ARGS_EXTRA', []) + chrome = get_env('SINGLEFILE_CHROME_BINARY') or get_env('CHROME_BINARY', '') - cmd = [binary] + cmd = [binary, *singlefile_args] # Try to use existing Chrome session via CDP cdp_url = get_cdp_url() @@ -142,11 +149,6 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]: elif chrome: cmd.extend(['--browser-executable-path', chrome]) - # Common options - cmd.extend([ - '--browser-headless', - ]) - # SSL handling if not check_ssl: cmd.append('--browser-ignore-insecure-certs') @@ -157,8 +159,9 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]: if cookies_file and Path(cookies_file).is_file(): cmd.extend(['--browser-cookies-file', cookies_file]) - if extra_args: - cmd.extend(extra_args.split()) + # Add extra args from config + if singlefile_args_extra: + cmd.extend(singlefile_args_extra) # Output directory is current directory (hook already runs in output dir) output_dir = Path(OUTPUT_DIR) diff --git a/archivebox/plugins/wget/config.json b/archivebox/plugins/wget/config.json index 968791ac..70893612 100644 --- a/archivebox/plugins/wget/config.json +++ b/archivebox/plugins/wget/config.json @@ -9,18 +9,12 @@ "x-aliases": ["SAVE_WGET", "USE_WGET"], "description": "Enable wget archiving" }, - "WGET_SAVE_WARC": { + "WGET_WARC_ENABLED": { "type": "boolean", "default": true, - "x-aliases": ["SAVE_WARC"], + "x-aliases": ["SAVE_WARC", "WGET_SAVE_WARC"], "description": "Save WARC archive file" }, - "WGET_SAVE_REQUISITES": { - "type": "boolean", - "default": true, - "x-aliases": ["SAVE_WGET_REQUISITES"], - "description": "Download page requisites (CSS, JS, images)" - }, "WGET_BINARY": { "type": "string", "default": "wget", @@ -39,25 +33,17 @@ "x-fallback": "USER_AGENT", "description": "User agent string for wget" }, - "WGET_CHECK_SSL_VALIDITY": { - "type": "boolean", - "default": true, - "x-fallback": "CHECK_SSL_VALIDITY", - "x-aliases": ["CHECK_SSL_VALIDITY"], - "description": "Whether to verify SSL certificates" - }, "WGET_COOKIES_FILE": { "type": "string", "default": "", "x-fallback": "COOKIES_FILE", "description": "Path to cookies file" }, - "WGET_RESTRICT_FILE_NAMES": { - "type": "string", - "default": "windows", - "enum": ["windows", "unix", "ascii", "nocontrol", "lowercase", "uppercase"], - "x-fallback": "RESTRICT_FILE_NAMES", - "description": "Filename restriction mode" + "WGET_CHECK_SSL_VALIDITY": { + "type": "boolean", + "default": true, + "x-fallback": "CHECK_SSL_VALIDITY", + "description": "Whether to verify SSL certificates" }, "WGET_ARGS": { "type": "array", @@ -70,14 +56,20 @@ "--backup-converted", "--span-hosts", "--no-parent", + "--page-requisites", + "--restrict-file-names=windows", + "--tries=2", "-e", "robots=off" ], + "x-aliases": ["WGET_DEFAULT_ARGS"], "description": "Default wget arguments" }, - "WGET_EXTRA_ARGS": { - "type": "string", - "default": "", - "description": "Extra arguments for wget (space-separated)" + "WGET_ARGS_EXTRA": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["WGET_EXTRA_ARGS"], + "description": "Extra arguments to append to wget command" } } } diff --git a/archivebox/plugins/wget/on_Snapshot__61_wget.py b/archivebox/plugins/wget/on_Snapshot__61_wget.py index b605ea6c..8d4372d5 100644 --- a/archivebox/plugins/wget/on_Snapshot__61_wget.py +++ b/archivebox/plugins/wget/on_Snapshot__61_wget.py @@ -6,25 +6,15 @@ Usage: on_Snapshot__wget.py --url= --snapshot-id= Output: Downloads files to $PWD Environment variables: - WGET_BINARY: Path to wget binary (optional, falls back to PATH) - WGET_TIMEOUT: Timeout in seconds (default: 60) - WGET_USER_AGENT: User agent string - WGET_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True) - WGET_COOKIES_FILE: Path to cookies file (optional) - WGET_RESTRICT_FILE_NAMES: Filename restriction mode (default: windows) - WGET_EXTRA_ARGS: Extra arguments for wget (space-separated) - - # Wget feature toggles - SAVE_WGET: Enable wget archiving (default: True) - SAVE_WARC: Save WARC file (default: True) - SAVE_WGET_REQUISITES: Download page requisites (default: True) - - # Fallback to ARCHIVING_CONFIG values if WGET_* not set: - TIMEOUT: Fallback timeout - USER_AGENT: Fallback user agent - CHECK_SSL_VALIDITY: Fallback SSL check - COOKIES_FILE: Fallback cookies file - RESTRICT_FILE_NAMES: Fallback filename restriction + WGET_ENABLED: Enable wget archiving (default: True) + WGET_WARC_ENABLED: Save WARC file (default: True) + WGET_BINARY: Path to wget binary (default: wget) + WGET_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT) + WGET_USER_AGENT: User agent string (x-fallback: USER_AGENT) + WGET_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE) + WGET_CHECK_SSL_VALIDITY: Whether to check SSL certificates (x-fallback: CHECK_SSL_VALIDITY) + WGET_ARGS: Default wget arguments (JSON array) + WGET_ARGS_EXTRA: Extra arguments to append (JSON array) """ import json @@ -65,6 +55,20 @@ def get_env_int(name: str, default: int = 0) -> int: return default +def get_env_array(name: str, default: list[str] | None = None) -> list[str]: + """Parse a JSON array from environment variable.""" + val = get_env(name, '') + if not val: + return default if default is not None else [] + try: + result = json.loads(val) + if isinstance(result, list): + return [str(item) for item in result] + return default if default is not None else [] + except json.JSONDecodeError: + return default if default is not None else [] + + STATICFILE_DIR = '../staticfile' def has_staticfile_output() -> bool: @@ -73,17 +77,6 @@ def has_staticfile_output() -> bool: return staticfile_dir.exists() and any(staticfile_dir.iterdir()) -# Default wget args (from old WGET_CONFIG) -WGET_DEFAULT_ARGS = [ - '--no-verbose', - '--adjust-extension', - '--convert-links', - '--force-directories', - '--backup-converted', - '--span-hosts', - '--no-parent', - '-e', 'robots=off', -] def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]: @@ -92,36 +85,28 @@ def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]: Returns: (success, output_path, error_message) """ - # Get config from env (with WGET_ prefix or fallback to ARCHIVING_CONFIG style) + # Get config from env (with WGET_ prefix, x-fallback handled by config loader) timeout = get_env_int('WGET_TIMEOUT') or get_env_int('TIMEOUT', 60) user_agent = get_env('WGET_USER_AGENT') or get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)') - check_ssl = get_env_bool('WGET_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True)) + check_ssl = get_env_bool('WGET_CHECK_SSL_VALIDITY', True) if get_env('WGET_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True) cookies_file = get_env('WGET_COOKIES_FILE') or get_env('COOKIES_FILE', '') - restrict_names = get_env('WGET_RESTRICT_FILE_NAMES') or get_env('RESTRICT_FILE_NAMES', 'windows') - extra_args = get_env('WGET_EXTRA_ARGS', '') + wget_args = get_env_array('WGET_ARGS', []) + wget_args_extra = get_env_array('WGET_ARGS_EXTRA', []) # Feature toggles - save_warc = get_env_bool('WGET_SAVE_WARC', True) - save_requisites = get_env_bool('WGET_SAVE_REQUISITES', True) + warc_enabled = get_env_bool('WGET_WARC_ENABLED', True) # Build wget command (later options take precedence) cmd = [ binary, - *WGET_DEFAULT_ARGS, + *wget_args, f'--timeout={timeout}', - '--tries=2', ] if user_agent: cmd.append(f'--user-agent={user_agent}') - if restrict_names: - cmd.append(f'--restrict-file-names={restrict_names}') - - if save_requisites: - cmd.append('--page-requisites') - - if save_warc: + if warc_enabled: warc_dir = Path('warc') warc_dir.mkdir(exist_ok=True) warc_path = warc_dir / str(int(datetime.now(timezone.utc).timestamp())) @@ -135,8 +120,8 @@ def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]: if not check_ssl: cmd.extend(['--no-check-certificate', '--no-hsts']) - if extra_args: - cmd.extend(extra_args.split()) + if wget_args_extra: + cmd.extend(wget_args_extra) cmd.append(url) diff --git a/archivebox/plugins/ytdlp/binaries.jsonl b/archivebox/plugins/ytdlp/binaries.jsonl index beb44a4a..05240fd2 100644 --- a/archivebox/plugins/ytdlp/binaries.jsonl +++ b/archivebox/plugins/ytdlp/binaries.jsonl @@ -1,3 +1,3 @@ -{"type": "Binary", "name": "yt-dlp", "binproviders": "pip,brew,apt,env"} +{"type": "Binary", "name": "yt-dlp", "binproviders": "pip,brew,apt,env", "overrides": {"pip": {"packages": "yt-dlp[default]"}}} {"type": "Binary", "name": "node", "binproviders": "apt,brew,env", "overrides": {"apt": {"packages": ["nodejs"]}}} {"type": "Binary", "name": "ffmpeg", "binproviders": "apt,brew,env"} diff --git a/archivebox/plugins/ytdlp/config.json b/archivebox/plugins/ytdlp/config.json index 69ae5566..6c9a74b6 100644 --- a/archivebox/plugins/ytdlp/config.json +++ b/archivebox/plugins/ytdlp/config.json @@ -15,6 +15,12 @@ "x-aliases": ["MEDIA_BINARY", "YOUTUBEDL_BINARY", "YOUTUBE_DL_BINARY"], "description": "Path to yt-dlp binary" }, + "YTDLP_NODE_BINARY": { + "type": "string", + "default": "node", + "x-fallback": "NODE_BINARY", + "description": "Path to Node.js binary for yt-dlp JS runtime" + }, "YTDLP_TIMEOUT": { "type": "integer", "default": 3600, @@ -23,6 +29,12 @@ "x-aliases": ["MEDIA_TIMEOUT"], "description": "Timeout for yt-dlp downloads in seconds" }, + "YTDLP_COOKIES_FILE": { + "type": "string", + "default": "", + "x-fallback": "COOKIES_FILE", + "description": "Path to cookies file" + }, "YTDLP_MAX_SIZE": { "type": "string", "default": "750m", @@ -58,14 +70,15 @@ "--no-progress", "-o", "%(title)s.%(ext)s" ], - "x-aliases": ["MEDIA_ARGS"], - "description": "Default yt-dlp arguments (override to customize behavior)" + "x-aliases": ["YTDLP_DEFAULT_ARGS", "MEDIA_ARGS"], + "description": "Default yt-dlp arguments" }, - "YTDLP_EXTRA_ARGS": { - "type": "string", - "default": "", - "x-aliases": ["MEDIA_EXTRA_ARGS"], - "description": "Extra arguments for yt-dlp (space-separated, appended after YTDLP_ARGS)" + "YTDLP_ARGS_EXTRA": { + "type": "array", + "items": {"type": "string"}, + "default": [], + "x-aliases": ["YTDLP_EXTRA_ARGS", "MEDIA_ARGS_EXTRA", "MEDIA_EXTRA_ARGS"], + "description": "Extra arguments to append to yt-dlp command" } } } diff --git a/archivebox/plugins/ytdlp/on_Snapshot__63_ytdlp.bg.py b/archivebox/plugins/ytdlp/on_Snapshot__63_ytdlp.bg.py index 6471645d..b8112b16 100644 --- a/archivebox/plugins/ytdlp/on_Snapshot__63_ytdlp.bg.py +++ b/archivebox/plugins/ytdlp/on_Snapshot__63_ytdlp.bg.py @@ -3,24 +3,18 @@ Download video/audio from a URL using yt-dlp. Usage: on_Snapshot__ytdlp.py --url= --snapshot-id= -Output: Downloads video/audio files to $PWD/ytdlp/ +Output: Downloads video/audio files to $PWD Environment variables: - YTDLP_BINARY: Path to yt-dlp binary - YTDLP_TIMEOUT: Timeout in seconds (default: 3600 for large downloads) - YTDLP_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True) - YTDLP_ARGS: JSON array of yt-dlp arguments (overrides defaults) - YTDLP_EXTRA_ARGS: Extra arguments for yt-dlp (space-separated, appended) - YTDLP_MAX_SIZE: Maximum file size (default: 750m) - - # Feature toggles (with backwards-compatible aliases) YTDLP_ENABLED: Enable yt-dlp extraction (default: True) - SAVE_YTDLP: Alias for YTDLP_ENABLED - MEDIA_ENABLED: Backwards-compatible alias for YTDLP_ENABLED - - # Fallback to ARCHIVING_CONFIG values if YTDLP_* not set: - TIMEOUT: Fallback timeout - CHECK_SSL_VALIDITY: Fallback SSL check + YTDLP_BINARY: Path to yt-dlp binary (default: yt-dlp) + YTDLP_NODE_BINARY: Path to Node.js binary (x-fallback: NODE_BINARY) + YTDLP_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT) + YTDLP_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE) + YTDLP_MAX_SIZE: Maximum file size (default: 750m) + YTDLP_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY) + YTDLP_ARGS: Default yt-dlp arguments (JSON array) + YTDLP_ARGS_EXTRA: Extra arguments to append (JSON array) """ import json @@ -59,6 +53,20 @@ def get_env_int(name: str, default: int = 0) -> int: return default +def get_env_array(name: str, default: list[str] | None = None) -> list[str]: + """Parse a JSON array from environment variable.""" + val = get_env(name, '') + if not val: + return default if default is not None else [] + try: + result = json.loads(val) + if isinstance(result, list): + return [str(item) for item in result] + return default if default is not None else [] + except json.JSONDecodeError: + return default if default is not None else [] + + STATICFILE_DIR = '../staticfile' def has_staticfile_output() -> bool: @@ -67,69 +75,41 @@ def has_staticfile_output() -> bool: return staticfile_dir.exists() and any(staticfile_dir.iterdir()) -# Default yt-dlp args (can be overridden via YTDLP_ARGS env var) -YTDLP_DEFAULT_ARGS = [ - '--restrict-filenames', - '--trim-filenames', '128', - '--write-description', - '--write-info-json', - '--write-thumbnail', - '--write-sub', - '--write-auto-subs', - '--convert-subs=srt', - '--yes-playlist', - '--continue', - '--no-abort-on-error', - '--ignore-errors', - '--geo-bypass', - '--add-metadata', - '--no-progress', - '-o', '%(title)s.%(ext)s', -] - - -def get_ytdlp_args() -> list[str]: - """Get yt-dlp arguments from YTDLP_ARGS env var or use defaults.""" - ytdlp_args_str = get_env('YTDLP_ARGS', '') - if ytdlp_args_str: - try: - # Try to parse as JSON array - args = json.loads(ytdlp_args_str) - if isinstance(args, list): - return [str(arg) for arg in args] - except json.JSONDecodeError: - pass - return YTDLP_DEFAULT_ARGS - - def save_ytdlp(url: str, binary: str) -> tuple[bool, str | None, str]: """ Download video/audio using yt-dlp. Returns: (success, output_path, error_message) """ - # Get config from env (YTDLP_* primary, MEDIA_* as fallback via aliases) - timeout = get_env_int('TIMEOUT', 3600) - check_ssl = get_env_bool('CHECK_SSL_VALIDITY', True) - extra_args = get_env('YTDLP_EXTRA_ARGS', '') - max_size = get_env('YTDLP_MAX_SIZE', '') or get_env('MEDIA_MAX_SIZE', '750m') + # Get config from env (with YTDLP_ prefix, x-fallback handled by config loader) + timeout = get_env_int('YTDLP_TIMEOUT') or get_env_int('TIMEOUT', 3600) + check_ssl = get_env_bool('YTDLP_CHECK_SSL_VALIDITY', True) if get_env('YTDLP_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True) + cookies_file = get_env('YTDLP_COOKIES_FILE') or get_env('COOKIES_FILE', '') + max_size = get_env('YTDLP_MAX_SIZE', '750m') + node_binary = get_env('YTDLP_NODE_BINARY') or get_env('NODE_BINARY', 'node') + ytdlp_args = get_env_array('YTDLP_ARGS', []) + ytdlp_args_extra = get_env_array('YTDLP_ARGS_EXTRA', []) # Output directory is current directory (hook already runs in output dir) output_dir = Path(OUTPUT_DIR) - # Build command using configurable YTDLP_ARGS (later options take precedence) + # Build command (later options take precedence) cmd = [ binary, - *get_ytdlp_args(), - # Format with max_size limit (appended after YTDLP_ARGS so it can be overridden by YTDLP_EXTRA_ARGS) + *ytdlp_args, + # Format with max_size limit (appended after YTDLP_ARGS so it can be overridden by YTDLP_ARGS_EXTRA) f'--format=(bv*+ba/b)[filesize<={max_size}][filesize_approx<=?{max_size}]/(bv*+ba/b)', + f'--js-runtimes=node:{node_binary}', ] if not check_ssl: cmd.append('--no-check-certificate') - if extra_args: - cmd.extend(extra_args.split()) + if cookies_file and Path(cookies_file).is_file(): + cmd.extend(['--cookies', cookies_file]) + + if ytdlp_args_extra: + cmd.extend(ytdlp_args_extra) cmd.append(url) @@ -193,9 +173,8 @@ def main(url: str, snapshot_id: str): """Download video/audio from a URL using yt-dlp.""" try: - # Check if yt-dlp downloading is enabled (YTDLP_ENABLED primary, MEDIA_ENABLED fallback) - ytdlp_enabled = get_env_bool('YTDLP_ENABLED', True) and get_env_bool('MEDIA_ENABLED', True) - if not ytdlp_enabled: + # Check if yt-dlp downloading is enabled + if not get_env_bool('YTDLP_ENABLED', True): print('Skipping ytdlp (YTDLP_ENABLED=False)', file=sys.stderr) # Temporary failure (config disabled) - NO JSONL emission sys.exit(0)