make plugin config more consistent

This commit is contained in:
Nick Sweeting
2025-12-29 13:11:34 -08:00
parent 8d76b2b0c6
commit 967c5d53e0
23 changed files with 452 additions and 339 deletions

View File

@@ -9,10 +9,10 @@
"x-aliases": ["CHROMIUM_BINARY", "GOOGLE_CHROME_BINARY"],
"description": "Path to Chrome/Chromium binary"
},
"NODE_BINARY": {
"CHROME_NODE_BINARY": {
"type": "string",
"default": "node",
"x-aliases": ["NODEJS_BINARY"],
"x-fallback": "NODE_BINARY",
"description": "Path to Node.js binary (for Puppeteer)"
},
"CHROME_TIMEOUT": {
@@ -50,16 +50,19 @@
"x-fallback": "USER_AGENT",
"description": "User agent string for Chrome"
},
"CHROME_EXTRA_ARGS": {
"type": "string",
"default": "",
"description": "Extra command-line arguments for Chrome (space-separated)"
"CHROME_ARGS": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["CHROME_DEFAULT_ARGS"],
"description": "Default Chrome command-line arguments"
},
"CHROME_CHECK_SSL_VALIDITY": {
"type": "boolean",
"default": true,
"x-fallback": "CHECK_SSL_VALIDITY",
"description": "Whether to verify SSL certificates"
"CHROME_ARGS_EXTRA": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["CHROME_EXTRA_ARGS"],
"description": "Extra arguments to append to Chrome command"
}
}
}

View File

@@ -21,12 +21,6 @@
"default": "",
"x-fallback": "USER_AGENT",
"description": "User agent string"
},
"FAVICON_CHECK_SSL_VALIDITY": {
"type": "boolean",
"default": true,
"x-fallback": "CHECK_SSL_VALIDITY",
"description": "Whether to verify SSL certificates"
}
}
}

View File

@@ -27,21 +27,25 @@
"enum": ["jsonl", "warc", "mbox", "maildir", "mh", "mmdf", "babyl"],
"description": "Output format for forum downloads"
},
"FORUMDL_TEXTIFY": {
"type": "boolean",
"default": false,
"description": "Convert HTML content to plaintext (keep false to preserve HTML)"
},
"FORUMDL_CHECK_SSL_VALIDITY": {
"type": "boolean",
"default": true,
"x-fallback": "CHECK_SSL_VALIDITY",
"description": "Whether to verify SSL certificates"
},
"FORUMDL_EXTRA_ARGS": {
"type": "string",
"default": "",
"description": "Extra arguments for forum-dl (space-separated)"
"FORUMDL_ARGS": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["FORUMDL_DEFAULT_ARGS"],
"description": "Default forum-dl arguments"
},
"FORUMDL_ARGS_EXTRA": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["FORUMDL_EXTRA_ARGS"],
"description": "Extra arguments to append to forum-dl command"
}
}
}

View File

@@ -6,19 +6,13 @@ Usage: on_Snapshot__forumdl.py --url=<url> --snapshot-id=<uuid>
Output: Downloads forum content to $PWD/
Environment variables:
FORUMDL_BINARY: Path to forum-dl binary
FORUMDL_TIMEOUT: Timeout in seconds (default: 3600 for large forums)
FORUMDL_ENABLED: Enable forum downloading (default: True)
FORUMDL_BINARY: Path to forum-dl binary (default: forum-dl)
FORUMDL_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT)
FORUMDL_OUTPUT_FORMAT: Output format (default: jsonl)
FORUMDL_TEXTIFY: Convert HTML to plaintext (default: False - keeps HTML)
FORUMDL_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
FORUMDL_EXTRA_ARGS: Extra arguments for forum-dl (space-separated)
# Forum-dl feature toggles
SAVE_FORUMDL: Enable forum-dl forum extraction (default: True)
# Fallback to ARCHIVING_CONFIG values if FORUMDL_* not set:
TIMEOUT: Fallback timeout
CHECK_SSL_VALIDITY: Fallback SSL check
FORUMDL_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY)
FORUMDL_ARGS: Default forum-dl arguments (JSON array)
FORUMDL_ARGS_EXTRA: Extra arguments to append (JSON array)
"""
import json
@@ -78,6 +72,20 @@ def get_env_int(name: str, default: int = 0) -> int:
return default
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
"""Parse a JSON array from environment variable."""
val = get_env(name, '')
if not val:
return default if default is not None else []
try:
result = json.loads(val)
if isinstance(result, list):
return [str(item) for item in result]
return default if default is not None else []
except json.JSONDecodeError:
return default if default is not None else []
def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
"""
@@ -85,11 +93,11 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
Returns: (success, output_path, error_message)
"""
# Get config from env
timeout = get_env_int('TIMEOUT', 3600)
check_ssl = get_env_bool('CHECK_SSL_VALIDITY', True)
textify = get_env_bool('FORUMDL_TEXTIFY', False)
extra_args = get_env('FORUMDL_EXTRA_ARGS', '')
# Get config from env (with FORUMDL_ prefix, x-fallback handled by config loader)
timeout = get_env_int('FORUMDL_TIMEOUT') or get_env_int('TIMEOUT', 3600)
check_ssl = get_env_bool('FORUMDL_CHECK_SSL_VALIDITY', True) if get_env('FORUMDL_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True)
forumdl_args = get_env_array('FORUMDL_ARGS', [])
forumdl_args_extra = get_env_array('FORUMDL_ARGS_EXTRA', [])
output_format = get_env('FORUMDL_OUTPUT_FORMAT', 'jsonl')
# Output directory is current directory (hook already runs in output dir)
@@ -108,16 +116,13 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
output_file = output_dir / f'forum.{output_format}'
# Build command
cmd = [binary, '-f', output_format, '-o', str(output_file)]
if textify:
cmd.append('--textify')
cmd = [binary, *forumdl_args, '-f', output_format, '-o', str(output_file)]
if not check_ssl:
cmd.append('--no-check-certificate')
if extra_args:
cmd.extend(extra_args.split())
if forumdl_args_extra:
cmd.extend(forumdl_args_extra)
cmd.append(url)

View File

@@ -21,6 +21,12 @@
"x-fallback": "TIMEOUT",
"description": "Timeout for gallery downloads in seconds"
},
"GALLERYDL_COOKIES_FILE": {
"type": "string",
"default": "",
"x-fallback": "COOKIES_FILE",
"description": "Path to cookies file"
},
"GALLERYDL_CHECK_SSL_VALIDITY": {
"type": "boolean",
"default": true,
@@ -34,12 +40,15 @@
"--write-metadata",
"--write-info-json"
],
"x-aliases": ["GALLERYDL_DEFAULT_ARGS"],
"description": "Default gallery-dl arguments"
},
"GALLERYDL_EXTRA_ARGS": {
"type": "string",
"default": "",
"description": "Extra arguments for gallery-dl (space-separated)"
"GALLERYDL_ARGS_EXTRA": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["GALLERYDL_EXTRA_ARGS"],
"description": "Extra arguments to append to gallery-dl command"
}
}
}

View File

@@ -6,20 +6,13 @@ Usage: on_Snapshot__gallerydl.py --url=<url> --snapshot-id=<uuid>
Output: Downloads gallery images to $PWD/gallerydl/
Environment variables:
GALLERYDL_BINARY: Path to gallery-dl binary
GALLERYDL_TIMEOUT: Timeout in seconds (default: 3600 for large galleries)
GALLERYDL_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
GALLERYDL_EXTRA_ARGS: Extra arguments for gallery-dl (space-separated)
COOKIES_FILE: Path to cookies file for authentication
# Gallery-dl feature toggles
USE_GALLERYDL: Enable gallery-dl gallery extraction (default: True)
SAVE_GALLERYDL: Alias for USE_GALLERYDL
# Fallback to ARCHIVING_CONFIG values if GALLERYDL_* not set:
GALLERYDL_TIMEOUT: Fallback timeout for gallery downloads
TIMEOUT: Fallback timeout
CHECK_SSL_VALIDITY: Fallback SSL check
GALLERYDL_ENABLED: Enable gallery-dl gallery extraction (default: True)
GALLERYDL_BINARY: Path to gallery-dl binary (default: gallery-dl)
GALLERYDL_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT)
GALLERYDL_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE)
GALLERYDL_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY)
GALLERYDL_ARGS: Default gallery-dl arguments (JSON array)
GALLERYDL_ARGS_EXTRA: Extra arguments to append (JSON array)
"""
import json
@@ -58,6 +51,20 @@ def get_env_int(name: str, default: int = 0) -> int:
return default
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
"""Parse a JSON array from environment variable."""
val = get_env(name, '')
if not val:
return default if default is not None else []
try:
result = json.loads(val)
if isinstance(result, list):
return [str(item) for item in result]
return default if default is not None else []
except json.JSONDecodeError:
return default if default is not None else []
STATICFILE_DIR = '../staticfile'
def has_staticfile_output() -> bool:
@@ -66,35 +73,27 @@ def has_staticfile_output() -> bool:
return staticfile_dir.exists() and any(staticfile_dir.iterdir())
# Default gallery-dl args
def get_gallerydl_default_args() -> list[str]:
"""Build default gallery-dl arguments."""
return [
'--write-metadata',
'--write-info-json',
]
def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]:
"""
Download gallery using gallery-dl.
Returns: (success, output_path, error_message)
"""
# Get config from env
timeout = get_env_int('TIMEOUT', 3600)
check_ssl = get_env_bool('CHECK_SSL_VALIDITY', True)
extra_args = get_env('GALLERYDL_EXTRA_ARGS', '')
cookies_file = get_env('COOKIES_FILE', '')
# Get config from env (with GALLERYDL_ prefix, x-fallback handled by config loader)
timeout = get_env_int('GALLERYDL_TIMEOUT') or get_env_int('TIMEOUT', 3600)
check_ssl = get_env_bool('GALLERYDL_CHECK_SSL_VALIDITY', True) if get_env('GALLERYDL_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True)
gallerydl_args = get_env_array('GALLERYDL_ARGS', [])
gallerydl_args_extra = get_env_array('GALLERYDL_ARGS_EXTRA', [])
cookies_file = get_env('GALLERYDL_COOKIES_FILE') or get_env('COOKIES_FILE', '')
# Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)
# Build command (later options take precedence)
# Build command
# Use -D for exact directory (flat structure) instead of -d (nested structure)
cmd = [
binary,
*get_gallerydl_default_args(),
*gallerydl_args,
'-D', str(output_dir),
]
@@ -104,8 +103,8 @@ def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]:
if cookies_file and Path(cookies_file).exists():
cmd.extend(['-C', cookies_file])
if extra_args:
cmd.extend(extra_args.split())
if gallerydl_args_extra:
cmd.extend(gallerydl_args_extra)
cmd.append(url)

View File

@@ -26,16 +26,19 @@
"default": "github.com,gitlab.com,bitbucket.org,gist.github.com,codeberg.org,gitea.com,git.sr.ht",
"description": "Comma-separated list of domains to treat as git repositories"
},
"GIT_CLONE_DEPTH": {
"type": "integer",
"default": 1,
"minimum": 0,
"description": "Depth of git clone (0 for full history, 1 for shallow)"
"GIT_ARGS": {
"type": "array",
"items": {"type": "string"},
"default": ["clone", "--depth=1", "--recursive"],
"x-aliases": ["GIT_DEFAULT_ARGS"],
"description": "Default git arguments"
},
"GIT_EXTRA_ARGS": {
"type": "string",
"default": "",
"description": "Extra arguments for git clone"
"GIT_ARGS_EXTRA": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["GIT_EXTRA_ARGS"],
"description": "Extra arguments to append to git command"
}
}
}

View File

@@ -8,7 +8,8 @@ Output: Clones repository to $PWD/repo
Environment variables:
GIT_BINARY: Path to git binary
GIT_TIMEOUT: Timeout in seconds (default: 120)
GIT_ARGS: Extra arguments for git clone (space-separated)
GIT_ARGS: Default git arguments (JSON array, default: ["clone", "--depth=1", "--recursive"])
GIT_ARGS_EXTRA: Extra arguments to append (JSON array, default: [])
# Fallback to ARCHIVING_CONFIG values if GIT_* not set:
TIMEOUT: Fallback timeout
@@ -41,6 +42,20 @@ def get_env_int(name: str, default: int = 0) -> int:
return default
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
"""Parse a JSON array from environment variable."""
val = get_env(name, '')
if not val:
return default if default is not None else []
try:
result = json.loads(val)
if isinstance(result, list):
return [str(item) for item in result]
return default if default is not None else []
except json.JSONDecodeError:
return default if default is not None else []
def is_git_url(url: str) -> bool:
"""Check if URL looks like a git repository."""
git_patterns = [
@@ -61,19 +76,10 @@ def clone_git(url: str, binary: str) -> tuple[bool, str | None, str]:
Returns: (success, output_path, error_message)
"""
timeout = get_env_int('GIT_TIMEOUT') or get_env_int('TIMEOUT', 120)
extra_args = get_env('GIT_ARGS')
git_args = get_env_array('GIT_ARGS', [])
git_args_extra = get_env_array('GIT_ARGS_EXTRA', [])
cmd = [
binary,
'clone',
'--depth=1',
'--recursive',
]
if extra_args:
cmd.extend(extra_args.split())
cmd.extend([url, OUTPUT_DIR])
cmd = [binary, *git_args, *git_args_extra, url, OUTPUT_DIR]
try:
result = subprocess.run(cmd, capture_output=True, timeout=timeout)

View File

@@ -15,17 +15,26 @@
"x-aliases": ["POSTLIGHT_PARSER_BINARY"],
"description": "Path to Mercury/Postlight parser binary"
},
"NODE_BINARY": {
"type": "string",
"default": "node",
"description": "Path to Node.js binary"
},
"MERCURY_TIMEOUT": {
"type": "integer",
"default": 30,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for Mercury in seconds"
},
"MERCURY_ARGS": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["MERCURY_DEFAULT_ARGS"],
"description": "Default Mercury parser arguments"
},
"MERCURY_ARGS_EXTRA": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["MERCURY_EXTRA_ARGS"],
"description": "Extra arguments to append to Mercury parser command"
}
}
}

View File

@@ -8,8 +8,8 @@ Output: Creates mercury/ directory with content.html, content.txt, article.json
Environment variables:
MERCURY_BINARY: Path to postlight-parser binary
MERCURY_TIMEOUT: Timeout in seconds (default: 60)
# Fallback to ARCHIVING_CONFIG values if MERCURY_* not set:
MERCURY_ARGS: Default Mercury arguments (JSON array)
MERCURY_ARGS_EXTRA: Extra arguments to append (JSON array)
TIMEOUT: Fallback timeout
Note: Requires postlight-parser: npm install -g @postlight/parser
@@ -51,6 +51,20 @@ def get_env_int(name: str, default: int = 0) -> int:
return default
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
"""Parse a JSON array from environment variable."""
val = get_env(name, '')
if not val:
return default if default is not None else []
try:
result = json.loads(val)
if isinstance(result, list):
return [str(item) for item in result]
return default if default is not None else []
except json.JSONDecodeError:
return default if default is not None else []
def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
"""
Extract article using Mercury Parser.
@@ -58,13 +72,15 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
Returns: (success, output_path, error_message)
"""
timeout = get_env_int('MERCURY_TIMEOUT') or get_env_int('TIMEOUT', 60)
mercury_args = get_env_array('MERCURY_ARGS', [])
mercury_args_extra = get_env_array('MERCURY_ARGS_EXTRA', [])
# Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)
try:
# Get text version
cmd_text = [binary, url, '--format=text']
cmd_text = [binary, *mercury_args, *mercury_args_extra, url, '--format=text']
result_text = subprocess.run(cmd_text, capture_output=True, timeout=timeout)
if result_text.returncode != 0:
@@ -84,7 +100,7 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
(output_dir / 'content.txt').write_text(text_content, encoding='utf-8')
# Get HTML version
cmd_html = [binary, url, '--format=html']
cmd_html = [binary, *mercury_args, *mercury_args_extra, url, '--format=html']
result_html = subprocess.run(cmd_html, capture_output=True, timeout=timeout)
try:

View File

@@ -21,10 +21,19 @@
"x-fallback": "TIMEOUT",
"description": "Timeout for paper downloads in seconds"
},
"PAPERSDL_EXTRA_ARGS": {
"type": "string",
"default": "",
"description": "Extra arguments for papers-dl (space-separated)"
"PAPERSDL_ARGS": {
"type": "array",
"items": {"type": "string"},
"default": ["fetch"],
"x-aliases": ["PAPERSDL_DEFAULT_ARGS"],
"description": "Default papers-dl arguments"
},
"PAPERSDL_ARGS_EXTRA": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["PAPERSDL_EXTRA_ARGS"],
"description": "Extra arguments to append to papers-dl command"
}
}
}

View File

@@ -8,7 +8,8 @@ Output: Downloads paper PDFs to $PWD/
Environment variables:
PAPERSDL_BINARY: Path to papers-dl binary
PAPERSDL_TIMEOUT: Timeout in seconds (default: 300 for paper downloads)
PAPERSDL_EXTRA_ARGS: Extra arguments for papers-dl (space-separated)
PAPERSDL_ARGS: Default papers-dl arguments (JSON array, default: ["fetch"])
PAPERSDL_ARGS_EXTRA: Extra arguments to append (JSON array)
# papers-dl feature toggles
SAVE_PAPERSDL: Enable papers-dl paper extraction (default: True)
@@ -54,6 +55,20 @@ def get_env_int(name: str, default: int = 0) -> int:
return default
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
"""Parse a JSON array from environment variable."""
val = get_env(name, '')
if not val:
return default if default is not None else []
try:
result = json.loads(val)
if isinstance(result, list):
return [str(item) for item in result]
return default if default is not None else []
except json.JSONDecodeError:
return default if default is not None else []
def extract_doi_from_url(url: str) -> str | None:
"""Extract DOI from common paper URLs."""
# Match DOI pattern in URL
@@ -72,7 +87,8 @@ def save_paper(url: str, binary: str) -> tuple[bool, str | None, str]:
"""
# Get config from env
timeout = get_env_int('TIMEOUT', 300)
extra_args = get_env('PAPERSDL_EXTRA_ARGS', '')
papersdl_args = get_env_array('PAPERSDL_ARGS', [])
papersdl_args_extra = get_env_array('PAPERSDL_ARGS_EXTRA', [])
# Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)
@@ -85,11 +101,11 @@ def save_paper(url: str, binary: str) -> tuple[bool, str | None, str]:
else:
identifier = doi
# Build command - papers-dl fetch <identifier> -o <output_dir>
cmd = [binary, 'fetch', identifier, '-o', str(output_dir)]
# Build command - papers-dl <args> <identifier> -o <output_dir>
cmd = [binary, *papersdl_args, identifier, '-o', str(output_dir)]
if extra_args:
cmd.extend(extra_args.split())
if papersdl_args_extra:
cmd.extend(papersdl_args_extra)
try:
result = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True)

View File

@@ -14,17 +14,26 @@
"default": "readability-extractor",
"description": "Path to readability-extractor binary"
},
"NODE_BINARY": {
"type": "string",
"default": "node",
"description": "Path to Node.js binary"
},
"READABILITY_TIMEOUT": {
"type": "integer",
"default": 30,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for Readability in seconds"
},
"READABILITY_ARGS": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["READABILITY_DEFAULT_ARGS"],
"description": "Default Readability arguments"
},
"READABILITY_ARGS_EXTRA": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["READABILITY_EXTRA_ARGS"],
"description": "Extra arguments to append to Readability command"
}
}
}

View File

@@ -8,8 +8,8 @@ Output: Creates readability/ directory with content.html, content.txt, article.j
Environment variables:
READABILITY_BINARY: Path to readability-extractor binary
READABILITY_TIMEOUT: Timeout in seconds (default: 60)
# Fallback to ARCHIVING_CONFIG values if READABILITY_* not set:
READABILITY_ARGS: Default Readability arguments (JSON array)
READABILITY_ARGS_EXTRA: Extra arguments to append (JSON array)
TIMEOUT: Fallback timeout
Note: Requires readability-extractor from https://github.com/ArchiveBox/readability-extractor
@@ -44,6 +44,20 @@ def get_env_int(name: str, default: int = 0) -> int:
return default
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
"""Parse a JSON array from environment variable."""
val = get_env(name, '')
if not val:
return default if default is not None else []
try:
result = json.loads(val)
if isinstance(result, list):
return [str(item) for item in result]
return default if default is not None else []
except json.JSONDecodeError:
return default if default is not None else []
def find_html_source() -> str | None:
"""Find HTML content from other extractors in the snapshot directory."""
# Hooks run in snapshot_dir, sibling extractor outputs are in subdirectories
@@ -73,6 +87,8 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
Returns: (success, output_path, error_message)
"""
timeout = get_env_int('READABILITY_TIMEOUT') or get_env_int('TIMEOUT', 60)
readability_args = get_env_array('READABILITY_ARGS', [])
readability_args_extra = get_env_array('READABILITY_ARGS_EXTRA', [])
# Find HTML source
html_source = find_html_source()
@@ -84,7 +100,7 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
try:
# Run readability-extractor (outputs JSON by default)
cmd = [binary, html_source]
cmd = [binary, *readability_args, *readability_args_extra, html_source]
result = subprocess.run(cmd, capture_output=True, timeout=timeout)
if result.returncode != 0:

View File

@@ -3,25 +3,32 @@
"type": "object",
"additionalProperties": false,
"properties": {
"SEARCH_BACKEND_RIPGREP_BINARY": {
"RIPGREP_BINARY": {
"type": "string",
"default": "rg",
"x-aliases": ["RIPGREP_BINARY"],
"description": "Path to ripgrep binary"
},
"SEARCH_BACKEND_RIPGREP_IGNORE_EXTENSIONS": {
"type": "string",
"default": "css,js,orig,svg",
"x-aliases": ["RIPGREP_IGNORE_EXTENSIONS"],
"description": "Comma-separated file extensions to ignore"
},
"SEARCH_BACKEND_RIPGREP_TIMEOUT": {
"RIPGREP_TIMEOUT": {
"type": "integer",
"default": 90,
"minimum": 5,
"x-fallback": "TIMEOUT",
"x-aliases": ["SEARCH_BACKEND_TIMEOUT"],
"description": "Search timeout in seconds"
},
"RIPGREP_ARGS": {
"type": "array",
"items": { "type": "string" },
"default": ["--files-with-matches", "--no-messages", "--ignore-case"],
"x-aliases": ["RIPGREP_DEFAULT_ARGS"],
"description": "Default ripgrep arguments"
},
"RIPGREP_ARGS_EXTRA": {
"type": "array",
"items": { "type": "string" },
"default": [],
"x-aliases": ["RIPGREP_EXTRA_ARGS"],
"description": "Extra arguments to append to ripgrep command"
}
}
}

View File

@@ -6,10 +6,12 @@ using ripgrep (rg). This is simpler but slower for large archives.
Environment variables:
RIPGREP_BINARY: Path to ripgrep binary (default: rg)
RIPGREP_IGNORE_EXTENSIONS: Comma-separated extensions to ignore (default: css,js,orig,svg)
SEARCH_BACKEND_TIMEOUT: Search timeout in seconds (default: 90)
RIPGREP_ARGS: Default ripgrep arguments (JSON array)
RIPGREP_ARGS_EXTRA: Extra arguments to append (JSON array)
RIPGREP_TIMEOUT: Search timeout in seconds (default: 90)
"""
import json
import os
import subprocess
import shutil
@@ -19,39 +21,57 @@ from typing import List, Iterable
from django.conf import settings
# Config with old var names for backwards compatibility
RIPGREP_BINARY = os.environ.get('RIPGREP_BINARY', 'rg').strip()
RIPGREP_IGNORE_EXTENSIONS = os.environ.get('RIPGREP_IGNORE_EXTENSIONS', 'css,js,orig,svg').strip()
SEARCH_BACKEND_TIMEOUT = int(os.environ.get('SEARCH_BACKEND_TIMEOUT', '90'))
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
except ValueError:
return default
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
"""Parse a JSON array from environment variable."""
val = get_env(name, '')
if not val:
return default if default is not None else []
try:
result = json.loads(val)
if isinstance(result, list):
return [str(item) for item in result]
return default if default is not None else []
except json.JSONDecodeError:
return default if default is not None else []
def search(query: str) -> List[str]:
"""Search for snapshots using ripgrep."""
rg_binary = shutil.which(RIPGREP_BINARY) or RIPGREP_BINARY
rg_binary = get_env('RIPGREP_BINARY', 'rg')
rg_binary = shutil.which(rg_binary) or rg_binary
if not rg_binary or not Path(rg_binary).exists():
raise RuntimeError(f'ripgrep binary not found ({RIPGREP_BINARY}). Install with: apt install ripgrep')
raise RuntimeError(f'ripgrep binary not found. Install with: apt install ripgrep')
timeout = get_env_int('RIPGREP_TIMEOUT', 90)
ripgrep_args = get_env_array('RIPGREP_ARGS', [])
ripgrep_args_extra = get_env_array('RIPGREP_ARGS_EXTRA', [])
archive_dir = Path(settings.ARCHIVE_DIR)
if not archive_dir.exists():
return []
# Build ignore pattern from config
ignore_pattern = f'*.{{{RIPGREP_IGNORE_EXTENSIONS}}}'
cmd = [
rg_binary,
f'--type-add=ignore:{ignore_pattern}',
'--type-not=ignore',
'--files-with-matches',
'--no-messages',
'--ignore-case',
*ripgrep_args,
*ripgrep_args_extra,
'--regexp',
query,
str(archive_dir),
]
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=SEARCH_BACKEND_TIMEOUT)
result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
# Extract snapshot IDs from file paths
# Paths look like: archive/<snapshot_id>/<extractor>/file.txt

View File

@@ -15,11 +15,18 @@
"x-aliases": ["SINGLE_FILE_BINARY"],
"description": "Path to single-file binary"
},
"NODE_BINARY": {
"SINGLEFILE_NODE_BINARY": {
"type": "string",
"default": "node",
"x-fallback": "NODE_BINARY",
"description": "Path to Node.js binary"
},
"SINGLEFILE_CHROME_BINARY": {
"type": "string",
"default": "",
"x-fallback": "CHROME_BINARY",
"description": "Path to Chrome/Chromium binary"
},
"SINGLEFILE_TIMEOUT": {
"type": "integer",
"default": 60,
@@ -39,16 +46,25 @@
"x-fallback": "COOKIES_FILE",
"description": "Path to cookies file"
},
"SINGLEFILE_CHECK_SSL_VALIDITY": {
"type": "boolean",
"default": true,
"x-fallback": "CHECK_SSL_VALIDITY",
"description": "Whether to verify SSL certificates"
},
"SINGLEFILE_ARGS": {
"type": "array",
"items": {"type": "string"},
"default": [],
"default": ["--browser-headless"],
"x-aliases": ["SINGLEFILE_DEFAULT_ARGS"],
"description": "Default single-file arguments"
},
"SINGLEFILE_EXTRA_ARGS": {
"type": "string",
"default": "",
"description": "Extra arguments for single-file"
"SINGLEFILE_ARGS_EXTRA": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["SINGLEFILE_EXTRA_ARGS"],
"description": "Extra arguments to append to single-file command"
}
}
}

View File

@@ -6,24 +6,16 @@ Usage: on_Snapshot__singlefile.py --url=<url> --snapshot-id=<uuid>
Output: Writes singlefile.html to $PWD
Environment variables:
SINGLEFILE_BINARY: Path to SingleFile binary
SINGLEFILE_TIMEOUT: Timeout in seconds (default: 120)
SINGLEFILE_USER_AGENT: User agent string (optional)
SINGLEFILE_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
SINGLEFILE_COOKIES_FILE: Path to cookies file (optional)
SINGLEFILE_EXTRA_ARGS: Extra arguments for SingleFile (space-separated)
# Feature toggle
SAVE_SINGLEFILE: Enable SingleFile archiving (default: True)
# Chrome binary (SingleFile needs Chrome)
CHROME_BINARY: Path to Chrome/Chromium binary
# Fallback to ARCHIVING_CONFIG values if SINGLEFILE_* not set:
TIMEOUT: Fallback timeout
USER_AGENT: Fallback user agent
CHECK_SSL_VALIDITY: Fallback SSL check
COOKIES_FILE: Fallback cookies file
SINGLEFILE_ENABLED: Enable SingleFile archiving (default: True)
SINGLEFILE_BINARY: Path to SingleFile binary (default: single-file)
SINGLEFILE_NODE_BINARY: Path to Node.js binary (x-fallback: NODE_BINARY)
SINGLEFILE_CHROME_BINARY: Path to Chrome binary (x-fallback: CHROME_BINARY)
SINGLEFILE_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT)
SINGLEFILE_USER_AGENT: User agent string (x-fallback: USER_AGENT)
SINGLEFILE_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE)
SINGLEFILE_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY)
SINGLEFILE_ARGS: Default SingleFile arguments (JSON array)
SINGLEFILE_ARGS_EXTRA: Extra arguments to append (JSON array)
"""
import json
@@ -63,6 +55,20 @@ def get_env_int(name: str, default: int = 0) -> int:
return default
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
"""Parse a JSON array from environment variable."""
val = get_env(name, '')
if not val:
return default if default is not None else []
try:
result = json.loads(val)
if isinstance(result, list):
return [str(item) for item in result]
return default if default is not None else []
except json.JSONDecodeError:
return default if default is not None else []
STATICFILE_DIR = '../staticfile'
def has_staticfile_output() -> bool:
@@ -121,15 +127,16 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
Returns: (success, output_path, error_message)
"""
# Get config from env (with SINGLEFILE_ prefix or fallback to ARCHIVING_CONFIG style)
# Get config from env (with SINGLEFILE_ prefix, x-fallback handled by config loader)
timeout = get_env_int('SINGLEFILE_TIMEOUT') or get_env_int('TIMEOUT', 120)
user_agent = get_env('SINGLEFILE_USER_AGENT') or get_env('USER_AGENT', '')
check_ssl = get_env_bool('SINGLEFILE_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True))
check_ssl = get_env_bool('SINGLEFILE_CHECK_SSL_VALIDITY', True) if get_env('SINGLEFILE_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True)
cookies_file = get_env('SINGLEFILE_COOKIES_FILE') or get_env('COOKIES_FILE', '')
extra_args = get_env('SINGLEFILE_EXTRA_ARGS', '')
chrome = get_env('CHROME_BINARY', '')
singlefile_args = get_env_array('SINGLEFILE_ARGS', [])
singlefile_args_extra = get_env_array('SINGLEFILE_ARGS_EXTRA', [])
chrome = get_env('SINGLEFILE_CHROME_BINARY') or get_env('CHROME_BINARY', '')
cmd = [binary]
cmd = [binary, *singlefile_args]
# Try to use existing Chrome session via CDP
cdp_url = get_cdp_url()
@@ -142,11 +149,6 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
elif chrome:
cmd.extend(['--browser-executable-path', chrome])
# Common options
cmd.extend([
'--browser-headless',
])
# SSL handling
if not check_ssl:
cmd.append('--browser-ignore-insecure-certs')
@@ -157,8 +159,9 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
if cookies_file and Path(cookies_file).is_file():
cmd.extend(['--browser-cookies-file', cookies_file])
if extra_args:
cmd.extend(extra_args.split())
# Add extra args from config
if singlefile_args_extra:
cmd.extend(singlefile_args_extra)
# Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)

View File

@@ -9,18 +9,12 @@
"x-aliases": ["SAVE_WGET", "USE_WGET"],
"description": "Enable wget archiving"
},
"WGET_SAVE_WARC": {
"WGET_WARC_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_WARC"],
"x-aliases": ["SAVE_WARC", "WGET_SAVE_WARC"],
"description": "Save WARC archive file"
},
"WGET_SAVE_REQUISITES": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_WGET_REQUISITES"],
"description": "Download page requisites (CSS, JS, images)"
},
"WGET_BINARY": {
"type": "string",
"default": "wget",
@@ -39,25 +33,17 @@
"x-fallback": "USER_AGENT",
"description": "User agent string for wget"
},
"WGET_CHECK_SSL_VALIDITY": {
"type": "boolean",
"default": true,
"x-fallback": "CHECK_SSL_VALIDITY",
"x-aliases": ["CHECK_SSL_VALIDITY"],
"description": "Whether to verify SSL certificates"
},
"WGET_COOKIES_FILE": {
"type": "string",
"default": "",
"x-fallback": "COOKIES_FILE",
"description": "Path to cookies file"
},
"WGET_RESTRICT_FILE_NAMES": {
"type": "string",
"default": "windows",
"enum": ["windows", "unix", "ascii", "nocontrol", "lowercase", "uppercase"],
"x-fallback": "RESTRICT_FILE_NAMES",
"description": "Filename restriction mode"
"WGET_CHECK_SSL_VALIDITY": {
"type": "boolean",
"default": true,
"x-fallback": "CHECK_SSL_VALIDITY",
"description": "Whether to verify SSL certificates"
},
"WGET_ARGS": {
"type": "array",
@@ -70,14 +56,20 @@
"--backup-converted",
"--span-hosts",
"--no-parent",
"--page-requisites",
"--restrict-file-names=windows",
"--tries=2",
"-e", "robots=off"
],
"x-aliases": ["WGET_DEFAULT_ARGS"],
"description": "Default wget arguments"
},
"WGET_EXTRA_ARGS": {
"type": "string",
"default": "",
"description": "Extra arguments for wget (space-separated)"
"WGET_ARGS_EXTRA": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["WGET_EXTRA_ARGS"],
"description": "Extra arguments to append to wget command"
}
}
}

View File

@@ -6,25 +6,15 @@ Usage: on_Snapshot__wget.py --url=<url> --snapshot-id=<uuid>
Output: Downloads files to $PWD
Environment variables:
WGET_BINARY: Path to wget binary (optional, falls back to PATH)
WGET_TIMEOUT: Timeout in seconds (default: 60)
WGET_USER_AGENT: User agent string
WGET_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
WGET_COOKIES_FILE: Path to cookies file (optional)
WGET_RESTRICT_FILE_NAMES: Filename restriction mode (default: windows)
WGET_EXTRA_ARGS: Extra arguments for wget (space-separated)
# Wget feature toggles
SAVE_WGET: Enable wget archiving (default: True)
SAVE_WARC: Save WARC file (default: True)
SAVE_WGET_REQUISITES: Download page requisites (default: True)
# Fallback to ARCHIVING_CONFIG values if WGET_* not set:
TIMEOUT: Fallback timeout
USER_AGENT: Fallback user agent
CHECK_SSL_VALIDITY: Fallback SSL check
COOKIES_FILE: Fallback cookies file
RESTRICT_FILE_NAMES: Fallback filename restriction
WGET_ENABLED: Enable wget archiving (default: True)
WGET_WARC_ENABLED: Save WARC file (default: True)
WGET_BINARY: Path to wget binary (default: wget)
WGET_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT)
WGET_USER_AGENT: User agent string (x-fallback: USER_AGENT)
WGET_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE)
WGET_CHECK_SSL_VALIDITY: Whether to check SSL certificates (x-fallback: CHECK_SSL_VALIDITY)
WGET_ARGS: Default wget arguments (JSON array)
WGET_ARGS_EXTRA: Extra arguments to append (JSON array)
"""
import json
@@ -65,6 +55,20 @@ def get_env_int(name: str, default: int = 0) -> int:
return default
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
"""Parse a JSON array from environment variable."""
val = get_env(name, '')
if not val:
return default if default is not None else []
try:
result = json.loads(val)
if isinstance(result, list):
return [str(item) for item in result]
return default if default is not None else []
except json.JSONDecodeError:
return default if default is not None else []
STATICFILE_DIR = '../staticfile'
def has_staticfile_output() -> bool:
@@ -73,17 +77,6 @@ def has_staticfile_output() -> bool:
return staticfile_dir.exists() and any(staticfile_dir.iterdir())
# Default wget args (from old WGET_CONFIG)
WGET_DEFAULT_ARGS = [
'--no-verbose',
'--adjust-extension',
'--convert-links',
'--force-directories',
'--backup-converted',
'--span-hosts',
'--no-parent',
'-e', 'robots=off',
]
def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]:
@@ -92,36 +85,28 @@ def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]:
Returns: (success, output_path, error_message)
"""
# Get config from env (with WGET_ prefix or fallback to ARCHIVING_CONFIG style)
# Get config from env (with WGET_ prefix, x-fallback handled by config loader)
timeout = get_env_int('WGET_TIMEOUT') or get_env_int('TIMEOUT', 60)
user_agent = get_env('WGET_USER_AGENT') or get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
check_ssl = get_env_bool('WGET_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True))
check_ssl = get_env_bool('WGET_CHECK_SSL_VALIDITY', True) if get_env('WGET_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True)
cookies_file = get_env('WGET_COOKIES_FILE') or get_env('COOKIES_FILE', '')
restrict_names = get_env('WGET_RESTRICT_FILE_NAMES') or get_env('RESTRICT_FILE_NAMES', 'windows')
extra_args = get_env('WGET_EXTRA_ARGS', '')
wget_args = get_env_array('WGET_ARGS', [])
wget_args_extra = get_env_array('WGET_ARGS_EXTRA', [])
# Feature toggles
save_warc = get_env_bool('WGET_SAVE_WARC', True)
save_requisites = get_env_bool('WGET_SAVE_REQUISITES', True)
warc_enabled = get_env_bool('WGET_WARC_ENABLED', True)
# Build wget command (later options take precedence)
cmd = [
binary,
*WGET_DEFAULT_ARGS,
*wget_args,
f'--timeout={timeout}',
'--tries=2',
]
if user_agent:
cmd.append(f'--user-agent={user_agent}')
if restrict_names:
cmd.append(f'--restrict-file-names={restrict_names}')
if save_requisites:
cmd.append('--page-requisites')
if save_warc:
if warc_enabled:
warc_dir = Path('warc')
warc_dir.mkdir(exist_ok=True)
warc_path = warc_dir / str(int(datetime.now(timezone.utc).timestamp()))
@@ -135,8 +120,8 @@ def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]:
if not check_ssl:
cmd.extend(['--no-check-certificate', '--no-hsts'])
if extra_args:
cmd.extend(extra_args.split())
if wget_args_extra:
cmd.extend(wget_args_extra)
cmd.append(url)

View File

@@ -1,3 +1,3 @@
{"type": "Binary", "name": "yt-dlp", "binproviders": "pip,brew,apt,env"}
{"type": "Binary", "name": "yt-dlp", "binproviders": "pip,brew,apt,env", "overrides": {"pip": {"packages": "yt-dlp[default]"}}}
{"type": "Binary", "name": "node", "binproviders": "apt,brew,env", "overrides": {"apt": {"packages": ["nodejs"]}}}
{"type": "Binary", "name": "ffmpeg", "binproviders": "apt,brew,env"}

View File

@@ -15,6 +15,12 @@
"x-aliases": ["MEDIA_BINARY", "YOUTUBEDL_BINARY", "YOUTUBE_DL_BINARY"],
"description": "Path to yt-dlp binary"
},
"YTDLP_NODE_BINARY": {
"type": "string",
"default": "node",
"x-fallback": "NODE_BINARY",
"description": "Path to Node.js binary for yt-dlp JS runtime"
},
"YTDLP_TIMEOUT": {
"type": "integer",
"default": 3600,
@@ -23,6 +29,12 @@
"x-aliases": ["MEDIA_TIMEOUT"],
"description": "Timeout for yt-dlp downloads in seconds"
},
"YTDLP_COOKIES_FILE": {
"type": "string",
"default": "",
"x-fallback": "COOKIES_FILE",
"description": "Path to cookies file"
},
"YTDLP_MAX_SIZE": {
"type": "string",
"default": "750m",
@@ -58,14 +70,15 @@
"--no-progress",
"-o", "%(title)s.%(ext)s"
],
"x-aliases": ["MEDIA_ARGS"],
"description": "Default yt-dlp arguments (override to customize behavior)"
"x-aliases": ["YTDLP_DEFAULT_ARGS", "MEDIA_ARGS"],
"description": "Default yt-dlp arguments"
},
"YTDLP_EXTRA_ARGS": {
"type": "string",
"default": "",
"x-aliases": ["MEDIA_EXTRA_ARGS"],
"description": "Extra arguments for yt-dlp (space-separated, appended after YTDLP_ARGS)"
"YTDLP_ARGS_EXTRA": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["YTDLP_EXTRA_ARGS", "MEDIA_ARGS_EXTRA", "MEDIA_EXTRA_ARGS"],
"description": "Extra arguments to append to yt-dlp command"
}
}
}

View File

@@ -3,24 +3,18 @@
Download video/audio from a URL using yt-dlp.
Usage: on_Snapshot__ytdlp.py --url=<url> --snapshot-id=<uuid>
Output: Downloads video/audio files to $PWD/ytdlp/
Output: Downloads video/audio files to $PWD
Environment variables:
YTDLP_BINARY: Path to yt-dlp binary
YTDLP_TIMEOUT: Timeout in seconds (default: 3600 for large downloads)
YTDLP_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
YTDLP_ARGS: JSON array of yt-dlp arguments (overrides defaults)
YTDLP_EXTRA_ARGS: Extra arguments for yt-dlp (space-separated, appended)
YTDLP_MAX_SIZE: Maximum file size (default: 750m)
# Feature toggles (with backwards-compatible aliases)
YTDLP_ENABLED: Enable yt-dlp extraction (default: True)
SAVE_YTDLP: Alias for YTDLP_ENABLED
MEDIA_ENABLED: Backwards-compatible alias for YTDLP_ENABLED
# Fallback to ARCHIVING_CONFIG values if YTDLP_* not set:
TIMEOUT: Fallback timeout
CHECK_SSL_VALIDITY: Fallback SSL check
YTDLP_BINARY: Path to yt-dlp binary (default: yt-dlp)
YTDLP_NODE_BINARY: Path to Node.js binary (x-fallback: NODE_BINARY)
YTDLP_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT)
YTDLP_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE)
YTDLP_MAX_SIZE: Maximum file size (default: 750m)
YTDLP_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY)
YTDLP_ARGS: Default yt-dlp arguments (JSON array)
YTDLP_ARGS_EXTRA: Extra arguments to append (JSON array)
"""
import json
@@ -59,6 +53,20 @@ def get_env_int(name: str, default: int = 0) -> int:
return default
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
"""Parse a JSON array from environment variable."""
val = get_env(name, '')
if not val:
return default if default is not None else []
try:
result = json.loads(val)
if isinstance(result, list):
return [str(item) for item in result]
return default if default is not None else []
except json.JSONDecodeError:
return default if default is not None else []
STATICFILE_DIR = '../staticfile'
def has_staticfile_output() -> bool:
@@ -67,69 +75,41 @@ def has_staticfile_output() -> bool:
return staticfile_dir.exists() and any(staticfile_dir.iterdir())
# Default yt-dlp args (can be overridden via YTDLP_ARGS env var)
YTDLP_DEFAULT_ARGS = [
'--restrict-filenames',
'--trim-filenames', '128',
'--write-description',
'--write-info-json',
'--write-thumbnail',
'--write-sub',
'--write-auto-subs',
'--convert-subs=srt',
'--yes-playlist',
'--continue',
'--no-abort-on-error',
'--ignore-errors',
'--geo-bypass',
'--add-metadata',
'--no-progress',
'-o', '%(title)s.%(ext)s',
]
def get_ytdlp_args() -> list[str]:
"""Get yt-dlp arguments from YTDLP_ARGS env var or use defaults."""
ytdlp_args_str = get_env('YTDLP_ARGS', '')
if ytdlp_args_str:
try:
# Try to parse as JSON array
args = json.loads(ytdlp_args_str)
if isinstance(args, list):
return [str(arg) for arg in args]
except json.JSONDecodeError:
pass
return YTDLP_DEFAULT_ARGS
def save_ytdlp(url: str, binary: str) -> tuple[bool, str | None, str]:
"""
Download video/audio using yt-dlp.
Returns: (success, output_path, error_message)
"""
# Get config from env (YTDLP_* primary, MEDIA_* as fallback via aliases)
timeout = get_env_int('TIMEOUT', 3600)
check_ssl = get_env_bool('CHECK_SSL_VALIDITY', True)
extra_args = get_env('YTDLP_EXTRA_ARGS', '')
max_size = get_env('YTDLP_MAX_SIZE', '') or get_env('MEDIA_MAX_SIZE', '750m')
# Get config from env (with YTDLP_ prefix, x-fallback handled by config loader)
timeout = get_env_int('YTDLP_TIMEOUT') or get_env_int('TIMEOUT', 3600)
check_ssl = get_env_bool('YTDLP_CHECK_SSL_VALIDITY', True) if get_env('YTDLP_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True)
cookies_file = get_env('YTDLP_COOKIES_FILE') or get_env('COOKIES_FILE', '')
max_size = get_env('YTDLP_MAX_SIZE', '750m')
node_binary = get_env('YTDLP_NODE_BINARY') or get_env('NODE_BINARY', 'node')
ytdlp_args = get_env_array('YTDLP_ARGS', [])
ytdlp_args_extra = get_env_array('YTDLP_ARGS_EXTRA', [])
# Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)
# Build command using configurable YTDLP_ARGS (later options take precedence)
# Build command (later options take precedence)
cmd = [
binary,
*get_ytdlp_args(),
# Format with max_size limit (appended after YTDLP_ARGS so it can be overridden by YTDLP_EXTRA_ARGS)
*ytdlp_args,
# Format with max_size limit (appended after YTDLP_ARGS so it can be overridden by YTDLP_ARGS_EXTRA)
f'--format=(bv*+ba/b)[filesize<={max_size}][filesize_approx<=?{max_size}]/(bv*+ba/b)',
f'--js-runtimes=node:{node_binary}',
]
if not check_ssl:
cmd.append('--no-check-certificate')
if extra_args:
cmd.extend(extra_args.split())
if cookies_file and Path(cookies_file).is_file():
cmd.extend(['--cookies', cookies_file])
if ytdlp_args_extra:
cmd.extend(ytdlp_args_extra)
cmd.append(url)
@@ -193,9 +173,8 @@ def main(url: str, snapshot_id: str):
"""Download video/audio from a URL using yt-dlp."""
try:
# Check if yt-dlp downloading is enabled (YTDLP_ENABLED primary, MEDIA_ENABLED fallback)
ytdlp_enabled = get_env_bool('YTDLP_ENABLED', True) and get_env_bool('MEDIA_ENABLED', True)
if not ytdlp_enabled:
# Check if yt-dlp downloading is enabled
if not get_env_bool('YTDLP_ENABLED', True):
print('Skipping ytdlp (YTDLP_ENABLED=False)', file=sys.stderr)
# Temporary failure (config disabled) - NO JSONL emission
sys.exit(0)