Merge remote-tracking branch 'origin/dev' into claude/add-max-url-attempts-oBHCD

This commit is contained in:
Claude
2025-12-29 21:29:01 +00:00
29 changed files with 1150 additions and 445 deletions

View File

@@ -86,57 +86,33 @@ jobs:
python-version: ${{ matrix.python }}
architecture: x64
- name: Install uv
uses: astral-sh/setup-uv@v4
with:
version: "latest"
- name: Set up Node JS
uses: actions/setup-node@v4
with:
node-version: 20.10.0
node-version: 22
- name: Get pip cache dir
id: pip-cache
run: |
echo "dir=$(pip cache dir)" >> $GITHUB_OUTPUT
- name: Cache pip
- name: Cache uv
uses: actions/cache@v3
id: cache-pip
with:
path: ${{ steps.pip-cache.outputs.dir }}
key: ${{ runner.os }}-${{ matrix.python }}-venv-${{ hashFiles('setup.py') }}
path: ~/.cache/uv
key: ${{ runner.os }}-${{ matrix.python }}-uv-${{ hashFiles('pyproject.toml', 'uv.lock') }}
restore-keys: |
${{ runner.os }}-${{ matrix.python }}-venv-
${{ runner.os }}-${{ matrix.python }}-uv-
- uses: awalsh128/cache-apt-pkgs-action@latest
with:
packages: ripgrep build-essential python3-dev python3-setuptools libssl-dev libldap2-dev libsasl2-dev zlib1g-dev libatomic1 python3-minimal gnupg2 curl wget python3-ldap python3-msgpack python3-mutagen python3-regex python3-pycryptodome procps
version: 1.0
packages: git ripgrep build-essential python3-dev python3-setuptools libssl-dev libldap2-dev libsasl2-dev zlib1g-dev libatomic1 python3-minimal gnupg2 curl wget python3-ldap python3-msgpack python3-mutagen python3-regex python3-pycryptodome procps
version: 1.1
- name: Install pip dependencies
- name: Install dependencies with uv
run: |
python -m pip install --upgrade pip setuptools wheel pytest bottle build
python -m pip install -r requirements.txt
python -m pip install -e .[sonic,ldap]
- name: Get npm cache dir
id: npm-cache
run: |
echo "dir=$GITHUB_WORKSPACE/node_modules" >> $GITHUB_OUTPUT
- name: Cache npm
uses: actions/cache@v3
id: cache-npm
with:
path: ${{ steps.npm-cache.outputs.dir }}
key: ${{ runner.os }}-node_modules-${{ hashFiles('package-lock.json') }}
restore-keys: |
${{ runner.os }}-node_modules
- name: Install npm requirements
run: |
npm install
echo "SINGLEFILE_BINARY=$GITHUB_WORKSPACE/node_modules/.bin/single-file" >> $GITHUB_ENV
echo "READABILITY_BINARY=$GITHUB_WORKSPACE/node_modules/.bin/readability-extractor" >> $GITHUB_ENV
echo "MERCURY_BINARY=$GITHUB_WORKSPACE/node_modules/.bin/mercury-parser" >> $GITHUB_ENV
uv sync --dev --all-extras
- name: Run test - ${{ matrix.test.name }}
run: |
python -m pytest -xvs "${{ matrix.test.path }}" --basetemp=tests/out --ignore=archivebox/pkgs
uv run pytest -xvs "${{ matrix.test.path }}" --basetemp=tests/out --ignore=archivebox/pkgs

View File

@@ -26,9 +26,7 @@ from archivebox.misc.system import get_dir_size, atomic_write
from archivebox.misc.util import parse_date, base_url, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode
from archivebox.misc.hashing import get_dir_info
from archivebox.hooks import (
EXTRACTOR_INDEXING_PRECEDENCE,
get_plugins, get_plugin_name, get_plugin_icon,
DEFAULT_PLUGIN_ICONS,
)
from archivebox.base_models.models import (
ModelWithUUID, ModelWithSerializers, ModelWithOutputDir,
@@ -1931,16 +1929,6 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
)
class ArchiveResultManager(models.Manager):
def indexable(self, sorted: bool = True):
INDEXABLE_METHODS = [r[0] for r in EXTRACTOR_INDEXING_PRECEDENCE]
qs = self.get_queryset().filter(plugin__in=INDEXABLE_METHODS, status='succeeded')
if sorted:
precedence = [When(plugin=method, then=Value(p)) for method, p in EXTRACTOR_INDEXING_PRECEDENCE]
qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000), output_field=IntegerField())).order_by('indexing_precedence')
return qs
class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
class StatusChoices(models.TextChoices):
QUEUED = 'queued', 'Queued'
@@ -2000,8 +1988,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
state_field_name = 'status'
active_state = StatusChoices.STARTED
objects = ArchiveResultManager()
class Meta(TypedModelMeta):
app_label = 'core'
verbose_name = 'Archive Result'

View File

@@ -619,20 +619,6 @@ def is_parser_plugin(plugin: str) -> bool:
return name.startswith('parse_') and name.endswith('_urls')
# Precedence order for search indexing (lower number = higher priority)
# Used to select which plugin's output to use for full-text search
# Plugin names here should match the part after the numeric prefix
# e.g., '31_readability' -> 'readability'
EXTRACTOR_INDEXING_PRECEDENCE = [
('readability', 1),
('mercury', 2),
('htmltotext', 3),
('singlefile', 4),
('dom', 5),
('wget', 6),
]
def get_enabled_plugins(config: Optional[Dict[str, Any]] = None) -> List[str]:
"""
Get the list of enabled plugins based on config and available hooks.
@@ -960,25 +946,6 @@ DEFAULT_TEMPLATES = {
''',
}
# Default icons for known extractor plugins (emoji or short HTML)
DEFAULT_PLUGIN_ICONS = {
'screenshot': '📷',
'pdf': '📄',
'singlefile': '📦',
'dom': '🌐',
'wget': '📥',
'media': '🎬',
'git': '📂',
'readability': '📖',
'mercury': '☿️',
'favicon': '',
'title': '📝',
'headers': '📋',
'archive_org': '🏛️',
'htmltotext': '📃',
'warc': '🗄️',
}
def get_plugin_template(plugin: str, template_name: str, fallback: bool = True) -> Optional[str]:
"""
@@ -1018,10 +985,7 @@ def get_plugin_template(plugin: str, template_name: str, fallback: bool = True)
def get_plugin_icon(plugin: str) -> str:
"""
Get the icon for a plugin.
First checks for plugin-provided icon.html template,
then falls back to DEFAULT_PLUGIN_ICONS.
Get the icon for a plugin from its icon.html template.
Args:
plugin: Plugin name (e.g., 'screenshot', '15_singlefile')
@@ -1029,15 +993,13 @@ def get_plugin_icon(plugin: str) -> str:
Returns:
Icon HTML/emoji string.
"""
base_name = get_plugin_name(plugin)
# Try plugin-provided icon template
icon_template = get_plugin_template(plugin, 'icon', fallback=False)
if icon_template:
return icon_template.strip()
# Fall back to default icon
return DEFAULT_PLUGIN_ICONS.get(base_name, '📁')
# Fall back to generic folder icon
return '📁'
def get_all_plugin_icons() -> Dict[str, str]:

View File

@@ -9,10 +9,10 @@
"x-aliases": ["CHROMIUM_BINARY", "GOOGLE_CHROME_BINARY"],
"description": "Path to Chrome/Chromium binary"
},
"NODE_BINARY": {
"CHROME_NODE_BINARY": {
"type": "string",
"default": "node",
"x-aliases": ["NODEJS_BINARY"],
"x-fallback": "NODE_BINARY",
"description": "Path to Node.js binary (for Puppeteer)"
},
"CHROME_TIMEOUT": {
@@ -50,16 +50,19 @@
"x-fallback": "USER_AGENT",
"description": "User agent string for Chrome"
},
"CHROME_EXTRA_ARGS": {
"type": "string",
"default": "",
"description": "Extra command-line arguments for Chrome (space-separated)"
"CHROME_ARGS": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["CHROME_DEFAULT_ARGS"],
"description": "Default Chrome command-line arguments"
},
"CHROME_CHECK_SSL_VALIDITY": {
"type": "boolean",
"default": true,
"x-fallback": "CHECK_SSL_VALIDITY",
"description": "Whether to verify SSL certificates"
"CHROME_ARGS_EXTRA": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["CHROME_EXTRA_ARGS"],
"description": "Extra arguments to append to Chrome command"
}
}
}

View File

@@ -21,12 +21,6 @@
"default": "",
"x-fallback": "USER_AGENT",
"description": "User agent string"
},
"FAVICON_CHECK_SSL_VALIDITY": {
"type": "boolean",
"default": true,
"x-fallback": "CHECK_SSL_VALIDITY",
"description": "Whether to verify SSL certificates"
}
}
}

View File

@@ -27,21 +27,25 @@
"enum": ["jsonl", "warc", "mbox", "maildir", "mh", "mmdf", "babyl"],
"description": "Output format for forum downloads"
},
"FORUMDL_TEXTIFY": {
"type": "boolean",
"default": false,
"description": "Convert HTML content to plaintext (keep false to preserve HTML)"
},
"FORUMDL_CHECK_SSL_VALIDITY": {
"type": "boolean",
"default": true,
"x-fallback": "CHECK_SSL_VALIDITY",
"description": "Whether to verify SSL certificates"
},
"FORUMDL_EXTRA_ARGS": {
"type": "string",
"default": "",
"description": "Extra arguments for forum-dl (space-separated)"
"FORUMDL_ARGS": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["FORUMDL_DEFAULT_ARGS"],
"description": "Default forum-dl arguments"
},
"FORUMDL_ARGS_EXTRA": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["FORUMDL_EXTRA_ARGS"],
"description": "Extra arguments to append to forum-dl command"
}
}
}

View File

@@ -6,19 +6,13 @@ Usage: on_Snapshot__forumdl.py --url=<url> --snapshot-id=<uuid>
Output: Downloads forum content to $PWD/
Environment variables:
FORUMDL_BINARY: Path to forum-dl binary
FORUMDL_TIMEOUT: Timeout in seconds (default: 3600 for large forums)
FORUMDL_ENABLED: Enable forum downloading (default: True)
FORUMDL_BINARY: Path to forum-dl binary (default: forum-dl)
FORUMDL_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT)
FORUMDL_OUTPUT_FORMAT: Output format (default: jsonl)
FORUMDL_TEXTIFY: Convert HTML to plaintext (default: False - keeps HTML)
FORUMDL_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
FORUMDL_EXTRA_ARGS: Extra arguments for forum-dl (space-separated)
# Forum-dl feature toggles
SAVE_FORUMDL: Enable forum-dl forum extraction (default: True)
# Fallback to ARCHIVING_CONFIG values if FORUMDL_* not set:
TIMEOUT: Fallback timeout
CHECK_SSL_VALIDITY: Fallback SSL check
FORUMDL_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY)
FORUMDL_ARGS: Default forum-dl arguments (JSON array)
FORUMDL_ARGS_EXTRA: Extra arguments to append (JSON array)
"""
import json
@@ -78,6 +72,20 @@ def get_env_int(name: str, default: int = 0) -> int:
return default
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
"""Parse a JSON array from environment variable."""
val = get_env(name, '')
if not val:
return default if default is not None else []
try:
result = json.loads(val)
if isinstance(result, list):
return [str(item) for item in result]
return default if default is not None else []
except json.JSONDecodeError:
return default if default is not None else []
def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
"""
@@ -85,11 +93,11 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
Returns: (success, output_path, error_message)
"""
# Get config from env
timeout = get_env_int('TIMEOUT', 3600)
check_ssl = get_env_bool('CHECK_SSL_VALIDITY', True)
textify = get_env_bool('FORUMDL_TEXTIFY', False)
extra_args = get_env('FORUMDL_EXTRA_ARGS', '')
# Get config from env (with FORUMDL_ prefix, x-fallback handled by config loader)
timeout = get_env_int('FORUMDL_TIMEOUT') or get_env_int('TIMEOUT', 3600)
check_ssl = get_env_bool('FORUMDL_CHECK_SSL_VALIDITY', True) if get_env('FORUMDL_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True)
forumdl_args = get_env_array('FORUMDL_ARGS', [])
forumdl_args_extra = get_env_array('FORUMDL_ARGS_EXTRA', [])
output_format = get_env('FORUMDL_OUTPUT_FORMAT', 'jsonl')
# Output directory is current directory (hook already runs in output dir)
@@ -108,16 +116,13 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
output_file = output_dir / f'forum.{output_format}'
# Build command
cmd = [binary, '-f', output_format, '-o', str(output_file)]
if textify:
cmd.append('--textify')
cmd = [binary, *forumdl_args, '-f', output_format, '-o', str(output_file)]
if not check_ssl:
cmd.append('--no-check-certificate')
if extra_args:
cmd.extend(extra_args.split())
if forumdl_args_extra:
cmd.extend(forumdl_args_extra)
cmd.append(url)

View File

@@ -21,6 +21,12 @@
"x-fallback": "TIMEOUT",
"description": "Timeout for gallery downloads in seconds"
},
"GALLERYDL_COOKIES_FILE": {
"type": "string",
"default": "",
"x-fallback": "COOKIES_FILE",
"description": "Path to cookies file"
},
"GALLERYDL_CHECK_SSL_VALIDITY": {
"type": "boolean",
"default": true,
@@ -34,12 +40,15 @@
"--write-metadata",
"--write-info-json"
],
"x-aliases": ["GALLERYDL_DEFAULT_ARGS"],
"description": "Default gallery-dl arguments"
},
"GALLERYDL_EXTRA_ARGS": {
"type": "string",
"default": "",
"description": "Extra arguments for gallery-dl (space-separated)"
"GALLERYDL_ARGS_EXTRA": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["GALLERYDL_EXTRA_ARGS"],
"description": "Extra arguments to append to gallery-dl command"
}
}
}

View File

@@ -6,20 +6,13 @@ Usage: on_Snapshot__gallerydl.py --url=<url> --snapshot-id=<uuid>
Output: Downloads gallery images to $PWD/gallerydl/
Environment variables:
GALLERYDL_BINARY: Path to gallery-dl binary
GALLERYDL_TIMEOUT: Timeout in seconds (default: 3600 for large galleries)
GALLERYDL_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
GALLERYDL_EXTRA_ARGS: Extra arguments for gallery-dl (space-separated)
COOKIES_FILE: Path to cookies file for authentication
# Gallery-dl feature toggles
USE_GALLERYDL: Enable gallery-dl gallery extraction (default: True)
SAVE_GALLERYDL: Alias for USE_GALLERYDL
# Fallback to ARCHIVING_CONFIG values if GALLERYDL_* not set:
GALLERYDL_TIMEOUT: Fallback timeout for gallery downloads
TIMEOUT: Fallback timeout
CHECK_SSL_VALIDITY: Fallback SSL check
GALLERYDL_ENABLED: Enable gallery-dl gallery extraction (default: True)
GALLERYDL_BINARY: Path to gallery-dl binary (default: gallery-dl)
GALLERYDL_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT)
GALLERYDL_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE)
GALLERYDL_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY)
GALLERYDL_ARGS: Default gallery-dl arguments (JSON array)
GALLERYDL_ARGS_EXTRA: Extra arguments to append (JSON array)
"""
import json
@@ -58,6 +51,20 @@ def get_env_int(name: str, default: int = 0) -> int:
return default
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
"""Parse a JSON array from environment variable."""
val = get_env(name, '')
if not val:
return default if default is not None else []
try:
result = json.loads(val)
if isinstance(result, list):
return [str(item) for item in result]
return default if default is not None else []
except json.JSONDecodeError:
return default if default is not None else []
STATICFILE_DIR = '../staticfile'
def has_staticfile_output() -> bool:
@@ -66,35 +73,27 @@ def has_staticfile_output() -> bool:
return staticfile_dir.exists() and any(staticfile_dir.iterdir())
# Default gallery-dl args
def get_gallerydl_default_args() -> list[str]:
"""Build default gallery-dl arguments."""
return [
'--write-metadata',
'--write-info-json',
]
def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]:
"""
Download gallery using gallery-dl.
Returns: (success, output_path, error_message)
"""
# Get config from env
timeout = get_env_int('TIMEOUT', 3600)
check_ssl = get_env_bool('CHECK_SSL_VALIDITY', True)
extra_args = get_env('GALLERYDL_EXTRA_ARGS', '')
cookies_file = get_env('COOKIES_FILE', '')
# Get config from env (with GALLERYDL_ prefix, x-fallback handled by config loader)
timeout = get_env_int('GALLERYDL_TIMEOUT') or get_env_int('TIMEOUT', 3600)
check_ssl = get_env_bool('GALLERYDL_CHECK_SSL_VALIDITY', True) if get_env('GALLERYDL_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True)
gallerydl_args = get_env_array('GALLERYDL_ARGS', [])
gallerydl_args_extra = get_env_array('GALLERYDL_ARGS_EXTRA', [])
cookies_file = get_env('GALLERYDL_COOKIES_FILE') or get_env('COOKIES_FILE', '')
# Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)
# Build command (later options take precedence)
# Build command
# Use -D for exact directory (flat structure) instead of -d (nested structure)
cmd = [
binary,
*get_gallerydl_default_args(),
*gallerydl_args,
'-D', str(output_dir),
]
@@ -104,8 +103,8 @@ def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]:
if cookies_file and Path(cookies_file).exists():
cmd.extend(['-C', cookies_file])
if extra_args:
cmd.extend(extra_args.split())
if gallerydl_args_extra:
cmd.extend(gallerydl_args_extra)
cmd.append(url)

View File

@@ -26,16 +26,19 @@
"default": "github.com,gitlab.com,bitbucket.org,gist.github.com,codeberg.org,gitea.com,git.sr.ht",
"description": "Comma-separated list of domains to treat as git repositories"
},
"GIT_CLONE_DEPTH": {
"type": "integer",
"default": 1,
"minimum": 0,
"description": "Depth of git clone (0 for full history, 1 for shallow)"
"GIT_ARGS": {
"type": "array",
"items": {"type": "string"},
"default": ["clone", "--depth=1", "--recursive"],
"x-aliases": ["GIT_DEFAULT_ARGS"],
"description": "Default git arguments"
},
"GIT_EXTRA_ARGS": {
"type": "string",
"default": "",
"description": "Extra arguments for git clone"
"GIT_ARGS_EXTRA": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["GIT_EXTRA_ARGS"],
"description": "Extra arguments to append to git command"
}
}
}

View File

@@ -8,7 +8,8 @@ Output: Clones repository to $PWD/repo
Environment variables:
GIT_BINARY: Path to git binary
GIT_TIMEOUT: Timeout in seconds (default: 120)
GIT_ARGS: Extra arguments for git clone (space-separated)
GIT_ARGS: Default git arguments (JSON array, default: ["clone", "--depth=1", "--recursive"])
GIT_ARGS_EXTRA: Extra arguments to append (JSON array, default: [])
# Fallback to ARCHIVING_CONFIG values if GIT_* not set:
TIMEOUT: Fallback timeout
@@ -41,6 +42,20 @@ def get_env_int(name: str, default: int = 0) -> int:
return default
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
"""Parse a JSON array from environment variable."""
val = get_env(name, '')
if not val:
return default if default is not None else []
try:
result = json.loads(val)
if isinstance(result, list):
return [str(item) for item in result]
return default if default is not None else []
except json.JSONDecodeError:
return default if default is not None else []
def is_git_url(url: str) -> bool:
"""Check if URL looks like a git repository."""
git_patterns = [
@@ -61,19 +76,10 @@ def clone_git(url: str, binary: str) -> tuple[bool, str | None, str]:
Returns: (success, output_path, error_message)
"""
timeout = get_env_int('GIT_TIMEOUT') or get_env_int('TIMEOUT', 120)
extra_args = get_env('GIT_ARGS')
git_args = get_env_array('GIT_ARGS', [])
git_args_extra = get_env_array('GIT_ARGS_EXTRA', [])
cmd = [
binary,
'clone',
'--depth=1',
'--recursive',
]
if extra_args:
cmd.extend(extra_args.split())
cmd.extend([url, OUTPUT_DIR])
cmd = [binary, *git_args, *git_args_extra, url, OUTPUT_DIR]
try:
result = subprocess.run(cmd, capture_output=True, timeout=timeout)

View File

@@ -0,0 +1,46 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"required_plugins": ["chrome"],
"properties": {
"INFINISCROLL_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_INFINISCROLL", "USE_INFINISCROLL"],
"description": "Enable infinite scroll page expansion"
},
"INFINISCROLL_TIMEOUT": {
"type": "integer",
"default": 120,
"minimum": 10,
"x-fallback": "TIMEOUT",
"description": "Maximum timeout for scrolling in seconds"
},
"INFINISCROLL_SCROLL_DELAY": {
"type": "integer",
"default": 2000,
"minimum": 500,
"description": "Delay between scrolls in milliseconds"
},
"INFINISCROLL_SCROLL_DISTANCE": {
"type": "integer",
"default": 1600,
"minimum": 100,
"description": "Distance to scroll per step in pixels"
},
"INFINISCROLL_SCROLL_LIMIT": {
"type": "integer",
"default": 10,
"minimum": 1,
"maximum": 100,
"description": "Maximum number of scroll steps"
},
"INFINISCROLL_MIN_HEIGHT": {
"type": "integer",
"default": 16000,
"minimum": 1000,
"description": "Minimum page height to scroll to in pixels"
}
}
}

View File

@@ -0,0 +1,267 @@
#!/usr/bin/env node
/**
* Scroll the page down to trigger infinite scroll / lazy loading.
*
* Scrolls down 1 page at a time, up to INFINISCROLL_SCROLL_LIMIT times,
* ensuring at least INFINISCROLL_MIN_HEIGHT (default 16,000px) is reached.
* Stops early if no new content loads after a scroll.
*
* Usage: on_Snapshot__45_infiniscroll.js --url=<url> --snapshot-id=<uuid>
* Output: JSONL with scroll stats (no files created)
*
* Environment variables:
* INFINISCROLL_ENABLED: Enable/disable (default: true)
* INFINISCROLL_TIMEOUT: Max timeout in seconds (default: 120)
* INFINISCROLL_SCROLL_DELAY: Delay between scrolls in ms (default: 2000)
* INFINISCROLL_SCROLL_DISTANCE: Pixels per scroll (default: 1600)
* INFINISCROLL_SCROLL_LIMIT: Max scroll iterations (default: 10)
* INFINISCROLL_MIN_HEIGHT: Min page height to reach in px (default: 16000)
*/
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
function getEnvInt(name, defaultValue = 0) {
const val = parseInt(getEnv(name, String(defaultValue)), 10);
return isNaN(val) ? defaultValue : val;
}
// Check if infiniscroll is enabled BEFORE requiring puppeteer
if (!getEnvBool('INFINISCROLL_ENABLED', true)) {
console.error('Skipping infiniscroll (INFINISCROLL_ENABLED=False)');
process.exit(0);
}
const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer-core');
const PLUGIN_NAME = 'infiniscroll';
const CHROME_SESSION_DIR = '../chrome';
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
function getCdpUrl() {
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
if (fs.existsSync(cdpFile)) {
return fs.readFileSync(cdpFile, 'utf8').trim();
}
return null;
}
function getPageId() {
const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt');
if (fs.existsSync(targetIdFile)) {
return fs.readFileSync(targetIdFile, 'utf8').trim();
}
return null;
}
async function waitForChromeTabLoaded(timeoutMs = 60000) {
const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json');
const startTime = Date.now();
while (Date.now() - startTime < timeoutMs) {
if (fs.existsSync(navigationFile)) {
return true;
}
await new Promise(resolve => setTimeout(resolve, 100));
}
return false;
}
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
async function scrollDown(page, options = {}) {
const {
timeout = 120000,
scrollDelay = 2000,
scrollDistance = 1600,
scrollLimit = 10,
minHeight = 16000,
} = options;
const startTime = Date.now();
const startingHeight = await page.evaluate(() => document.body.scrollHeight);
let lastHeight = startingHeight;
let scrollCount = 0;
let scrollPosition = 0;
// Scroll to top first
await page.evaluate(() => {
window.scrollTo({ top: 0, left: 0, behavior: 'smooth' });
});
await sleep(500);
while (scrollCount < scrollLimit) {
// Check timeout
const elapsed = Date.now() - startTime;
if (elapsed >= timeout) {
console.error(`Timeout reached after ${scrollCount} scrolls`);
break;
}
scrollPosition = (scrollCount + 1) * scrollDistance;
console.error(`Scrolling down ${scrollCount + 1}x ${scrollDistance}px... (${scrollPosition}/${lastHeight})`);
await page.evaluate((yOffset) => {
window.scrollTo({ top: yOffset, left: 0, behavior: 'smooth' });
}, scrollPosition);
scrollCount++;
await sleep(scrollDelay);
// Check if new content was added (infinite scroll detection)
const newHeight = await page.evaluate(() => document.body.scrollHeight);
const addedPx = newHeight - lastHeight;
if (addedPx > 0) {
console.error(`Detected infini-scrolling: ${lastHeight}+${addedPx} => ${newHeight}`);
} else if (scrollPosition >= newHeight + scrollDistance) {
// Reached the bottom
if (scrollCount > 2) {
console.error(`Reached bottom of page at ${newHeight}px`);
break;
}
}
lastHeight = newHeight;
// Check if we've reached minimum height and can stop
if (lastHeight >= minHeight && scrollPosition >= lastHeight) {
console.error(`Reached minimum height target (${minHeight}px)`);
break;
}
}
// Scroll to absolute bottom
if (scrollPosition < lastHeight) {
await page.evaluate(() => {
window.scrollTo({ top: document.body.scrollHeight, left: 0, behavior: 'smooth' });
});
await sleep(scrollDelay);
}
// Scroll back to top
console.error(`Reached bottom of page at ${lastHeight}px, scrolling back to top...`);
await page.evaluate(() => {
window.scrollTo({ top: 0, left: 0, behavior: 'smooth' });
});
await sleep(scrollDelay);
const totalElapsed = Date.now() - startTime;
return {
scrollCount,
finalHeight: lastHeight,
startingHeight,
elapsedMs: totalElapsed,
};
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__45_infiniscroll.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
const timeout = getEnvInt('INFINISCROLL_TIMEOUT', 120) * 1000;
const scrollDelay = getEnvInt('INFINISCROLL_SCROLL_DELAY', 2000);
const scrollDistance = getEnvInt('INFINISCROLL_SCROLL_DISTANCE', 1600);
const scrollLimit = getEnvInt('INFINISCROLL_SCROLL_LIMIT', 10);
const minHeight = getEnvInt('INFINISCROLL_MIN_HEIGHT', 16000);
const cdpUrl = getCdpUrl();
if (!cdpUrl) {
console.error('ERROR: Chrome CDP URL not found (chrome plugin must run first)');
process.exit(1);
}
// Wait for page to be loaded
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
console.error('ERROR: Page not loaded after 60s (chrome_navigate must complete first)');
process.exit(1);
}
let browser = null;
try {
browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
const pages = await browser.pages();
if (pages.length === 0) {
throw new Error('No pages found in browser');
}
// Find the right page by target ID
const targetId = getPageId();
let page = null;
if (targetId) {
page = pages.find(p => {
const target = p.target();
return target && target._targetId === targetId;
});
}
if (!page) {
page = pages[pages.length - 1];
}
console.error(`Starting infinite scroll on ${url}`);
const result = await scrollDown(page, {
timeout,
scrollDelay,
scrollDistance,
scrollLimit,
minHeight,
});
browser.disconnect();
const elapsedSec = (result.elapsedMs / 1000).toFixed(1);
const finalHeightStr = result.finalHeight.toLocaleString();
const addedHeight = result.finalHeight - result.startingHeight;
const addedStr = addedHeight > 0 ? `+${addedHeight.toLocaleString()}px new content` : 'no new content';
const outputStr = `scrolled to ${finalHeightStr}px (${addedStr}) over ${elapsedSec}s`;
console.error(`Success: ${outputStr}`);
console.log(JSON.stringify({
type: 'ArchiveResult',
status: 'succeeded',
output_str: outputStr,
}));
process.exit(0);
} catch (e) {
if (browser) browser.disconnect();
console.error(`ERROR: ${e.name}: ${e.message}`);
process.exit(1);
}
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -0,0 +1,352 @@
"""
Integration tests for infiniscroll plugin
Tests verify:
1. Hook script exists
2. Dependencies installed via chrome validation hooks
3. Verify deps with abx-pkg
4. INFINISCROLL_ENABLED=False skips without JSONL
5. Fails gracefully when no chrome session exists
6. Full integration test: scrolls page and outputs stats
7. Config options work (scroll limit, min height)
"""
import json
import os
import re
import signal
import subprocess
import time
import tempfile
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
INFINISCROLL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_infiniscroll.*'), None)
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
CHROME_TAB_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Snapshot__20_chrome_tab.bg.js'
CHROME_NAVIGATE_HOOK = next((PLUGINS_ROOT / 'chrome').glob('on_Snapshot__*_chrome_navigate.*'), None)
TEST_URL = 'https://www.singsing.movie/'
def get_node_modules_dir():
"""Get NODE_MODULES_DIR for tests, checking env first."""
# Check if NODE_PATH is already set in environment
if os.environ.get('NODE_PATH'):
return Path(os.environ['NODE_PATH'])
# Otherwise compute from LIB_DIR
from archivebox.config.common import STORAGE_CONFIG
lib_dir = Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
return lib_dir / 'npm' / 'node_modules'
NODE_MODULES_DIR = get_node_modules_dir()
def get_test_env():
"""Get environment with NODE_PATH set correctly."""
env = os.environ.copy()
env['NODE_PATH'] = str(NODE_MODULES_DIR)
return env
def test_hook_script_exists():
"""Verify on_Snapshot hook exists."""
assert INFINISCROLL_HOOK is not None, "Infiniscroll hook not found"
assert INFINISCROLL_HOOK.exists(), f"Hook not found: {INFINISCROLL_HOOK}"
def test_verify_deps_with_abx_pkg():
"""Verify dependencies are available via abx-pkg after hook installation."""
from abx_pkg import Binary, EnvProvider, BinProviderOverrides
EnvProvider.model_rebuild()
# Verify node is available
node_binary = Binary(name='node', binproviders=[EnvProvider()])
node_loaded = node_binary.load()
assert node_loaded and node_loaded.abspath, "Node.js required for infiniscroll plugin"
def test_config_infiniscroll_disabled_skips():
"""Test that INFINISCROLL_ENABLED=False exits without emitting JSONL."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
env = get_test_env()
env['INFINISCROLL_ENABLED'] = 'False'
result = subprocess.run(
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-disabled'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=30
)
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
# Should NOT emit any JSONL
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, got: {jsonl_lines}"
def test_fails_gracefully_without_chrome_session():
"""Test that hook fails gracefully when no chrome session exists."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
result = subprocess.run(
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-no-chrome'],
cwd=tmpdir,
capture_output=True,
text=True,
env=get_test_env(),
timeout=30
)
# Should fail (exit 1) when no chrome session
assert result.returncode != 0, "Should fail when no chrome session exists"
# Error could be about chrome/CDP not found, or puppeteer module missing
err_lower = result.stderr.lower()
assert any(x in err_lower for x in ['chrome', 'cdp', 'puppeteer', 'module']), \
f"Should mention chrome/CDP/puppeteer in error: {result.stderr}"
def setup_chrome_session(tmpdir):
"""Helper to set up Chrome session with tab and navigation."""
crawl_dir = Path(tmpdir) / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
env = get_test_env()
env['CHROME_HEADLESS'] = 'true'
# Launch Chrome at crawl level
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-infiniscroll'],
cwd=str(crawl_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Wait for Chrome to launch
for i in range(15):
if chrome_launch_process.poll() is not None:
stdout, stderr = chrome_launch_process.communicate()
raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
if (chrome_dir / 'cdp_url.txt').exists():
break
time.sleep(1)
if not (chrome_dir / 'cdp_url.txt').exists():
raise RuntimeError("Chrome CDP URL not found after 15s")
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
# Create snapshot directory structure
snapshot_dir = Path(tmpdir) / 'snapshot'
snapshot_dir.mkdir()
snapshot_chrome_dir = snapshot_dir / 'chrome'
snapshot_chrome_dir.mkdir()
# Create tab
tab_env = env.copy()
tab_env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
result = subprocess.run(
['node', str(CHROME_TAB_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll', '--crawl-id=test-infiniscroll'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=60,
env=tab_env
)
if result.returncode != 0:
raise RuntimeError(f"Tab creation failed: {result.stderr}")
# Navigate to URL
result = subprocess.run(
['node', str(CHROME_NAVIGATE_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
if result.returncode != 0:
raise RuntimeError(f"Navigation failed: {result.stderr}")
return chrome_launch_process, chrome_pid, snapshot_chrome_dir
def cleanup_chrome(chrome_launch_process, chrome_pid):
"""Helper to clean up Chrome processes."""
try:
chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5)
except:
pass
try:
os.kill(chrome_pid, signal.SIGKILL)
except OSError:
pass
def test_scrolls_page_and_outputs_stats():
"""Integration test: scroll page and verify JSONL output format."""
with tempfile.TemporaryDirectory() as tmpdir:
chrome_launch_process = None
chrome_pid = None
try:
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir)
# Create infiniscroll output directory (sibling to chrome)
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
infiniscroll_dir.mkdir()
# Run infiniscroll hook
env = get_test_env()
env['INFINISCROLL_SCROLL_LIMIT'] = '3' # Limit scrolls for faster test
env['INFINISCROLL_SCROLL_DELAY'] = '500' # Faster scrolling
env['INFINISCROLL_MIN_HEIGHT'] = '1000' # Lower threshold for test
result = subprocess.run(
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll'],
cwd=str(infiniscroll_dir),
capture_output=True,
text=True,
timeout=60,
env=env
)
assert result.returncode == 0, f"Infiniscroll failed: {result.stderr}\nStdout: {result.stdout}"
# Parse JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json is not None, f"Should have ArchiveResult JSONL output. Stdout: {result.stdout}"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
# Verify output_str format: "scrolled to X,XXXpx (+Y,YYYpx new content) over Z.Zs"
output_str = result_json.get('output_str', '')
assert output_str.startswith('scrolled to'), f"output_str should start with 'scrolled to': {output_str}"
assert 'px' in output_str, f"output_str should contain pixel count: {output_str}"
assert re.search(r'over \d+(\.\d+)?s', output_str), f"output_str should contain duration: {output_str}"
# Verify no files created in output directory
output_files = list(infiniscroll_dir.iterdir())
assert len(output_files) == 0, f"Should not create any files, but found: {output_files}"
finally:
if chrome_launch_process and chrome_pid:
cleanup_chrome(chrome_launch_process, chrome_pid)
def test_config_scroll_limit_honored():
"""Test that INFINISCROLL_SCROLL_LIMIT config is respected."""
with tempfile.TemporaryDirectory() as tmpdir:
chrome_launch_process = None
chrome_pid = None
try:
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir)
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
infiniscroll_dir.mkdir()
# Set scroll limit to 2
env = get_test_env()
env['INFINISCROLL_SCROLL_LIMIT'] = '2'
env['INFINISCROLL_SCROLL_DELAY'] = '500'
env['INFINISCROLL_MIN_HEIGHT'] = '100000' # High threshold so limit kicks in
result = subprocess.run(
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-limit'],
cwd=str(infiniscroll_dir),
capture_output=True,
text=True,
timeout=60,
env=env
)
assert result.returncode == 0, f"Infiniscroll failed: {result.stderr}"
# Parse output and verify scroll count
result_json = None
for line in result.stdout.strip().split('\n'):
if line.strip().startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json is not None, "Should have JSONL output"
output_str = result_json.get('output_str', '')
# Verify output format and that it completed (scroll limit enforced internally)
assert output_str.startswith('scrolled to'), f"Should have valid output_str: {output_str}"
assert result_json['status'] == 'succeeded', f"Should succeed with scroll limit: {result_json}"
finally:
if chrome_launch_process and chrome_pid:
cleanup_chrome(chrome_launch_process, chrome_pid)
def test_config_timeout_honored():
"""Test that INFINISCROLL_TIMEOUT config is respected."""
with tempfile.TemporaryDirectory() as tmpdir:
chrome_launch_process = None
chrome_pid = None
try:
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir)
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
infiniscroll_dir.mkdir()
# Set very short timeout
env = get_test_env()
env['INFINISCROLL_TIMEOUT'] = '3' # 3 seconds
env['INFINISCROLL_SCROLL_DELAY'] = '2000' # 2s delay - timeout should trigger
env['INFINISCROLL_SCROLL_LIMIT'] = '100' # High limit
env['INFINISCROLL_MIN_HEIGHT'] = '100000'
start_time = time.time()
result = subprocess.run(
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-timeout'],
cwd=str(infiniscroll_dir),
capture_output=True,
text=True,
timeout=30,
env=env
)
elapsed = time.time() - start_time
# Should complete within reasonable time (timeout + buffer)
assert elapsed < 15, f"Should respect timeout, took {elapsed:.1f}s"
assert result.returncode == 0, f"Should complete even with timeout: {result.stderr}"
finally:
if chrome_launch_process and chrome_pid:
cleanup_chrome(chrome_launch_process, chrome_pid)
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -15,17 +15,26 @@
"x-aliases": ["POSTLIGHT_PARSER_BINARY"],
"description": "Path to Mercury/Postlight parser binary"
},
"NODE_BINARY": {
"type": "string",
"default": "node",
"description": "Path to Node.js binary"
},
"MERCURY_TIMEOUT": {
"type": "integer",
"default": 30,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for Mercury in seconds"
},
"MERCURY_ARGS": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["MERCURY_DEFAULT_ARGS"],
"description": "Default Mercury parser arguments"
},
"MERCURY_ARGS_EXTRA": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["MERCURY_EXTRA_ARGS"],
"description": "Extra arguments to append to Mercury parser command"
}
}
}

View File

@@ -8,8 +8,8 @@ Output: Creates mercury/ directory with content.html, content.txt, article.json
Environment variables:
MERCURY_BINARY: Path to postlight-parser binary
MERCURY_TIMEOUT: Timeout in seconds (default: 60)
# Fallback to ARCHIVING_CONFIG values if MERCURY_* not set:
MERCURY_ARGS: Default Mercury arguments (JSON array)
MERCURY_ARGS_EXTRA: Extra arguments to append (JSON array)
TIMEOUT: Fallback timeout
Note: Requires postlight-parser: npm install -g @postlight/parser
@@ -51,6 +51,20 @@ def get_env_int(name: str, default: int = 0) -> int:
return default
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
"""Parse a JSON array from environment variable."""
val = get_env(name, '')
if not val:
return default if default is not None else []
try:
result = json.loads(val)
if isinstance(result, list):
return [str(item) for item in result]
return default if default is not None else []
except json.JSONDecodeError:
return default if default is not None else []
def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
"""
Extract article using Mercury Parser.
@@ -58,13 +72,15 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
Returns: (success, output_path, error_message)
"""
timeout = get_env_int('MERCURY_TIMEOUT') or get_env_int('TIMEOUT', 60)
mercury_args = get_env_array('MERCURY_ARGS', [])
mercury_args_extra = get_env_array('MERCURY_ARGS_EXTRA', [])
# Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)
try:
# Get text version
cmd_text = [binary, url, '--format=text']
cmd_text = [binary, *mercury_args, *mercury_args_extra, url, '--format=text']
result_text = subprocess.run(cmd_text, capture_output=True, timeout=timeout)
if result_text.returncode != 0:
@@ -84,7 +100,7 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
(output_dir / 'content.txt').write_text(text_content, encoding='utf-8')
# Get HTML version
cmd_html = [binary, url, '--format=html']
cmd_html = [binary, *mercury_args, *mercury_args_extra, url, '--format=html']
result_html = subprocess.run(cmd_html, capture_output=True, timeout=timeout)
try:

View File

@@ -21,10 +21,19 @@
"x-fallback": "TIMEOUT",
"description": "Timeout for paper downloads in seconds"
},
"PAPERSDL_EXTRA_ARGS": {
"type": "string",
"default": "",
"description": "Extra arguments for papers-dl (space-separated)"
"PAPERSDL_ARGS": {
"type": "array",
"items": {"type": "string"},
"default": ["fetch"],
"x-aliases": ["PAPERSDL_DEFAULT_ARGS"],
"description": "Default papers-dl arguments"
},
"PAPERSDL_ARGS_EXTRA": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["PAPERSDL_EXTRA_ARGS"],
"description": "Extra arguments to append to papers-dl command"
}
}
}

View File

@@ -8,7 +8,8 @@ Output: Downloads paper PDFs to $PWD/
Environment variables:
PAPERSDL_BINARY: Path to papers-dl binary
PAPERSDL_TIMEOUT: Timeout in seconds (default: 300 for paper downloads)
PAPERSDL_EXTRA_ARGS: Extra arguments for papers-dl (space-separated)
PAPERSDL_ARGS: Default papers-dl arguments (JSON array, default: ["fetch"])
PAPERSDL_ARGS_EXTRA: Extra arguments to append (JSON array)
# papers-dl feature toggles
SAVE_PAPERSDL: Enable papers-dl paper extraction (default: True)
@@ -54,6 +55,20 @@ def get_env_int(name: str, default: int = 0) -> int:
return default
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
"""Parse a JSON array from environment variable."""
val = get_env(name, '')
if not val:
return default if default is not None else []
try:
result = json.loads(val)
if isinstance(result, list):
return [str(item) for item in result]
return default if default is not None else []
except json.JSONDecodeError:
return default if default is not None else []
def extract_doi_from_url(url: str) -> str | None:
"""Extract DOI from common paper URLs."""
# Match DOI pattern in URL
@@ -72,7 +87,8 @@ def save_paper(url: str, binary: str) -> tuple[bool, str | None, str]:
"""
# Get config from env
timeout = get_env_int('TIMEOUT', 300)
extra_args = get_env('PAPERSDL_EXTRA_ARGS', '')
papersdl_args = get_env_array('PAPERSDL_ARGS', [])
papersdl_args_extra = get_env_array('PAPERSDL_ARGS_EXTRA', [])
# Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)
@@ -85,11 +101,11 @@ def save_paper(url: str, binary: str) -> tuple[bool, str | None, str]:
else:
identifier = doi
# Build command - papers-dl fetch <identifier> -o <output_dir>
cmd = [binary, 'fetch', identifier, '-o', str(output_dir)]
# Build command - papers-dl <args> <identifier> -o <output_dir>
cmd = [binary, *papersdl_args, identifier, '-o', str(output_dir)]
if extra_args:
cmd.extend(extra_args.split())
if papersdl_args_extra:
cmd.extend(papersdl_args_extra)
try:
result = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True)

View File

@@ -14,17 +14,26 @@
"default": "readability-extractor",
"description": "Path to readability-extractor binary"
},
"NODE_BINARY": {
"type": "string",
"default": "node",
"description": "Path to Node.js binary"
},
"READABILITY_TIMEOUT": {
"type": "integer",
"default": 30,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for Readability in seconds"
},
"READABILITY_ARGS": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["READABILITY_DEFAULT_ARGS"],
"description": "Default Readability arguments"
},
"READABILITY_ARGS_EXTRA": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["READABILITY_EXTRA_ARGS"],
"description": "Extra arguments to append to Readability command"
}
}
}

View File

@@ -8,8 +8,8 @@ Output: Creates readability/ directory with content.html, content.txt, article.j
Environment variables:
READABILITY_BINARY: Path to readability-extractor binary
READABILITY_TIMEOUT: Timeout in seconds (default: 60)
# Fallback to ARCHIVING_CONFIG values if READABILITY_* not set:
READABILITY_ARGS: Default Readability arguments (JSON array)
READABILITY_ARGS_EXTRA: Extra arguments to append (JSON array)
TIMEOUT: Fallback timeout
Note: Requires readability-extractor from https://github.com/ArchiveBox/readability-extractor
@@ -44,6 +44,20 @@ def get_env_int(name: str, default: int = 0) -> int:
return default
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
"""Parse a JSON array from environment variable."""
val = get_env(name, '')
if not val:
return default if default is not None else []
try:
result = json.loads(val)
if isinstance(result, list):
return [str(item) for item in result]
return default if default is not None else []
except json.JSONDecodeError:
return default if default is not None else []
def find_html_source() -> str | None:
"""Find HTML content from other extractors in the snapshot directory."""
# Hooks run in snapshot_dir, sibling extractor outputs are in subdirectories
@@ -73,6 +87,8 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
Returns: (success, output_path, error_message)
"""
timeout = get_env_int('READABILITY_TIMEOUT') or get_env_int('TIMEOUT', 60)
readability_args = get_env_array('READABILITY_ARGS', [])
readability_args_extra = get_env_array('READABILITY_ARGS_EXTRA', [])
# Find HTML source
html_source = find_html_source()
@@ -84,7 +100,7 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
try:
# Run readability-extractor (outputs JSON by default)
cmd = [binary, html_source]
cmd = [binary, *readability_args, *readability_args_extra, html_source]
result = subprocess.run(cmd, capture_output=True, timeout=timeout)
if result.returncode != 0:

View File

@@ -3,25 +3,32 @@
"type": "object",
"additionalProperties": false,
"properties": {
"SEARCH_BACKEND_RIPGREP_BINARY": {
"RIPGREP_BINARY": {
"type": "string",
"default": "rg",
"x-aliases": ["RIPGREP_BINARY"],
"description": "Path to ripgrep binary"
},
"SEARCH_BACKEND_RIPGREP_IGNORE_EXTENSIONS": {
"type": "string",
"default": "css,js,orig,svg",
"x-aliases": ["RIPGREP_IGNORE_EXTENSIONS"],
"description": "Comma-separated file extensions to ignore"
},
"SEARCH_BACKEND_RIPGREP_TIMEOUT": {
"RIPGREP_TIMEOUT": {
"type": "integer",
"default": 90,
"minimum": 5,
"x-fallback": "TIMEOUT",
"x-aliases": ["SEARCH_BACKEND_TIMEOUT"],
"description": "Search timeout in seconds"
},
"RIPGREP_ARGS": {
"type": "array",
"items": { "type": "string" },
"default": ["--files-with-matches", "--no-messages", "--ignore-case"],
"x-aliases": ["RIPGREP_DEFAULT_ARGS"],
"description": "Default ripgrep arguments"
},
"RIPGREP_ARGS_EXTRA": {
"type": "array",
"items": { "type": "string" },
"default": [],
"x-aliases": ["RIPGREP_EXTRA_ARGS"],
"description": "Extra arguments to append to ripgrep command"
}
}
}

View File

@@ -6,10 +6,12 @@ using ripgrep (rg). This is simpler but slower for large archives.
Environment variables:
RIPGREP_BINARY: Path to ripgrep binary (default: rg)
RIPGREP_IGNORE_EXTENSIONS: Comma-separated extensions to ignore (default: css,js,orig,svg)
SEARCH_BACKEND_TIMEOUT: Search timeout in seconds (default: 90)
RIPGREP_ARGS: Default ripgrep arguments (JSON array)
RIPGREP_ARGS_EXTRA: Extra arguments to append (JSON array)
RIPGREP_TIMEOUT: Search timeout in seconds (default: 90)
"""
import json
import os
import subprocess
import shutil
@@ -19,39 +21,57 @@ from typing import List, Iterable
from django.conf import settings
# Config with old var names for backwards compatibility
RIPGREP_BINARY = os.environ.get('RIPGREP_BINARY', 'rg').strip()
RIPGREP_IGNORE_EXTENSIONS = os.environ.get('RIPGREP_IGNORE_EXTENSIONS', 'css,js,orig,svg').strip()
SEARCH_BACKEND_TIMEOUT = int(os.environ.get('SEARCH_BACKEND_TIMEOUT', '90'))
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
except ValueError:
return default
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
"""Parse a JSON array from environment variable."""
val = get_env(name, '')
if not val:
return default if default is not None else []
try:
result = json.loads(val)
if isinstance(result, list):
return [str(item) for item in result]
return default if default is not None else []
except json.JSONDecodeError:
return default if default is not None else []
def search(query: str) -> List[str]:
"""Search for snapshots using ripgrep."""
rg_binary = shutil.which(RIPGREP_BINARY) or RIPGREP_BINARY
rg_binary = get_env('RIPGREP_BINARY', 'rg')
rg_binary = shutil.which(rg_binary) or rg_binary
if not rg_binary or not Path(rg_binary).exists():
raise RuntimeError(f'ripgrep binary not found ({RIPGREP_BINARY}). Install with: apt install ripgrep')
raise RuntimeError(f'ripgrep binary not found. Install with: apt install ripgrep')
timeout = get_env_int('RIPGREP_TIMEOUT', 90)
ripgrep_args = get_env_array('RIPGREP_ARGS', [])
ripgrep_args_extra = get_env_array('RIPGREP_ARGS_EXTRA', [])
archive_dir = Path(settings.ARCHIVE_DIR)
if not archive_dir.exists():
return []
# Build ignore pattern from config
ignore_pattern = f'*.{{{RIPGREP_IGNORE_EXTENSIONS}}}'
cmd = [
rg_binary,
f'--type-add=ignore:{ignore_pattern}',
'--type-not=ignore',
'--files-with-matches',
'--no-messages',
'--ignore-case',
*ripgrep_args,
*ripgrep_args_extra,
'--regexp',
query,
str(archive_dir),
]
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=SEARCH_BACKEND_TIMEOUT)
result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
# Extract snapshot IDs from file paths
# Paths look like: archive/<snapshot_id>/<extractor>/file.txt

View File

@@ -15,11 +15,18 @@
"x-aliases": ["SINGLE_FILE_BINARY"],
"description": "Path to single-file binary"
},
"NODE_BINARY": {
"SINGLEFILE_NODE_BINARY": {
"type": "string",
"default": "node",
"x-fallback": "NODE_BINARY",
"description": "Path to Node.js binary"
},
"SINGLEFILE_CHROME_BINARY": {
"type": "string",
"default": "",
"x-fallback": "CHROME_BINARY",
"description": "Path to Chrome/Chromium binary"
},
"SINGLEFILE_TIMEOUT": {
"type": "integer",
"default": 60,
@@ -39,16 +46,25 @@
"x-fallback": "COOKIES_FILE",
"description": "Path to cookies file"
},
"SINGLEFILE_CHECK_SSL_VALIDITY": {
"type": "boolean",
"default": true,
"x-fallback": "CHECK_SSL_VALIDITY",
"description": "Whether to verify SSL certificates"
},
"SINGLEFILE_ARGS": {
"type": "array",
"items": {"type": "string"},
"default": [],
"default": ["--browser-headless"],
"x-aliases": ["SINGLEFILE_DEFAULT_ARGS"],
"description": "Default single-file arguments"
},
"SINGLEFILE_EXTRA_ARGS": {
"type": "string",
"default": "",
"description": "Extra arguments for single-file"
"SINGLEFILE_ARGS_EXTRA": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["SINGLEFILE_EXTRA_ARGS"],
"description": "Extra arguments to append to single-file command"
}
}
}

View File

@@ -6,24 +6,16 @@ Usage: on_Snapshot__singlefile.py --url=<url> --snapshot-id=<uuid>
Output: Writes singlefile.html to $PWD
Environment variables:
SINGLEFILE_BINARY: Path to SingleFile binary
SINGLEFILE_TIMEOUT: Timeout in seconds (default: 120)
SINGLEFILE_USER_AGENT: User agent string (optional)
SINGLEFILE_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
SINGLEFILE_COOKIES_FILE: Path to cookies file (optional)
SINGLEFILE_EXTRA_ARGS: Extra arguments for SingleFile (space-separated)
# Feature toggle
SAVE_SINGLEFILE: Enable SingleFile archiving (default: True)
# Chrome binary (SingleFile needs Chrome)
CHROME_BINARY: Path to Chrome/Chromium binary
# Fallback to ARCHIVING_CONFIG values if SINGLEFILE_* not set:
TIMEOUT: Fallback timeout
USER_AGENT: Fallback user agent
CHECK_SSL_VALIDITY: Fallback SSL check
COOKIES_FILE: Fallback cookies file
SINGLEFILE_ENABLED: Enable SingleFile archiving (default: True)
SINGLEFILE_BINARY: Path to SingleFile binary (default: single-file)
SINGLEFILE_NODE_BINARY: Path to Node.js binary (x-fallback: NODE_BINARY)
SINGLEFILE_CHROME_BINARY: Path to Chrome binary (x-fallback: CHROME_BINARY)
SINGLEFILE_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT)
SINGLEFILE_USER_AGENT: User agent string (x-fallback: USER_AGENT)
SINGLEFILE_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE)
SINGLEFILE_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY)
SINGLEFILE_ARGS: Default SingleFile arguments (JSON array)
SINGLEFILE_ARGS_EXTRA: Extra arguments to append (JSON array)
"""
import json
@@ -63,6 +55,20 @@ def get_env_int(name: str, default: int = 0) -> int:
return default
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
"""Parse a JSON array from environment variable."""
val = get_env(name, '')
if not val:
return default if default is not None else []
try:
result = json.loads(val)
if isinstance(result, list):
return [str(item) for item in result]
return default if default is not None else []
except json.JSONDecodeError:
return default if default is not None else []
STATICFILE_DIR = '../staticfile'
def has_staticfile_output() -> bool:
@@ -121,15 +127,16 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
Returns: (success, output_path, error_message)
"""
# Get config from env (with SINGLEFILE_ prefix or fallback to ARCHIVING_CONFIG style)
# Get config from env (with SINGLEFILE_ prefix, x-fallback handled by config loader)
timeout = get_env_int('SINGLEFILE_TIMEOUT') or get_env_int('TIMEOUT', 120)
user_agent = get_env('SINGLEFILE_USER_AGENT') or get_env('USER_AGENT', '')
check_ssl = get_env_bool('SINGLEFILE_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True))
check_ssl = get_env_bool('SINGLEFILE_CHECK_SSL_VALIDITY', True) if get_env('SINGLEFILE_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True)
cookies_file = get_env('SINGLEFILE_COOKIES_FILE') or get_env('COOKIES_FILE', '')
extra_args = get_env('SINGLEFILE_EXTRA_ARGS', '')
chrome = get_env('CHROME_BINARY', '')
singlefile_args = get_env_array('SINGLEFILE_ARGS', [])
singlefile_args_extra = get_env_array('SINGLEFILE_ARGS_EXTRA', [])
chrome = get_env('SINGLEFILE_CHROME_BINARY') or get_env('CHROME_BINARY', '')
cmd = [binary]
cmd = [binary, *singlefile_args]
# Try to use existing Chrome session via CDP
cdp_url = get_cdp_url()
@@ -142,11 +149,6 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
elif chrome:
cmd.extend(['--browser-executable-path', chrome])
# Common options
cmd.extend([
'--browser-headless',
])
# SSL handling
if not check_ssl:
cmd.append('--browser-ignore-insecure-certs')
@@ -157,8 +159,9 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
if cookies_file and Path(cookies_file).is_file():
cmd.extend(['--browser-cookies-file', cookies_file])
if extra_args:
cmd.extend(extra_args.split())
# Add extra args from config
if singlefile_args_extra:
cmd.extend(singlefile_args_extra)
# Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)

View File

@@ -9,18 +9,12 @@
"x-aliases": ["SAVE_WGET", "USE_WGET"],
"description": "Enable wget archiving"
},
"WGET_SAVE_WARC": {
"WGET_WARC_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_WARC"],
"x-aliases": ["SAVE_WARC", "WGET_SAVE_WARC"],
"description": "Save WARC archive file"
},
"WGET_SAVE_REQUISITES": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_WGET_REQUISITES"],
"description": "Download page requisites (CSS, JS, images)"
},
"WGET_BINARY": {
"type": "string",
"default": "wget",
@@ -39,25 +33,17 @@
"x-fallback": "USER_AGENT",
"description": "User agent string for wget"
},
"WGET_CHECK_SSL_VALIDITY": {
"type": "boolean",
"default": true,
"x-fallback": "CHECK_SSL_VALIDITY",
"x-aliases": ["CHECK_SSL_VALIDITY"],
"description": "Whether to verify SSL certificates"
},
"WGET_COOKIES_FILE": {
"type": "string",
"default": "",
"x-fallback": "COOKIES_FILE",
"description": "Path to cookies file"
},
"WGET_RESTRICT_FILE_NAMES": {
"type": "string",
"default": "windows",
"enum": ["windows", "unix", "ascii", "nocontrol", "lowercase", "uppercase"],
"x-fallback": "RESTRICT_FILE_NAMES",
"description": "Filename restriction mode"
"WGET_CHECK_SSL_VALIDITY": {
"type": "boolean",
"default": true,
"x-fallback": "CHECK_SSL_VALIDITY",
"description": "Whether to verify SSL certificates"
},
"WGET_ARGS": {
"type": "array",
@@ -70,14 +56,20 @@
"--backup-converted",
"--span-hosts",
"--no-parent",
"--page-requisites",
"--restrict-file-names=windows",
"--tries=2",
"-e", "robots=off"
],
"x-aliases": ["WGET_DEFAULT_ARGS"],
"description": "Default wget arguments"
},
"WGET_EXTRA_ARGS": {
"type": "string",
"default": "",
"description": "Extra arguments for wget (space-separated)"
"WGET_ARGS_EXTRA": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["WGET_EXTRA_ARGS"],
"description": "Extra arguments to append to wget command"
}
}
}

View File

@@ -6,25 +6,15 @@ Usage: on_Snapshot__wget.py --url=<url> --snapshot-id=<uuid>
Output: Downloads files to $PWD
Environment variables:
WGET_BINARY: Path to wget binary (optional, falls back to PATH)
WGET_TIMEOUT: Timeout in seconds (default: 60)
WGET_USER_AGENT: User agent string
WGET_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
WGET_COOKIES_FILE: Path to cookies file (optional)
WGET_RESTRICT_FILE_NAMES: Filename restriction mode (default: windows)
WGET_EXTRA_ARGS: Extra arguments for wget (space-separated)
# Wget feature toggles
SAVE_WGET: Enable wget archiving (default: True)
SAVE_WARC: Save WARC file (default: True)
SAVE_WGET_REQUISITES: Download page requisites (default: True)
# Fallback to ARCHIVING_CONFIG values if WGET_* not set:
TIMEOUT: Fallback timeout
USER_AGENT: Fallback user agent
CHECK_SSL_VALIDITY: Fallback SSL check
COOKIES_FILE: Fallback cookies file
RESTRICT_FILE_NAMES: Fallback filename restriction
WGET_ENABLED: Enable wget archiving (default: True)
WGET_WARC_ENABLED: Save WARC file (default: True)
WGET_BINARY: Path to wget binary (default: wget)
WGET_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT)
WGET_USER_AGENT: User agent string (x-fallback: USER_AGENT)
WGET_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE)
WGET_CHECK_SSL_VALIDITY: Whether to check SSL certificates (x-fallback: CHECK_SSL_VALIDITY)
WGET_ARGS: Default wget arguments (JSON array)
WGET_ARGS_EXTRA: Extra arguments to append (JSON array)
"""
import json
@@ -65,6 +55,20 @@ def get_env_int(name: str, default: int = 0) -> int:
return default
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
"""Parse a JSON array from environment variable."""
val = get_env(name, '')
if not val:
return default if default is not None else []
try:
result = json.loads(val)
if isinstance(result, list):
return [str(item) for item in result]
return default if default is not None else []
except json.JSONDecodeError:
return default if default is not None else []
STATICFILE_DIR = '../staticfile'
def has_staticfile_output() -> bool:
@@ -73,17 +77,6 @@ def has_staticfile_output() -> bool:
return staticfile_dir.exists() and any(staticfile_dir.iterdir())
# Default wget args (from old WGET_CONFIG)
WGET_DEFAULT_ARGS = [
'--no-verbose',
'--adjust-extension',
'--convert-links',
'--force-directories',
'--backup-converted',
'--span-hosts',
'--no-parent',
'-e', 'robots=off',
]
def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]:
@@ -92,36 +85,28 @@ def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]:
Returns: (success, output_path, error_message)
"""
# Get config from env (with WGET_ prefix or fallback to ARCHIVING_CONFIG style)
# Get config from env (with WGET_ prefix, x-fallback handled by config loader)
timeout = get_env_int('WGET_TIMEOUT') or get_env_int('TIMEOUT', 60)
user_agent = get_env('WGET_USER_AGENT') or get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
check_ssl = get_env_bool('WGET_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True))
check_ssl = get_env_bool('WGET_CHECK_SSL_VALIDITY', True) if get_env('WGET_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True)
cookies_file = get_env('WGET_COOKIES_FILE') or get_env('COOKIES_FILE', '')
restrict_names = get_env('WGET_RESTRICT_FILE_NAMES') or get_env('RESTRICT_FILE_NAMES', 'windows')
extra_args = get_env('WGET_EXTRA_ARGS', '')
wget_args = get_env_array('WGET_ARGS', [])
wget_args_extra = get_env_array('WGET_ARGS_EXTRA', [])
# Feature toggles
save_warc = get_env_bool('WGET_SAVE_WARC', True)
save_requisites = get_env_bool('WGET_SAVE_REQUISITES', True)
warc_enabled = get_env_bool('WGET_WARC_ENABLED', True)
# Build wget command (later options take precedence)
cmd = [
binary,
*WGET_DEFAULT_ARGS,
*wget_args,
f'--timeout={timeout}',
'--tries=2',
]
if user_agent:
cmd.append(f'--user-agent={user_agent}')
if restrict_names:
cmd.append(f'--restrict-file-names={restrict_names}')
if save_requisites:
cmd.append('--page-requisites')
if save_warc:
if warc_enabled:
warc_dir = Path('warc')
warc_dir.mkdir(exist_ok=True)
warc_path = warc_dir / str(int(datetime.now(timezone.utc).timestamp()))
@@ -135,8 +120,8 @@ def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]:
if not check_ssl:
cmd.extend(['--no-check-certificate', '--no-hsts'])
if extra_args:
cmd.extend(extra_args.split())
if wget_args_extra:
cmd.extend(wget_args_extra)
cmd.append(url)

View File

@@ -1,3 +1,3 @@
{"type": "Binary", "name": "yt-dlp", "binproviders": "pip,brew,apt,env"}
{"type": "Binary", "name": "yt-dlp", "binproviders": "pip,brew,apt,env", "overrides": {"pip": {"packages": "yt-dlp[default]"}}}
{"type": "Binary", "name": "node", "binproviders": "apt,brew,env", "overrides": {"apt": {"packages": ["nodejs"]}}}
{"type": "Binary", "name": "ffmpeg", "binproviders": "apt,brew,env"}

View File

@@ -6,15 +6,28 @@
"YTDLP_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["MEDIA_ENABLED", "SAVE_MEDIA", "USE_MEDIA", "USE_YTDLP", "FETCH_MEDIA", "SAVE_YTDLP"],
"x-aliases": [
"MEDIA_ENABLED",
"SAVE_MEDIA",
"USE_MEDIA",
"USE_YTDLP",
"FETCH_MEDIA",
"SAVE_YTDLP"
],
"description": "Enable video/audio downloading with yt-dlp"
},
"YTDLP_BINARY": {
"type": "string",
"default": "yt-dlp",
"x-aliases": ["MEDIA_BINARY", "YOUTUBEDL_BINARY", "YOUTUBE_DL_BINARY"],
"x-aliases": ["YOUTUBEDL_BINARY", "YOUTUBE_DL_BINARY"],
"description": "Path to yt-dlp binary"
},
"YTDLP_NODE_BINARY": {
"type": "string",
"default": "node",
"x-fallback": "NODE_BINARY",
"description": "Path to Node.js binary for yt-dlp JS runtime"
},
"YTDLP_TIMEOUT": {
"type": "integer",
"default": 3600,
@@ -23,6 +36,12 @@
"x-aliases": ["MEDIA_TIMEOUT"],
"description": "Timeout for yt-dlp downloads in seconds"
},
"YTDLP_COOKIES_FILE": {
"type": "string",
"default": "",
"x-fallback": "COOKIES_FILE",
"description": "Path to cookies file"
},
"YTDLP_MAX_SIZE": {
"type": "string",
"default": "750m",
@@ -34,15 +53,14 @@
"type": "boolean",
"default": true,
"x-fallback": "CHECK_SSL_VALIDITY",
"x-aliases": ["MEDIA_CHECK_SSL_VALIDITY"],
"description": "Whether to verify SSL certificates"
},
"YTDLP_ARGS": {
"type": "array",
"items": {"type": "string"},
"items": { "type": "string" },
"default": [
"--restrict-filenames",
"--trim-filenames", "128",
"--trim-filenames=128",
"--write-description",
"--write-info-json",
"--write-thumbnail",
@@ -56,16 +74,19 @@
"--geo-bypass",
"--add-metadata",
"--no-progress",
"-o", "%(title)s.%(ext)s"
"--remote-components ejs:github",
"-o",
"%(title)s.%(ext)s"
],
"x-aliases": ["MEDIA_ARGS"],
"description": "Default yt-dlp arguments (override to customize behavior)"
"x-aliases": ["YTDLP_DEFAULT_ARGS"],
"description": "Default yt-dlp arguments"
},
"YTDLP_EXTRA_ARGS": {
"type": "string",
"default": "",
"x-aliases": ["MEDIA_EXTRA_ARGS"],
"description": "Extra arguments for yt-dlp (space-separated, appended after YTDLP_ARGS)"
"YTDLP_ARGS_EXTRA": {
"type": "array",
"items": { "type": "string" },
"default": [],
"x-aliases": ["YTDLP_EXTRA_ARGS"],
"description": "Extra arguments to append to yt-dlp command"
}
}
}

View File

@@ -3,24 +3,18 @@
Download video/audio from a URL using yt-dlp.
Usage: on_Snapshot__ytdlp.py --url=<url> --snapshot-id=<uuid>
Output: Downloads video/audio files to $PWD/ytdlp/
Output: Downloads video/audio files to $PWD
Environment variables:
YTDLP_BINARY: Path to yt-dlp binary
YTDLP_TIMEOUT: Timeout in seconds (default: 3600 for large downloads)
YTDLP_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
YTDLP_ARGS: JSON array of yt-dlp arguments (overrides defaults)
YTDLP_EXTRA_ARGS: Extra arguments for yt-dlp (space-separated, appended)
YTDLP_MAX_SIZE: Maximum file size (default: 750m)
# Feature toggles (with backwards-compatible aliases)
YTDLP_ENABLED: Enable yt-dlp extraction (default: True)
SAVE_YTDLP: Alias for YTDLP_ENABLED
MEDIA_ENABLED: Backwards-compatible alias for YTDLP_ENABLED
# Fallback to ARCHIVING_CONFIG values if YTDLP_* not set:
TIMEOUT: Fallback timeout
CHECK_SSL_VALIDITY: Fallback SSL check
YTDLP_BINARY: Path to yt-dlp binary (default: yt-dlp)
YTDLP_NODE_BINARY: Path to Node.js binary (x-fallback: NODE_BINARY)
YTDLP_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT)
YTDLP_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE)
YTDLP_MAX_SIZE: Maximum file size (default: 750m)
YTDLP_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY)
YTDLP_ARGS: Default yt-dlp arguments (JSON array)
YTDLP_ARGS_EXTRA: Extra arguments to append (JSON array)
"""
import json
@@ -32,11 +26,6 @@ from pathlib import Path
import rich_click as click
# Extractor metadata
PLUGIN_NAME = 'ytdlp'
BIN_NAME = 'yt-dlp'
BIN_PROVIDERS = 'pip,apt,brew,env'
OUTPUT_DIR = '.'
def get_env(name: str, default: str = '') -> str:
@@ -59,6 +48,20 @@ def get_env_int(name: str, default: int = 0) -> int:
return default
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
"""Parse a JSON array from environment variable."""
val = get_env(name, '')
if not val:
return default if default is not None else []
try:
result = json.loads(val)
if isinstance(result, list):
return [str(item) for item in result]
return default if default is not None else []
except json.JSONDecodeError:
return default if default is not None else []
STATICFILE_DIR = '../staticfile'
def has_staticfile_output() -> bool:
@@ -67,69 +70,41 @@ def has_staticfile_output() -> bool:
return staticfile_dir.exists() and any(staticfile_dir.iterdir())
# Default yt-dlp args (can be overridden via YTDLP_ARGS env var)
YTDLP_DEFAULT_ARGS = [
'--restrict-filenames',
'--trim-filenames', '128',
'--write-description',
'--write-info-json',
'--write-thumbnail',
'--write-sub',
'--write-auto-subs',
'--convert-subs=srt',
'--yes-playlist',
'--continue',
'--no-abort-on-error',
'--ignore-errors',
'--geo-bypass',
'--add-metadata',
'--no-progress',
'-o', '%(title)s.%(ext)s',
]
def get_ytdlp_args() -> list[str]:
"""Get yt-dlp arguments from YTDLP_ARGS env var or use defaults."""
ytdlp_args_str = get_env('YTDLP_ARGS', '')
if ytdlp_args_str:
try:
# Try to parse as JSON array
args = json.loads(ytdlp_args_str)
if isinstance(args, list):
return [str(arg) for arg in args]
except json.JSONDecodeError:
pass
return YTDLP_DEFAULT_ARGS
def save_ytdlp(url: str, binary: str) -> tuple[bool, str | None, str]:
"""
Download video/audio using yt-dlp.
Returns: (success, output_path, error_message)
"""
# Get config from env (YTDLP_* primary, MEDIA_* as fallback via aliases)
timeout = get_env_int('TIMEOUT', 3600)
check_ssl = get_env_bool('CHECK_SSL_VALIDITY', True)
extra_args = get_env('YTDLP_EXTRA_ARGS', '')
max_size = get_env('YTDLP_MAX_SIZE', '') or get_env('MEDIA_MAX_SIZE', '750m')
# Get config from env (with YTDLP_ prefix, x-fallback handled by config loader)
timeout = get_env_int('YTDLP_TIMEOUT') or get_env_int('TIMEOUT', 3600)
check_ssl = get_env_bool('YTDLP_CHECK_SSL_VALIDITY', True) if get_env('YTDLP_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True)
cookies_file = get_env('YTDLP_COOKIES_FILE') or get_env('COOKIES_FILE', '')
max_size = get_env('YTDLP_MAX_SIZE', '750m')
node_binary = get_env('YTDLP_NODE_BINARY') or get_env('NODE_BINARY', 'node')
ytdlp_args = get_env_array('YTDLP_ARGS', [])
ytdlp_args_extra = get_env_array('YTDLP_ARGS_EXTRA', [])
# Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)
output_dir = Path('.')
# Build command using configurable YTDLP_ARGS (later options take precedence)
# Build command (later options take precedence)
cmd = [
binary,
*get_ytdlp_args(),
# Format with max_size limit (appended after YTDLP_ARGS so it can be overridden by YTDLP_EXTRA_ARGS)
*ytdlp_args,
# Format with max_size limit (appended after YTDLP_ARGS so it can be overridden by YTDLP_ARGS_EXTRA)
f'--format=(bv*+ba/b)[filesize<={max_size}][filesize_approx<=?{max_size}]/(bv*+ba/b)',
f'--js-runtimes=node:{node_binary}',
]
if not check_ssl:
cmd.append('--no-check-certificate')
if extra_args:
cmd.extend(extra_args.split())
if cookies_file and Path(cookies_file).is_file():
cmd.extend(['--cookies', cookies_file])
if ytdlp_args_extra:
cmd.extend(ytdlp_args_extra)
cmd.append(url)
@@ -193,9 +168,8 @@ def main(url: str, snapshot_id: str):
"""Download video/audio from a URL using yt-dlp."""
try:
# Check if yt-dlp downloading is enabled (YTDLP_ENABLED primary, MEDIA_ENABLED fallback)
ytdlp_enabled = get_env_bool('YTDLP_ENABLED', True) and get_env_bool('MEDIA_ENABLED', True)
if not ytdlp_enabled:
# Check if yt-dlp downloading is enabled
if not get_env_bool('YTDLP_ENABLED', True):
print('Skipping ytdlp (YTDLP_ENABLED=False)', file=sys.stderr)
# Temporary failure (config disabled) - NO JSONL emission
sys.exit(0)