mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-03 14:27:55 +10:00
Merge remote-tracking branch 'origin/dev' into claude/add-max-url-attempts-oBHCD
This commit is contained in:
54
.github/workflows/test-parallel.yml
vendored
54
.github/workflows/test-parallel.yml
vendored
@@ -86,57 +86,33 @@ jobs:
|
||||
python-version: ${{ matrix.python }}
|
||||
architecture: x64
|
||||
|
||||
- name: Install uv
|
||||
uses: astral-sh/setup-uv@v4
|
||||
with:
|
||||
version: "latest"
|
||||
|
||||
- name: Set up Node JS
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: 20.10.0
|
||||
node-version: 22
|
||||
|
||||
- name: Get pip cache dir
|
||||
id: pip-cache
|
||||
run: |
|
||||
echo "dir=$(pip cache dir)" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Cache pip
|
||||
- name: Cache uv
|
||||
uses: actions/cache@v3
|
||||
id: cache-pip
|
||||
with:
|
||||
path: ${{ steps.pip-cache.outputs.dir }}
|
||||
key: ${{ runner.os }}-${{ matrix.python }}-venv-${{ hashFiles('setup.py') }}
|
||||
path: ~/.cache/uv
|
||||
key: ${{ runner.os }}-${{ matrix.python }}-uv-${{ hashFiles('pyproject.toml', 'uv.lock') }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-${{ matrix.python }}-venv-
|
||||
${{ runner.os }}-${{ matrix.python }}-uv-
|
||||
|
||||
- uses: awalsh128/cache-apt-pkgs-action@latest
|
||||
with:
|
||||
packages: ripgrep build-essential python3-dev python3-setuptools libssl-dev libldap2-dev libsasl2-dev zlib1g-dev libatomic1 python3-minimal gnupg2 curl wget python3-ldap python3-msgpack python3-mutagen python3-regex python3-pycryptodome procps
|
||||
version: 1.0
|
||||
packages: git ripgrep build-essential python3-dev python3-setuptools libssl-dev libldap2-dev libsasl2-dev zlib1g-dev libatomic1 python3-minimal gnupg2 curl wget python3-ldap python3-msgpack python3-mutagen python3-regex python3-pycryptodome procps
|
||||
version: 1.1
|
||||
|
||||
- name: Install pip dependencies
|
||||
- name: Install dependencies with uv
|
||||
run: |
|
||||
python -m pip install --upgrade pip setuptools wheel pytest bottle build
|
||||
python -m pip install -r requirements.txt
|
||||
python -m pip install -e .[sonic,ldap]
|
||||
|
||||
- name: Get npm cache dir
|
||||
id: npm-cache
|
||||
run: |
|
||||
echo "dir=$GITHUB_WORKSPACE/node_modules" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Cache npm
|
||||
uses: actions/cache@v3
|
||||
id: cache-npm
|
||||
with:
|
||||
path: ${{ steps.npm-cache.outputs.dir }}
|
||||
key: ${{ runner.os }}-node_modules-${{ hashFiles('package-lock.json') }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-node_modules
|
||||
|
||||
- name: Install npm requirements
|
||||
run: |
|
||||
npm install
|
||||
echo "SINGLEFILE_BINARY=$GITHUB_WORKSPACE/node_modules/.bin/single-file" >> $GITHUB_ENV
|
||||
echo "READABILITY_BINARY=$GITHUB_WORKSPACE/node_modules/.bin/readability-extractor" >> $GITHUB_ENV
|
||||
echo "MERCURY_BINARY=$GITHUB_WORKSPACE/node_modules/.bin/mercury-parser" >> $GITHUB_ENV
|
||||
uv sync --dev --all-extras
|
||||
|
||||
- name: Run test - ${{ matrix.test.name }}
|
||||
run: |
|
||||
python -m pytest -xvs "${{ matrix.test.path }}" --basetemp=tests/out --ignore=archivebox/pkgs
|
||||
uv run pytest -xvs "${{ matrix.test.path }}" --basetemp=tests/out --ignore=archivebox/pkgs
|
||||
|
||||
@@ -26,9 +26,7 @@ from archivebox.misc.system import get_dir_size, atomic_write
|
||||
from archivebox.misc.util import parse_date, base_url, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode
|
||||
from archivebox.misc.hashing import get_dir_info
|
||||
from archivebox.hooks import (
|
||||
EXTRACTOR_INDEXING_PRECEDENCE,
|
||||
get_plugins, get_plugin_name, get_plugin_icon,
|
||||
DEFAULT_PLUGIN_ICONS,
|
||||
)
|
||||
from archivebox.base_models.models import (
|
||||
ModelWithUUID, ModelWithSerializers, ModelWithOutputDir,
|
||||
@@ -1931,16 +1929,6 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
|
||||
)
|
||||
|
||||
|
||||
class ArchiveResultManager(models.Manager):
|
||||
def indexable(self, sorted: bool = True):
|
||||
INDEXABLE_METHODS = [r[0] for r in EXTRACTOR_INDEXING_PRECEDENCE]
|
||||
qs = self.get_queryset().filter(plugin__in=INDEXABLE_METHODS, status='succeeded')
|
||||
if sorted:
|
||||
precedence = [When(plugin=method, then=Value(p)) for method, p in EXTRACTOR_INDEXING_PRECEDENCE]
|
||||
qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000), output_field=IntegerField())).order_by('indexing_precedence')
|
||||
return qs
|
||||
|
||||
|
||||
class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
|
||||
class StatusChoices(models.TextChoices):
|
||||
QUEUED = 'queued', 'Queued'
|
||||
@@ -2000,8 +1988,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
state_field_name = 'status'
|
||||
active_state = StatusChoices.STARTED
|
||||
|
||||
objects = ArchiveResultManager()
|
||||
|
||||
class Meta(TypedModelMeta):
|
||||
app_label = 'core'
|
||||
verbose_name = 'Archive Result'
|
||||
|
||||
@@ -619,20 +619,6 @@ def is_parser_plugin(plugin: str) -> bool:
|
||||
return name.startswith('parse_') and name.endswith('_urls')
|
||||
|
||||
|
||||
# Precedence order for search indexing (lower number = higher priority)
|
||||
# Used to select which plugin's output to use for full-text search
|
||||
# Plugin names here should match the part after the numeric prefix
|
||||
# e.g., '31_readability' -> 'readability'
|
||||
EXTRACTOR_INDEXING_PRECEDENCE = [
|
||||
('readability', 1),
|
||||
('mercury', 2),
|
||||
('htmltotext', 3),
|
||||
('singlefile', 4),
|
||||
('dom', 5),
|
||||
('wget', 6),
|
||||
]
|
||||
|
||||
|
||||
def get_enabled_plugins(config: Optional[Dict[str, Any]] = None) -> List[str]:
|
||||
"""
|
||||
Get the list of enabled plugins based on config and available hooks.
|
||||
@@ -960,25 +946,6 @@ DEFAULT_TEMPLATES = {
|
||||
''',
|
||||
}
|
||||
|
||||
# Default icons for known extractor plugins (emoji or short HTML)
|
||||
DEFAULT_PLUGIN_ICONS = {
|
||||
'screenshot': '📷',
|
||||
'pdf': '📄',
|
||||
'singlefile': '📦',
|
||||
'dom': '🌐',
|
||||
'wget': '📥',
|
||||
'media': '🎬',
|
||||
'git': '📂',
|
||||
'readability': '📖',
|
||||
'mercury': '☿️',
|
||||
'favicon': '⭐',
|
||||
'title': '📝',
|
||||
'headers': '📋',
|
||||
'archive_org': '🏛️',
|
||||
'htmltotext': '📃',
|
||||
'warc': '🗄️',
|
||||
}
|
||||
|
||||
|
||||
def get_plugin_template(plugin: str, template_name: str, fallback: bool = True) -> Optional[str]:
|
||||
"""
|
||||
@@ -1018,10 +985,7 @@ def get_plugin_template(plugin: str, template_name: str, fallback: bool = True)
|
||||
|
||||
def get_plugin_icon(plugin: str) -> str:
|
||||
"""
|
||||
Get the icon for a plugin.
|
||||
|
||||
First checks for plugin-provided icon.html template,
|
||||
then falls back to DEFAULT_PLUGIN_ICONS.
|
||||
Get the icon for a plugin from its icon.html template.
|
||||
|
||||
Args:
|
||||
plugin: Plugin name (e.g., 'screenshot', '15_singlefile')
|
||||
@@ -1029,15 +993,13 @@ def get_plugin_icon(plugin: str) -> str:
|
||||
Returns:
|
||||
Icon HTML/emoji string.
|
||||
"""
|
||||
base_name = get_plugin_name(plugin)
|
||||
|
||||
# Try plugin-provided icon template
|
||||
icon_template = get_plugin_template(plugin, 'icon', fallback=False)
|
||||
if icon_template:
|
||||
return icon_template.strip()
|
||||
|
||||
# Fall back to default icon
|
||||
return DEFAULT_PLUGIN_ICONS.get(base_name, '📁')
|
||||
# Fall back to generic folder icon
|
||||
return '📁'
|
||||
|
||||
|
||||
def get_all_plugin_icons() -> Dict[str, str]:
|
||||
|
||||
@@ -9,10 +9,10 @@
|
||||
"x-aliases": ["CHROMIUM_BINARY", "GOOGLE_CHROME_BINARY"],
|
||||
"description": "Path to Chrome/Chromium binary"
|
||||
},
|
||||
"NODE_BINARY": {
|
||||
"CHROME_NODE_BINARY": {
|
||||
"type": "string",
|
||||
"default": "node",
|
||||
"x-aliases": ["NODEJS_BINARY"],
|
||||
"x-fallback": "NODE_BINARY",
|
||||
"description": "Path to Node.js binary (for Puppeteer)"
|
||||
},
|
||||
"CHROME_TIMEOUT": {
|
||||
@@ -50,16 +50,19 @@
|
||||
"x-fallback": "USER_AGENT",
|
||||
"description": "User agent string for Chrome"
|
||||
},
|
||||
"CHROME_EXTRA_ARGS": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"description": "Extra command-line arguments for Chrome (space-separated)"
|
||||
"CHROME_ARGS": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": [],
|
||||
"x-aliases": ["CHROME_DEFAULT_ARGS"],
|
||||
"description": "Default Chrome command-line arguments"
|
||||
},
|
||||
"CHROME_CHECK_SSL_VALIDITY": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-fallback": "CHECK_SSL_VALIDITY",
|
||||
"description": "Whether to verify SSL certificates"
|
||||
"CHROME_ARGS_EXTRA": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": [],
|
||||
"x-aliases": ["CHROME_EXTRA_ARGS"],
|
||||
"description": "Extra arguments to append to Chrome command"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,12 +21,6 @@
|
||||
"default": "",
|
||||
"x-fallback": "USER_AGENT",
|
||||
"description": "User agent string"
|
||||
},
|
||||
"FAVICON_CHECK_SSL_VALIDITY": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-fallback": "CHECK_SSL_VALIDITY",
|
||||
"description": "Whether to verify SSL certificates"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,21 +27,25 @@
|
||||
"enum": ["jsonl", "warc", "mbox", "maildir", "mh", "mmdf", "babyl"],
|
||||
"description": "Output format for forum downloads"
|
||||
},
|
||||
"FORUMDL_TEXTIFY": {
|
||||
"type": "boolean",
|
||||
"default": false,
|
||||
"description": "Convert HTML content to plaintext (keep false to preserve HTML)"
|
||||
},
|
||||
"FORUMDL_CHECK_SSL_VALIDITY": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-fallback": "CHECK_SSL_VALIDITY",
|
||||
"description": "Whether to verify SSL certificates"
|
||||
},
|
||||
"FORUMDL_EXTRA_ARGS": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"description": "Extra arguments for forum-dl (space-separated)"
|
||||
"FORUMDL_ARGS": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": [],
|
||||
"x-aliases": ["FORUMDL_DEFAULT_ARGS"],
|
||||
"description": "Default forum-dl arguments"
|
||||
},
|
||||
"FORUMDL_ARGS_EXTRA": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": [],
|
||||
"x-aliases": ["FORUMDL_EXTRA_ARGS"],
|
||||
"description": "Extra arguments to append to forum-dl command"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,19 +6,13 @@ Usage: on_Snapshot__forumdl.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Downloads forum content to $PWD/
|
||||
|
||||
Environment variables:
|
||||
FORUMDL_BINARY: Path to forum-dl binary
|
||||
FORUMDL_TIMEOUT: Timeout in seconds (default: 3600 for large forums)
|
||||
FORUMDL_ENABLED: Enable forum downloading (default: True)
|
||||
FORUMDL_BINARY: Path to forum-dl binary (default: forum-dl)
|
||||
FORUMDL_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT)
|
||||
FORUMDL_OUTPUT_FORMAT: Output format (default: jsonl)
|
||||
FORUMDL_TEXTIFY: Convert HTML to plaintext (default: False - keeps HTML)
|
||||
FORUMDL_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
|
||||
FORUMDL_EXTRA_ARGS: Extra arguments for forum-dl (space-separated)
|
||||
|
||||
# Forum-dl feature toggles
|
||||
SAVE_FORUMDL: Enable forum-dl forum extraction (default: True)
|
||||
|
||||
# Fallback to ARCHIVING_CONFIG values if FORUMDL_* not set:
|
||||
TIMEOUT: Fallback timeout
|
||||
CHECK_SSL_VALIDITY: Fallback SSL check
|
||||
FORUMDL_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY)
|
||||
FORUMDL_ARGS: Default forum-dl arguments (JSON array)
|
||||
FORUMDL_ARGS_EXTRA: Extra arguments to append (JSON array)
|
||||
"""
|
||||
|
||||
import json
|
||||
@@ -78,6 +72,20 @@ def get_env_int(name: str, default: int = 0) -> int:
|
||||
return default
|
||||
|
||||
|
||||
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
|
||||
"""Parse a JSON array from environment variable."""
|
||||
val = get_env(name, '')
|
||||
if not val:
|
||||
return default if default is not None else []
|
||||
try:
|
||||
result = json.loads(val)
|
||||
if isinstance(result, list):
|
||||
return [str(item) for item in result]
|
||||
return default if default is not None else []
|
||||
except json.JSONDecodeError:
|
||||
return default if default is not None else []
|
||||
|
||||
|
||||
|
||||
def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
"""
|
||||
@@ -85,11 +93,11 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
# Get config from env
|
||||
timeout = get_env_int('TIMEOUT', 3600)
|
||||
check_ssl = get_env_bool('CHECK_SSL_VALIDITY', True)
|
||||
textify = get_env_bool('FORUMDL_TEXTIFY', False)
|
||||
extra_args = get_env('FORUMDL_EXTRA_ARGS', '')
|
||||
# Get config from env (with FORUMDL_ prefix, x-fallback handled by config loader)
|
||||
timeout = get_env_int('FORUMDL_TIMEOUT') or get_env_int('TIMEOUT', 3600)
|
||||
check_ssl = get_env_bool('FORUMDL_CHECK_SSL_VALIDITY', True) if get_env('FORUMDL_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True)
|
||||
forumdl_args = get_env_array('FORUMDL_ARGS', [])
|
||||
forumdl_args_extra = get_env_array('FORUMDL_ARGS_EXTRA', [])
|
||||
output_format = get_env('FORUMDL_OUTPUT_FORMAT', 'jsonl')
|
||||
|
||||
# Output directory is current directory (hook already runs in output dir)
|
||||
@@ -108,16 +116,13 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
output_file = output_dir / f'forum.{output_format}'
|
||||
|
||||
# Build command
|
||||
cmd = [binary, '-f', output_format, '-o', str(output_file)]
|
||||
|
||||
if textify:
|
||||
cmd.append('--textify')
|
||||
cmd = [binary, *forumdl_args, '-f', output_format, '-o', str(output_file)]
|
||||
|
||||
if not check_ssl:
|
||||
cmd.append('--no-check-certificate')
|
||||
|
||||
if extra_args:
|
||||
cmd.extend(extra_args.split())
|
||||
if forumdl_args_extra:
|
||||
cmd.extend(forumdl_args_extra)
|
||||
|
||||
cmd.append(url)
|
||||
|
||||
|
||||
@@ -21,6 +21,12 @@
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for gallery downloads in seconds"
|
||||
},
|
||||
"GALLERYDL_COOKIES_FILE": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"x-fallback": "COOKIES_FILE",
|
||||
"description": "Path to cookies file"
|
||||
},
|
||||
"GALLERYDL_CHECK_SSL_VALIDITY": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
@@ -34,12 +40,15 @@
|
||||
"--write-metadata",
|
||||
"--write-info-json"
|
||||
],
|
||||
"x-aliases": ["GALLERYDL_DEFAULT_ARGS"],
|
||||
"description": "Default gallery-dl arguments"
|
||||
},
|
||||
"GALLERYDL_EXTRA_ARGS": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"description": "Extra arguments for gallery-dl (space-separated)"
|
||||
"GALLERYDL_ARGS_EXTRA": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": [],
|
||||
"x-aliases": ["GALLERYDL_EXTRA_ARGS"],
|
||||
"description": "Extra arguments to append to gallery-dl command"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,20 +6,13 @@ Usage: on_Snapshot__gallerydl.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Downloads gallery images to $PWD/gallerydl/
|
||||
|
||||
Environment variables:
|
||||
GALLERYDL_BINARY: Path to gallery-dl binary
|
||||
GALLERYDL_TIMEOUT: Timeout in seconds (default: 3600 for large galleries)
|
||||
GALLERYDL_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
|
||||
GALLERYDL_EXTRA_ARGS: Extra arguments for gallery-dl (space-separated)
|
||||
COOKIES_FILE: Path to cookies file for authentication
|
||||
|
||||
# Gallery-dl feature toggles
|
||||
USE_GALLERYDL: Enable gallery-dl gallery extraction (default: True)
|
||||
SAVE_GALLERYDL: Alias for USE_GALLERYDL
|
||||
|
||||
# Fallback to ARCHIVING_CONFIG values if GALLERYDL_* not set:
|
||||
GALLERYDL_TIMEOUT: Fallback timeout for gallery downloads
|
||||
TIMEOUT: Fallback timeout
|
||||
CHECK_SSL_VALIDITY: Fallback SSL check
|
||||
GALLERYDL_ENABLED: Enable gallery-dl gallery extraction (default: True)
|
||||
GALLERYDL_BINARY: Path to gallery-dl binary (default: gallery-dl)
|
||||
GALLERYDL_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT)
|
||||
GALLERYDL_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE)
|
||||
GALLERYDL_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY)
|
||||
GALLERYDL_ARGS: Default gallery-dl arguments (JSON array)
|
||||
GALLERYDL_ARGS_EXTRA: Extra arguments to append (JSON array)
|
||||
"""
|
||||
|
||||
import json
|
||||
@@ -58,6 +51,20 @@ def get_env_int(name: str, default: int = 0) -> int:
|
||||
return default
|
||||
|
||||
|
||||
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
|
||||
"""Parse a JSON array from environment variable."""
|
||||
val = get_env(name, '')
|
||||
if not val:
|
||||
return default if default is not None else []
|
||||
try:
|
||||
result = json.loads(val)
|
||||
if isinstance(result, list):
|
||||
return [str(item) for item in result]
|
||||
return default if default is not None else []
|
||||
except json.JSONDecodeError:
|
||||
return default if default is not None else []
|
||||
|
||||
|
||||
STATICFILE_DIR = '../staticfile'
|
||||
|
||||
def has_staticfile_output() -> bool:
|
||||
@@ -66,35 +73,27 @@ def has_staticfile_output() -> bool:
|
||||
return staticfile_dir.exists() and any(staticfile_dir.iterdir())
|
||||
|
||||
|
||||
# Default gallery-dl args
|
||||
def get_gallerydl_default_args() -> list[str]:
|
||||
"""Build default gallery-dl arguments."""
|
||||
return [
|
||||
'--write-metadata',
|
||||
'--write-info-json',
|
||||
]
|
||||
|
||||
|
||||
def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
"""
|
||||
Download gallery using gallery-dl.
|
||||
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
# Get config from env
|
||||
timeout = get_env_int('TIMEOUT', 3600)
|
||||
check_ssl = get_env_bool('CHECK_SSL_VALIDITY', True)
|
||||
extra_args = get_env('GALLERYDL_EXTRA_ARGS', '')
|
||||
cookies_file = get_env('COOKIES_FILE', '')
|
||||
# Get config from env (with GALLERYDL_ prefix, x-fallback handled by config loader)
|
||||
timeout = get_env_int('GALLERYDL_TIMEOUT') or get_env_int('TIMEOUT', 3600)
|
||||
check_ssl = get_env_bool('GALLERYDL_CHECK_SSL_VALIDITY', True) if get_env('GALLERYDL_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True)
|
||||
gallerydl_args = get_env_array('GALLERYDL_ARGS', [])
|
||||
gallerydl_args_extra = get_env_array('GALLERYDL_ARGS_EXTRA', [])
|
||||
cookies_file = get_env('GALLERYDL_COOKIES_FILE') or get_env('COOKIES_FILE', '')
|
||||
|
||||
# Output directory is current directory (hook already runs in output dir)
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
|
||||
# Build command (later options take precedence)
|
||||
# Build command
|
||||
# Use -D for exact directory (flat structure) instead of -d (nested structure)
|
||||
cmd = [
|
||||
binary,
|
||||
*get_gallerydl_default_args(),
|
||||
*gallerydl_args,
|
||||
'-D', str(output_dir),
|
||||
]
|
||||
|
||||
@@ -104,8 +103,8 @@ def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
if cookies_file and Path(cookies_file).exists():
|
||||
cmd.extend(['-C', cookies_file])
|
||||
|
||||
if extra_args:
|
||||
cmd.extend(extra_args.split())
|
||||
if gallerydl_args_extra:
|
||||
cmd.extend(gallerydl_args_extra)
|
||||
|
||||
cmd.append(url)
|
||||
|
||||
|
||||
@@ -26,16 +26,19 @@
|
||||
"default": "github.com,gitlab.com,bitbucket.org,gist.github.com,codeberg.org,gitea.com,git.sr.ht",
|
||||
"description": "Comma-separated list of domains to treat as git repositories"
|
||||
},
|
||||
"GIT_CLONE_DEPTH": {
|
||||
"type": "integer",
|
||||
"default": 1,
|
||||
"minimum": 0,
|
||||
"description": "Depth of git clone (0 for full history, 1 for shallow)"
|
||||
"GIT_ARGS": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": ["clone", "--depth=1", "--recursive"],
|
||||
"x-aliases": ["GIT_DEFAULT_ARGS"],
|
||||
"description": "Default git arguments"
|
||||
},
|
||||
"GIT_EXTRA_ARGS": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"description": "Extra arguments for git clone"
|
||||
"GIT_ARGS_EXTRA": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": [],
|
||||
"x-aliases": ["GIT_EXTRA_ARGS"],
|
||||
"description": "Extra arguments to append to git command"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,7 +8,8 @@ Output: Clones repository to $PWD/repo
|
||||
Environment variables:
|
||||
GIT_BINARY: Path to git binary
|
||||
GIT_TIMEOUT: Timeout in seconds (default: 120)
|
||||
GIT_ARGS: Extra arguments for git clone (space-separated)
|
||||
GIT_ARGS: Default git arguments (JSON array, default: ["clone", "--depth=1", "--recursive"])
|
||||
GIT_ARGS_EXTRA: Extra arguments to append (JSON array, default: [])
|
||||
|
||||
# Fallback to ARCHIVING_CONFIG values if GIT_* not set:
|
||||
TIMEOUT: Fallback timeout
|
||||
@@ -41,6 +42,20 @@ def get_env_int(name: str, default: int = 0) -> int:
|
||||
return default
|
||||
|
||||
|
||||
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
|
||||
"""Parse a JSON array from environment variable."""
|
||||
val = get_env(name, '')
|
||||
if not val:
|
||||
return default if default is not None else []
|
||||
try:
|
||||
result = json.loads(val)
|
||||
if isinstance(result, list):
|
||||
return [str(item) for item in result]
|
||||
return default if default is not None else []
|
||||
except json.JSONDecodeError:
|
||||
return default if default is not None else []
|
||||
|
||||
|
||||
def is_git_url(url: str) -> bool:
|
||||
"""Check if URL looks like a git repository."""
|
||||
git_patterns = [
|
||||
@@ -61,19 +76,10 @@ def clone_git(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
timeout = get_env_int('GIT_TIMEOUT') or get_env_int('TIMEOUT', 120)
|
||||
extra_args = get_env('GIT_ARGS')
|
||||
git_args = get_env_array('GIT_ARGS', [])
|
||||
git_args_extra = get_env_array('GIT_ARGS_EXTRA', [])
|
||||
|
||||
cmd = [
|
||||
binary,
|
||||
'clone',
|
||||
'--depth=1',
|
||||
'--recursive',
|
||||
]
|
||||
|
||||
if extra_args:
|
||||
cmd.extend(extra_args.split())
|
||||
|
||||
cmd.extend([url, OUTPUT_DIR])
|
||||
cmd = [binary, *git_args, *git_args_extra, url, OUTPUT_DIR]
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, timeout=timeout)
|
||||
|
||||
46
archivebox/plugins/infiniscroll/config.json
Normal file
46
archivebox/plugins/infiniscroll/config.json
Normal file
@@ -0,0 +1,46 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"required_plugins": ["chrome"],
|
||||
"properties": {
|
||||
"INFINISCROLL_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_INFINISCROLL", "USE_INFINISCROLL"],
|
||||
"description": "Enable infinite scroll page expansion"
|
||||
},
|
||||
"INFINISCROLL_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 120,
|
||||
"minimum": 10,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Maximum timeout for scrolling in seconds"
|
||||
},
|
||||
"INFINISCROLL_SCROLL_DELAY": {
|
||||
"type": "integer",
|
||||
"default": 2000,
|
||||
"minimum": 500,
|
||||
"description": "Delay between scrolls in milliseconds"
|
||||
},
|
||||
"INFINISCROLL_SCROLL_DISTANCE": {
|
||||
"type": "integer",
|
||||
"default": 1600,
|
||||
"minimum": 100,
|
||||
"description": "Distance to scroll per step in pixels"
|
||||
},
|
||||
"INFINISCROLL_SCROLL_LIMIT": {
|
||||
"type": "integer",
|
||||
"default": 10,
|
||||
"minimum": 1,
|
||||
"maximum": 100,
|
||||
"description": "Maximum number of scroll steps"
|
||||
},
|
||||
"INFINISCROLL_MIN_HEIGHT": {
|
||||
"type": "integer",
|
||||
"default": 16000,
|
||||
"minimum": 1000,
|
||||
"description": "Minimum page height to scroll to in pixels"
|
||||
}
|
||||
}
|
||||
}
|
||||
267
archivebox/plugins/infiniscroll/on_Snapshot__45_infiniscroll.js
Executable file
267
archivebox/plugins/infiniscroll/on_Snapshot__45_infiniscroll.js
Executable file
@@ -0,0 +1,267 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Scroll the page down to trigger infinite scroll / lazy loading.
|
||||
*
|
||||
* Scrolls down 1 page at a time, up to INFINISCROLL_SCROLL_LIMIT times,
|
||||
* ensuring at least INFINISCROLL_MIN_HEIGHT (default 16,000px) is reached.
|
||||
* Stops early if no new content loads after a scroll.
|
||||
*
|
||||
* Usage: on_Snapshot__45_infiniscroll.js --url=<url> --snapshot-id=<uuid>
|
||||
* Output: JSONL with scroll stats (no files created)
|
||||
*
|
||||
* Environment variables:
|
||||
* INFINISCROLL_ENABLED: Enable/disable (default: true)
|
||||
* INFINISCROLL_TIMEOUT: Max timeout in seconds (default: 120)
|
||||
* INFINISCROLL_SCROLL_DELAY: Delay between scrolls in ms (default: 2000)
|
||||
* INFINISCROLL_SCROLL_DISTANCE: Pixels per scroll (default: 1600)
|
||||
* INFINISCROLL_SCROLL_LIMIT: Max scroll iterations (default: 10)
|
||||
* INFINISCROLL_MIN_HEIGHT: Min page height to reach in px (default: 16000)
|
||||
*/
|
||||
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
|
||||
function getEnvBool(name, defaultValue = false) {
|
||||
const val = getEnv(name, '').toLowerCase();
|
||||
if (['true', '1', 'yes', 'on'].includes(val)) return true;
|
||||
if (['false', '0', 'no', 'off'].includes(val)) return false;
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
function getEnvInt(name, defaultValue = 0) {
|
||||
const val = parseInt(getEnv(name, String(defaultValue)), 10);
|
||||
return isNaN(val) ? defaultValue : val;
|
||||
}
|
||||
|
||||
// Check if infiniscroll is enabled BEFORE requiring puppeteer
|
||||
if (!getEnvBool('INFINISCROLL_ENABLED', true)) {
|
||||
console.error('Skipping infiniscroll (INFINISCROLL_ENABLED=False)');
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
const PLUGIN_NAME = 'infiniscroll';
|
||||
const CHROME_SESSION_DIR = '../chrome';
|
||||
|
||||
function parseArgs() {
|
||||
const args = {};
|
||||
process.argv.slice(2).forEach(arg => {
|
||||
if (arg.startsWith('--')) {
|
||||
const [key, ...valueParts] = arg.slice(2).split('=');
|
||||
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
|
||||
}
|
||||
});
|
||||
return args;
|
||||
}
|
||||
|
||||
function getCdpUrl() {
|
||||
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
|
||||
if (fs.existsSync(cdpFile)) {
|
||||
return fs.readFileSync(cdpFile, 'utf8').trim();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function getPageId() {
|
||||
const targetIdFile = path.join(CHROME_SESSION_DIR, 'target_id.txt');
|
||||
if (fs.existsSync(targetIdFile)) {
|
||||
return fs.readFileSync(targetIdFile, 'utf8').trim();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function waitForChromeTabLoaded(timeoutMs = 60000) {
|
||||
const navigationFile = path.join(CHROME_SESSION_DIR, 'navigation.json');
|
||||
const startTime = Date.now();
|
||||
|
||||
while (Date.now() - startTime < timeoutMs) {
|
||||
if (fs.existsSync(navigationFile)) {
|
||||
return true;
|
||||
}
|
||||
await new Promise(resolve => setTimeout(resolve, 100));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
function sleep(ms) {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
async function scrollDown(page, options = {}) {
|
||||
const {
|
||||
timeout = 120000,
|
||||
scrollDelay = 2000,
|
||||
scrollDistance = 1600,
|
||||
scrollLimit = 10,
|
||||
minHeight = 16000,
|
||||
} = options;
|
||||
|
||||
const startTime = Date.now();
|
||||
const startingHeight = await page.evaluate(() => document.body.scrollHeight);
|
||||
let lastHeight = startingHeight;
|
||||
let scrollCount = 0;
|
||||
let scrollPosition = 0;
|
||||
|
||||
// Scroll to top first
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, left: 0, behavior: 'smooth' });
|
||||
});
|
||||
await sleep(500);
|
||||
|
||||
while (scrollCount < scrollLimit) {
|
||||
// Check timeout
|
||||
const elapsed = Date.now() - startTime;
|
||||
if (elapsed >= timeout) {
|
||||
console.error(`Timeout reached after ${scrollCount} scrolls`);
|
||||
break;
|
||||
}
|
||||
|
||||
scrollPosition = (scrollCount + 1) * scrollDistance;
|
||||
console.error(`Scrolling down ${scrollCount + 1}x ${scrollDistance}px... (${scrollPosition}/${lastHeight})`);
|
||||
|
||||
await page.evaluate((yOffset) => {
|
||||
window.scrollTo({ top: yOffset, left: 0, behavior: 'smooth' });
|
||||
}, scrollPosition);
|
||||
|
||||
scrollCount++;
|
||||
await sleep(scrollDelay);
|
||||
|
||||
// Check if new content was added (infinite scroll detection)
|
||||
const newHeight = await page.evaluate(() => document.body.scrollHeight);
|
||||
const addedPx = newHeight - lastHeight;
|
||||
|
||||
if (addedPx > 0) {
|
||||
console.error(`Detected infini-scrolling: ${lastHeight}+${addedPx} => ${newHeight}`);
|
||||
} else if (scrollPosition >= newHeight + scrollDistance) {
|
||||
// Reached the bottom
|
||||
if (scrollCount > 2) {
|
||||
console.error(`Reached bottom of page at ${newHeight}px`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
lastHeight = newHeight;
|
||||
|
||||
// Check if we've reached minimum height and can stop
|
||||
if (lastHeight >= minHeight && scrollPosition >= lastHeight) {
|
||||
console.error(`Reached minimum height target (${minHeight}px)`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Scroll to absolute bottom
|
||||
if (scrollPosition < lastHeight) {
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: document.body.scrollHeight, left: 0, behavior: 'smooth' });
|
||||
});
|
||||
await sleep(scrollDelay);
|
||||
}
|
||||
|
||||
// Scroll back to top
|
||||
console.error(`Reached bottom of page at ${lastHeight}px, scrolling back to top...`);
|
||||
await page.evaluate(() => {
|
||||
window.scrollTo({ top: 0, left: 0, behavior: 'smooth' });
|
||||
});
|
||||
await sleep(scrollDelay);
|
||||
|
||||
const totalElapsed = Date.now() - startTime;
|
||||
|
||||
return {
|
||||
scrollCount,
|
||||
finalHeight: lastHeight,
|
||||
startingHeight,
|
||||
elapsedMs: totalElapsed,
|
||||
};
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const url = args.url;
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__45_infiniscroll.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const timeout = getEnvInt('INFINISCROLL_TIMEOUT', 120) * 1000;
|
||||
const scrollDelay = getEnvInt('INFINISCROLL_SCROLL_DELAY', 2000);
|
||||
const scrollDistance = getEnvInt('INFINISCROLL_SCROLL_DISTANCE', 1600);
|
||||
const scrollLimit = getEnvInt('INFINISCROLL_SCROLL_LIMIT', 10);
|
||||
const minHeight = getEnvInt('INFINISCROLL_MIN_HEIGHT', 16000);
|
||||
|
||||
const cdpUrl = getCdpUrl();
|
||||
if (!cdpUrl) {
|
||||
console.error('ERROR: Chrome CDP URL not found (chrome plugin must run first)');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Wait for page to be loaded
|
||||
const pageLoaded = await waitForChromeTabLoaded(60000);
|
||||
if (!pageLoaded) {
|
||||
console.error('ERROR: Page not loaded after 60s (chrome_navigate must complete first)');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
let browser = null;
|
||||
try {
|
||||
browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
|
||||
|
||||
const pages = await browser.pages();
|
||||
if (pages.length === 0) {
|
||||
throw new Error('No pages found in browser');
|
||||
}
|
||||
|
||||
// Find the right page by target ID
|
||||
const targetId = getPageId();
|
||||
let page = null;
|
||||
if (targetId) {
|
||||
page = pages.find(p => {
|
||||
const target = p.target();
|
||||
return target && target._targetId === targetId;
|
||||
});
|
||||
}
|
||||
if (!page) {
|
||||
page = pages[pages.length - 1];
|
||||
}
|
||||
|
||||
console.error(`Starting infinite scroll on ${url}`);
|
||||
const result = await scrollDown(page, {
|
||||
timeout,
|
||||
scrollDelay,
|
||||
scrollDistance,
|
||||
scrollLimit,
|
||||
minHeight,
|
||||
});
|
||||
|
||||
browser.disconnect();
|
||||
|
||||
const elapsedSec = (result.elapsedMs / 1000).toFixed(1);
|
||||
const finalHeightStr = result.finalHeight.toLocaleString();
|
||||
const addedHeight = result.finalHeight - result.startingHeight;
|
||||
const addedStr = addedHeight > 0 ? `+${addedHeight.toLocaleString()}px new content` : 'no new content';
|
||||
const outputStr = `scrolled to ${finalHeightStr}px (${addedStr}) over ${elapsedSec}s`;
|
||||
|
||||
console.error(`Success: ${outputStr}`);
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status: 'succeeded',
|
||||
output_str: outputStr,
|
||||
}));
|
||||
process.exit(0);
|
||||
|
||||
} catch (e) {
|
||||
if (browser) browser.disconnect();
|
||||
console.error(`ERROR: ${e.name}: ${e.message}`);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
352
archivebox/plugins/infiniscroll/tests/test_infiniscroll.py
Normal file
352
archivebox/plugins/infiniscroll/tests/test_infiniscroll.py
Normal file
@@ -0,0 +1,352 @@
|
||||
"""
|
||||
Integration tests for infiniscroll plugin
|
||||
|
||||
Tests verify:
|
||||
1. Hook script exists
|
||||
2. Dependencies installed via chrome validation hooks
|
||||
3. Verify deps with abx-pkg
|
||||
4. INFINISCROLL_ENABLED=False skips without JSONL
|
||||
5. Fails gracefully when no chrome session exists
|
||||
6. Full integration test: scrolls page and outputs stats
|
||||
7. Config options work (scroll limit, min height)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import signal
|
||||
import subprocess
|
||||
import time
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
INFINISCROLL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_infiniscroll.*'), None)
|
||||
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
|
||||
CHROME_TAB_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Snapshot__20_chrome_tab.bg.js'
|
||||
CHROME_NAVIGATE_HOOK = next((PLUGINS_ROOT / 'chrome').glob('on_Snapshot__*_chrome_navigate.*'), None)
|
||||
TEST_URL = 'https://www.singsing.movie/'
|
||||
|
||||
|
||||
def get_node_modules_dir():
|
||||
"""Get NODE_MODULES_DIR for tests, checking env first."""
|
||||
# Check if NODE_PATH is already set in environment
|
||||
if os.environ.get('NODE_PATH'):
|
||||
return Path(os.environ['NODE_PATH'])
|
||||
# Otherwise compute from LIB_DIR
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
lib_dir = Path(os.environ.get('LIB_DIR') or str(STORAGE_CONFIG.LIB_DIR))
|
||||
return lib_dir / 'npm' / 'node_modules'
|
||||
|
||||
|
||||
NODE_MODULES_DIR = get_node_modules_dir()
|
||||
|
||||
|
||||
def get_test_env():
|
||||
"""Get environment with NODE_PATH set correctly."""
|
||||
env = os.environ.copy()
|
||||
env['NODE_PATH'] = str(NODE_MODULES_DIR)
|
||||
return env
|
||||
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify on_Snapshot hook exists."""
|
||||
assert INFINISCROLL_HOOK is not None, "Infiniscroll hook not found"
|
||||
assert INFINISCROLL_HOOK.exists(), f"Hook not found: {INFINISCROLL_HOOK}"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify dependencies are available via abx-pkg after hook installation."""
|
||||
from abx_pkg import Binary, EnvProvider, BinProviderOverrides
|
||||
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
# Verify node is available
|
||||
node_binary = Binary(name='node', binproviders=[EnvProvider()])
|
||||
node_loaded = node_binary.load()
|
||||
assert node_loaded and node_loaded.abspath, "Node.js required for infiniscroll plugin"
|
||||
|
||||
|
||||
def test_config_infiniscroll_disabled_skips():
|
||||
"""Test that INFINISCROLL_ENABLED=False exits without emitting JSONL."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
env = get_test_env()
|
||||
env['INFINISCROLL_ENABLED'] = 'False'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-disabled'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
|
||||
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
|
||||
|
||||
# Should NOT emit any JSONL
|
||||
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
|
||||
assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, got: {jsonl_lines}"
|
||||
|
||||
|
||||
def test_fails_gracefully_without_chrome_session():
|
||||
"""Test that hook fails gracefully when no chrome session exists."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=test-no-chrome'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=get_test_env(),
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Should fail (exit 1) when no chrome session
|
||||
assert result.returncode != 0, "Should fail when no chrome session exists"
|
||||
# Error could be about chrome/CDP not found, or puppeteer module missing
|
||||
err_lower = result.stderr.lower()
|
||||
assert any(x in err_lower for x in ['chrome', 'cdp', 'puppeteer', 'module']), \
|
||||
f"Should mention chrome/CDP/puppeteer in error: {result.stderr}"
|
||||
|
||||
|
||||
def setup_chrome_session(tmpdir):
|
||||
"""Helper to set up Chrome session with tab and navigation."""
|
||||
crawl_dir = Path(tmpdir) / 'crawl'
|
||||
crawl_dir.mkdir()
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
|
||||
env = get_test_env()
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
# Launch Chrome at crawl level
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-infiniscroll'],
|
||||
cwd=str(crawl_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Wait for Chrome to launch
|
||||
for i in range(15):
|
||||
if chrome_launch_process.poll() is not None:
|
||||
stdout, stderr = chrome_launch_process.communicate()
|
||||
raise RuntimeError(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
||||
if (chrome_dir / 'cdp_url.txt').exists():
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
if not (chrome_dir / 'cdp_url.txt').exists():
|
||||
raise RuntimeError("Chrome CDP URL not found after 15s")
|
||||
|
||||
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
|
||||
|
||||
# Create snapshot directory structure
|
||||
snapshot_dir = Path(tmpdir) / 'snapshot'
|
||||
snapshot_dir.mkdir()
|
||||
snapshot_chrome_dir = snapshot_dir / 'chrome'
|
||||
snapshot_chrome_dir.mkdir()
|
||||
|
||||
# Create tab
|
||||
tab_env = env.copy()
|
||||
tab_env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
|
||||
result = subprocess.run(
|
||||
['node', str(CHROME_TAB_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll', '--crawl-id=test-infiniscroll'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
env=tab_env
|
||||
)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"Tab creation failed: {result.stderr}")
|
||||
|
||||
# Navigate to URL
|
||||
result = subprocess.run(
|
||||
['node', str(CHROME_NAVIGATE_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"Navigation failed: {result.stderr}")
|
||||
|
||||
return chrome_launch_process, chrome_pid, snapshot_chrome_dir
|
||||
|
||||
|
||||
def cleanup_chrome(chrome_launch_process, chrome_pid):
|
||||
"""Helper to clean up Chrome processes."""
|
||||
try:
|
||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||
chrome_launch_process.wait(timeout=5)
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
os.kill(chrome_pid, signal.SIGKILL)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def test_scrolls_page_and_outputs_stats():
|
||||
"""Integration test: scroll page and verify JSONL output format."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
chrome_launch_process = None
|
||||
chrome_pid = None
|
||||
try:
|
||||
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir)
|
||||
|
||||
# Create infiniscroll output directory (sibling to chrome)
|
||||
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
|
||||
infiniscroll_dir.mkdir()
|
||||
|
||||
# Run infiniscroll hook
|
||||
env = get_test_env()
|
||||
env['INFINISCROLL_SCROLL_LIMIT'] = '3' # Limit scrolls for faster test
|
||||
env['INFINISCROLL_SCROLL_DELAY'] = '500' # Faster scrolling
|
||||
env['INFINISCROLL_MIN_HEIGHT'] = '1000' # Lower threshold for test
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll'],
|
||||
cwd=str(infiniscroll_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
env=env
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Infiniscroll failed: {result.stderr}\nStdout: {result.stdout}"
|
||||
|
||||
# Parse JSONL output
|
||||
result_json = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert result_json is not None, f"Should have ArchiveResult JSONL output. Stdout: {result.stdout}"
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
||||
|
||||
# Verify output_str format: "scrolled to X,XXXpx (+Y,YYYpx new content) over Z.Zs"
|
||||
output_str = result_json.get('output_str', '')
|
||||
assert output_str.startswith('scrolled to'), f"output_str should start with 'scrolled to': {output_str}"
|
||||
assert 'px' in output_str, f"output_str should contain pixel count: {output_str}"
|
||||
assert re.search(r'over \d+(\.\d+)?s', output_str), f"output_str should contain duration: {output_str}"
|
||||
|
||||
# Verify no files created in output directory
|
||||
output_files = list(infiniscroll_dir.iterdir())
|
||||
assert len(output_files) == 0, f"Should not create any files, but found: {output_files}"
|
||||
|
||||
finally:
|
||||
if chrome_launch_process and chrome_pid:
|
||||
cleanup_chrome(chrome_launch_process, chrome_pid)
|
||||
|
||||
|
||||
def test_config_scroll_limit_honored():
|
||||
"""Test that INFINISCROLL_SCROLL_LIMIT config is respected."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
chrome_launch_process = None
|
||||
chrome_pid = None
|
||||
try:
|
||||
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir)
|
||||
|
||||
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
|
||||
infiniscroll_dir.mkdir()
|
||||
|
||||
# Set scroll limit to 2
|
||||
env = get_test_env()
|
||||
env['INFINISCROLL_SCROLL_LIMIT'] = '2'
|
||||
env['INFINISCROLL_SCROLL_DELAY'] = '500'
|
||||
env['INFINISCROLL_MIN_HEIGHT'] = '100000' # High threshold so limit kicks in
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-limit'],
|
||||
cwd=str(infiniscroll_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
env=env
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Infiniscroll failed: {result.stderr}"
|
||||
|
||||
# Parse output and verify scroll count
|
||||
result_json = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip().startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert result_json is not None, "Should have JSONL output"
|
||||
output_str = result_json.get('output_str', '')
|
||||
|
||||
# Verify output format and that it completed (scroll limit enforced internally)
|
||||
assert output_str.startswith('scrolled to'), f"Should have valid output_str: {output_str}"
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed with scroll limit: {result_json}"
|
||||
|
||||
finally:
|
||||
if chrome_launch_process and chrome_pid:
|
||||
cleanup_chrome(chrome_launch_process, chrome_pid)
|
||||
|
||||
|
||||
def test_config_timeout_honored():
|
||||
"""Test that INFINISCROLL_TIMEOUT config is respected."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
chrome_launch_process = None
|
||||
chrome_pid = None
|
||||
try:
|
||||
chrome_launch_process, chrome_pid, snapshot_chrome_dir = setup_chrome_session(tmpdir)
|
||||
|
||||
infiniscroll_dir = snapshot_chrome_dir.parent / 'infiniscroll'
|
||||
infiniscroll_dir.mkdir()
|
||||
|
||||
# Set very short timeout
|
||||
env = get_test_env()
|
||||
env['INFINISCROLL_TIMEOUT'] = '3' # 3 seconds
|
||||
env['INFINISCROLL_SCROLL_DELAY'] = '2000' # 2s delay - timeout should trigger
|
||||
env['INFINISCROLL_SCROLL_LIMIT'] = '100' # High limit
|
||||
env['INFINISCROLL_MIN_HEIGHT'] = '100000'
|
||||
|
||||
start_time = time.time()
|
||||
result = subprocess.run(
|
||||
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-timeout'],
|
||||
cwd=str(infiniscroll_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
env=env
|
||||
)
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
# Should complete within reasonable time (timeout + buffer)
|
||||
assert elapsed < 15, f"Should respect timeout, took {elapsed:.1f}s"
|
||||
assert result.returncode == 0, f"Should complete even with timeout: {result.stderr}"
|
||||
|
||||
finally:
|
||||
if chrome_launch_process and chrome_pid:
|
||||
cleanup_chrome(chrome_launch_process, chrome_pid)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
@@ -15,17 +15,26 @@
|
||||
"x-aliases": ["POSTLIGHT_PARSER_BINARY"],
|
||||
"description": "Path to Mercury/Postlight parser binary"
|
||||
},
|
||||
"NODE_BINARY": {
|
||||
"type": "string",
|
||||
"default": "node",
|
||||
"description": "Path to Node.js binary"
|
||||
},
|
||||
"MERCURY_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 30,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for Mercury in seconds"
|
||||
},
|
||||
"MERCURY_ARGS": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": [],
|
||||
"x-aliases": ["MERCURY_DEFAULT_ARGS"],
|
||||
"description": "Default Mercury parser arguments"
|
||||
},
|
||||
"MERCURY_ARGS_EXTRA": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": [],
|
||||
"x-aliases": ["MERCURY_EXTRA_ARGS"],
|
||||
"description": "Extra arguments to append to Mercury parser command"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,8 +8,8 @@ Output: Creates mercury/ directory with content.html, content.txt, article.json
|
||||
Environment variables:
|
||||
MERCURY_BINARY: Path to postlight-parser binary
|
||||
MERCURY_TIMEOUT: Timeout in seconds (default: 60)
|
||||
|
||||
# Fallback to ARCHIVING_CONFIG values if MERCURY_* not set:
|
||||
MERCURY_ARGS: Default Mercury arguments (JSON array)
|
||||
MERCURY_ARGS_EXTRA: Extra arguments to append (JSON array)
|
||||
TIMEOUT: Fallback timeout
|
||||
|
||||
Note: Requires postlight-parser: npm install -g @postlight/parser
|
||||
@@ -51,6 +51,20 @@ def get_env_int(name: str, default: int = 0) -> int:
|
||||
return default
|
||||
|
||||
|
||||
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
|
||||
"""Parse a JSON array from environment variable."""
|
||||
val = get_env(name, '')
|
||||
if not val:
|
||||
return default if default is not None else []
|
||||
try:
|
||||
result = json.loads(val)
|
||||
if isinstance(result, list):
|
||||
return [str(item) for item in result]
|
||||
return default if default is not None else []
|
||||
except json.JSONDecodeError:
|
||||
return default if default is not None else []
|
||||
|
||||
|
||||
def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
"""
|
||||
Extract article using Mercury Parser.
|
||||
@@ -58,13 +72,15 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
timeout = get_env_int('MERCURY_TIMEOUT') or get_env_int('TIMEOUT', 60)
|
||||
mercury_args = get_env_array('MERCURY_ARGS', [])
|
||||
mercury_args_extra = get_env_array('MERCURY_ARGS_EXTRA', [])
|
||||
|
||||
# Output directory is current directory (hook already runs in output dir)
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
|
||||
try:
|
||||
# Get text version
|
||||
cmd_text = [binary, url, '--format=text']
|
||||
cmd_text = [binary, *mercury_args, *mercury_args_extra, url, '--format=text']
|
||||
result_text = subprocess.run(cmd_text, capture_output=True, timeout=timeout)
|
||||
|
||||
if result_text.returncode != 0:
|
||||
@@ -84,7 +100,7 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
(output_dir / 'content.txt').write_text(text_content, encoding='utf-8')
|
||||
|
||||
# Get HTML version
|
||||
cmd_html = [binary, url, '--format=html']
|
||||
cmd_html = [binary, *mercury_args, *mercury_args_extra, url, '--format=html']
|
||||
result_html = subprocess.run(cmd_html, capture_output=True, timeout=timeout)
|
||||
|
||||
try:
|
||||
|
||||
@@ -21,10 +21,19 @@
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for paper downloads in seconds"
|
||||
},
|
||||
"PAPERSDL_EXTRA_ARGS": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"description": "Extra arguments for papers-dl (space-separated)"
|
||||
"PAPERSDL_ARGS": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": ["fetch"],
|
||||
"x-aliases": ["PAPERSDL_DEFAULT_ARGS"],
|
||||
"description": "Default papers-dl arguments"
|
||||
},
|
||||
"PAPERSDL_ARGS_EXTRA": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": [],
|
||||
"x-aliases": ["PAPERSDL_EXTRA_ARGS"],
|
||||
"description": "Extra arguments to append to papers-dl command"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,7 +8,8 @@ Output: Downloads paper PDFs to $PWD/
|
||||
Environment variables:
|
||||
PAPERSDL_BINARY: Path to papers-dl binary
|
||||
PAPERSDL_TIMEOUT: Timeout in seconds (default: 300 for paper downloads)
|
||||
PAPERSDL_EXTRA_ARGS: Extra arguments for papers-dl (space-separated)
|
||||
PAPERSDL_ARGS: Default papers-dl arguments (JSON array, default: ["fetch"])
|
||||
PAPERSDL_ARGS_EXTRA: Extra arguments to append (JSON array)
|
||||
|
||||
# papers-dl feature toggles
|
||||
SAVE_PAPERSDL: Enable papers-dl paper extraction (default: True)
|
||||
@@ -54,6 +55,20 @@ def get_env_int(name: str, default: int = 0) -> int:
|
||||
return default
|
||||
|
||||
|
||||
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
|
||||
"""Parse a JSON array from environment variable."""
|
||||
val = get_env(name, '')
|
||||
if not val:
|
||||
return default if default is not None else []
|
||||
try:
|
||||
result = json.loads(val)
|
||||
if isinstance(result, list):
|
||||
return [str(item) for item in result]
|
||||
return default if default is not None else []
|
||||
except json.JSONDecodeError:
|
||||
return default if default is not None else []
|
||||
|
||||
|
||||
def extract_doi_from_url(url: str) -> str | None:
|
||||
"""Extract DOI from common paper URLs."""
|
||||
# Match DOI pattern in URL
|
||||
@@ -72,7 +87,8 @@ def save_paper(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
"""
|
||||
# Get config from env
|
||||
timeout = get_env_int('TIMEOUT', 300)
|
||||
extra_args = get_env('PAPERSDL_EXTRA_ARGS', '')
|
||||
papersdl_args = get_env_array('PAPERSDL_ARGS', [])
|
||||
papersdl_args_extra = get_env_array('PAPERSDL_ARGS_EXTRA', [])
|
||||
|
||||
# Output directory is current directory (hook already runs in output dir)
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
@@ -85,11 +101,11 @@ def save_paper(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
else:
|
||||
identifier = doi
|
||||
|
||||
# Build command - papers-dl fetch <identifier> -o <output_dir>
|
||||
cmd = [binary, 'fetch', identifier, '-o', str(output_dir)]
|
||||
# Build command - papers-dl <args> <identifier> -o <output_dir>
|
||||
cmd = [binary, *papersdl_args, identifier, '-o', str(output_dir)]
|
||||
|
||||
if extra_args:
|
||||
cmd.extend(extra_args.split())
|
||||
if papersdl_args_extra:
|
||||
cmd.extend(papersdl_args_extra)
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True)
|
||||
|
||||
@@ -14,17 +14,26 @@
|
||||
"default": "readability-extractor",
|
||||
"description": "Path to readability-extractor binary"
|
||||
},
|
||||
"NODE_BINARY": {
|
||||
"type": "string",
|
||||
"default": "node",
|
||||
"description": "Path to Node.js binary"
|
||||
},
|
||||
"READABILITY_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 30,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for Readability in seconds"
|
||||
},
|
||||
"READABILITY_ARGS": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": [],
|
||||
"x-aliases": ["READABILITY_DEFAULT_ARGS"],
|
||||
"description": "Default Readability arguments"
|
||||
},
|
||||
"READABILITY_ARGS_EXTRA": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": [],
|
||||
"x-aliases": ["READABILITY_EXTRA_ARGS"],
|
||||
"description": "Extra arguments to append to Readability command"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,8 +8,8 @@ Output: Creates readability/ directory with content.html, content.txt, article.j
|
||||
Environment variables:
|
||||
READABILITY_BINARY: Path to readability-extractor binary
|
||||
READABILITY_TIMEOUT: Timeout in seconds (default: 60)
|
||||
|
||||
# Fallback to ARCHIVING_CONFIG values if READABILITY_* not set:
|
||||
READABILITY_ARGS: Default Readability arguments (JSON array)
|
||||
READABILITY_ARGS_EXTRA: Extra arguments to append (JSON array)
|
||||
TIMEOUT: Fallback timeout
|
||||
|
||||
Note: Requires readability-extractor from https://github.com/ArchiveBox/readability-extractor
|
||||
@@ -44,6 +44,20 @@ def get_env_int(name: str, default: int = 0) -> int:
|
||||
return default
|
||||
|
||||
|
||||
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
|
||||
"""Parse a JSON array from environment variable."""
|
||||
val = get_env(name, '')
|
||||
if not val:
|
||||
return default if default is not None else []
|
||||
try:
|
||||
result = json.loads(val)
|
||||
if isinstance(result, list):
|
||||
return [str(item) for item in result]
|
||||
return default if default is not None else []
|
||||
except json.JSONDecodeError:
|
||||
return default if default is not None else []
|
||||
|
||||
|
||||
def find_html_source() -> str | None:
|
||||
"""Find HTML content from other extractors in the snapshot directory."""
|
||||
# Hooks run in snapshot_dir, sibling extractor outputs are in subdirectories
|
||||
@@ -73,6 +87,8 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
timeout = get_env_int('READABILITY_TIMEOUT') or get_env_int('TIMEOUT', 60)
|
||||
readability_args = get_env_array('READABILITY_ARGS', [])
|
||||
readability_args_extra = get_env_array('READABILITY_ARGS_EXTRA', [])
|
||||
|
||||
# Find HTML source
|
||||
html_source = find_html_source()
|
||||
@@ -84,7 +100,7 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
|
||||
try:
|
||||
# Run readability-extractor (outputs JSON by default)
|
||||
cmd = [binary, html_source]
|
||||
cmd = [binary, *readability_args, *readability_args_extra, html_source]
|
||||
result = subprocess.run(cmd, capture_output=True, timeout=timeout)
|
||||
|
||||
if result.returncode != 0:
|
||||
|
||||
@@ -3,25 +3,32 @@
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"SEARCH_BACKEND_RIPGREP_BINARY": {
|
||||
"RIPGREP_BINARY": {
|
||||
"type": "string",
|
||||
"default": "rg",
|
||||
"x-aliases": ["RIPGREP_BINARY"],
|
||||
"description": "Path to ripgrep binary"
|
||||
},
|
||||
"SEARCH_BACKEND_RIPGREP_IGNORE_EXTENSIONS": {
|
||||
"type": "string",
|
||||
"default": "css,js,orig,svg",
|
||||
"x-aliases": ["RIPGREP_IGNORE_EXTENSIONS"],
|
||||
"description": "Comma-separated file extensions to ignore"
|
||||
},
|
||||
"SEARCH_BACKEND_RIPGREP_TIMEOUT": {
|
||||
"RIPGREP_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 90,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"x-aliases": ["SEARCH_BACKEND_TIMEOUT"],
|
||||
"description": "Search timeout in seconds"
|
||||
},
|
||||
"RIPGREP_ARGS": {
|
||||
"type": "array",
|
||||
"items": { "type": "string" },
|
||||
"default": ["--files-with-matches", "--no-messages", "--ignore-case"],
|
||||
"x-aliases": ["RIPGREP_DEFAULT_ARGS"],
|
||||
"description": "Default ripgrep arguments"
|
||||
},
|
||||
"RIPGREP_ARGS_EXTRA": {
|
||||
"type": "array",
|
||||
"items": { "type": "string" },
|
||||
"default": [],
|
||||
"x-aliases": ["RIPGREP_EXTRA_ARGS"],
|
||||
"description": "Extra arguments to append to ripgrep command"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,10 +6,12 @@ using ripgrep (rg). This is simpler but slower for large archives.
|
||||
|
||||
Environment variables:
|
||||
RIPGREP_BINARY: Path to ripgrep binary (default: rg)
|
||||
RIPGREP_IGNORE_EXTENSIONS: Comma-separated extensions to ignore (default: css,js,orig,svg)
|
||||
SEARCH_BACKEND_TIMEOUT: Search timeout in seconds (default: 90)
|
||||
RIPGREP_ARGS: Default ripgrep arguments (JSON array)
|
||||
RIPGREP_ARGS_EXTRA: Extra arguments to append (JSON array)
|
||||
RIPGREP_TIMEOUT: Search timeout in seconds (default: 90)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import shutil
|
||||
@@ -19,39 +21,57 @@ from typing import List, Iterable
|
||||
from django.conf import settings
|
||||
|
||||
|
||||
# Config with old var names for backwards compatibility
|
||||
RIPGREP_BINARY = os.environ.get('RIPGREP_BINARY', 'rg').strip()
|
||||
RIPGREP_IGNORE_EXTENSIONS = os.environ.get('RIPGREP_IGNORE_EXTENSIONS', 'css,js,orig,svg').strip()
|
||||
SEARCH_BACKEND_TIMEOUT = int(os.environ.get('SEARCH_BACKEND_TIMEOUT', '90'))
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
|
||||
def get_env_int(name: str, default: int = 0) -> int:
|
||||
try:
|
||||
return int(get_env(name, str(default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
|
||||
"""Parse a JSON array from environment variable."""
|
||||
val = get_env(name, '')
|
||||
if not val:
|
||||
return default if default is not None else []
|
||||
try:
|
||||
result = json.loads(val)
|
||||
if isinstance(result, list):
|
||||
return [str(item) for item in result]
|
||||
return default if default is not None else []
|
||||
except json.JSONDecodeError:
|
||||
return default if default is not None else []
|
||||
|
||||
|
||||
def search(query: str) -> List[str]:
|
||||
"""Search for snapshots using ripgrep."""
|
||||
rg_binary = shutil.which(RIPGREP_BINARY) or RIPGREP_BINARY
|
||||
rg_binary = get_env('RIPGREP_BINARY', 'rg')
|
||||
rg_binary = shutil.which(rg_binary) or rg_binary
|
||||
if not rg_binary or not Path(rg_binary).exists():
|
||||
raise RuntimeError(f'ripgrep binary not found ({RIPGREP_BINARY}). Install with: apt install ripgrep')
|
||||
raise RuntimeError(f'ripgrep binary not found. Install with: apt install ripgrep')
|
||||
|
||||
timeout = get_env_int('RIPGREP_TIMEOUT', 90)
|
||||
ripgrep_args = get_env_array('RIPGREP_ARGS', [])
|
||||
ripgrep_args_extra = get_env_array('RIPGREP_ARGS_EXTRA', [])
|
||||
|
||||
archive_dir = Path(settings.ARCHIVE_DIR)
|
||||
if not archive_dir.exists():
|
||||
return []
|
||||
|
||||
# Build ignore pattern from config
|
||||
ignore_pattern = f'*.{{{RIPGREP_IGNORE_EXTENSIONS}}}'
|
||||
|
||||
cmd = [
|
||||
rg_binary,
|
||||
f'--type-add=ignore:{ignore_pattern}',
|
||||
'--type-not=ignore',
|
||||
'--files-with-matches',
|
||||
'--no-messages',
|
||||
'--ignore-case',
|
||||
*ripgrep_args,
|
||||
*ripgrep_args_extra,
|
||||
'--regexp',
|
||||
query,
|
||||
str(archive_dir),
|
||||
]
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=SEARCH_BACKEND_TIMEOUT)
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
|
||||
|
||||
# Extract snapshot IDs from file paths
|
||||
# Paths look like: archive/<snapshot_id>/<extractor>/file.txt
|
||||
|
||||
@@ -15,11 +15,18 @@
|
||||
"x-aliases": ["SINGLE_FILE_BINARY"],
|
||||
"description": "Path to single-file binary"
|
||||
},
|
||||
"NODE_BINARY": {
|
||||
"SINGLEFILE_NODE_BINARY": {
|
||||
"type": "string",
|
||||
"default": "node",
|
||||
"x-fallback": "NODE_BINARY",
|
||||
"description": "Path to Node.js binary"
|
||||
},
|
||||
"SINGLEFILE_CHROME_BINARY": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"x-fallback": "CHROME_BINARY",
|
||||
"description": "Path to Chrome/Chromium binary"
|
||||
},
|
||||
"SINGLEFILE_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 60,
|
||||
@@ -39,16 +46,25 @@
|
||||
"x-fallback": "COOKIES_FILE",
|
||||
"description": "Path to cookies file"
|
||||
},
|
||||
"SINGLEFILE_CHECK_SSL_VALIDITY": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-fallback": "CHECK_SSL_VALIDITY",
|
||||
"description": "Whether to verify SSL certificates"
|
||||
},
|
||||
"SINGLEFILE_ARGS": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": [],
|
||||
"default": ["--browser-headless"],
|
||||
"x-aliases": ["SINGLEFILE_DEFAULT_ARGS"],
|
||||
"description": "Default single-file arguments"
|
||||
},
|
||||
"SINGLEFILE_EXTRA_ARGS": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"description": "Extra arguments for single-file"
|
||||
"SINGLEFILE_ARGS_EXTRA": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": [],
|
||||
"x-aliases": ["SINGLEFILE_EXTRA_ARGS"],
|
||||
"description": "Extra arguments to append to single-file command"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,24 +6,16 @@ Usage: on_Snapshot__singlefile.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Writes singlefile.html to $PWD
|
||||
|
||||
Environment variables:
|
||||
SINGLEFILE_BINARY: Path to SingleFile binary
|
||||
SINGLEFILE_TIMEOUT: Timeout in seconds (default: 120)
|
||||
SINGLEFILE_USER_AGENT: User agent string (optional)
|
||||
SINGLEFILE_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
|
||||
SINGLEFILE_COOKIES_FILE: Path to cookies file (optional)
|
||||
SINGLEFILE_EXTRA_ARGS: Extra arguments for SingleFile (space-separated)
|
||||
|
||||
# Feature toggle
|
||||
SAVE_SINGLEFILE: Enable SingleFile archiving (default: True)
|
||||
|
||||
# Chrome binary (SingleFile needs Chrome)
|
||||
CHROME_BINARY: Path to Chrome/Chromium binary
|
||||
|
||||
# Fallback to ARCHIVING_CONFIG values if SINGLEFILE_* not set:
|
||||
TIMEOUT: Fallback timeout
|
||||
USER_AGENT: Fallback user agent
|
||||
CHECK_SSL_VALIDITY: Fallback SSL check
|
||||
COOKIES_FILE: Fallback cookies file
|
||||
SINGLEFILE_ENABLED: Enable SingleFile archiving (default: True)
|
||||
SINGLEFILE_BINARY: Path to SingleFile binary (default: single-file)
|
||||
SINGLEFILE_NODE_BINARY: Path to Node.js binary (x-fallback: NODE_BINARY)
|
||||
SINGLEFILE_CHROME_BINARY: Path to Chrome binary (x-fallback: CHROME_BINARY)
|
||||
SINGLEFILE_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT)
|
||||
SINGLEFILE_USER_AGENT: User agent string (x-fallback: USER_AGENT)
|
||||
SINGLEFILE_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE)
|
||||
SINGLEFILE_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY)
|
||||
SINGLEFILE_ARGS: Default SingleFile arguments (JSON array)
|
||||
SINGLEFILE_ARGS_EXTRA: Extra arguments to append (JSON array)
|
||||
"""
|
||||
|
||||
import json
|
||||
@@ -63,6 +55,20 @@ def get_env_int(name: str, default: int = 0) -> int:
|
||||
return default
|
||||
|
||||
|
||||
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
|
||||
"""Parse a JSON array from environment variable."""
|
||||
val = get_env(name, '')
|
||||
if not val:
|
||||
return default if default is not None else []
|
||||
try:
|
||||
result = json.loads(val)
|
||||
if isinstance(result, list):
|
||||
return [str(item) for item in result]
|
||||
return default if default is not None else []
|
||||
except json.JSONDecodeError:
|
||||
return default if default is not None else []
|
||||
|
||||
|
||||
STATICFILE_DIR = '../staticfile'
|
||||
|
||||
def has_staticfile_output() -> bool:
|
||||
@@ -121,15 +127,16 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
# Get config from env (with SINGLEFILE_ prefix or fallback to ARCHIVING_CONFIG style)
|
||||
# Get config from env (with SINGLEFILE_ prefix, x-fallback handled by config loader)
|
||||
timeout = get_env_int('SINGLEFILE_TIMEOUT') or get_env_int('TIMEOUT', 120)
|
||||
user_agent = get_env('SINGLEFILE_USER_AGENT') or get_env('USER_AGENT', '')
|
||||
check_ssl = get_env_bool('SINGLEFILE_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True))
|
||||
check_ssl = get_env_bool('SINGLEFILE_CHECK_SSL_VALIDITY', True) if get_env('SINGLEFILE_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True)
|
||||
cookies_file = get_env('SINGLEFILE_COOKIES_FILE') or get_env('COOKIES_FILE', '')
|
||||
extra_args = get_env('SINGLEFILE_EXTRA_ARGS', '')
|
||||
chrome = get_env('CHROME_BINARY', '')
|
||||
singlefile_args = get_env_array('SINGLEFILE_ARGS', [])
|
||||
singlefile_args_extra = get_env_array('SINGLEFILE_ARGS_EXTRA', [])
|
||||
chrome = get_env('SINGLEFILE_CHROME_BINARY') or get_env('CHROME_BINARY', '')
|
||||
|
||||
cmd = [binary]
|
||||
cmd = [binary, *singlefile_args]
|
||||
|
||||
# Try to use existing Chrome session via CDP
|
||||
cdp_url = get_cdp_url()
|
||||
@@ -142,11 +149,6 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
elif chrome:
|
||||
cmd.extend(['--browser-executable-path', chrome])
|
||||
|
||||
# Common options
|
||||
cmd.extend([
|
||||
'--browser-headless',
|
||||
])
|
||||
|
||||
# SSL handling
|
||||
if not check_ssl:
|
||||
cmd.append('--browser-ignore-insecure-certs')
|
||||
@@ -157,8 +159,9 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
if cookies_file and Path(cookies_file).is_file():
|
||||
cmd.extend(['--browser-cookies-file', cookies_file])
|
||||
|
||||
if extra_args:
|
||||
cmd.extend(extra_args.split())
|
||||
# Add extra args from config
|
||||
if singlefile_args_extra:
|
||||
cmd.extend(singlefile_args_extra)
|
||||
|
||||
# Output directory is current directory (hook already runs in output dir)
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
|
||||
@@ -9,18 +9,12 @@
|
||||
"x-aliases": ["SAVE_WGET", "USE_WGET"],
|
||||
"description": "Enable wget archiving"
|
||||
},
|
||||
"WGET_SAVE_WARC": {
|
||||
"WGET_WARC_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_WARC"],
|
||||
"x-aliases": ["SAVE_WARC", "WGET_SAVE_WARC"],
|
||||
"description": "Save WARC archive file"
|
||||
},
|
||||
"WGET_SAVE_REQUISITES": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_WGET_REQUISITES"],
|
||||
"description": "Download page requisites (CSS, JS, images)"
|
||||
},
|
||||
"WGET_BINARY": {
|
||||
"type": "string",
|
||||
"default": "wget",
|
||||
@@ -39,25 +33,17 @@
|
||||
"x-fallback": "USER_AGENT",
|
||||
"description": "User agent string for wget"
|
||||
},
|
||||
"WGET_CHECK_SSL_VALIDITY": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-fallback": "CHECK_SSL_VALIDITY",
|
||||
"x-aliases": ["CHECK_SSL_VALIDITY"],
|
||||
"description": "Whether to verify SSL certificates"
|
||||
},
|
||||
"WGET_COOKIES_FILE": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"x-fallback": "COOKIES_FILE",
|
||||
"description": "Path to cookies file"
|
||||
},
|
||||
"WGET_RESTRICT_FILE_NAMES": {
|
||||
"type": "string",
|
||||
"default": "windows",
|
||||
"enum": ["windows", "unix", "ascii", "nocontrol", "lowercase", "uppercase"],
|
||||
"x-fallback": "RESTRICT_FILE_NAMES",
|
||||
"description": "Filename restriction mode"
|
||||
"WGET_CHECK_SSL_VALIDITY": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-fallback": "CHECK_SSL_VALIDITY",
|
||||
"description": "Whether to verify SSL certificates"
|
||||
},
|
||||
"WGET_ARGS": {
|
||||
"type": "array",
|
||||
@@ -70,14 +56,20 @@
|
||||
"--backup-converted",
|
||||
"--span-hosts",
|
||||
"--no-parent",
|
||||
"--page-requisites",
|
||||
"--restrict-file-names=windows",
|
||||
"--tries=2",
|
||||
"-e", "robots=off"
|
||||
],
|
||||
"x-aliases": ["WGET_DEFAULT_ARGS"],
|
||||
"description": "Default wget arguments"
|
||||
},
|
||||
"WGET_EXTRA_ARGS": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"description": "Extra arguments for wget (space-separated)"
|
||||
"WGET_ARGS_EXTRA": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": [],
|
||||
"x-aliases": ["WGET_EXTRA_ARGS"],
|
||||
"description": "Extra arguments to append to wget command"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,25 +6,15 @@ Usage: on_Snapshot__wget.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Downloads files to $PWD
|
||||
|
||||
Environment variables:
|
||||
WGET_BINARY: Path to wget binary (optional, falls back to PATH)
|
||||
WGET_TIMEOUT: Timeout in seconds (default: 60)
|
||||
WGET_USER_AGENT: User agent string
|
||||
WGET_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
|
||||
WGET_COOKIES_FILE: Path to cookies file (optional)
|
||||
WGET_RESTRICT_FILE_NAMES: Filename restriction mode (default: windows)
|
||||
WGET_EXTRA_ARGS: Extra arguments for wget (space-separated)
|
||||
|
||||
# Wget feature toggles
|
||||
SAVE_WGET: Enable wget archiving (default: True)
|
||||
SAVE_WARC: Save WARC file (default: True)
|
||||
SAVE_WGET_REQUISITES: Download page requisites (default: True)
|
||||
|
||||
# Fallback to ARCHIVING_CONFIG values if WGET_* not set:
|
||||
TIMEOUT: Fallback timeout
|
||||
USER_AGENT: Fallback user agent
|
||||
CHECK_SSL_VALIDITY: Fallback SSL check
|
||||
COOKIES_FILE: Fallback cookies file
|
||||
RESTRICT_FILE_NAMES: Fallback filename restriction
|
||||
WGET_ENABLED: Enable wget archiving (default: True)
|
||||
WGET_WARC_ENABLED: Save WARC file (default: True)
|
||||
WGET_BINARY: Path to wget binary (default: wget)
|
||||
WGET_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT)
|
||||
WGET_USER_AGENT: User agent string (x-fallback: USER_AGENT)
|
||||
WGET_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE)
|
||||
WGET_CHECK_SSL_VALIDITY: Whether to check SSL certificates (x-fallback: CHECK_SSL_VALIDITY)
|
||||
WGET_ARGS: Default wget arguments (JSON array)
|
||||
WGET_ARGS_EXTRA: Extra arguments to append (JSON array)
|
||||
"""
|
||||
|
||||
import json
|
||||
@@ -65,6 +55,20 @@ def get_env_int(name: str, default: int = 0) -> int:
|
||||
return default
|
||||
|
||||
|
||||
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
|
||||
"""Parse a JSON array from environment variable."""
|
||||
val = get_env(name, '')
|
||||
if not val:
|
||||
return default if default is not None else []
|
||||
try:
|
||||
result = json.loads(val)
|
||||
if isinstance(result, list):
|
||||
return [str(item) for item in result]
|
||||
return default if default is not None else []
|
||||
except json.JSONDecodeError:
|
||||
return default if default is not None else []
|
||||
|
||||
|
||||
STATICFILE_DIR = '../staticfile'
|
||||
|
||||
def has_staticfile_output() -> bool:
|
||||
@@ -73,17 +77,6 @@ def has_staticfile_output() -> bool:
|
||||
return staticfile_dir.exists() and any(staticfile_dir.iterdir())
|
||||
|
||||
|
||||
# Default wget args (from old WGET_CONFIG)
|
||||
WGET_DEFAULT_ARGS = [
|
||||
'--no-verbose',
|
||||
'--adjust-extension',
|
||||
'--convert-links',
|
||||
'--force-directories',
|
||||
'--backup-converted',
|
||||
'--span-hosts',
|
||||
'--no-parent',
|
||||
'-e', 'robots=off',
|
||||
]
|
||||
|
||||
|
||||
def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
@@ -92,36 +85,28 @@ def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
# Get config from env (with WGET_ prefix or fallback to ARCHIVING_CONFIG style)
|
||||
# Get config from env (with WGET_ prefix, x-fallback handled by config loader)
|
||||
timeout = get_env_int('WGET_TIMEOUT') or get_env_int('TIMEOUT', 60)
|
||||
user_agent = get_env('WGET_USER_AGENT') or get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
|
||||
check_ssl = get_env_bool('WGET_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True))
|
||||
check_ssl = get_env_bool('WGET_CHECK_SSL_VALIDITY', True) if get_env('WGET_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True)
|
||||
cookies_file = get_env('WGET_COOKIES_FILE') or get_env('COOKIES_FILE', '')
|
||||
restrict_names = get_env('WGET_RESTRICT_FILE_NAMES') or get_env('RESTRICT_FILE_NAMES', 'windows')
|
||||
extra_args = get_env('WGET_EXTRA_ARGS', '')
|
||||
wget_args = get_env_array('WGET_ARGS', [])
|
||||
wget_args_extra = get_env_array('WGET_ARGS_EXTRA', [])
|
||||
|
||||
# Feature toggles
|
||||
save_warc = get_env_bool('WGET_SAVE_WARC', True)
|
||||
save_requisites = get_env_bool('WGET_SAVE_REQUISITES', True)
|
||||
warc_enabled = get_env_bool('WGET_WARC_ENABLED', True)
|
||||
|
||||
# Build wget command (later options take precedence)
|
||||
cmd = [
|
||||
binary,
|
||||
*WGET_DEFAULT_ARGS,
|
||||
*wget_args,
|
||||
f'--timeout={timeout}',
|
||||
'--tries=2',
|
||||
]
|
||||
|
||||
if user_agent:
|
||||
cmd.append(f'--user-agent={user_agent}')
|
||||
|
||||
if restrict_names:
|
||||
cmd.append(f'--restrict-file-names={restrict_names}')
|
||||
|
||||
if save_requisites:
|
||||
cmd.append('--page-requisites')
|
||||
|
||||
if save_warc:
|
||||
if warc_enabled:
|
||||
warc_dir = Path('warc')
|
||||
warc_dir.mkdir(exist_ok=True)
|
||||
warc_path = warc_dir / str(int(datetime.now(timezone.utc).timestamp()))
|
||||
@@ -135,8 +120,8 @@ def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
if not check_ssl:
|
||||
cmd.extend(['--no-check-certificate', '--no-hsts'])
|
||||
|
||||
if extra_args:
|
||||
cmd.extend(extra_args.split())
|
||||
if wget_args_extra:
|
||||
cmd.extend(wget_args_extra)
|
||||
|
||||
cmd.append(url)
|
||||
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
{"type": "Binary", "name": "yt-dlp", "binproviders": "pip,brew,apt,env"}
|
||||
{"type": "Binary", "name": "yt-dlp", "binproviders": "pip,brew,apt,env", "overrides": {"pip": {"packages": "yt-dlp[default]"}}}
|
||||
{"type": "Binary", "name": "node", "binproviders": "apt,brew,env", "overrides": {"apt": {"packages": ["nodejs"]}}}
|
||||
{"type": "Binary", "name": "ffmpeg", "binproviders": "apt,brew,env"}
|
||||
|
||||
@@ -6,15 +6,28 @@
|
||||
"YTDLP_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["MEDIA_ENABLED", "SAVE_MEDIA", "USE_MEDIA", "USE_YTDLP", "FETCH_MEDIA", "SAVE_YTDLP"],
|
||||
"x-aliases": [
|
||||
"MEDIA_ENABLED",
|
||||
"SAVE_MEDIA",
|
||||
"USE_MEDIA",
|
||||
"USE_YTDLP",
|
||||
"FETCH_MEDIA",
|
||||
"SAVE_YTDLP"
|
||||
],
|
||||
"description": "Enable video/audio downloading with yt-dlp"
|
||||
},
|
||||
"YTDLP_BINARY": {
|
||||
"type": "string",
|
||||
"default": "yt-dlp",
|
||||
"x-aliases": ["MEDIA_BINARY", "YOUTUBEDL_BINARY", "YOUTUBE_DL_BINARY"],
|
||||
"x-aliases": ["YOUTUBEDL_BINARY", "YOUTUBE_DL_BINARY"],
|
||||
"description": "Path to yt-dlp binary"
|
||||
},
|
||||
"YTDLP_NODE_BINARY": {
|
||||
"type": "string",
|
||||
"default": "node",
|
||||
"x-fallback": "NODE_BINARY",
|
||||
"description": "Path to Node.js binary for yt-dlp JS runtime"
|
||||
},
|
||||
"YTDLP_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 3600,
|
||||
@@ -23,6 +36,12 @@
|
||||
"x-aliases": ["MEDIA_TIMEOUT"],
|
||||
"description": "Timeout for yt-dlp downloads in seconds"
|
||||
},
|
||||
"YTDLP_COOKIES_FILE": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"x-fallback": "COOKIES_FILE",
|
||||
"description": "Path to cookies file"
|
||||
},
|
||||
"YTDLP_MAX_SIZE": {
|
||||
"type": "string",
|
||||
"default": "750m",
|
||||
@@ -34,15 +53,14 @@
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-fallback": "CHECK_SSL_VALIDITY",
|
||||
"x-aliases": ["MEDIA_CHECK_SSL_VALIDITY"],
|
||||
"description": "Whether to verify SSL certificates"
|
||||
},
|
||||
"YTDLP_ARGS": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"items": { "type": "string" },
|
||||
"default": [
|
||||
"--restrict-filenames",
|
||||
"--trim-filenames", "128",
|
||||
"--trim-filenames=128",
|
||||
"--write-description",
|
||||
"--write-info-json",
|
||||
"--write-thumbnail",
|
||||
@@ -56,16 +74,19 @@
|
||||
"--geo-bypass",
|
||||
"--add-metadata",
|
||||
"--no-progress",
|
||||
"-o", "%(title)s.%(ext)s"
|
||||
"--remote-components ejs:github",
|
||||
"-o",
|
||||
"%(title)s.%(ext)s"
|
||||
],
|
||||
"x-aliases": ["MEDIA_ARGS"],
|
||||
"description": "Default yt-dlp arguments (override to customize behavior)"
|
||||
"x-aliases": ["YTDLP_DEFAULT_ARGS"],
|
||||
"description": "Default yt-dlp arguments"
|
||||
},
|
||||
"YTDLP_EXTRA_ARGS": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"x-aliases": ["MEDIA_EXTRA_ARGS"],
|
||||
"description": "Extra arguments for yt-dlp (space-separated, appended after YTDLP_ARGS)"
|
||||
"YTDLP_ARGS_EXTRA": {
|
||||
"type": "array",
|
||||
"items": { "type": "string" },
|
||||
"default": [],
|
||||
"x-aliases": ["YTDLP_EXTRA_ARGS"],
|
||||
"description": "Extra arguments to append to yt-dlp command"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,24 +3,18 @@
|
||||
Download video/audio from a URL using yt-dlp.
|
||||
|
||||
Usage: on_Snapshot__ytdlp.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Downloads video/audio files to $PWD/ytdlp/
|
||||
Output: Downloads video/audio files to $PWD
|
||||
|
||||
Environment variables:
|
||||
YTDLP_BINARY: Path to yt-dlp binary
|
||||
YTDLP_TIMEOUT: Timeout in seconds (default: 3600 for large downloads)
|
||||
YTDLP_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
|
||||
YTDLP_ARGS: JSON array of yt-dlp arguments (overrides defaults)
|
||||
YTDLP_EXTRA_ARGS: Extra arguments for yt-dlp (space-separated, appended)
|
||||
YTDLP_MAX_SIZE: Maximum file size (default: 750m)
|
||||
|
||||
# Feature toggles (with backwards-compatible aliases)
|
||||
YTDLP_ENABLED: Enable yt-dlp extraction (default: True)
|
||||
SAVE_YTDLP: Alias for YTDLP_ENABLED
|
||||
MEDIA_ENABLED: Backwards-compatible alias for YTDLP_ENABLED
|
||||
|
||||
# Fallback to ARCHIVING_CONFIG values if YTDLP_* not set:
|
||||
TIMEOUT: Fallback timeout
|
||||
CHECK_SSL_VALIDITY: Fallback SSL check
|
||||
YTDLP_BINARY: Path to yt-dlp binary (default: yt-dlp)
|
||||
YTDLP_NODE_BINARY: Path to Node.js binary (x-fallback: NODE_BINARY)
|
||||
YTDLP_TIMEOUT: Timeout in seconds (x-fallback: TIMEOUT)
|
||||
YTDLP_COOKIES_FILE: Path to cookies file (x-fallback: COOKIES_FILE)
|
||||
YTDLP_MAX_SIZE: Maximum file size (default: 750m)
|
||||
YTDLP_CHECK_SSL_VALIDITY: Whether to verify SSL certs (x-fallback: CHECK_SSL_VALIDITY)
|
||||
YTDLP_ARGS: Default yt-dlp arguments (JSON array)
|
||||
YTDLP_ARGS_EXTRA: Extra arguments to append (JSON array)
|
||||
"""
|
||||
|
||||
import json
|
||||
@@ -32,11 +26,6 @@ from pathlib import Path
|
||||
import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
PLUGIN_NAME = 'ytdlp'
|
||||
BIN_NAME = 'yt-dlp'
|
||||
BIN_PROVIDERS = 'pip,apt,brew,env'
|
||||
OUTPUT_DIR = '.'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
@@ -59,6 +48,20 @@ def get_env_int(name: str, default: int = 0) -> int:
|
||||
return default
|
||||
|
||||
|
||||
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
|
||||
"""Parse a JSON array from environment variable."""
|
||||
val = get_env(name, '')
|
||||
if not val:
|
||||
return default if default is not None else []
|
||||
try:
|
||||
result = json.loads(val)
|
||||
if isinstance(result, list):
|
||||
return [str(item) for item in result]
|
||||
return default if default is not None else []
|
||||
except json.JSONDecodeError:
|
||||
return default if default is not None else []
|
||||
|
||||
|
||||
STATICFILE_DIR = '../staticfile'
|
||||
|
||||
def has_staticfile_output() -> bool:
|
||||
@@ -67,69 +70,41 @@ def has_staticfile_output() -> bool:
|
||||
return staticfile_dir.exists() and any(staticfile_dir.iterdir())
|
||||
|
||||
|
||||
# Default yt-dlp args (can be overridden via YTDLP_ARGS env var)
|
||||
YTDLP_DEFAULT_ARGS = [
|
||||
'--restrict-filenames',
|
||||
'--trim-filenames', '128',
|
||||
'--write-description',
|
||||
'--write-info-json',
|
||||
'--write-thumbnail',
|
||||
'--write-sub',
|
||||
'--write-auto-subs',
|
||||
'--convert-subs=srt',
|
||||
'--yes-playlist',
|
||||
'--continue',
|
||||
'--no-abort-on-error',
|
||||
'--ignore-errors',
|
||||
'--geo-bypass',
|
||||
'--add-metadata',
|
||||
'--no-progress',
|
||||
'-o', '%(title)s.%(ext)s',
|
||||
]
|
||||
|
||||
|
||||
def get_ytdlp_args() -> list[str]:
|
||||
"""Get yt-dlp arguments from YTDLP_ARGS env var or use defaults."""
|
||||
ytdlp_args_str = get_env('YTDLP_ARGS', '')
|
||||
if ytdlp_args_str:
|
||||
try:
|
||||
# Try to parse as JSON array
|
||||
args = json.loads(ytdlp_args_str)
|
||||
if isinstance(args, list):
|
||||
return [str(arg) for arg in args]
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
return YTDLP_DEFAULT_ARGS
|
||||
|
||||
|
||||
def save_ytdlp(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
"""
|
||||
Download video/audio using yt-dlp.
|
||||
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
# Get config from env (YTDLP_* primary, MEDIA_* as fallback via aliases)
|
||||
timeout = get_env_int('TIMEOUT', 3600)
|
||||
check_ssl = get_env_bool('CHECK_SSL_VALIDITY', True)
|
||||
extra_args = get_env('YTDLP_EXTRA_ARGS', '')
|
||||
max_size = get_env('YTDLP_MAX_SIZE', '') or get_env('MEDIA_MAX_SIZE', '750m')
|
||||
# Get config from env (with YTDLP_ prefix, x-fallback handled by config loader)
|
||||
timeout = get_env_int('YTDLP_TIMEOUT') or get_env_int('TIMEOUT', 3600)
|
||||
check_ssl = get_env_bool('YTDLP_CHECK_SSL_VALIDITY', True) if get_env('YTDLP_CHECK_SSL_VALIDITY') else get_env_bool('CHECK_SSL_VALIDITY', True)
|
||||
cookies_file = get_env('YTDLP_COOKIES_FILE') or get_env('COOKIES_FILE', '')
|
||||
max_size = get_env('YTDLP_MAX_SIZE', '750m')
|
||||
node_binary = get_env('YTDLP_NODE_BINARY') or get_env('NODE_BINARY', 'node')
|
||||
ytdlp_args = get_env_array('YTDLP_ARGS', [])
|
||||
ytdlp_args_extra = get_env_array('YTDLP_ARGS_EXTRA', [])
|
||||
|
||||
# Output directory is current directory (hook already runs in output dir)
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
output_dir = Path('.')
|
||||
|
||||
# Build command using configurable YTDLP_ARGS (later options take precedence)
|
||||
# Build command (later options take precedence)
|
||||
cmd = [
|
||||
binary,
|
||||
*get_ytdlp_args(),
|
||||
# Format with max_size limit (appended after YTDLP_ARGS so it can be overridden by YTDLP_EXTRA_ARGS)
|
||||
*ytdlp_args,
|
||||
# Format with max_size limit (appended after YTDLP_ARGS so it can be overridden by YTDLP_ARGS_EXTRA)
|
||||
f'--format=(bv*+ba/b)[filesize<={max_size}][filesize_approx<=?{max_size}]/(bv*+ba/b)',
|
||||
f'--js-runtimes=node:{node_binary}',
|
||||
]
|
||||
|
||||
if not check_ssl:
|
||||
cmd.append('--no-check-certificate')
|
||||
|
||||
if extra_args:
|
||||
cmd.extend(extra_args.split())
|
||||
if cookies_file and Path(cookies_file).is_file():
|
||||
cmd.extend(['--cookies', cookies_file])
|
||||
|
||||
if ytdlp_args_extra:
|
||||
cmd.extend(ytdlp_args_extra)
|
||||
|
||||
cmd.append(url)
|
||||
|
||||
@@ -193,9 +168,8 @@ def main(url: str, snapshot_id: str):
|
||||
"""Download video/audio from a URL using yt-dlp."""
|
||||
|
||||
try:
|
||||
# Check if yt-dlp downloading is enabled (YTDLP_ENABLED primary, MEDIA_ENABLED fallback)
|
||||
ytdlp_enabled = get_env_bool('YTDLP_ENABLED', True) and get_env_bool('MEDIA_ENABLED', True)
|
||||
if not ytdlp_enabled:
|
||||
# Check if yt-dlp downloading is enabled
|
||||
if not get_env_bool('YTDLP_ENABLED', True):
|
||||
print('Skipping ytdlp (YTDLP_ENABLED=False)', file=sys.stderr)
|
||||
# Temporary failure (config disabled) - NO JSONL emission
|
||||
sys.exit(0)
|
||||
|
||||
Reference in New Issue
Block a user