make plugin config more consistent

This commit is contained in:
Nick Sweeting
2025-12-29 13:11:34 -08:00
parent 8d76b2b0c6
commit 967c5d53e0
23 changed files with 452 additions and 339 deletions

View File

@@ -14,17 +14,26 @@
"default": "readability-extractor",
"description": "Path to readability-extractor binary"
},
"NODE_BINARY": {
"type": "string",
"default": "node",
"description": "Path to Node.js binary"
},
"READABILITY_TIMEOUT": {
"type": "integer",
"default": 30,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for Readability in seconds"
},
"READABILITY_ARGS": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["READABILITY_DEFAULT_ARGS"],
"description": "Default Readability arguments"
},
"READABILITY_ARGS_EXTRA": {
"type": "array",
"items": {"type": "string"},
"default": [],
"x-aliases": ["READABILITY_EXTRA_ARGS"],
"description": "Extra arguments to append to Readability command"
}
}
}

View File

@@ -8,8 +8,8 @@ Output: Creates readability/ directory with content.html, content.txt, article.j
Environment variables:
READABILITY_BINARY: Path to readability-extractor binary
READABILITY_TIMEOUT: Timeout in seconds (default: 60)
# Fallback to ARCHIVING_CONFIG values if READABILITY_* not set:
READABILITY_ARGS: Default Readability arguments (JSON array)
READABILITY_ARGS_EXTRA: Extra arguments to append (JSON array)
TIMEOUT: Fallback timeout
Note: Requires readability-extractor from https://github.com/ArchiveBox/readability-extractor
@@ -44,6 +44,20 @@ def get_env_int(name: str, default: int = 0) -> int:
return default
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
"""Parse a JSON array from environment variable."""
val = get_env(name, '')
if not val:
return default if default is not None else []
try:
result = json.loads(val)
if isinstance(result, list):
return [str(item) for item in result]
return default if default is not None else []
except json.JSONDecodeError:
return default if default is not None else []
def find_html_source() -> str | None:
"""Find HTML content from other extractors in the snapshot directory."""
# Hooks run in snapshot_dir, sibling extractor outputs are in subdirectories
@@ -73,6 +87,8 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
Returns: (success, output_path, error_message)
"""
timeout = get_env_int('READABILITY_TIMEOUT') or get_env_int('TIMEOUT', 60)
readability_args = get_env_array('READABILITY_ARGS', [])
readability_args_extra = get_env_array('READABILITY_ARGS_EXTRA', [])
# Find HTML source
html_source = find_html_source()
@@ -84,7 +100,7 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
try:
# Run readability-extractor (outputs JSON by default)
cmd = [binary, html_source]
cmd = [binary, *readability_args, *readability_args_extra, html_source]
result = subprocess.run(cmd, capture_output=True, timeout=timeout)
if result.returncode != 0: