make plugin config more consistent

2026-04-05 23:37:58 +10:00 · 2025-12-29 13:11:34 -08:00
parent 8d76b2b0c6
commit 967c5d53e0
23 changed files with 452 additions and 339 deletions
--- a/archivebox/plugins/readability/config.json
+++ b/archivebox/plugins/readability/config.json
@@ -14,17 +14,26 @@
      "default": "readability-extractor",
      "description": "Path to readability-extractor binary"
    },
-    "NODE_BINARY": {
-      "type": "string",
-      "default": "node",
-      "description": "Path to Node.js binary"
-    },
    "READABILITY_TIMEOUT": {
      "type": "integer",
      "default": 30,
      "minimum": 5,
      "x-fallback": "TIMEOUT",
      "description": "Timeout for Readability in seconds"
+    },
+    "READABILITY_ARGS": {
+      "type": "array",
+      "items": {"type": "string"},
+      "default": [],
+      "x-aliases": ["READABILITY_DEFAULT_ARGS"],
+      "description": "Default Readability arguments"
+    },
+    "READABILITY_ARGS_EXTRA": {
+      "type": "array",
+      "items": {"type": "string"},
+      "default": [],
+      "x-aliases": ["READABILITY_EXTRA_ARGS"],
+      "description": "Extra arguments to append to Readability command"
    }
  }
 }
--- a/archivebox/plugins/readability/on_Snapshot__55_readability.py
+++ b/archivebox/plugins/readability/on_Snapshot__55_readability.py
@@ -8,8 +8,8 @@ Output: Creates readability/ directory with content.html, content.txt, article.j
 Environment variables:
    READABILITY_BINARY: Path to readability-extractor binary
    READABILITY_TIMEOUT: Timeout in seconds (default: 60)
-
-    # Fallback to ARCHIVING_CONFIG values if READABILITY_* not set:
+    READABILITY_ARGS: Default Readability arguments (JSON array)
+    READABILITY_ARGS_EXTRA: Extra arguments to append (JSON array)
    TIMEOUT: Fallback timeout

 Note: Requires readability-extractor from https://github.com/ArchiveBox/readability-extractor
@@ -44,6 +44,20 @@ def get_env_int(name: str, default: int = 0) -> int:
        return default


+def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
+    """Parse a JSON array from environment variable."""
+    val = get_env(name, '')
+    if not val:
+        return default if default is not None else []
+    try:
+        result = json.loads(val)
+        if isinstance(result, list):
+            return [str(item) for item in result]
+        return default if default is not None else []
+    except json.JSONDecodeError:
+        return default if default is not None else []
+
+
 def find_html_source() -> str | None:
    """Find HTML content from other extractors in the snapshot directory."""
    # Hooks run in snapshot_dir, sibling extractor outputs are in subdirectories
@@ -73,6 +87,8 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
    Returns: (success, output_path, error_message)
    """
    timeout = get_env_int('READABILITY_TIMEOUT') or get_env_int('TIMEOUT', 60)
+    readability_args = get_env_array('READABILITY_ARGS', [])
+    readability_args_extra = get_env_array('READABILITY_ARGS_EXTRA', [])

    # Find HTML source
    html_source = find_html_source()
@@ -84,7 +100,7 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:

    try:
        # Run readability-extractor (outputs JSON by default)
-        cmd = [binary, html_source]
+        cmd = [binary, *readability_args, *readability_args_extra, html_source]
        result = subprocess.run(cmd, capture_output=True, timeout=timeout)

        if result.returncode != 0: