bump version

2026-04-06 07:47:53 +10:00 · 2026-01-19 01:11:59 -08:00
parent c7b2217cd6
commit 1cb2d5070e
8 changed files with 43 additions and 33 deletions
--- a/archivebox/plugins/custom/on_Binary__14_custom_install.py
+++ b/archivebox/plugins/custom/on_Binary__14_custom_install.py
@@ -44,12 +44,10 @@ def main(binary_id: str, machine_id: str, name: str, binproviders: str, custom_c
        result = subprocess.run(
            custom_cmd,
            shell=True,
-            capture_output=True,
-            text=True,
            timeout=600,  # 10 minute timeout for custom installs
        )
        if result.returncode != 0:
-            click.echo(f"Custom install failed: {result.stderr}", err=True)
+            click.echo(f"Custom install failed (exit={result.returncode})", err=True)
            sys.exit(1)
    except subprocess.TimeoutExpired:
        click.echo("Custom install timed out", err=True)
--- a/archivebox/plugins/git/on_Snapshot__05_git.bg.py
+++ b/archivebox/plugins/git/on_Snapshot__05_git.bg.py
@@ -82,13 +82,12 @@ def clone_git(url: str, binary: str) -> tuple[bool, str | None, str]:
    cmd = [binary, *git_args, *git_args_extra, url, OUTPUT_DIR]

    try:
-        result = subprocess.run(cmd, capture_output=True, timeout=timeout)
+        result = subprocess.run(cmd, timeout=timeout)

        if result.returncode == 0 and Path(OUTPUT_DIR).is_dir():
            return True, OUTPUT_DIR, ''
        else:
-            stderr = result.stderr.decode('utf-8', errors='replace')
-            return False, None, f'git clone failed: {stderr[:200]}'
+            return False, None, f'git clone failed (exit={result.returncode})'

    except subprocess.TimeoutExpired:
        return False, None, f'Timed out after {timeout} seconds'
--- a/archivebox/plugins/mercury/on_Snapshot__57_mercury.py
+++ b/archivebox/plugins/mercury/on_Snapshot__57_mercury.py
@@ -81,11 +81,10 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
    try:
        # Get text version
        cmd_text = [binary, *mercury_args, *mercury_args_extra, url, '--format=text']
-        result_text = subprocess.run(cmd_text, capture_output=True, timeout=timeout)
+        result_text = subprocess.run(cmd_text, stdout=subprocess.PIPE, timeout=timeout, text=True)

        if result_text.returncode != 0:
-            stderr = result_text.stderr.decode('utf-8', errors='replace')
-            return False, None, f'postlight-parser failed: {stderr[:200]}'
+            return False, None, f'postlight-parser failed (exit={result_text.returncode})'

        try:
            text_json = json.loads(result_text.stdout)
@@ -101,7 +100,7 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:

        # Get HTML version
        cmd_html = [binary, *mercury_args, *mercury_args_extra, url, '--format=html']
-        result_html = subprocess.run(cmd_html, capture_output=True, timeout=timeout)
+        result_html = subprocess.run(cmd_html, stdout=subprocess.PIPE, timeout=timeout, text=True)

        try:
            html_json = json.loads(result_html.stdout)
--- a/archivebox/plugins/pip/on_Binary__11_pip_install.py
+++ b/archivebox/plugins/pip/on_Binary__11_pip_install.py
@@ -62,8 +62,6 @@ def main(binary_id: str, machine_id: str, name: str, binproviders: str, override
            subprocess.run(
                [preferred_python, '-m', 'venv', str(pip_venv_path), '--upgrade-deps'],
                check=True,
-                capture_output=True,
-                text=True,
            )
        except Exception:
            # Fall back to PipProvider-managed venv creation
--- a/archivebox/plugins/readability/on_Snapshot__56_readability.py
+++ b/archivebox/plugins/readability/on_Snapshot__56_readability.py
@@ -107,11 +107,10 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
    try:
        # Run readability-extractor (outputs JSON by default)
        cmd = [binary, *readability_args, *readability_args_extra, html_source]
-        result = subprocess.run(cmd, capture_output=True, timeout=timeout)
+        result = subprocess.run(cmd, stdout=subprocess.PIPE, timeout=timeout, text=True)

        if result.returncode != 0:
-            stderr = result.stderr.decode('utf-8', errors='replace')
-            return False, None, f'readability-extractor failed: {stderr[:200]}'
+            return False, None, f'readability-extractor failed (exit={result.returncode})'

        # Parse JSON output
        try:
--- a/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py
+++ b/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py
@@ -23,6 +23,7 @@ import json
 import os
 import subprocess
 import sys
+import threading
 import time
 from urllib.request import urlopen
 from pathlib import Path
@@ -200,18 +201,44 @@ def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
    cmd.extend([url, str(output_path)])

    try:
-        result = subprocess.run(cmd, capture_output=True, timeout=timeout)
+        output_lines: list[str] = []
+        process = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            bufsize=1,
+        )
+
+        def _read_output() -> None:
+            if not process.stdout:
+                return
+            for line in process.stdout:
+                output_lines.append(line)
+                sys.stderr.write(line)
+
+        reader = threading.Thread(target=_read_output, daemon=True)
+        reader.start()
+
+        try:
+            process.wait(timeout=timeout)
+        except subprocess.TimeoutExpired:
+            process.kill()
+            reader.join(timeout=1)
+            return False, None, f'Timed out after {timeout} seconds'
+
+        reader.join(timeout=1)
+        combined_output = ''.join(output_lines)

        if output_path.exists() and output_path.stat().st_size > 0:
            return True, str(output_path), ''
        else:
-            stderr = result.stderr.decode('utf-8', errors='replace')
-            stdout = result.stdout.decode('utf-8', errors='replace')
+            stderr = combined_output
            if 'ERR_NAME_NOT_RESOLVED' in stderr:
                return False, None, 'DNS resolution failed'
            if 'ERR_CONNECTION_REFUSED' in stderr:
                return False, None, 'Connection refused'
-            detail = (stderr or stdout).strip()
+            detail = (stderr or '').strip()
            if len(detail) > 2000:
                detail = detail[:2000]
            cmd_preview = list(cmd)
--- a/archivebox/plugins/wget/on_Snapshot__06_wget.bg.py
+++ b/archivebox/plugins/wget/on_Snapshot__06_wget.bg.py
@@ -144,7 +144,6 @@ def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]:
    try:
        result = subprocess.run(
            cmd,
-            capture_output=True,
            timeout=timeout * 2,  # Allow extra time for large downloads
        )

@@ -155,18 +154,9 @@ def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]:
        ]

        if not downloaded_files:
-            stderr = result.stderr.decode('utf-8', errors='replace')
-            stdout = result.stdout.decode('utf-8', errors='replace')
-            combined = stderr + stdout
-
-            if '403' in combined or 'Forbidden' in combined:
-                return False, None, '403 Forbidden (try changing USER_AGENT)'
-            elif '404' in combined or 'Not Found' in combined:
-                return False, None, '404 Not Found'
-            elif '500' in combined:
-                return False, None, '500 Internal Server Error'
-            else:
-                return False, None, f'No files downloaded: {stderr[:200]}'
+            if result.returncode != 0:
+                return False, None, f'wget failed (exit={result.returncode})'
+            return False, None, 'No files downloaded'

        # Find main HTML file
        html_files = [
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "archivebox"
-version = "0.9.2"
+version = "0.9.3"
 requires-python = ">=3.13"
 description = "Self-hosted internet archiving solution."
 authors = [{name = "Nick Sweeting", email = "pyproject.toml@archivebox.io"}]