continue renaming extractor to plugin, add plan for hook concurrency, add chrome kill helper script

2026-04-04 23:07:56 +10:00 · 2025-12-28 05:29:24 -08:00
parent d2e65cfd38
commit 4ccb0863bb
53 changed files with 456 additions and 493 deletions
--- a/archivebox/misc/logging_util.py
+++ b/archivebox/misc/logging_util.py
@@ -522,7 +522,7 @@ def log_worker_event(
    pid: Optional[int] = None,
    worker_id: Optional[str] = None,
    url: Optional[str] = None,
-    extractor: Optional[str] = None,
+    plugin: Optional[str] = None,
    metadata: Optional[Dict[str, Any]] = None,
    error: Optional[Exception] = None,
 ) -> None:
@@ -534,9 +534,9 @@ def log_worker_event(
        event: Event name (Starting, Completed, Failed, etc.)
        indent_level: Indentation level (0=Orchestrator, 1=CrawlWorker, 2=SnapshotWorker, 3=ArchiveResultWorker)
        pid: Process ID
-        worker_id: Worker ID (UUID for CrawlWorker, url for SnapshotWorker, extractor for ArchiveResultWorker)
+        worker_id: Worker ID (UUID for CrawlWorker, url for SnapshotWorker, plugin for ArchiveResultWorker)
        url: URL being processed (for SnapshotWorker/ArchiveResultWorker)
-        extractor: Extractor name (for ArchiveResultWorker)
+        plugin: Plugin name (for ArchiveResultWorker)
        metadata: Dict of metadata to show in curly braces
        error: Exception if event is an error
    """
@@ -544,7 +544,7 @@ def log_worker_event(

    from rich.markup import escape

-    # Build worker identifier (without URL/extractor)
+    # Build worker identifier (without URL/plugin)
    worker_parts = [worker_type]
    # Don't add pid/worker_id for DB operations (they happen in whatever process is running)
    if pid and worker_type != 'DB':
@@ -556,12 +556,12 @@ def log_worker_event(
    worker_label_base = worker_parts[0]
    worker_bracket_content = ", ".join(worker_parts[1:]) if len(worker_parts) > 1 else None

-    # Build URL/extractor display (shown AFTER the label, outside brackets)
+    # Build URL/plugin display (shown AFTER the label, outside brackets)
    url_extractor_parts = []
    if url:
        url_extractor_parts.append(f'url: {escape(url)}')
-    if extractor:
-        url_extractor_parts.append(f'extractor: {escape(extractor)}')
+    if plugin:
+        url_extractor_parts.append(f'extractor: {escape(plugin)}')

    url_extractor_str = ' | '.join(url_extractor_parts) if url_extractor_parts else ''

@@ -623,7 +623,7 @@ def log_worker_event(

    text.append(f' {event}{error_str}', style=color)

-    # Add URL/extractor info first (more important)
+    # Add URL/plugin info first (more important)
    if url_extractor_str:
        text.append(f' | {url_extractor_str}')

--- a/archivebox/misc/process_utils.py
+++ b/archivebox/misc/process_utils.py
@@ -1,14 +1,9 @@
 """
-Cross-platform process validation utilities using psutil.
+Process validation using psutil and filesystem mtime.

-Uses filesystem mtime as a "password" to validate PIDs haven't been reused.
-Since filesystem mtimes can be set arbitrarily, but process start times cannot,
-we can detect PID reuse by comparing:
-  - PID file mtime (set to process start time when we launched it)
-  - Actual process start time (from psutil)
-
-If they match (within tolerance), it's our process.
-If they don't match, the PID was reused by a different process.
+Uses mtime as a "password": PID files are timestamped with process start time.
+Since filesystem mtimes can be set arbitrarily but process start times cannot,
+comparing them detects PID reuse.
 """

 __package__ = 'archivebox.misc'
@@ -20,245 +15,70 @@ from typing import Optional

 try:
    import psutil
+    PSUTIL_AVAILABLE = True
 except ImportError:
-    psutil = None
+    PSUTIL_AVAILABLE = False


-def get_process_info(pid: int) -> Optional[dict]:
-    """
-    Get process information using psutil.
-
-    Args:
-        pid: Process ID
-
-    Returns:
-        Dict with 'start_time', 'cmdline', 'name', 'status' or None if not found
-    """
-    if psutil is None:
-        return None
+def validate_pid_file(pid_file: Path, cmd_file: Optional[Path] = None, tolerance: float = 5.0) -> bool:
+    """Validate PID using mtime and optional cmd.sh. Returns True if process is ours."""
+    if not PSUTIL_AVAILABLE or not pid_file.exists():
+        return False

    try:
+        pid = int(pid_file.read_text().strip())
        proc = psutil.Process(pid)
-        return {
-            'start_time': proc.create_time(),  # Unix epoch seconds
-            'cmdline': proc.cmdline(),
-            'name': proc.name(),
-            'status': proc.status(),
-        }
-    except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
-        return None

+        # Check mtime matches process start time
+        if abs(pid_file.stat().st_mtime - proc.create_time()) > tolerance:
+            return False  # PID reused

-def validate_pid_file(
-    pid_file: Path,
-    cmd_file: Optional[Path] = None,
-    tolerance_seconds: float = 5.0
-) -> bool:
-    """
-    Validate PID file using mtime as "password".
-
-    Returns True only if ALL checks pass:
-    1. PID file exists and contains valid integer
-    2. Process with that PID exists
-    3. File mtime matches process start time (within tolerance)
-    4. If cmd_file provided, process cmdline contains expected args
-
-    Args:
-        pid_file: Path to .pid file
-        cmd_file: Optional path to cmd.sh for command validation
-        tolerance_seconds: Allowed difference between mtime and start time
-
-    Returns:
-        True if PID is validated, False if reused/invalid
-    """
-    if psutil is None:
-        # Fallback: just check if process exists (no validation)
-        return _validate_pid_file_without_psutil(pid_file)
-
-    # Check PID file exists
-    if not pid_file.exists():
-        return False
-
-    # Read PID
-    try:
-        pid = int(pid_file.read_text().strip())
-    except (ValueError, OSError):
-        return False
-
-    # Get process info
-    proc_info = get_process_info(pid)
-    if proc_info is None:
-        return False  # Process doesn't exist
-
-    # Check mtime matches process start time
-    try:
-        file_mtime = pid_file.stat().st_mtime
-    except OSError:
-        return False
-
-    proc_start_time = proc_info['start_time']
-    time_diff = abs(file_mtime - proc_start_time)
-
-    if time_diff > tolerance_seconds:
-        # PID was reused by different process
-        return False
-
-    # Validate command if provided
-    if cmd_file and cmd_file.exists():
-        try:
-            expected_cmd = cmd_file.read_text().strip()
-            actual_cmdline = ' '.join(proc_info['cmdline'])
-
-            # Check for key indicators (chrome, debug port, etc.)
-            # This is a heuristic - just checks if critical args are present
-            if '--remote-debugging-port' in expected_cmd:
-                if '--remote-debugging-port' not in actual_cmdline:
+        # Validate command if provided
+        if cmd_file and cmd_file.exists():
+            cmd = cmd_file.read_text()
+            cmdline = ' '.join(proc.cmdline())
+            if '--remote-debugging-port' in cmd and '--remote-debugging-port' not in cmdline:
+                return False
+            if ('chrome' in cmd.lower() or 'chromium' in cmd.lower()):
+                if 'chrome' not in proc.name().lower() and 'chromium' not in proc.name().lower():
                    return False

-            if 'chrome' in expected_cmd.lower() or 'chromium' in expected_cmd.lower():
-                proc_name_lower = proc_info['name'].lower()
-                if 'chrome' not in proc_name_lower and 'chromium' not in proc_name_lower:
-                    return False
-
-        except OSError:
-            pass  # Can't validate command, but other checks passed
-
-    return True
-
-
-def _validate_pid_file_without_psutil(pid_file: Path) -> bool:
-    """
-    Fallback validation when psutil not available.
-    Only checks if process exists, no validation.
-    """
-    if not pid_file.exists():
-        return False
-
-    try:
-        pid = int(pid_file.read_text().strip())
-        os.kill(pid, 0)  # Signal 0 = check existence
        return True
-    except (OSError, ValueError, ProcessLookupError):
+    except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess, ValueError, OSError):
        return False


 def write_pid_file_with_mtime(pid_file: Path, pid: int, start_time: float):
-    """
-    Write PID file and set mtime to process start time.
-
-    This creates a "password" that can be validated later to ensure
-    the PID hasn't been reused by a different process.
-
-    Args:
-        pid_file: Path to .pid file to create
-        pid: Process ID to write
-        start_time: Process start time as Unix epoch seconds
-    """
+    """Write PID file and set mtime to process start time."""
    pid_file.write_text(str(pid))
-
-    # Set both atime and mtime to process start time
    try:
        os.utime(pid_file, (start_time, start_time))
    except OSError:
-        # If we can't set mtime, file is still written
-        # Validation will be less reliable but won't break
-        pass
+        pass  # mtime optional, validation degrades gracefully


 def write_cmd_file(cmd_file: Path, cmd: list[str]):
-    """
-    Write command script for validation.
-
-    Args:
-        cmd_file: Path to cmd.sh to create
-        cmd: Command list (e.g., ['chrome', '--remote-debugging-port=9222', ...])
-    """
-    # Shell escape arguments with spaces or special chars
-    def shell_escape(arg: str) -> str:
-        if ' ' in arg or '"' in arg or "'" in arg or '$' in arg:
-            # Escape double quotes and wrap in double quotes
-            return f'"{arg.replace(chr(34), chr(92) + chr(34))}"'
-        return arg
-
-    escaped_cmd = [shell_escape(arg) for arg in cmd]
-    script = '#!/bin/bash\n' + ' '.join(escaped_cmd) + '\n'
+    """Write shell command script."""
+    def escape(arg: str) -> str:
+        return f'"{arg.replace(chr(34), chr(92)+chr(34))}"' if any(c in arg for c in ' "$') else arg

+    script = '#!/bin/bash\n' + ' '.join(escape(arg) for arg in cmd) + '\n'
    cmd_file.write_text(script)
    try:
        cmd_file.chmod(0o755)
    except OSError:
-        pass  # Best effort
+        pass


-def safe_kill_process(
-    pid_file: Path,
-    cmd_file: Optional[Path] = None,
-    signal_num: int = 15,  # SIGTERM
-    validate: bool = True
-) -> bool:
-    """
-    Safely kill a process with validation.
-
-    Args:
-        pid_file: Path to .pid file
-        cmd_file: Optional path to cmd.sh for validation
-        signal_num: Signal to send (default SIGTERM=15)
-        validate: If True, validate process identity before killing
-
-    Returns:
-        True if process was killed, False if not found or validation failed
-    """
-    if not pid_file.exists():
+def safe_kill_process(pid_file: Path, cmd_file: Optional[Path] = None, signal_num: int = 15) -> bool:
+    """Kill process after validation. Returns True if killed."""
+    if not validate_pid_file(pid_file, cmd_file):
+        pid_file.unlink(missing_ok=True)  # Clean stale file
        return False

-    # Validate process identity first
-    if validate:
-        if not validate_pid_file(pid_file, cmd_file):
-            # PID reused by different process, don't kill
-            # Clean up stale PID file
-            try:
-                pid_file.unlink()
-            except OSError:
-                pass
-            return False
-
-    # Read PID and kill
    try:
        pid = int(pid_file.read_text().strip())
        os.kill(pid, signal_num)
        return True
    except (OSError, ValueError, ProcessLookupError):
        return False
-
-
-def cleanup_stale_pid_files(directory: Path, cmd_file_name: str = 'cmd.sh') -> int:
-    """
-    Remove stale PID files from directory.
-
-    A PID file is stale if:
-    - Process no longer exists, OR
-    - Process exists but validation fails (PID reused)
-
-    Args:
-        directory: Directory to scan for *.pid files
-        cmd_file_name: Name of command file for validation (default: cmd.sh)
-
-    Returns:
-        Number of stale PID files removed
-    """
-    if not directory.exists():
-        return 0
-
-    removed = 0
-    for pid_file in directory.glob('**/*.pid'):
-        cmd_file = pid_file.parent / cmd_file_name
-
-        # Check if valid
-        if not validate_pid_file(pid_file, cmd_file):
-            try:
-                pid_file.unlink()
-                removed += 1
-            except OSError:
-                pass
-
-    return removed
--- a/archivebox/misc/shell_welcome_message.py
+++ b/archivebox/misc/shell_welcome_message.py
@@ -53,5 +53,5 @@ if __name__ == '__main__':
    prnt('    add[blink][deep_sky_blue4]?[/deep_sky_blue4][/blink]                                                                        [grey53]# add ? after anything to get help[/]')
    prnt('    add("https://example.com/some/new/url")                                     [grey53]# call CLI methods from the shell[/]')
    prnt('    snap = Snapshot.objects.filter(url__contains="https://example.com").last()  [grey53]# query for individual snapshots[/]')
-    prnt('    snap.archiveresult_set.all()                                                [grey53]# see extractor results[/]')
+    prnt('    snap.archiveresult_set.all()                                                [grey53]# see extractor plugin results[/]')
    prnt('    bool(re.compile(CONFIG.URL_DENYLIST).search("https://example.com/abc.exe")) [grey53]# test out a config change[/]')