diff --git a/TODO_process_tracking.md b/TODO_process_tracking.md index be8c7c63..570c3c6e 100644 --- a/TODO_process_tracking.md +++ b/TODO_process_tracking.md @@ -28,7 +28,7 @@ Process(cmd=['archivebox', 'add', 'https://example.com']) # CLI entry **File:** `archivebox/machine/models.py` ```python -class Process(ModelWithStateMachine): +class Process(ModelWithHealthStats): # ... existing fields ... # NEW: Parent process FK for hierarchy tracking @@ -621,6 +621,18 @@ class Process(ModelWithHealthStats): return self + def is_alive(self) -> bool: + """Check if this process is still running.""" + from archivebox.misc.process_utils import validate_pid_file + + if self.status == self.StatusChoices.EXITED: + return False + + if not self.pid: + return False + + return validate_pid_file(self.pid_file, self.cmd_file) + def kill(self, signal_num: int = 15) -> bool: """ Kill this process and update status. @@ -700,7 +712,7 @@ class Process(ModelWithHealthStats): Wait for process to exit, polling periodically. Args: - timeout: Max seconds to wait (None = use self.timeout, or config.TIMEOUT * 5 if that's also None) + timeout: Max seconds to wait (None = use self.timeout) Returns: exit_code @@ -709,10 +721,8 @@ class Process(ModelWithHealthStats): TimeoutError if process doesn't exit in time """ import time - from archivebox import config - # Require a timeout - default to config.TIMEOUT * 5 (typically 300s) - timeout = timeout or self.timeout or (config.TIMEOUT * 5) + timeout = timeout or self.timeout start = time.time() while True: @@ -1692,6 +1702,230 @@ class ProcessAdmin(admin.ModelAdmin): --- +## Phase 8: Code Consolidation (Delete Redundant Logic) + +The goal is to consolidate all subprocess management into `Process` model methods, eliminating duplicate logic scattered across the codebase. + +### 8.1 Files to Simplify/Delete + +| File | Current Lines | After Consolidation | Savings | +|------|--------------|---------------------|---------| +| `workers/pid_utils.py` | ~192 lines | DELETE entirely | -192 | +| `misc/process_utils.py` | ~85 lines | Keep as low-level utils | 0 | +| `hooks.py` (run_hook) | ~100 lines | -50 lines (use Process.launch) | -50 | +| `hooks.py` (kill/alive) | ~50 lines | DELETE (use Process.kill/is_running) | -50 | +| `crawls/models.py` (cleanup) | ~100 lines | -70 lines (use Process.kill) | -70 | +| `supervisord_util.py` | ~50 lines process mgmt | -30 lines | -30 | +| **TOTAL** | | | **~-390 lines** | + +### 8.2 Detailed Consolidation Map + +#### `workers/pid_utils.py` → DELETE ENTIRELY + +| Current Function | Replacement | +|------------------|-------------| +| `write_pid_file(worker_type, worker_id)` | `Process.current()` auto-creates | +| `read_pid_file(path)` | `Process.objects.get_by_pid(pid)` | +| `remove_pid_file(path)` | Manual cleanup in `Process.kill()` and legacy hook cleanup code | +| `is_process_alive(pid)` | `Process.is_running` / `Process.proc is not None` | +| `get_all_pid_files()` | `Process.objects.filter(machine=Machine.current(), status=Process.StatusChoices.RUNNING)` | +| `get_all_worker_pids(type)` | `Process.objects.filter(machine=Machine.current(), process_type=type, status=Process.StatusChoices.RUNNING)` | +| `cleanup_stale_pid_files()` | `Process.cleanup_stale_running()` | +| `get_running_worker_count(type)` | `Process.objects.filter(...).count()` | +| `get_next_worker_id(type)` | Use `Max(worker_id)+1` under transaction or DB sequence to avoid race conditions | +| `stop_worker(pid, graceful)` | `Process.terminate(graceful_timeout)` or `Process.kill_tree()` | + +#### `hooks.py` Changes + +**Current `run_hook()` lines 374-398:** +```python +# DELETE these lines - replaced by Process.launch() +stdout_file = output_dir / 'stdout.log' +stderr_file = output_dir / 'stderr.log' +pid_file = output_dir / 'hook.pid' +cmd_file = output_dir / 'cmd.sh' +write_cmd_file(cmd_file, cmd) +with open(stdout_file, 'w') as out, open(stderr_file, 'w') as err: + process = subprocess.Popen(cmd, ...) + write_pid_file_with_mtime(pid_file, process.pid, time.time()) +``` + +**New `run_hook()` using Process:** +```python +# Only store env delta or allowlist to avoid leaking secrets +env_delta = {k: v for k, v in env.items() if k in ALLOWED_ENV_VARS} + +hook_process = Process.objects.create( + parent=parent_process, + process_type=Process.TypeChoices.HOOK, + cmd=cmd, pwd=str(output_dir), env=env_delta, timeout=timeout, +) +hook_process.launch(background=is_background) +# stdout/stderr/pid_file all handled internally by Process.launch() +``` + +**DELETE these functions entirely:** +```python +def process_is_alive(pid_file: Path) -> bool: # lines 1238-1256 +def kill_process(pid_file: Path, sig, validate): # lines 1259-1282 +``` + +**Replace with:** +```python +# Use Process methods directly: +process.is_running # replaces process_is_alive() +process.kill() # replaces kill_process() +``` + +#### `crawls/models.py` Changes + +**Current `Crawl.cleanup()` lines 418-493:** +```python +# DELETE all this inline process logic: +def is_process_alive(pid): + try: + os.kill(pid, 0) + return True + except (OSError, ProcessLookupError): + return False + +for pid_file in self.OUTPUT_DIR.glob('**/*.pid'): + if not validate_pid_file(pid_file, cmd_file): + pid_file.unlink(missing_ok=True) + continue + pid = int(pid_file.read_text().strip()) + os.killpg(pid, signal.SIGTERM) + time.sleep(2) + if not is_process_alive(pid): + pid_file.unlink(missing_ok=True) + continue + os.killpg(pid, signal.SIGKILL) + # ... more cleanup logic +``` + +**New `Crawl.cleanup()` using Process:** +```python +def cleanup(self): + # Kill all running child processes for this crawl + for snapshot in self.snapshot_set.all(): + for ar in snapshot.archiveresult_set.filter(status=ArchiveResult.StatusChoices.STARTED): + if ar.process_id: + # Kill hook process and all its children + ar.process.kill() + for child in ar.process.children.filter(status='running'): + child.kill() + + # Run on_CrawlEnd hooks (foreground) + # ... existing hook running logic ... +``` + +#### `supervisord_util.py` Changes + +**Current global tracking:** +```python +_supervisord_proc = None # subprocess.Popen reference + +def stop_existing_supervisord_process(): + global _supervisord_proc + if _supervisord_proc and _supervisord_proc.poll() is None: + _supervisord_proc.terminate() + _supervisord_proc.wait(timeout=5) + # ... fallback to PID file ... +``` + +**New using Process model:** +```python +_supervisord_db_process = None # Process model instance + +def start_new_supervisord_process(): + # ... existing subprocess.Popen ... + global _supervisord_db_process + _supervisord_db_process = Process.objects.create( + parent=Process.current(), + process_type=Process.TypeChoices.SUPERVISORD, + pid=proc.pid, + cmd=['supervisord', f'--configuration={CONFIG_FILE}'], + started_at=timezone.now(), + status=Process.StatusChoices.RUNNING, + ) + +def stop_existing_supervisord_process(): + global _supervisord_db_process + if _supervisord_db_process: + _supervisord_db_process.kill() # Handles children, PID validation, etc. + _supervisord_db_process = None +``` + +#### `workers/worker.py` Changes + +**Current:** +```python +from .pid_utils import write_pid_file, remove_pid_file, ... + +def on_startup(self): + self.pid = os.getpid() + self.pid_file = write_pid_file(self.name, self.worker_id) + +def on_shutdown(self, error=None): + if self.pid_file: + remove_pid_file(self.pid_file) +``` + +**New:** +```python +# No import needed - Process.current() handles everything + +def on_startup(self): + self.db_process = Process.current() + # Process.current() auto-detects type, finds parent via PPID, creates record + +def on_shutdown(self, error=None): + if self.db_process: + self.db_process.exit_code = 0 if error is None else 1 + self.db_process.status = Process.StatusChoices.EXITED + self.db_process.ended_at = timezone.now() + self.db_process.save() +``` + +### 8.3 New Process Model Methods Summary + +All process operations now go through `Process`: + +```python +# Getting current process +Process.current() # Creates/retrieves Process for os.getpid() + +# Spawning new process +proc = Process.objects.create(parent=Process.current(), cmd=[...], ...) +proc.launch(background=False) # Handles Popen, PID file, stdout/stderr + +# Checking process status +proc.is_running # True if OS process exists and matches +proc.proc # psutil.Process or None (validated) +proc.poll() # Returns exit_code or None + +# Terminating process +proc.kill() # Safe kill with PID validation +proc.kill(SIGKILL) # Force kill + +# Waiting for completion +proc.wait(timeout=30) # Blocks until exit or timeout + +# Cleanup +Process.cleanup_stale_running() # Mark orphaned processes as EXITED +``` + +### 8.4 Benefits + +1. **Single Source of Truth**: All process state in database, queryable +2. **PID Reuse Protection**: `Process.proc` validates via psutil.create_time() +3. **Hierarchy Tracking**: `Process.parent` / `Process.children` for tree traversal +4. **Machine-Scoped**: All queries filter by `machine=Machine.current()` +5. **Audit Trail**: Every subprocess is logged with timestamps, exit codes +6. **No Stale PID Files**: Process records update status automatically + +--- + ## Open Questions 1. **Performance**: Deep hierarchies with many children could slow queries. Consider: diff --git a/archivebox/cli/archivebox_extract.py b/archivebox/cli/archivebox_extract.py new file mode 100644 index 00000000..7dc043ae --- /dev/null +++ b/archivebox/cli/archivebox_extract.py @@ -0,0 +1,265 @@ +#!/usr/bin/env python3 + +""" +archivebox extract [snapshot_ids...] [--plugins=NAMES] + +Run plugins on Snapshots. Accepts snapshot IDs as arguments, from stdin, or via JSONL. + +Input formats: + - Snapshot UUIDs (one per line) + - JSONL: {"type": "Snapshot", "id": "...", "url": "..."} + - JSONL: {"type": "ArchiveResult", "snapshot_id": "...", "plugin": "..."} + +Output (JSONL): + {"type": "ArchiveResult", "id": "...", "snapshot_id": "...", "plugin": "...", "status": "..."} + +Examples: + # Extract specific snapshot + archivebox extract 01234567-89ab-cdef-0123-456789abcdef + + # Pipe from snapshot command + archivebox snapshot https://example.com | archivebox extract + + # Run specific plugins only + archivebox extract --plugins=screenshot,singlefile 01234567-89ab-cdef-0123-456789abcdef + + # Chain commands + archivebox crawl https://example.com | archivebox snapshot | archivebox extract +""" + +__package__ = 'archivebox.cli' +__command__ = 'archivebox extract' + +import sys +from typing import Optional, List + +import rich_click as click + + +def process_archiveresult_by_id(archiveresult_id: str) -> int: + """ + Run extraction for a single ArchiveResult by ID (used by workers). + + Triggers the ArchiveResult's state machine tick() to run the extractor plugin. + """ + from rich import print as rprint + from archivebox.core.models import ArchiveResult + + try: + archiveresult = ArchiveResult.objects.get(id=archiveresult_id) + except ArchiveResult.DoesNotExist: + rprint(f'[red]ArchiveResult {archiveresult_id} not found[/red]', file=sys.stderr) + return 1 + + rprint(f'[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]', file=sys.stderr) + + try: + # Trigger state machine tick - this runs the actual extraction + archiveresult.sm.tick() + archiveresult.refresh_from_db() + + if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED: + print(f'[green]Extraction succeeded: {archiveresult.output_str}[/green]') + return 0 + elif archiveresult.status == ArchiveResult.StatusChoices.FAILED: + print(f'[red]Extraction failed: {archiveresult.output_str}[/red]', file=sys.stderr) + return 1 + else: + # Still in progress or backoff - not a failure + print(f'[yellow]Extraction status: {archiveresult.status}[/yellow]') + return 0 + + except Exception as e: + print(f'[red]Extraction error: {type(e).__name__}: {e}[/red]', file=sys.stderr) + return 1 + + +def run_plugins( + args: tuple, + plugins: str = '', + wait: bool = True, +) -> int: + """ + Run plugins on Snapshots from input. + + Reads Snapshot IDs or JSONL from args/stdin, runs plugins, outputs JSONL. + + Exit codes: + 0: Success + 1: Failure + """ + from rich import print as rprint + from django.utils import timezone + + from archivebox.misc.jsonl import ( + read_args_or_stdin, write_record, + TYPE_SNAPSHOT, TYPE_ARCHIVERESULT + ) + from archivebox.core.models import Snapshot, ArchiveResult + from archivebox.workers.orchestrator import Orchestrator + + is_tty = sys.stdout.isatty() + + # Parse comma-separated plugins list once (reused in creation and filtering) + plugins_list = [p.strip() for p in plugins.split(',') if p.strip()] if plugins else [] + + # Collect all input records + records = list(read_args_or_stdin(args)) + + if not records: + rprint('[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]', file=sys.stderr) + return 1 + + # Gather snapshot IDs to process + snapshot_ids = set() + for record in records: + record_type = record.get('type') + + if record_type == TYPE_SNAPSHOT: + snapshot_id = record.get('id') + if snapshot_id: + snapshot_ids.add(snapshot_id) + elif record.get('url'): + # Look up by URL (get most recent if multiple exist) + snap = Snapshot.objects.filter(url=record['url']).order_by('-created_at').first() + if snap: + snapshot_ids.add(str(snap.id)) + else: + rprint(f'[yellow]Snapshot not found for URL: {record["url"]}[/yellow]', file=sys.stderr) + + elif record_type == TYPE_ARCHIVERESULT: + snapshot_id = record.get('snapshot_id') + if snapshot_id: + snapshot_ids.add(snapshot_id) + + elif 'id' in record: + # Assume it's a snapshot ID + snapshot_ids.add(record['id']) + + if not snapshot_ids: + rprint('[red]No valid snapshot IDs found in input[/red]', file=sys.stderr) + return 1 + + # Get snapshots and ensure they have pending ArchiveResults + processed_count = 0 + for snapshot_id in snapshot_ids: + try: + snapshot = Snapshot.objects.get(id=snapshot_id) + except Snapshot.DoesNotExist: + rprint(f'[yellow]Snapshot {snapshot_id} not found[/yellow]', file=sys.stderr) + continue + + # Create pending ArchiveResults if needed + if plugins_list: + # Only create for specific plugins + for plugin_name in plugins_list: + result, created = ArchiveResult.objects.get_or_create( + snapshot=snapshot, + plugin=plugin_name, + defaults={ + 'status': ArchiveResult.StatusChoices.QUEUED, + 'retry_at': timezone.now(), + } + ) + if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]: + # Reset for retry + result.status = ArchiveResult.StatusChoices.QUEUED + result.retry_at = timezone.now() + result.save() + else: + # Create all pending plugins + snapshot.create_pending_archiveresults() + + # Reset snapshot status to allow processing + if snapshot.status == Snapshot.StatusChoices.SEALED: + snapshot.status = Snapshot.StatusChoices.STARTED + snapshot.retry_at = timezone.now() + snapshot.save() + + processed_count += 1 + + if processed_count == 0: + rprint('[red]No snapshots to process[/red]', file=sys.stderr) + return 1 + + rprint(f'[blue]Queued {processed_count} snapshots for extraction[/blue]', file=sys.stderr) + + # Run orchestrator if --wait (default) + if wait: + rprint('[blue]Running plugins...[/blue]', file=sys.stderr) + orchestrator = Orchestrator(exit_on_idle=True) + orchestrator.runloop() + + # Output results as JSONL (when piped) or human-readable (when TTY) + for snapshot_id in snapshot_ids: + try: + snapshot = Snapshot.objects.get(id=snapshot_id) + results = snapshot.archiveresult_set.all() + if plugins_list: + results = results.filter(plugin__in=plugins_list) + + for result in results: + if is_tty: + status_color = { + 'succeeded': 'green', + 'failed': 'red', + 'skipped': 'yellow', + }.get(result.status, 'dim') + rprint(f' [{status_color}]{result.status}[/{status_color}] {result.plugin} → {result.output_str or ""}', file=sys.stderr) + else: + write_record(result.to_jsonl()) + except Snapshot.DoesNotExist: + continue + + return 0 + + +def is_archiveresult_id(value: str) -> bool: + """Check if value looks like an ArchiveResult UUID.""" + import re + uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I) + if not uuid_pattern.match(value): + return False + # Verify it's actually an ArchiveResult (not a Snapshot or other object) + from archivebox.core.models import ArchiveResult + return ArchiveResult.objects.filter(id=value).exists() + + +@click.command() +@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run (e.g., screenshot,singlefile)') +@click.option('--wait/--no-wait', default=True, help='Wait for plugins to complete (default: wait)') +@click.argument('args', nargs=-1) +def main(plugins: str, wait: bool, args: tuple): + """Run plugins on Snapshots, or process existing ArchiveResults by ID""" + from archivebox.misc.jsonl import read_args_or_stdin + + # Read all input + records = list(read_args_or_stdin(args)) + + if not records: + from rich import print as rprint + rprint('[yellow]No Snapshot IDs or ArchiveResult IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr) + sys.exit(1) + + # Check if input looks like existing ArchiveResult IDs to process + all_are_archiveresult_ids = all( + is_archiveresult_id(r.get('id') or r.get('url', '')) + for r in records + ) + + if all_are_archiveresult_ids: + # Process existing ArchiveResults by ID + exit_code = 0 + for record in records: + archiveresult_id = record.get('id') or record.get('url') + result = process_archiveresult_by_id(archiveresult_id) + if result != 0: + exit_code = result + sys.exit(exit_code) + else: + # Default behavior: run plugins on Snapshots from input + sys.exit(run_plugins(args, plugins=plugins, wait=wait)) + + +if __name__ == '__main__': + main() diff --git a/archivebox/cli/archivebox_orchestrator.py b/archivebox/cli/archivebox_orchestrator.py new file mode 100644 index 00000000..4b272727 --- /dev/null +++ b/archivebox/cli/archivebox_orchestrator.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 + +""" +archivebox orchestrator [--daemon] + +Start the orchestrator process that manages workers. + +The orchestrator polls queues for each model type (Crawl, Snapshot, ArchiveResult) +and lazily spawns worker processes when there is work to be done. +""" + +__package__ = 'archivebox.cli' +__command__ = 'archivebox orchestrator' + +import sys + +import rich_click as click + +from archivebox.misc.util import docstring + + +def orchestrator(daemon: bool = False, watch: bool = False) -> int: + """ + Start the orchestrator process. + + The orchestrator: + 1. Polls each model queue (Crawl, Snapshot, ArchiveResult) + 2. Spawns worker processes when there is work to do + 3. Monitors worker health and restarts failed workers + 4. Exits when all queues are empty (unless --daemon) + + Args: + daemon: Run forever (don't exit when idle) + watch: Just watch the queues without spawning workers (for debugging) + + Exit codes: + 0: All work completed successfully + 1: Error occurred + """ + from archivebox.workers.orchestrator import Orchestrator + + if Orchestrator.is_running(): + print('[yellow]Orchestrator is already running[/yellow]') + return 0 + + try: + orchestrator_instance = Orchestrator(exit_on_idle=not daemon) + orchestrator_instance.runloop() + return 0 + except KeyboardInterrupt: + return 0 + except Exception as e: + print(f'[red]Orchestrator error: {type(e).__name__}: {e}[/red]', file=sys.stderr) + return 1 + + +@click.command() +@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)") +@click.option('--watch', '-w', is_flag=True, help="Watch queues without spawning workers") +@docstring(orchestrator.__doc__) +def main(daemon: bool, watch: bool): + """Start the ArchiveBox orchestrator process""" + sys.exit(orchestrator(daemon=daemon, watch=watch)) + + +if __name__ == '__main__': + main() diff --git a/archivebox/cli/archivebox_remove.py b/archivebox/cli/archivebox_remove.py new file mode 100644 index 00000000..374b60d3 --- /dev/null +++ b/archivebox/cli/archivebox_remove.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 + +__package__ = 'archivebox.cli' +__command__ = 'archivebox remove' + +import shutil +from pathlib import Path +from typing import Iterable + +import rich_click as click + +from django.db.models import QuerySet + +from archivebox.config import DATA_DIR +from archivebox.config.django import setup_django +from archivebox.misc.util import enforce_types, docstring +from archivebox.misc.checks import check_data_folder +from archivebox.misc.logging_util import ( + log_list_started, + log_list_finished, + log_removal_started, + log_removal_finished, + TimedProgress, +) + + +@enforce_types +def remove(filter_patterns: Iterable[str]=(), + filter_type: str='exact', + snapshots: QuerySet | None=None, + after: float | None=None, + before: float | None=None, + yes: bool=False, + delete: bool=False, + out_dir: Path=DATA_DIR) -> QuerySet: + """Remove the specified URLs from the archive""" + + setup_django() + check_data_folder() + + from archivebox.cli.archivebox_search import get_snapshots + + log_list_started(filter_patterns, filter_type) + timer = TimedProgress(360, prefix=' ') + try: + snapshots = get_snapshots( + snapshots=snapshots, + filter_patterns=list(filter_patterns) if filter_patterns else None, + filter_type=filter_type, + after=after, + before=before, + ) + finally: + timer.end() + + if not snapshots.exists(): + log_removal_finished(0, 0) + raise SystemExit(1) + + log_list_finished(snapshots) + log_removal_started(snapshots, yes=yes, delete=delete) + + timer = TimedProgress(360, prefix=' ') + try: + for snapshot in snapshots: + if delete: + shutil.rmtree(snapshot.output_dir, ignore_errors=True) + finally: + timer.end() + + to_remove = snapshots.count() + + from archivebox.search import flush_search_index + from archivebox.core.models import Snapshot + + flush_search_index(snapshots=snapshots) + snapshots.delete() + all_snapshots = Snapshot.objects.all() + log_removal_finished(all_snapshots.count(), to_remove) + + return all_snapshots + + +@click.command() +@click.option('--yes', is_flag=True, help='Remove links instantly without prompting to confirm') +@click.option('--delete', is_flag=True, help='Delete the archived content and metadata folder in addition to removing from index') +@click.option('--before', type=float, help='Remove only URLs bookmarked before timestamp') +@click.option('--after', type=float, help='Remove only URLs bookmarked after timestamp') +@click.option('--filter-type', '-f', type=click.Choice(('exact', 'substring', 'domain', 'regex', 'tag')), default='exact', help='Type of pattern matching to use when filtering URLs') +@click.argument('filter_patterns', nargs=-1) +@docstring(remove.__doc__) +def main(**kwargs): + """Remove the specified URLs from the archive""" + remove(**kwargs) + + +if __name__ == '__main__': + main() diff --git a/archivebox/cli/archivebox_search.py b/archivebox/cli/archivebox_search.py new file mode 100644 index 00000000..055e952d --- /dev/null +++ b/archivebox/cli/archivebox_search.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 + +__package__ = 'archivebox.cli' +__command__ = 'archivebox search' + +from pathlib import Path +from typing import Optional, List, Any + +import rich_click as click +from rich import print + +from django.db.models import QuerySet + +from archivebox.config import DATA_DIR +from archivebox.misc.logging import stderr +from archivebox.misc.util import enforce_types, docstring + +# Filter types for URL matching +LINK_FILTERS = { + 'exact': lambda pattern: {'url': pattern}, + 'substring': lambda pattern: {'url__icontains': pattern}, + 'regex': lambda pattern: {'url__iregex': pattern}, + 'domain': lambda pattern: {'url__istartswith': f'http://{pattern}'}, + 'tag': lambda pattern: {'tags__name': pattern}, + 'timestamp': lambda pattern: {'timestamp': pattern}, +} + +STATUS_CHOICES = ['indexed', 'archived', 'unarchived'] + + + +def get_snapshots(snapshots: Optional[QuerySet]=None, + filter_patterns: Optional[List[str]]=None, + filter_type: str='substring', + after: Optional[float]=None, + before: Optional[float]=None, + out_dir: Path=DATA_DIR) -> QuerySet: + """Filter and return Snapshots matching the given criteria.""" + from archivebox.core.models import Snapshot + + if snapshots: + result = snapshots + else: + result = Snapshot.objects.all() + + if after is not None: + result = result.filter(timestamp__gte=after) + if before is not None: + result = result.filter(timestamp__lt=before) + if filter_patterns: + result = Snapshot.objects.filter_by_patterns(filter_patterns, filter_type) + + if not result: + stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow') + + return result + + +@enforce_types +def search(filter_patterns: list[str] | None=None, + filter_type: str='substring', + status: str='indexed', + before: float | None=None, + after: float | None=None, + sort: str | None=None, + json: bool=False, + html: bool=False, + csv: str | None=None, + with_headers: bool=False): + """List, filter, and export information about archive entries""" + from archivebox.core.models import Snapshot + + if with_headers and not (json or html or csv): + stderr('[X] --with-headers requires --json, --html or --csv\n', color='red') + raise SystemExit(2) + + # Query DB directly - no filesystem scanning + snapshots = get_snapshots( + filter_patterns=list(filter_patterns) if filter_patterns else None, + filter_type=filter_type, + before=before, + after=after, + ) + + # Apply status filter + if status == 'archived': + snapshots = snapshots.filter(downloaded_at__isnull=False) + elif status == 'unarchived': + snapshots = snapshots.filter(downloaded_at__isnull=True) + # 'indexed' = all snapshots (no filter) + + if sort: + snapshots = snapshots.order_by(sort) + + # Export to requested format + if json: + output = snapshots.to_json(with_headers=with_headers) + elif html: + output = snapshots.to_html(with_headers=with_headers) + elif csv: + output = snapshots.to_csv(cols=csv.split(','), header=with_headers) + else: + from archivebox.misc.logging_util import printable_folders + # Convert to dict for printable_folders + folders = {s.output_dir: s for s in snapshots} + output = printable_folders(folders, with_headers) + + print(output) + return output + + +@click.command() +@click.option('--filter-type', '-f', type=click.Choice(['search', *LINK_FILTERS.keys()]), default='substring', help='Pattern matching type for filtering URLs') +@click.option('--status', '-s', type=click.Choice(STATUS_CHOICES), default='indexed', help='List snapshots with the given status') +@click.option('--before', '-b', type=float, help='List snapshots bookmarked before the given UNIX timestamp') +@click.option('--after', '-a', type=float, help='List snapshots bookmarked after the given UNIX timestamp') +@click.option('--sort', '-o', type=str, help='Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at') +@click.option('--json', '-J', is_flag=True, help='Print output in JSON format') +@click.option('--html', '-M', is_flag=True, help='Print output in HTML format (suitable for viewing statically without a server)') +@click.option('--csv', '-C', type=str, help='Print output as CSV with the provided fields, e.g.: created_at,url,title') +@click.option('--with-headers', '-H', is_flag=True, help='Include extra CSV/HTML headers in the output') +@click.help_option('--help', '-h') +@click.argument('filter_patterns', nargs=-1) +@docstring(search.__doc__) +def main(**kwargs): + return search(**kwargs) + + + +if __name__ == '__main__': + main() diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 11b1ab20..f7b45ba9 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -1,6 +1,6 @@ __package__ = 'archivebox.core' -from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING, Iterator, Set +from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING from archivebox.uuid_compat import uuid7 from datetime import datetime, timedelta from django_stubs_ext.db.models import TypedModelMeta @@ -41,8 +41,6 @@ from archivebox.machine.models import NetworkInterface, Binary class Tag(ModelWithSerializers): - JSONL_TYPE = 'Tag' - # Keep AutoField for compatibility with main branch migrations # Don't use UUIDField here - requires complex FK transformation id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') @@ -93,66 +91,26 @@ class Tag(ModelWithSerializers): def api_url(self) -> str: return reverse_lazy('api-1:get_tag', args=[self.id]) - def to_json(self) -> dict: + def to_jsonl(self) -> dict: """ - Convert Tag model instance to a JSON-serializable dict. + Convert Tag model instance to a JSONL record. """ from archivebox.config import VERSION return { - 'type': self.JSONL_TYPE, + 'type': 'Tag', 'schema_version': VERSION, 'id': str(self.id), 'name': self.name, 'slug': self.slug, } - def to_jsonl(self, seen: Set[tuple] = None, **kwargs) -> Iterator[dict]: - """ - Yield this Tag as a JSON record. - - Args: - seen: Set of (type, id) tuples already emitted (for deduplication) - **kwargs: Passed to children (none for Tag, leaf node) - - Yields: - dict: JSON-serializable record for this tag - """ - if seen is not None: - key = (self.JSONL_TYPE, str(self.id)) - if key in seen: - return - seen.add(key) - yield self.to_json() - - @classmethod - def from_jsonl(cls, records, overrides: Dict[str, Any] = None) -> list['Tag']: - """ - Create/update Tags from an iterable of JSONL records. - Filters to only records with type='Tag'. - - Args: - records: Iterable of dicts (JSONL records) - overrides: Optional dict with 'snapshot' to auto-attach tags - - Returns: - List of Tag instances (skips None results) - """ - results = [] - for record in records: - record_type = record.get('type', cls.JSONL_TYPE) - if record_type == cls.JSONL_TYPE: - instance = cls.from_json(record, overrides=overrides) - if instance: - results.append(instance) - return results - @staticmethod - def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None) -> 'Tag | None': + def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None): """ - Create/update a single Tag from a JSON record dict. + Create/update Tag from JSONL record. Args: - record: Dict with 'name' field + record: JSONL record with 'name' field overrides: Optional dict with 'snapshot' to auto-attach tag Returns: @@ -331,8 +289,6 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)): class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine): - JSONL_TYPE = 'Snapshot' - id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) created_at = models.DateTimeField(default=timezone.now, db_index=True) modified_at = models.DateTimeField(auto_now=True) @@ -469,7 +425,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea def _fs_next_version(self, version: str) -> str: """Get next version in migration chain (0.7/0.8 had same layout, only 0.8→0.9 migration needed)""" - # Treat 0.7.0 and 0.8.0 as equivalent (both used data/archive/{timestamp}) + # Treat 0.7.0 and 0.8.0 as equivalent (both used archive/{timestamp}) if version in ('0.7.0', '0.8.0'): return '0.9.0' return self._fs_current_version() @@ -478,8 +434,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea """ Migrate from flat to nested structure. - 0.8.x: data/archive/{timestamp}/{extractor}/ - 0.9.x: data/users/{username}/snapshots/YYYYMMDD/{domain}/{uuid}/{plugin}/ + 0.8.x: archive/{timestamp}/ + 0.9.x: users/{user}/snapshots/YYYYMMDD/{domain}/{uuid}/ Transaction handling: 1. Copy files INSIDE transaction @@ -597,8 +553,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea Calculate storage path for specific filesystem version. Centralizes path logic so it's reusable. - 0.7.x/0.8.x: data/archive/{timestamp} - 0.9.x: data/users/{username}/snapshots/YYYYMMDD/{domain}/{uuid}/ + 0.7.x/0.8.x: archive/{timestamp} + 0.9.x: users/{username}/snapshots/YYYYMMDD/{domain}/{uuid}/ """ from datetime import datetime @@ -1012,18 +968,38 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea Each line is a JSON record with a 'type' field: - Snapshot: snapshot metadata (crawl_id, url, tags, etc.) + - ArchiveResult: extractor results (plugin, status, output, etc.) - Binary: binary info used for the extraction - Process: process execution details (cmd, exit_code, timing, etc.) - - ArchiveResult: extractor results (plugin, status, output, etc.) """ import json index_path = Path(self.output_dir) / CONSTANTS.JSONL_INDEX_FILENAME index_path.parent.mkdir(parents=True, exist_ok=True) + # Track unique binaries and processes to avoid duplicates + binaries_seen = set() + processes_seen = set() + with open(index_path, 'w') as f: - for record in self.to_jsonl(): - f.write(json.dumps(record) + '\n') + # Write Snapshot record first (to_jsonl includes crawl_id, fs_version) + f.write(json.dumps(self.to_jsonl()) + '\n') + + # Write ArchiveResult records with their associated Binary and Process + # Use select_related to optimize queries + for ar in self.archiveresult_set.select_related('process__binary').order_by('start_ts'): + # Write Binary record if not already written + if ar.process and ar.process.binary and ar.process.binary_id not in binaries_seen: + binaries_seen.add(ar.process.binary_id) + f.write(json.dumps(ar.process.binary.to_jsonl()) + '\n') + + # Write Process record if not already written + if ar.process and ar.process_id not in processes_seen: + processes_seen.add(ar.process_id) + f.write(json.dumps(ar.process.to_jsonl()) + '\n') + + # Write ArchiveResult record + f.write(json.dumps(ar.to_jsonl()) + '\n') def read_index_jsonl(self) -> dict: """ @@ -1407,22 +1383,18 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea Clean up background ArchiveResult hooks. Called by the state machine when entering the 'sealed' state. - Gracefully terminates background hooks using plugin-specific timeouts: - 1. Send SIGTERM to all background hook processes - 2. Wait up to each hook's plugin-specific timeout - 3. Send SIGKILL to any hooks still running after timeout + Kills any background hooks and finalizes their ArchiveResults. """ - from archivebox.hooks import graceful_terminate_background_hooks - from archivebox.config.configset import get_config + from archivebox.misc.process_utils import safe_kill_process + # Kill any background ArchiveResult hooks if not self.OUTPUT_DIR.exists(): return - # Get merged config for plugin-specific timeout lookup - config = get_config(crawl=self.crawl, snapshot=self) - - # Gracefully terminate all background hooks with plugin-specific timeouts - graceful_terminate_background_hooks(self.OUTPUT_DIR, config) + # Find all .pid files in this snapshot's output directory + for pid_file in self.OUTPUT_DIR.glob('**/*.pid'): + cmd_file = pid_file.parent / 'cmd.sh' + safe_kill_process(pid_file, cmd_file) # Update all STARTED ArchiveResults from filesystem results = self.archiveresult_set.filter(status=ArchiveResult.StatusChoices.STARTED) @@ -1435,32 +1407,35 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea Used by state machine to determine if snapshot is finished. """ - from archivebox.hooks import process_is_alive + from archivebox.misc.process_utils import validate_pid_file if not self.OUTPUT_DIR.exists(): return False - # Check all .pid files in the snapshot directory (hook-specific names) - for pid_file in self.OUTPUT_DIR.glob('**/*.pid'): - if process_is_alive(pid_file): + for plugin_dir in self.OUTPUT_DIR.iterdir(): + if not plugin_dir.is_dir(): + continue + pid_file = plugin_dir / 'hook.pid' + cmd_file = plugin_dir / 'cmd.sh' + if validate_pid_file(pid_file, cmd_file): return True return False - def to_json(self) -> dict: + def to_jsonl(self) -> dict: """ - Convert Snapshot model instance to a JSON-serializable dict. + Convert Snapshot model instance to a JSONL record. Includes all fields needed to fully reconstruct/identify this snapshot. """ from archivebox.config import VERSION return { - 'type': self.JSONL_TYPE, + 'type': 'Snapshot', 'schema_version': VERSION, 'id': str(self.id), 'crawl_id': str(self.crawl_id), 'url': self.url, 'title': self.title, - 'tags_str': self.tags_str(), + 'tags': self.tags_str(), 'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None, 'created_at': self.created_at.isoformat() if self.created_at else None, 'timestamp': self.timestamp, @@ -1469,68 +1444,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea 'fs_version': self.fs_version, } - def to_jsonl(self, seen: Set[tuple] = None, archiveresult: bool = True, process: bool = True, binary: bool = True, machine: bool = False, iface: bool = False, **kwargs) -> Iterator[dict]: - """ - Yield this Snapshot and optionally related objects as JSON records. - - Uses select_related for efficient querying. Deduplicates automatically. - - Args: - seen: Set of (type, id) tuples already emitted (for deduplication) - archiveresult: Include related ArchiveResults (default: True) - process: Include Process for each ArchiveResult (default: True) - binary: Include Binary for each Process (default: True) - machine: Include Machine for each Process (default: False) - iface: Include NetworkInterface for each Process (default: False) - **kwargs: Additional options passed to children - - Yields: - dict: JSON-serializable records - """ - if seen is None: - seen = set() - - key = (self.JSONL_TYPE, str(self.id)) - if key in seen: - return - seen.add(key) - - yield self.to_json() - - if archiveresult: - # Use select_related to optimize queries - for ar in self.archiveresult_set.select_related('process__binary').order_by('start_ts'): - yield from ar.to_jsonl(seen=seen, process=process, binary=binary, machine=machine, iface=iface, **kwargs) - - @classmethod - def from_jsonl(cls, records, overrides: Dict[str, Any] = None, queue_for_extraction: bool = True) -> list['Snapshot']: - """ - Create/update Snapshots from an iterable of JSONL records. - Filters to only records with type='Snapshot' (or no type). - - Args: - records: Iterable of dicts (JSONL records) - overrides: Dict with 'crawl', 'snapshot' (parent), 'created_by_id' - queue_for_extraction: If True, sets status=QUEUED and retry_at (default: True) - - Returns: - List of Snapshot instances (skips None results) - """ - results = [] - for record in records: - record_type = record.get('type', cls.JSONL_TYPE) - if record_type == cls.JSONL_TYPE: - instance = cls.from_json(record, overrides=overrides, queue_for_extraction=queue_for_extraction) - if instance: - results.append(instance) - return results - @staticmethod - def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True) -> 'Snapshot | None': + def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True): """ - Create/update a single Snapshot from a JSON record dict. + Create/update Snapshot from JSONL record or dict. - Handles: + Unified method that handles: - ID-based patching: {"id": "...", "title": "new title"} - URL-based create/update: {"url": "...", "title": "...", "tags": "..."} - Auto-creates Crawl if not provided @@ -2137,8 +2056,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea result['canonical'] = self.canonical_outputs() return result - def to_json_str(self, indent: int = 4) -> str: - """Convert to JSON string for file output.""" + def to_json(self, indent: int = 4) -> str: + """Convert to JSON string""" return to_json(self.to_dict(extended=True), indent=indent) def to_csv(self, cols: Optional[List[str]] = None, separator: str = ',', ljust: int = 0) -> str: @@ -2286,8 +2205,6 @@ class SnapshotMachine(BaseStateMachine, strict_states=True): class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine): - JSONL_TYPE = 'ArchiveResult' - class StatusChoices(models.TextChoices): QUEUED = 'queued', 'Queued' STARTED = 'started', 'Started' @@ -2321,7 +2238,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi process = models.OneToOneField( 'machine.Process', on_delete=models.PROTECT, - null=False, + null=False, # Required after migration 4 related_name='archiveresult', help_text='Process execution details for this archive result' ) @@ -2359,13 +2276,13 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi """Convenience property to access the user who created this archive result via its snapshot's crawl.""" return self.snapshot.crawl.created_by - def to_json(self) -> dict: + def to_jsonl(self) -> dict: """ - Convert ArchiveResult model instance to a JSON-serializable dict. + Convert ArchiveResult model instance to a JSONL record. """ from archivebox.config import VERSION record = { - 'type': self.JSONL_TYPE, + 'type': 'ArchiveResult', 'schema_version': VERSION, 'id': str(self.id), 'snapshot_id': str(self.snapshot_id), @@ -2393,121 +2310,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi record['process_id'] = str(self.process_id) return record - def to_jsonl(self, seen: Set[tuple] = None, process: bool = True, **kwargs) -> Iterator[dict]: - """ - Yield this ArchiveResult and optionally related objects as JSON records. - - Args: - seen: Set of (type, id) tuples already emitted (for deduplication) - process: Include related Process and its children (default: True) - **kwargs: Passed to Process.to_jsonl() (e.g., binary=True, machine=False) - - Yields: - dict: JSON-serializable records - """ - if seen is None: - seen = set() - - key = (self.JSONL_TYPE, str(self.id)) - if key in seen: - return - seen.add(key) - - yield self.to_json() - - if process and self.process: - yield from self.process.to_jsonl(seen=seen, **kwargs) - - @classmethod - def from_jsonl(cls, records, overrides: Dict[str, Any] = None) -> list['ArchiveResult']: - """ - Create/update ArchiveResults from an iterable of JSONL records. - Filters to only records with type='ArchiveResult'. - - Args: - records: Iterable of dicts (JSONL records) - overrides: Dict of field overrides - - Returns: - List of ArchiveResult instances (skips None results) - """ - results = [] - for record in records: - record_type = record.get('type', cls.JSONL_TYPE) - if record_type == cls.JSONL_TYPE: - instance = cls.from_json(record, overrides=overrides) - if instance: - results.append(instance) - return results - - @staticmethod - def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None) -> 'ArchiveResult | None': - """ - Create or update a single ArchiveResult from a JSON record dict. - - Args: - record: Dict with 'snapshot_id' and 'plugin' (required for create), - or 'id' (for update) - overrides: Dict of field overrides (e.g., config overrides) - - Returns: - ArchiveResult instance or None if invalid - """ - from django.utils import timezone - - overrides = overrides or {} - - # If 'id' is provided, lookup and update existing - result_id = record.get('id') - if result_id: - try: - result = ArchiveResult.objects.get(id=result_id) - # Update fields from record - if record.get('status'): - result.status = record['status'] - result.retry_at = timezone.now() - result.save() - return result - except ArchiveResult.DoesNotExist: - pass # Fall through to create - - # Required fields for creation - snapshot_id = record.get('snapshot_id') - plugin = record.get('plugin') - - if not snapshot_id or not plugin: - return None - - try: - snapshot = Snapshot.objects.get(id=snapshot_id) - except Snapshot.DoesNotExist: - return None - - # Check if result already exists for this snapshot+plugin - existing = ArchiveResult.objects.filter( - snapshot=snapshot, - plugin=plugin, - ).first() - - if existing: - # Update existing result if status provided - if record.get('status'): - existing.status = record['status'] - existing.retry_at = timezone.now() - existing.save() - return existing - - # Create new ArchiveResult - result = ArchiveResult( - snapshot=snapshot, - plugin=plugin, - status=record.get('status', ArchiveResult.StatusChoices.QUEUED), - retry_at=timezone.now(), - hook_name=record.get('hook_name', ''), - ) - result.save() - return result - def save(self, *args, **kwargs): is_new = self._state.adding @@ -2795,26 +2597,9 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi self.save() return - # Derive hook basename for hook-specific filenames - # e.g., "on_Snapshot__50_wget.py" -> "on_Snapshot__50_wget" - hook_basename = Path(self.hook_name).stem if self.hook_name else 'hook' - - # Read and parse JSONL output from hook-specific stdout log - stdout_file = plugin_dir / f'{hook_basename}.stdout.log' - stderr_file = plugin_dir / f'{hook_basename}.stderr.log' - returncode_file = plugin_dir / f'{hook_basename}.returncode' - + # Read and parse JSONL output from stdout.log + stdout_file = plugin_dir / 'stdout.log' stdout = stdout_file.read_text() if stdout_file.exists() else '' - stderr = stderr_file.read_text() if stderr_file.exists() else '' - - # Read returncode from file (written by graceful_terminate_background_hooks) - returncode = None - if returncode_file.exists(): - try: - rc_text = returncode_file.read_text().strip() - returncode = int(rc_text) if rc_text else None - except (ValueError, OSError): - pass records = [] for line in stdout.splitlines(): @@ -2849,43 +2634,12 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi self._set_binary_from_cmd(hook_data['cmd']) # Note: cmd_version is derived from binary.version, not stored on Process else: - # No ArchiveResult JSONL record - determine status from returncode - if returncode is not None: - if returncode == 0: - self.status = self.StatusChoices.SUCCEEDED - self.output_str = 'Hook completed successfully (no JSONL output)' - elif returncode < 0: - # Negative = killed by signal (e.g., -9 for SIGKILL, -15 for SIGTERM) - sig_num = abs(returncode) - sig_name = {9: 'SIGKILL', 15: 'SIGTERM'}.get(sig_num, f'signal {sig_num}') - self.status = self.StatusChoices.FAILED - self.output_str = f'Hook killed by {sig_name}' - if stderr: - self.output_str += f'\n\nstderr:\n{stderr[:2000]}' - else: - self.status = self.StatusChoices.FAILED - self.output_str = f'Hook failed with exit code {returncode}' - if stderr: - self.output_str += f'\n\nstderr:\n{stderr[:2000]}' - else: - # No returncode file and no JSONL = failed - self.status = self.StatusChoices.FAILED - self.output_str = 'Hook did not output ArchiveResult record' - if stderr: - self.output_str += f'\n\nstderr:\n{stderr[:2000]}' + # No ArchiveResult record = failed + self.status = self.StatusChoices.FAILED + self.output_str = 'Hook did not output ArchiveResult record' # Walk filesystem and populate output_files, output_size, output_mimetypes - # Exclude hook output files (hook-specific names like on_Snapshot__50_wget.stdout.log) - def is_hook_output_file(name: str) -> bool: - """Check if a file is a hook output file that should be excluded.""" - return ( - name.endswith('.stdout.log') or - name.endswith('.stderr.log') or - name.endswith('.pid') or - name.endswith('.returncode') or - (name.endswith('.sh') and name.startswith('on_')) - ) - + exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid'} mime_sizes = defaultdict(int) total_size = 0 output_files = {} @@ -2893,7 +2647,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi for file_path in plugin_dir.rglob('*'): if not file_path.is_file(): continue - if is_hook_output_file(file_path.name): + if file_path.name in exclude_names: continue try: @@ -2951,10 +2705,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi } process_hook_records(filtered_records, overrides=overrides) - # Cleanup PID files, returncode files, and empty logs (hook-specific names) - pid_file = plugin_dir / f'{hook_basename}.pid' + # Cleanup PID files and empty logs + pid_file = plugin_dir / 'hook.pid' pid_file.unlink(missing_ok=True) - returncode_file.unlink(missing_ok=True) + stderr_file = plugin_dir / 'stderr.log' if stdout_file.exists() and stdout_file.stat().st_size == 0: stdout_file.unlink() if stderr_file.exists() and stderr_file.stat().st_size == 0: @@ -3060,9 +2814,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi plugin_dir = Path(self.pwd) if self.pwd else None if not plugin_dir: return False - # Use hook-specific pid filename - hook_basename = Path(self.hook_name).stem if self.hook_name else 'hook' - pid_file = plugin_dir / f'{hook_basename}.pid' + pid_file = plugin_dir / 'hook.pid' return pid_file.exists() diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py index 07971109..c3b588c4 100755 --- a/archivebox/crawls/models.py +++ b/archivebox/crawls/models.py @@ -1,6 +1,6 @@ __package__ = 'archivebox.crawls' -from typing import TYPE_CHECKING, Iterable, Iterator, Set +from typing import TYPE_CHECKING, Iterable from datetime import timedelta from archivebox.uuid_compat import uuid7 from pathlib import Path @@ -59,8 +59,6 @@ class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats): class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWithStateMachine): - JSONL_TYPE = 'Crawl' - id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) created_at = models.DateTimeField(default=timezone.now, db_index=True) created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False) @@ -136,13 +134,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith def api_url(self) -> str: return reverse_lazy('api-1:get_crawl', args=[self.id]) - def to_json(self) -> dict: + def to_jsonl(self) -> dict: """ - Convert Crawl model instance to a JSON-serializable dict. + Convert Crawl model instance to a JSONL record. """ from archivebox.config import VERSION return { - 'type': self.JSONL_TYPE, + 'type': 'Crawl', 'schema_version': VERSION, 'id': str(self.id), 'urls': self.urls, @@ -153,63 +151,10 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith 'created_at': self.created_at.isoformat() if self.created_at else None, } - def to_jsonl(self, seen: Set[tuple] = None, snapshot: bool = True, archiveresult: bool = True, process: bool = True, binary: bool = True, machine: bool = False, iface: bool = False, **kwargs) -> Iterator[dict]: - """ - Yield this Crawl and optionally related objects as JSON records. - - Args: - seen: Set of (type, id) tuples already emitted (for deduplication) - snapshot: Include related Snapshots (default: True) - archiveresult: Include ArchiveResults for each Snapshot (default: True) - process: Include Process for each ArchiveResult (default: True) - binary: Include Binary for each Process (default: True) - machine: Include Machine for each Process (default: False) - iface: Include NetworkInterface for each Process (default: False) - **kwargs: Additional options passed to children - - Yields: - dict: JSON-serializable records - """ - if seen is None: - seen = set() - - key = (self.JSONL_TYPE, str(self.id)) - if key in seen: - return - seen.add(key) - - yield self.to_json() - - if snapshot: - for snap in self.snapshot_set.all(): - yield from snap.to_jsonl(seen=seen, archiveresult=archiveresult, process=process, binary=binary, machine=machine, iface=iface, **kwargs) - - @classmethod - def from_jsonl(cls, records, overrides: dict = None) -> list['Crawl']: - """ - Create/update Crawls from an iterable of JSONL records. - Filters to only records with type='Crawl' (or no type). - - Args: - records: Iterable of dicts (JSONL records) - overrides: Dict of field overrides (e.g., created_by_id) - - Returns: - List of Crawl instances (skips None results) - """ - results = [] - for record in records: - record_type = record.get('type', cls.JSONL_TYPE) - if record_type == cls.JSONL_TYPE: - instance = cls.from_json(record, overrides=overrides) - if instance: - results.append(instance) - return results - @staticmethod - def from_json(record: dict, overrides: dict = None) -> 'Crawl | None': + def from_jsonl(record: dict, overrides: dict = None): """ - Create or get a single Crawl from a JSON record dict. + Create or get a Crawl from a JSONL record. Args: record: Dict with 'urls' (required), optional 'max_depth', 'tags_str', 'label' @@ -250,45 +195,11 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith ) return crawl - @staticmethod - def extract_domain_from_url(url: str) -> str: - """ - Extract domain from URL for path structure. - Uses full hostname with sanitized special chars. - - Examples: - https://example.com:8080 → example.com_8080 - https://sub.example.com → sub.example.com - file:///path → localhost - data:text/html → data - """ - from urllib.parse import urlparse - - try: - parsed = urlparse(url) - - if parsed.scheme in ('http', 'https'): - if parsed.port: - return f"{parsed.hostname}_{parsed.port}".replace(':', '_') - return parsed.hostname or 'unknown' - elif parsed.scheme == 'file': - return 'localhost' - elif parsed.scheme: - return parsed.scheme - else: - return 'unknown' - except Exception: - return 'unknown' - @property def output_dir_parent(self) -> str: - """Construct parent directory: users/{username}/crawls/{YYYYMMDD}/{domain}""" + """Construct parent directory: users/{user_id}/crawls/{YYYYMMDD}""" date_str = self.created_at.strftime('%Y%m%d') - username = self.created_by.username - # Get domain from first URL - first_url = self.get_urls_list()[0] if self.get_urls_list() else '' - domain = self.extract_domain_from_url(first_url) if first_url else 'unknown' - return f'users/{username}/crawls/{date_str}/{domain}' + return f'users/{self.created_by_id}/crawls/{date_str}' @property def output_dir_name(self) -> str: @@ -506,84 +417,18 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith def cleanup(self): """Clean up background hooks and run on_CrawlEnd hooks.""" - import os - import signal - import time - from pathlib import Path from archivebox.hooks import run_hook, discover_hooks - from archivebox.misc.process_utils import validate_pid_file - - def is_process_alive(pid): - """Check if a process exists.""" - try: - os.kill(pid, 0) # Signal 0 checks existence without killing - return True - except (OSError, ProcessLookupError): - return False + from archivebox.misc.process_utils import safe_kill_process # Kill any background processes by scanning for all .pid files if self.OUTPUT_DIR.exists(): for pid_file in self.OUTPUT_DIR.glob('**/*.pid'): - # Validate PID before killing to avoid killing unrelated processes cmd_file = pid_file.parent / 'cmd.sh' - if not validate_pid_file(pid_file, cmd_file): - # PID reused by different process or process dead + # safe_kill_process now waits for termination and escalates to SIGKILL + # Returns True only if process is confirmed dead + killed = safe_kill_process(pid_file, cmd_file) + if killed: pid_file.unlink(missing_ok=True) - continue - - try: - pid = int(pid_file.read_text().strip()) - - # Step 1: Send SIGTERM for graceful shutdown - try: - # Try to kill process group first (handles detached processes like Chrome) - try: - os.killpg(pid, signal.SIGTERM) - except (OSError, ProcessLookupError): - # Fall back to killing just the process - os.kill(pid, signal.SIGTERM) - except ProcessLookupError: - # Already dead - pid_file.unlink(missing_ok=True) - continue - - # Step 2: Wait for graceful shutdown - time.sleep(2) - - # Step 3: Check if still alive - if not is_process_alive(pid): - # Process terminated gracefully - pid_file.unlink(missing_ok=True) - continue - - # Step 4: Process still alive, force kill ENTIRE process group with SIGKILL - try: - try: - # Always kill entire process group with SIGKILL (not individual processes) - os.killpg(pid, signal.SIGKILL) - except (OSError, ProcessLookupError) as e: - # Process group kill failed, try single process as fallback - os.kill(pid, signal.SIGKILL) - except ProcessLookupError: - # Process died between check and kill - pid_file.unlink(missing_ok=True) - continue - - # Step 5: Wait and verify death - time.sleep(1) - - if is_process_alive(pid): - # Process is unkillable (likely in UNE state on macOS) - # This happens when Chrome crashes in kernel syscall (IOSurface) - # Log but don't block cleanup - process will remain until reboot - print(f'[yellow]⚠️ Process {pid} is unkillable (likely crashed in kernel). Will remain until reboot.[/yellow]') - else: - # Successfully killed - pid_file.unlink(missing_ok=True) - - except (ValueError, OSError) as e: - # Invalid PID file or permission error - pass # Run on_CrawlEnd hooks from archivebox.config.configset import get_config diff --git a/archivebox/hooks.py b/archivebox/hooks.py index 94786d3f..73febfa0 100644 --- a/archivebox/hooks.py +++ b/archivebox/hooks.py @@ -365,14 +365,11 @@ def run_hook( # Old convention: __background in stem (for backwards compatibility) is_background = '.bg.' in script.name or '__background' in script.stem - # Set up output files for ALL hooks - use hook-specific names to avoid conflicts - # when multiple hooks run in the same plugin directory - # e.g., on_Snapshot__20_chrome_tab.bg.js -> on_Snapshot__20_chrome_tab.bg.stdout.log - hook_basename = script.stem # e.g., "on_Snapshot__20_chrome_tab.bg" - stdout_file = output_dir / f'{hook_basename}.stdout.log' - stderr_file = output_dir / f'{hook_basename}.stderr.log' - pid_file = output_dir / f'{hook_basename}.pid' - cmd_file = output_dir / f'{hook_basename}.sh' + # Set up output files for ALL hooks (useful for debugging) + stdout_file = output_dir / 'stdout.log' + stderr_file = output_dir / 'stderr.log' + pid_file = output_dir / 'hook.pid' + cmd_file = output_dir / 'cmd.sh' try: # Write command script for validation @@ -424,14 +421,8 @@ def run_hook( # Detect new files created by the hook files_after = set(output_dir.rglob('*')) if output_dir.exists() else set() new_files = [str(f.relative_to(output_dir)) for f in (files_after - files_before) if f.is_file()] - # Exclude the log/pid/sh files themselves from new_files (hook-specific names) - hook_output_files = { - f'{hook_basename}.stdout.log', - f'{hook_basename}.stderr.log', - f'{hook_basename}.pid', - f'{hook_basename}.sh', - } - new_files = [f for f in new_files if f not in hook_output_files] + # Exclude the log files themselves from new_files + new_files = [f for f in new_files if f not in ('stdout.log', 'stderr.log', 'hook.pid')] # Parse JSONL output from stdout # Each line starting with { that has 'type' field is a record @@ -1185,9 +1176,7 @@ def create_model_record(record: Dict[str, Any]) -> Any: def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any] = None) -> Dict[str, int]: """ Process JSONL records from hook output. - - Uses Model.from_jsonl() which automatically filters by JSONL_TYPE. - Each model only processes records matching its type. + Dispatches to Model.from_jsonl() for each record type. Args: records: List of JSONL record dicts from result['records'] @@ -1196,250 +1185,51 @@ def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any Returns: Dict with counts by record type """ - from archivebox.core.models import Snapshot, Tag - from archivebox.machine.models import Binary, Machine - + stats = {} overrides = overrides or {} - # Filter out ArchiveResult records (they update the calling AR, not create new ones) - filtered_records = [r for r in records if r.get('type') != 'ArchiveResult'] + for record in records: + record_type = record.get('type') + if not record_type: + continue - # Each model's from_jsonl() filters to only its own type - snapshots = Snapshot.from_jsonl(filtered_records, overrides) - tags = Tag.from_jsonl(filtered_records, overrides) - binaries = Binary.from_jsonl(filtered_records, overrides) - machines = Machine.from_jsonl(filtered_records, overrides) - - return { - 'Snapshot': len(snapshots), - 'Tag': len(tags), - 'Binary': len(binaries), - 'Machine': len(machines), - } - - -def process_is_alive(pid_file: Path) -> bool: - """ - Check if process in PID file is still running. - - Args: - pid_file: Path to hook.pid file - - Returns: - True if process is alive, False otherwise - """ - if not pid_file.exists(): - return False - - try: - pid = int(pid_file.read_text().strip()) - os.kill(pid, 0) # Signal 0 = check if process exists without killing it - return True - except (OSError, ValueError): - return False - - -def kill_process(pid_file: Path, sig: int = signal.SIGTERM, validate: bool = True): - """ - Kill process in PID file with optional validation. - - Args: - pid_file: Path to hook-specific .pid file (e.g., on_Snapshot__20_chrome_tab.bg.pid) - sig: Signal to send (default SIGTERM) - validate: If True, validate process identity before killing (default: True) - """ - from archivebox.misc.process_utils import safe_kill_process - - if validate: - # Use safe kill with validation - # Derive cmd file from pid file: on_Snapshot__20_chrome_tab.bg.pid -> on_Snapshot__20_chrome_tab.bg.sh - cmd_file = pid_file.with_suffix('.sh') - safe_kill_process(pid_file, cmd_file, signal_num=sig) - else: - # Legacy behavior - kill without validation - if not pid_file.exists(): - return - try: - pid = int(pid_file.read_text().strip()) - os.kill(pid, sig) - except (OSError, ValueError): - pass - - -def graceful_terminate_background_hooks( - output_dir: Path, - config: Dict[str, Any], - poll_interval: float = 0.5, -) -> Dict[str, Dict[str, Any]]: - """ - Gracefully terminate all background hooks in an output directory. - - Termination strategy: - 1. Send SIGTERM to all background hook processes (polite shutdown request) - 2. For each hook, wait up to its plugin-specific timeout - 3. Send SIGKILL to any hooks still running after their timeout expires - 4. Reap each process with waitpid() to get exit code - 5. Write returncode to .returncode file for update_from_output() - - Args: - output_dir: Snapshot output directory containing plugin subdirs with .pid files - config: Merged config dict from get_config() for timeout lookup - poll_interval: Seconds between process liveness checks (default: 0.5s) - - Returns: - Dict mapping hook names to result info: - { - 'hook_name': { - 'status': 'sigterm' | 'sigkill' | 'already_dead' | 'invalid', - 'returncode': int or None, - 'pid': int or None, - } - } - - Example: - from archivebox.config.configset import get_config - config = get_config(crawl=my_crawl, snapshot=my_snapshot) - results = graceful_terminate_background_hooks(snapshot.OUTPUT_DIR, config) - # {'on_Snapshot__20_chrome_tab.bg': {'status': 'sigterm', 'returncode': 0, 'pid': 12345}} - """ - from archivebox.misc.process_utils import validate_pid_file - - if not output_dir.exists(): - return {} - - results = {} - - # Collect all pid files and their metadata - pid_files = list(output_dir.glob('**/*.pid')) - if not pid_files: - return {} - - # Phase 1: Send SIGTERM to all background hook processes - active_hooks = [] # List of (pid_file, hook_name, plugin_name, timeout, pid) - for pid_file in pid_files: - hook_name = pid_file.stem # e.g., "on_Snapshot__20_chrome_tab.bg" - cmd_file = pid_file.with_suffix('.sh') - - # Validate and get PID - if not validate_pid_file(pid_file, cmd_file): - results[hook_name] = {'status': 'invalid', 'returncode': None, 'pid': None} - pid_file.unlink(missing_ok=True) + # Skip ArchiveResult records (they update the calling ArchiveResult, not create new ones) + if record_type == 'ArchiveResult': continue try: - pid = int(pid_file.read_text().strip()) - except (ValueError, OSError): - results[hook_name] = {'status': 'invalid', 'returncode': None, 'pid': None} - pid_file.unlink(missing_ok=True) + # Dispatch to appropriate model's from_jsonl() method + if record_type == 'Snapshot': + from archivebox.core.models import Snapshot + obj = Snapshot.from_jsonl(record.copy(), overrides) + if obj: + stats['Snapshot'] = stats.get('Snapshot', 0) + 1 + + elif record_type == 'Tag': + from archivebox.core.models import Tag + obj = Tag.from_jsonl(record.copy(), overrides) + if obj: + stats['Tag'] = stats.get('Tag', 0) + 1 + + elif record_type == 'Binary': + from archivebox.machine.models import Binary + obj = Binary.from_jsonl(record.copy(), overrides) + if obj: + stats['Binary'] = stats.get('Binary', 0) + 1 + + elif record_type == 'Machine': + from archivebox.machine.models import Machine + obj = Machine.from_jsonl(record.copy(), overrides) + if obj: + stats['Machine'] = stats.get('Machine', 0) + 1 + + else: + import sys + print(f"Warning: Unknown record type '{record_type}' from hook output", file=sys.stderr) + + except Exception as e: + import sys + print(f"Warning: Failed to create {record_type}: {e}", file=sys.stderr) continue - # Check if process is still alive - if not process_is_alive(pid_file): - # Process already dead - try to reap it and get exit code - returncode = _reap_process(pid) - results[hook_name] = {'status': 'already_dead', 'returncode': returncode, 'pid': pid} - _write_returncode_file(pid_file, returncode) - pid_file.unlink(missing_ok=True) - continue - - # Get plugin name from parent directory (e.g., "chrome_session") - plugin_name = pid_file.parent.name - - # Get plugin-specific timeout - plugin_config = get_plugin_special_config(plugin_name, config) - timeout = plugin_config['timeout'] - - # Send SIGTERM - try: - os.kill(pid, signal.SIGTERM) - except (OSError, ProcessLookupError): - returncode = _reap_process(pid) - results[hook_name] = {'status': 'already_dead', 'returncode': returncode, 'pid': pid} - _write_returncode_file(pid_file, returncode) - pid_file.unlink(missing_ok=True) - continue - - active_hooks.append((pid_file, hook_name, plugin_name, timeout, pid)) - - # Phase 2: Wait for each hook's timeout, then SIGKILL if still running - for pid_file, hook_name, plugin_name, timeout, pid in active_hooks: - deadline = time.time() + timeout - exited_cleanly = False - - # Poll until deadline or process exits - while time.time() < deadline: - if not process_is_alive(pid_file): - exited_cleanly = True - break - time.sleep(poll_interval) - - if exited_cleanly: - # Process exited from SIGTERM - reap it to get exit code - returncode = _reap_process(pid) - results[hook_name] = {'status': 'sigterm', 'returncode': returncode, 'pid': pid} - else: - # Timeout expired, send SIGKILL - try: - os.kill(pid, signal.SIGKILL) - except (OSError, ProcessLookupError): - pass # Process died between check and kill - - # Wait briefly for SIGKILL to take effect, then reap - time.sleep(0.1) - returncode = _reap_process(pid) - - # returncode from SIGKILL is typically -9 (negative signal number) - results[hook_name] = {'status': 'sigkill', 'returncode': returncode, 'pid': pid} - - # Write returncode file for update_from_output() to read - _write_returncode_file(pid_file, results[hook_name]['returncode']) - pid_file.unlink(missing_ok=True) - - return results - - -def _reap_process(pid: int) -> Optional[int]: - """ - Reap a terminated process and return its exit code. - - Uses os.waitpid() with WNOHANG to avoid blocking. - Returns None if process cannot be reaped (not a child, already reaped, etc). - """ - try: - # WNOHANG: return immediately if process hasn't exited - # We call this after we know process is dead, so it should return immediately - wpid, status = os.waitpid(pid, os.WNOHANG) - if wpid == 0: - # Process still running (shouldn't happen since we checked) - return None - if os.WIFEXITED(status): - return os.WEXITSTATUS(status) - elif os.WIFSIGNALED(status): - # Killed by signal - return negative signal number (convention) - return -os.WTERMSIG(status) - return None - except ChildProcessError: - # Not our child process (was started by subprocess.Popen which already reaped it, - # or process was started by different parent). This is expected for hooks. - return None - except OSError: - return None - - -def _write_returncode_file(pid_file: Path, returncode: Optional[int]) -> None: - """ - Write returncode to a .returncode file next to the .pid file. - - This allows update_from_output() to know the exit code even for background hooks. - """ - returncode_file = pid_file.with_suffix('.returncode') - try: - if returncode is not None: - returncode_file.write_text(str(returncode)) - else: - # Unknown exit code - write empty file to indicate process was terminated - returncode_file.write_text('') - except OSError: - pass # Best effort - - + return stats diff --git a/archivebox/machine/migrations/0002_process_parent_and_type.py b/archivebox/machine/migrations/0002_process_parent_and_type.py new file mode 100644 index 00000000..e70de360 --- /dev/null +++ b/archivebox/machine/migrations/0002_process_parent_and_type.py @@ -0,0 +1,101 @@ +# Generated on 2025-12-31 +# Adds parent FK and process_type field to Process model + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('machine', '0001_initial'), + ] + + operations = [ + migrations.SeparateDatabaseAndState( + database_operations=[ + migrations.RunSQL( + sql=""" + -- Add parent_id FK column to machine_process + ALTER TABLE machine_process ADD COLUMN parent_id TEXT REFERENCES machine_process(id) ON DELETE SET NULL; + CREATE INDEX IF NOT EXISTS machine_process_parent_id_idx ON machine_process(parent_id); + + -- Add process_type column with default 'binary' + ALTER TABLE machine_process ADD COLUMN process_type VARCHAR(16) NOT NULL DEFAULT 'binary'; + CREATE INDEX IF NOT EXISTS machine_process_process_type_idx ON machine_process(process_type); + + -- Add composite index for parent + status queries + CREATE INDEX IF NOT EXISTS machine_process_parent_status_idx ON machine_process(parent_id, status); + + -- Add composite index for machine + pid + started_at (for PID reuse protection) + CREATE INDEX IF NOT EXISTS machine_process_machine_pid_started_idx ON machine_process(machine_id, pid, started_at); + """, + # Migration is irreversible due to SQLite limitations + # SQLite doesn't support DROP COLUMN, would require table rebuild + reverse_sql=migrations.RunSQL.noop + ), + ], + state_operations=[ + # Add parent FK + migrations.AddField( + model_name='process', + name='parent', + field=models.ForeignKey( + blank=True, + help_text='Parent process that spawned this one', + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name='children', + to='machine.process', + ), + ), + # Add process_type field + migrations.AddField( + model_name='process', + name='process_type', + field=models.CharField( + choices=[ + ('cli', 'CLI Command'), + ('supervisord', 'Supervisord Daemon'), + ('orchestrator', 'Orchestrator'), + ('worker', 'Worker Process'), + ('hook', 'Hook Script'), + ('binary', 'Binary Execution'), + ], + default='binary', + help_text='Type of process in the execution hierarchy', + max_length=16, + ), + ), + # Add indexes - must match the SQL index names exactly + migrations.AddIndex( + model_name='process', + index=models.Index( + fields=['parent'], + name='machine_process_parent_id_idx', + ), + ), + migrations.AddIndex( + model_name='process', + index=models.Index( + fields=['process_type'], + name='machine_process_process_type_idx', + ), + ), + migrations.AddIndex( + model_name='process', + index=models.Index( + fields=['parent', 'status'], + name='machine_process_parent_status_idx', + ), + ), + migrations.AddIndex( + model_name='process', + index=models.Index( + fields=['machine', 'pid', 'started_at'], + name='machine_process_machine_pid_started_idx', + ), + ), + ], + ), + ] diff --git a/archivebox/machine/models.py b/archivebox/machine/models.py index feb9bc88..428633b3 100755 --- a/archivebox/machine/models.py +++ b/archivebox/machine/models.py @@ -1,9 +1,11 @@ __package__ = 'archivebox.machine' +import os +import sys import socket -from typing import Iterator, Set +from pathlib import Path from archivebox.uuid_compat import uuid7 -from datetime import timedelta +from datetime import timedelta, datetime from statemachine import State, registry @@ -15,13 +17,23 @@ from archivebox.base_models.models import ModelWithHealthStats from archivebox.workers.models import BaseStateMachine from .detect import get_host_guid, get_os_info, get_vm_info, get_host_network, get_host_stats +try: + import psutil + PSUTIL_AVAILABLE = True +except ImportError: + PSUTIL_AVAILABLE = False + _CURRENT_MACHINE = None _CURRENT_INTERFACE = None _CURRENT_BINARIES = {} +_CURRENT_PROCESS = None MACHINE_RECHECK_INTERVAL = 7 * 24 * 60 * 60 NETWORK_INTERFACE_RECHECK_INTERVAL = 1 * 60 * 60 BINARY_RECHECK_INTERVAL = 1 * 30 * 60 +PROCESS_RECHECK_INTERVAL = 60 # Re-validate every 60 seconds +PID_REUSE_WINDOW = timedelta(hours=24) # Max age for considering a PID match valid +START_TIME_TOLERANCE = 5.0 # Seconds tolerance for start time matching class MachineManager(models.Manager): @@ -30,8 +42,6 @@ class MachineManager(models.Manager): class Machine(ModelWithHealthStats): - JSONL_TYPE = 'Machine' - id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) created_at = models.DateTimeField(default=timezone.now, db_index=True) modified_at = models.DateTimeField(auto_now=True) @@ -72,35 +82,13 @@ class Machine(ModelWithHealthStats): ) return _CURRENT_MACHINE - @classmethod - def from_jsonl(cls, records, overrides: dict = None) -> list['Machine']: - """ - Update Machine configs from an iterable of JSONL records. - Filters to only records with type='Machine'. - - Args: - records: Iterable of dicts (JSONL records) - overrides: Not used - - Returns: - List of Machine instances (skips None results) - """ - results = [] - for record in records: - record_type = record.get('type', cls.JSONL_TYPE) - if record_type == cls.JSONL_TYPE: - instance = cls.from_json(record, overrides=overrides) - if instance: - results.append(instance) - return results - @staticmethod - def from_json(record: dict, overrides: dict = None) -> 'Machine | None': + def from_jsonl(record: dict, overrides: dict = None): """ - Update a single Machine config from a JSON record dict. + Update Machine config from JSONL record. Args: - record: Dict with '_method': 'update', 'key': '...', 'value': '...' + record: JSONL record with '_method': 'update', 'key': '...', 'value': '...' overrides: Not used Returns: @@ -119,44 +107,6 @@ class Machine(ModelWithHealthStats): return machine return None - def to_json(self) -> dict: - """ - Convert Machine model instance to a JSON-serializable dict. - """ - from archivebox.config import VERSION - return { - 'type': self.JSONL_TYPE, - 'schema_version': VERSION, - 'id': str(self.id), - 'guid': self.guid, - 'hostname': self.hostname, - 'hw_in_docker': self.hw_in_docker, - 'hw_in_vm': self.hw_in_vm, - 'os_arch': self.os_arch, - 'os_family': self.os_family, - 'os_platform': self.os_platform, - 'os_release': self.os_release, - 'created_at': self.created_at.isoformat() if self.created_at else None, - } - - def to_jsonl(self, seen: Set[tuple] = None, **kwargs) -> Iterator[dict]: - """ - Yield this Machine as a JSON record. - - Args: - seen: Set of (type, id) tuples already emitted (for deduplication) - **kwargs: Passed to children (none for Machine, leaf node) - - Yields: - dict: JSON-serializable record for this machine - """ - if seen is not None: - key = (self.JSONL_TYPE, str(self.id)) - if key in seen: - return - seen.add(key) - yield self.to_json() - class NetworkInterfaceManager(models.Manager): def current(self) -> 'NetworkInterface': @@ -164,8 +114,6 @@ class NetworkInterfaceManager(models.Manager): class NetworkInterface(ModelWithHealthStats): - JSONL_TYPE = 'NetworkInterface' - id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) created_at = models.DateTimeField(default=timezone.now, db_index=True) modified_at = models.DateTimeField(auto_now=True) @@ -204,46 +152,6 @@ class NetworkInterface(ModelWithHealthStats): ) return _CURRENT_INTERFACE - def to_json(self) -> dict: - """ - Convert NetworkInterface model instance to a JSON-serializable dict. - """ - from archivebox.config import VERSION - return { - 'type': self.JSONL_TYPE, - 'schema_version': VERSION, - 'id': str(self.id), - 'machine_id': str(self.machine_id), - 'hostname': self.hostname, - 'iface': self.iface, - 'ip_public': self.ip_public, - 'ip_local': self.ip_local, - 'mac_address': self.mac_address, - 'dns_server': self.dns_server, - 'isp': self.isp, - 'city': self.city, - 'region': self.region, - 'country': self.country, - 'created_at': self.created_at.isoformat() if self.created_at else None, - } - - def to_jsonl(self, seen: Set[tuple] = None, **kwargs) -> Iterator[dict]: - """ - Yield this NetworkInterface as a JSON record. - - Args: - seen: Set of (type, id) tuples already emitted (for deduplication) - **kwargs: Passed to children (none for NetworkInterface, leaf node) - - Yields: - dict: JSON-serializable record for this network interface - """ - if seen is not None: - key = (self.JSONL_TYPE, str(self.id)) - if key in seen: - return - seen.add(key) - yield self.to_json() class BinaryManager(models.Manager): @@ -270,7 +178,7 @@ class BinaryManager(models.Manager): class Binary(ModelWithHealthStats): """ - Tracks a binary on a specific machine. + Tracks an binary on a specific machine. Follows the unified state machine pattern: - queued: Binary needs to be installed @@ -281,7 +189,6 @@ class Binary(ModelWithHealthStats): State machine calls run() which executes on_Binary__install_* hooks to install the binary using the specified providers. """ - JSONL_TYPE = 'Binary' class StatusChoices(models.TextChoices): QUEUED = 'queued', 'Queued' @@ -348,13 +255,13 @@ class Binary(ModelWithHealthStats): 'is_valid': self.is_valid, } - def to_json(self) -> dict: + def to_jsonl(self) -> dict: """ - Convert Binary model instance to a JSON-serializable dict. + Convert Binary model instance to a JSONL record. """ from archivebox.config import VERSION return { - 'type': self.JSONL_TYPE, + 'type': 'Binary', 'schema_version': VERSION, 'id': str(self.id), 'machine_id': str(self.machine_id), @@ -366,57 +273,17 @@ class Binary(ModelWithHealthStats): 'status': self.status, } - def to_jsonl(self, seen: Set[tuple] = None, **kwargs) -> Iterator[dict]: - """ - Yield this Binary as a JSON record. - - Args: - seen: Set of (type, id) tuples already emitted (for deduplication) - **kwargs: Passed to children (none for Binary, leaf node) - - Yields: - dict: JSON-serializable record for this binary - """ - if seen is not None: - key = (self.JSONL_TYPE, str(self.id)) - if key in seen: - return - seen.add(key) - yield self.to_json() - - @classmethod - def from_jsonl(cls, records, overrides: dict = None) -> list['Binary']: - """ - Create/update Binaries from an iterable of JSONL records. - Filters to only records with type='Binary'. - - Args: - records: Iterable of dicts (JSONL records) - overrides: Not used - - Returns: - List of Binary instances (skips None results) - """ - results = [] - for record in records: - record_type = record.get('type', cls.JSONL_TYPE) - if record_type == cls.JSONL_TYPE: - instance = cls.from_json(record, overrides=overrides) - if instance: - results.append(instance) - return results - @staticmethod - def from_json(record: dict, overrides: dict = None) -> 'Binary | None': + def from_jsonl(record: dict, overrides: dict = None): """ - Create/update a single Binary from a JSON record dict. + Create/update Binary from JSONL record. Handles two cases: 1. From binaries.jsonl: creates queued binary with name, binproviders, overrides 2. From hook output: updates binary with abspath, version, sha256, binprovider Args: - record: Dict with 'name' and either: + record: JSONL record with 'name' and either: - 'binproviders', 'overrides' (from binaries.jsonl) - 'abspath', 'version', 'sha256', 'binprovider' (from hook output) overrides: Not used @@ -582,7 +449,7 @@ class Binary(ModelWithHealthStats): since installations are foreground, but included for consistency). """ from pathlib import Path - from archivebox.hooks import kill_process + from archivebox.misc.process_utils import safe_kill_process output_dir = self.OUTPUT_DIR if not output_dir.exists(): @@ -593,8 +460,8 @@ class Binary(ModelWithHealthStats): if not plugin_dir.is_dir(): continue pid_file = plugin_dir / 'hook.pid' - if pid_file.exists(): - kill_process(pid_file) + cmd_file = plugin_dir / 'cmd.sh' + safe_kill_process(pid_file, cmd_file) # ============================================================================= @@ -604,6 +471,56 @@ class Binary(ModelWithHealthStats): class ProcessManager(models.Manager): """Manager for Process model.""" + def current(self) -> 'Process': + """Get the Process record for the current OS process.""" + return Process.current() + + def get_by_pid(self, pid: int, machine: 'Machine' = None) -> 'Process | None': + """ + Find a Process by PID with proper validation against PID reuse. + + IMPORTANT: PIDs are reused by the OS! This method: + 1. Filters by machine (required - PIDs are only unique per machine) + 2. Filters by time window (processes older than 24h are stale) + 3. Validates via psutil that start times match + + Args: + pid: OS process ID + machine: Machine instance (defaults to current machine) + + Returns: + Process if found and validated, None otherwise + """ + if not PSUTIL_AVAILABLE: + return None + + machine = machine or Machine.current() + + # Get the actual process start time from OS + try: + os_proc = psutil.Process(pid) + os_start_time = os_proc.create_time() + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + # Process doesn't exist - any DB record with this PID is stale + return None + + # Query candidates: same machine, same PID, recent, still RUNNING + candidates = self.filter( + machine=machine, + pid=pid, + status=Process.StatusChoices.RUNNING, + started_at__gte=timezone.now() - PID_REUSE_WINDOW, + ).order_by('-started_at') + + for candidate in candidates: + # Validate start time matches (within tolerance) + if candidate.started_at: + db_start_time = candidate.started_at.timestamp() + if abs(db_start_time - os_start_time) < START_TIME_TOLERANCE: + return candidate + + return None + def create_for_archiveresult(self, archiveresult, **kwargs): """ Create a Process record for an ArchiveResult. @@ -625,7 +542,7 @@ class ProcessManager(models.Manager): return process -class Process(models.Model): +class Process(ModelWithHealthStats): """ Tracks a single OS process execution. @@ -640,18 +557,44 @@ class Process(models.Model): State machine calls launch() to spawn the process and monitors its lifecycle. """ - JSONL_TYPE = 'Process' class StatusChoices(models.TextChoices): QUEUED = 'queued', 'Queued' RUNNING = 'running', 'Running' EXITED = 'exited', 'Exited' + class TypeChoices(models.TextChoices): + CLI = 'cli', 'CLI Command' + SUPERVISORD = 'supervisord', 'Supervisord Daemon' + ORCHESTRATOR = 'orchestrator', 'Orchestrator' + WORKER = 'worker', 'Worker Process' + HOOK = 'hook', 'Hook Script' + BINARY = 'binary', 'Binary Execution' + # Primary fields id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) created_at = models.DateTimeField(default=timezone.now, db_index=True) modified_at = models.DateTimeField(auto_now=True) + # Parent process FK for hierarchy tracking + parent = models.ForeignKey( + 'self', + on_delete=models.SET_NULL, + null=True, + blank=True, + related_name='children', + help_text='Parent process that spawned this one' + ) + + # Process type for distinguishing in hierarchy + process_type = models.CharField( + max_length=16, + choices=TypeChoices.choices, + default=TypeChoices.BINARY, + db_index=True, + help_text='Type of process in the execution hierarchy' + ) + # Machine FK - required (every process runs on a machine) machine = models.ForeignKey( Machine, @@ -739,6 +682,8 @@ class Process(models.Model): indexes = [ models.Index(fields=['machine', 'status', 'retry_at']), models.Index(fields=['binary', 'exit_code']), + models.Index(fields=['parent', 'status']), + models.Index(fields=['machine', 'pid', 'started_at']), ] def __str__(self) -> str: @@ -771,13 +716,13 @@ class Process(models.Model): return self.archiveresult.hook_name return '' - def to_json(self) -> dict: + def to_jsonl(self) -> dict: """ - Convert Process model instance to a JSON-serializable dict. + Convert Process model instance to a JSONL record. """ from archivebox.config import VERSION record = { - 'type': self.JSONL_TYPE, + 'type': 'Process', 'schema_version': VERSION, 'id': str(self.id), 'machine_id': str(self.machine_id), @@ -797,37 +742,6 @@ class Process(models.Model): record['timeout'] = self.timeout return record - def to_jsonl(self, seen: Set[tuple] = None, binary: bool = True, machine: bool = False, iface: bool = False, **kwargs) -> Iterator[dict]: - """ - Yield this Process and optionally related objects as JSON records. - - Args: - seen: Set of (type, id) tuples already emitted (for deduplication) - binary: Include related Binary (default: True) - machine: Include related Machine (default: False) - iface: Include related NetworkInterface (default: False) - **kwargs: Passed to children - - Yields: - dict: JSON-serializable records - """ - if seen is None: - seen = set() - - key = (self.JSONL_TYPE, str(self.id)) - if key in seen: - return - seen.add(key) - - yield self.to_json() - - if binary and self.binary: - yield from self.binary.to_jsonl(seen=seen, **kwargs) - if machine and self.machine: - yield from self.machine.to_jsonl(seen=seen, **kwargs) - if iface and self.iface: - yield from self.iface.to_jsonl(seen=seen, **kwargs) - def update_and_requeue(self, **kwargs): """ Update process fields and requeue for worker state machine. @@ -838,6 +752,792 @@ class Process(models.Model): self.modified_at = timezone.now() self.save() + # ========================================================================= + # Process.current() and hierarchy methods + # ========================================================================= + + @classmethod + def current(cls) -> 'Process': + """ + Get or create the Process record for the current OS process. + + Similar to Machine.current(), this: + 1. Checks cache for existing Process with matching PID + 2. Validates the cached Process is still valid (PID not reused) + 3. Creates new Process if needed + + IMPORTANT: Uses psutil to validate PID hasn't been reused. + PIDs are recycled by OS, so we compare start times. + """ + global _CURRENT_PROCESS + + current_pid = os.getpid() + machine = Machine.current() + + # Check cache validity + if _CURRENT_PROCESS: + # Verify: same PID, same machine, cache not expired + if (_CURRENT_PROCESS.pid == current_pid and + _CURRENT_PROCESS.machine_id == machine.id and + timezone.now() < _CURRENT_PROCESS.modified_at + timedelta(seconds=PROCESS_RECHECK_INTERVAL)): + return _CURRENT_PROCESS + _CURRENT_PROCESS = None + + # Get actual process start time from OS for validation + os_start_time = None + if PSUTIL_AVAILABLE: + try: + os_proc = psutil.Process(current_pid) + os_start_time = os_proc.create_time() + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + + # Try to find existing Process for this PID on this machine + # Filter by: machine + PID + RUNNING + recent + start time matches + if os_start_time: + existing = cls.objects.filter( + machine=machine, + pid=current_pid, + status=cls.StatusChoices.RUNNING, + started_at__gte=timezone.now() - PID_REUSE_WINDOW, + ).order_by('-started_at').first() + + if existing and existing.started_at: + db_start_time = existing.started_at.timestamp() + if abs(db_start_time - os_start_time) < START_TIME_TOLERANCE: + _CURRENT_PROCESS = existing + return existing + + # No valid existing record - create new one + parent = cls._find_parent_process(machine) + process_type = cls._detect_process_type() + + # Use psutil cmdline if available (matches what proc() will validate against) + # Otherwise fall back to sys.argv + cmd = sys.argv + if PSUTIL_AVAILABLE: + try: + os_proc = psutil.Process(current_pid) + cmd = os_proc.cmdline() + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + + # Use psutil start time if available (more accurate than timezone.now()) + if os_start_time: + started_at = datetime.fromtimestamp(os_start_time, tz=timezone.get_current_timezone()) + else: + started_at = timezone.now() + + _CURRENT_PROCESS = cls.objects.create( + machine=machine, + parent=parent, + process_type=process_type, + cmd=cmd, + pwd=os.getcwd(), + pid=current_pid, + started_at=started_at, + status=cls.StatusChoices.RUNNING, + ) + return _CURRENT_PROCESS + + @classmethod + def _find_parent_process(cls, machine: 'Machine' = None) -> 'Process | None': + """ + Find the parent Process record by looking up PPID. + + IMPORTANT: Validates against PID reuse by checking: + 1. Same machine (PIDs are only unique per machine) + 2. Start time matches OS process start time + 3. Process is still RUNNING and recent + + Returns None if parent is not an ArchiveBox process. + """ + if not PSUTIL_AVAILABLE: + return None + + ppid = os.getppid() + machine = machine or Machine.current() + + # Get parent process start time from OS + try: + os_parent = psutil.Process(ppid) + os_parent_start = os_parent.create_time() + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + return None # Parent process doesn't exist + + # Find matching Process record + candidates = cls.objects.filter( + machine=machine, + pid=ppid, + status=cls.StatusChoices.RUNNING, + started_at__gte=timezone.now() - PID_REUSE_WINDOW, + ).order_by('-started_at') + + for candidate in candidates: + if candidate.started_at: + db_start_time = candidate.started_at.timestamp() + if abs(db_start_time - os_parent_start) < START_TIME_TOLERANCE: + return candidate + + return None # No matching ArchiveBox parent process + + @classmethod + def _detect_process_type(cls) -> str: + """ + Detect the type of the current process from sys.argv. + """ + argv_str = ' '.join(sys.argv).lower() + + if 'supervisord' in argv_str: + return cls.TypeChoices.SUPERVISORD + elif 'orchestrator' in argv_str: + return cls.TypeChoices.ORCHESTRATOR + elif any(w in argv_str for w in ['crawl_worker', 'snapshot_worker', 'archiveresult_worker']): + return cls.TypeChoices.WORKER + elif 'archivebox' in argv_str: + return cls.TypeChoices.CLI + else: + return cls.TypeChoices.BINARY + + @classmethod + def cleanup_stale_running(cls, machine: 'Machine' = None) -> int: + """ + Mark stale RUNNING processes as EXITED. + + Processes are stale if: + - Status is RUNNING but OS process no longer exists + - Status is RUNNING but started_at is older than PID_REUSE_WINDOW + + Returns count of processes cleaned up. + """ + machine = machine or Machine.current() + cleaned = 0 + + stale = cls.objects.filter( + machine=machine, + status=cls.StatusChoices.RUNNING, + ) + + for proc in stale: + is_stale = False + + # Check if too old (PID definitely reused) + if proc.started_at and proc.started_at < timezone.now() - PID_REUSE_WINDOW: + is_stale = True + elif PSUTIL_AVAILABLE and proc.pid is not None: + # Check if OS process still exists with matching start time + try: + os_proc = psutil.Process(proc.pid) + if proc.started_at: + db_start = proc.started_at.timestamp() + os_start = os_proc.create_time() + if abs(db_start - os_start) > START_TIME_TOLERANCE: + is_stale = True # PID reused by different process + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + is_stale = True # Process no longer exists + + if is_stale: + proc.status = cls.StatusChoices.EXITED + proc.ended_at = proc.ended_at or timezone.now() + proc.exit_code = proc.exit_code if proc.exit_code is not None else -1 + proc.save(update_fields=['status', 'ended_at', 'exit_code']) + cleaned += 1 + + return cleaned + + # ========================================================================= + # Tree traversal properties + # ========================================================================= + + @property + def root(self) -> 'Process': + """Get the root process (CLI command) of this hierarchy.""" + proc = self + while proc.parent_id: + proc = proc.parent + return proc + + @property + def ancestors(self) -> list['Process']: + """Get all ancestor processes from parent to root.""" + ancestors = [] + proc = self.parent + while proc: + ancestors.append(proc) + proc = proc.parent + return ancestors + + @property + def depth(self) -> int: + """Get depth in the process tree (0 = root).""" + return len(self.ancestors) + + def get_descendants(self, include_self: bool = False): + """Get all descendant processes recursively.""" + if include_self: + pks = [self.pk] + else: + pks = [] + + children = list(self.children.values_list('pk', flat=True)) + while children: + pks.extend(children) + children = list(Process.objects.filter(parent_id__in=children).values_list('pk', flat=True)) + + return Process.objects.filter(pk__in=pks) + + # ========================================================================= + # Validated psutil access via .proc property + # ========================================================================= + + @property + def proc(self) -> 'psutil.Process | None': + """ + Get validated psutil.Process for this record. + + Returns psutil.Process ONLY if: + 1. Process with this PID exists in OS + 2. OS process start time matches our started_at (within tolerance) + 3. Process is on current machine + + Returns None if: + - PID doesn't exist (process exited) + - PID was reused by a different process (start times don't match) + - We're on a different machine than where process ran + - psutil is not available + + This prevents accidentally matching a stale/recycled PID. + """ + if not PSUTIL_AVAILABLE: + return None + + # Can't get psutil.Process if we don't have a PID + if not self.pid: + return None + + # Can't validate processes on other machines + if self.machine_id != Machine.current().id: + return None + + try: + os_proc = psutil.Process(self.pid) + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + return None # Process no longer exists + + # Validate start time matches to prevent PID reuse confusion + if self.started_at: + os_start_time = os_proc.create_time() + db_start_time = self.started_at.timestamp() + + if abs(os_start_time - db_start_time) > START_TIME_TOLERANCE: + # PID has been reused by a different process! + return None + + # Optionally validate command matches (extra safety) + if self.cmd: + try: + os_cmdline = os_proc.cmdline() + # Check if first arg (binary) matches + if os_cmdline and self.cmd: + os_binary = os_cmdline[0] if os_cmdline else '' + db_binary = self.cmd[0] if self.cmd else '' + # Match by basename (handles /usr/bin/python3 vs python3) + if os_binary and db_binary: + if Path(os_binary).name != Path(db_binary).name: + return None # Different binary, PID reused + except (psutil.AccessDenied, psutil.ZombieProcess): + pass # Can't check cmdline, trust start time match + + return os_proc + + @property + def is_running(self) -> bool: + """ + Check if process is currently running via psutil. + + More reliable than checking status field since it validates + the actual OS process exists and matches our record. + """ + proc = self.proc + return proc is not None and proc.is_running() + + def is_alive(self) -> bool: + """ + Alias for is_running, for compatibility with subprocess.Popen API. + """ + return self.is_running + + def get_memory_info(self) -> dict | None: + """Get memory usage if process is running.""" + proc = self.proc + if proc: + try: + mem = proc.memory_info() + return {'rss': mem.rss, 'vms': mem.vms} + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + return None + + def get_cpu_percent(self) -> float | None: + """Get CPU usage percentage if process is running.""" + proc = self.proc + if proc: + try: + return proc.cpu_percent(interval=0.1) + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + return None + + def get_children_pids(self) -> list[int]: + """Get PIDs of child processes from OS (not DB).""" + proc = self.proc + if proc: + try: + return [child.pid for child in proc.children(recursive=True)] + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + return [] + + # ========================================================================= + # Lifecycle methods (launch, kill, poll, wait) + # ========================================================================= + + @property + def pid_file(self) -> Path: + """Path to PID file for this process.""" + return Path(self.pwd) / 'process.pid' if self.pwd else None + + @property + def cmd_file(self) -> Path: + """Path to cmd.sh script for this process.""" + return Path(self.pwd) / 'cmd.sh' if self.pwd else None + + @property + def stdout_file(self) -> Path: + """Path to stdout log.""" + return Path(self.pwd) / 'stdout.log' if self.pwd else None + + @property + def stderr_file(self) -> Path: + """Path to stderr log.""" + return Path(self.pwd) / 'stderr.log' if self.pwd else None + + def _write_pid_file(self) -> None: + """Write PID file with mtime set to process start time.""" + from archivebox.misc.process_utils import write_pid_file_with_mtime + if self.pid and self.started_at and self.pid_file: + write_pid_file_with_mtime( + self.pid_file, + self.pid, + self.started_at.timestamp() + ) + + def _write_cmd_file(self) -> None: + """Write cmd.sh script for debugging/validation.""" + from archivebox.misc.process_utils import write_cmd_file + if self.cmd and self.cmd_file: + write_cmd_file(self.cmd_file, self.cmd) + + def _build_env(self) -> dict: + """Build environment dict for subprocess, merging stored env with system.""" + env = os.environ.copy() + env.update(self.env or {}) + return env + + def launch(self, background: bool = False) -> 'Process': + """ + Spawn the subprocess and update this Process record. + + Args: + background: If True, don't wait for completion (for daemons/bg hooks) + + Returns: + self (updated with pid, started_at, etc.) + """ + import subprocess + import time + + # Validate pwd is set (required for output files) + if not self.pwd: + raise ValueError("Process.pwd must be set before calling launch()") + + # Ensure output directory exists + Path(self.pwd).mkdir(parents=True, exist_ok=True) + + # Write cmd.sh for debugging + self._write_cmd_file() + + stdout_path = self.stdout_file + stderr_path = self.stderr_file + + with open(stdout_path, 'w') as out, open(stderr_path, 'w') as err: + proc = subprocess.Popen( + self.cmd, + cwd=self.pwd, + stdout=out, + stderr=err, + env=self._build_env(), + ) + + # Get accurate start time from psutil if available + if PSUTIL_AVAILABLE: + try: + ps_proc = psutil.Process(proc.pid) + self.started_at = datetime.fromtimestamp( + ps_proc.create_time(), + tz=timezone.get_current_timezone() + ) + except (psutil.NoSuchProcess, psutil.AccessDenied): + self.started_at = timezone.now() + else: + self.started_at = timezone.now() + + self.pid = proc.pid + self.status = self.StatusChoices.RUNNING + self.save() + + self._write_pid_file() + + if not background: + try: + proc.wait(timeout=self.timeout) + self.exit_code = proc.returncode + except subprocess.TimeoutExpired: + proc.kill() + proc.wait() + self.exit_code = -1 + + self.ended_at = timezone.now() + if stdout_path.exists(): + self.stdout = stdout_path.read_text() + if stderr_path.exists(): + self.stderr = stderr_path.read_text() + self.status = self.StatusChoices.EXITED + self.save() + + return self + + def kill(self, signal_num: int = 15) -> bool: + """ + Kill this process and update status. + + Uses self.proc for safe killing - only kills if PID matches + our recorded process (prevents killing recycled PIDs). + + Args: + signal_num: Signal to send (default SIGTERM=15) + + Returns: + True if killed successfully, False otherwise + """ + # Use validated psutil.Process to ensure we're killing the right process + proc = self.proc + if proc is None: + # Process doesn't exist or PID was recycled - just update status + if self.status != self.StatusChoices.EXITED: + self.status = self.StatusChoices.EXITED + self.ended_at = self.ended_at or timezone.now() + self.save() + return False + + try: + # Safe to kill - we validated it's our process via start time match + proc.send_signal(signal_num) + + # Update our record + # Use standard Unix convention: 128 + signal number + self.exit_code = 128 + signal_num + self.ended_at = timezone.now() + self.status = self.StatusChoices.EXITED + self.save() + + # Clean up PID file + if self.pid_file and self.pid_file.exists(): + self.pid_file.unlink(missing_ok=True) + + return True + except (psutil.NoSuchProcess, psutil.AccessDenied, ProcessLookupError): + # Process already exited between proc check and kill + self.status = self.StatusChoices.EXITED + self.ended_at = self.ended_at or timezone.now() + self.save() + return False + + def poll(self) -> int | None: + """ + Check if process has exited and update status if so. + + Returns: + exit_code if exited, None if still running + """ + if self.status == self.StatusChoices.EXITED: + return self.exit_code + + if not self.is_running: + # Process exited - read output and update status + if self.stdout_file and self.stdout_file.exists(): + self.stdout = self.stdout_file.read_text() + if self.stderr_file and self.stderr_file.exists(): + self.stderr = self.stderr_file.read_text() + + # Try to get exit code from proc or default to unknown + self.exit_code = self.exit_code if self.exit_code is not None else -1 + self.ended_at = timezone.now() + self.status = self.StatusChoices.EXITED + self.save() + return self.exit_code + + return None # Still running + + def wait(self, timeout: int | None = None) -> int: + """ + Wait for process to exit, polling periodically. + + Args: + timeout: Max seconds to wait (None = use self.timeout) + + Returns: + exit_code + + Raises: + TimeoutError if process doesn't exit in time + """ + import time + + timeout = timeout or self.timeout + start = time.time() + + while True: + exit_code = self.poll() + if exit_code is not None: + return exit_code + + if time.time() - start > timeout: + raise TimeoutError(f"Process {self.id} did not exit within {timeout}s") + + time.sleep(0.1) + + def terminate(self, graceful_timeout: float = 5.0) -> bool: + """ + Gracefully terminate process: SIGTERM → wait → SIGKILL. + + This consolidates the scattered SIGTERM/SIGKILL logic from: + - crawls/models.py Crawl.cleanup() + - workers/pid_utils.py stop_worker() + - supervisord_util.py stop_existing_supervisord_process() + + Args: + graceful_timeout: Seconds to wait after SIGTERM before SIGKILL + + Returns: + True if process was terminated, False if already dead + """ + import time + import signal + + proc = self.proc + if proc is None: + # Already dead - just update status + if self.status != self.StatusChoices.EXITED: + self.status = self.StatusChoices.EXITED + self.ended_at = self.ended_at or timezone.now() + self.save() + return False + + try: + # Step 1: Send SIGTERM for graceful shutdown + proc.terminate() + + # Step 2: Wait for graceful exit + try: + exit_status = proc.wait(timeout=graceful_timeout) + # Process exited gracefully + # psutil.Process.wait() returns the exit status + self.exit_code = exit_status if exit_status is not None else 0 + self.status = self.StatusChoices.EXITED + self.ended_at = timezone.now() + self.save() + return True + except psutil.TimeoutExpired: + pass # Still running, need to force kill + + # Step 3: Force kill with SIGKILL + proc.kill() + proc.wait(timeout=2) + + # Use standard Unix convention: 128 + signal number + self.exit_code = 128 + signal.SIGKILL + self.status = self.StatusChoices.EXITED + self.ended_at = timezone.now() + self.save() + return True + + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + # Process already dead + self.status = self.StatusChoices.EXITED + self.ended_at = self.ended_at or timezone.now() + self.save() + return False + + def kill_tree(self, graceful_timeout: float = 2.0) -> int: + """ + Kill this process and all its children (OS children, not DB children). + + This consolidates the scattered child-killing logic from: + - crawls/models.py Crawl.cleanup() os.killpg() + - supervisord_util.py stop_existing_supervisord_process() + + Args: + graceful_timeout: Seconds to wait after SIGTERM before SIGKILL + + Returns: + Number of processes killed (including self) + """ + import signal + + killed_count = 0 + proc = self.proc + if proc is None: + # Already dead + if self.status != self.StatusChoices.EXITED: + self.status = self.StatusChoices.EXITED + self.ended_at = self.ended_at or timezone.now() + self.save() + return 0 + + try: + # Get all children before killing parent + children = proc.children(recursive=True) + + # Kill children first (reverse order - deepest first) + for child in reversed(children): + try: + child.terminate() + except (psutil.NoSuchProcess, psutil.AccessDenied): + # Child already dead or we don't have permission - continue + pass + + # Wait briefly for children to exit + gone, alive = psutil.wait_procs(children, timeout=graceful_timeout) + killed_count += len(gone) + + # Force kill remaining children + for child in alive: + try: + child.kill() + killed_count += 1 + except (psutil.NoSuchProcess, psutil.AccessDenied): + # Child exited or we don't have permission - continue + pass + + # Now kill self + if self.terminate(graceful_timeout=graceful_timeout): + killed_count += 1 + + return killed_count + + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + # Process tree already dead + self.status = self.StatusChoices.EXITED + self.ended_at = self.ended_at or timezone.now() + self.save() + return killed_count + + def kill_children_db(self) -> int: + """ + Kill all DB-tracked child processes (via parent FK). + + Different from kill_tree() which uses OS children. + This kills processes created via Process.create(parent=self). + + Returns: + Number of child Process records killed + """ + killed = 0 + for child in self.children.filter(status=self.StatusChoices.RUNNING): + if child.terminate(): + killed += 1 + return killed + + # ========================================================================= + # Class methods for querying processes + # ========================================================================= + + @classmethod + def get_running(cls, process_type: str = None, machine: 'Machine' = None) -> 'QuerySet[Process]': + """ + Get all running processes, optionally filtered by type. + + Replaces: + - workers/pid_utils.py get_all_worker_pids() + - workers/orchestrator.py get_total_worker_count() + + Args: + process_type: Filter by TypeChoices (e.g., 'worker', 'hook') + machine: Filter by machine (defaults to current) + + Returns: + QuerySet of running Process records + """ + machine = machine or Machine.current() + qs = cls.objects.filter( + machine=machine, + status=cls.StatusChoices.RUNNING, + ) + if process_type: + qs = qs.filter(process_type=process_type) + return qs + + @classmethod + def get_running_count(cls, process_type: str = None, machine: 'Machine' = None) -> int: + """ + Get count of running processes. + + Replaces: + - workers/pid_utils.py get_running_worker_count() + """ + return cls.get_running(process_type=process_type, machine=machine).count() + + @classmethod + def stop_all(cls, process_type: str = None, machine: 'Machine' = None, graceful: bool = True) -> int: + """ + Stop all running processes of a given type. + + Args: + process_type: Filter by TypeChoices + machine: Filter by machine + graceful: If True, use terminate() (SIGTERM→SIGKILL), else kill() + + Returns: + Number of processes stopped + """ + stopped = 0 + for proc in cls.get_running(process_type=process_type, machine=machine): + if graceful: + if proc.terminate(): + stopped += 1 + else: + if proc.kill(): + stopped += 1 + return stopped + + @classmethod + def get_next_worker_id(cls, process_type: str = 'worker', machine: 'Machine' = None) -> int: + """ + Get the next available worker ID for spawning new workers. + + Replaces workers/pid_utils.py get_next_worker_id(). + Simply returns count of running workers of this type. + + Args: + process_type: Worker type to count + machine: Machine to scope query + + Returns: + Next available worker ID (0-indexed) + """ + return cls.get_running_count(process_type=process_type, machine=machine) + # ============================================================================= # Binary State Machine diff --git a/archivebox/misc/process_utils.py b/archivebox/misc/process_utils.py index 9d3fe52d..15fa861e 100644 --- a/archivebox/misc/process_utils.py +++ b/archivebox/misc/process_utils.py @@ -70,15 +70,54 @@ def write_cmd_file(cmd_file: Path, cmd: list[str]): pass -def safe_kill_process(pid_file: Path, cmd_file: Optional[Path] = None, signal_num: int = 15) -> bool: - """Kill process after validation. Returns True if killed.""" +def safe_kill_process(pid_file: Path, cmd_file: Optional[Path] = None, signal_num: int = 15, timeout: float = 3.0) -> bool: + """ + Kill process after validation, with graceful wait and SIGKILL escalation. + + Returns True only if process is confirmed dead (either already dead or killed successfully). + """ + import time + import signal + if not validate_pid_file(pid_file, cmd_file): pid_file.unlink(missing_ok=True) # Clean stale file - return False + return True # Process already dead, consider it killed try: pid = int(pid_file.read_text().strip()) - os.kill(pid, signal_num) - return True - except (OSError, ValueError, ProcessLookupError): + + # Send initial signal (SIGTERM by default) + try: + os.kill(pid, signal_num) + except ProcessLookupError: + # Process already dead + return True + + # Wait for process to terminate gracefully + start_time = time.time() + while time.time() - start_time < timeout: + try: + os.kill(pid, 0) # Check if process still exists + time.sleep(0.1) + except ProcessLookupError: + # Process terminated + return True + + # Process didn't terminate, escalate to SIGKILL + try: + os.kill(pid, signal.SIGKILL) + time.sleep(0.5) # Brief wait after SIGKILL + # Verify it's dead + try: + os.kill(pid, 0) + # Process still alive after SIGKILL - this is unusual + return False + except ProcessLookupError: + # Process finally dead + return True + except ProcessLookupError: + # Process died between timeout and SIGKILL + return True + + except (OSError, ValueError): return False diff --git a/archivebox/plugins/captcha2/config.json b/archivebox/plugins/captcha2/config.json new file mode 100644 index 00000000..ba1a1383 --- /dev/null +++ b/archivebox/plugins/captcha2/config.json @@ -0,0 +1,21 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "additionalProperties": false, + "required_plugins": ["chrome"], + "properties": { + "CAPTCHA2_ENABLED": { + "type": "boolean", + "default": true, + "x-aliases": ["USE_CAPTCHA2"], + "description": "Enable Captcha2 browser extension for CAPTCHA solving" + }, + "CAPTCHA2_TIMEOUT": { + "type": "integer", + "default": 60, + "minimum": 5, + "x-fallback": "TIMEOUT", + "description": "Timeout for CAPTCHA solving in seconds" + } + } +} diff --git a/archivebox/plugins/captcha2/on_Crawl__01_captcha2.js b/archivebox/plugins/captcha2/on_Crawl__01_captcha2.js new file mode 100755 index 00000000..45fb8956 --- /dev/null +++ b/archivebox/plugins/captcha2/on_Crawl__01_captcha2.js @@ -0,0 +1,121 @@ +#!/usr/bin/env node +/** + * 2Captcha Extension Plugin + * + * Installs and configures the 2captcha Chrome extension for automatic + * CAPTCHA solving during page archiving. + * + * Extension: https://chromewebstore.google.com/detail/ifibfemgeogfhoebkmokieepdoobkbpo + * Documentation: https://2captcha.com/blog/how-to-use-2captcha-solver-extension-in-puppeteer + * + * Priority: 01 (early) - Must install before Chrome session starts at Crawl level + * Hook: on_Crawl (runs once per crawl, not per snapshot) + * + * Requirements: + * - API_KEY_2CAPTCHA environment variable must be set + * - Extension will automatically solve reCAPTCHA, hCaptcha, Cloudflare Turnstile, etc. + */ + +const path = require('path'); +const fs = require('fs'); + +// Import extension utilities +const extensionUtils = require('../chrome/chrome_utils.js'); + +// Extension metadata +const EXTENSION = { + webstore_id: 'ifibfemgeogfhoebkmokieepdoobkbpo', + name: 'captcha2', +}; + +// Get extensions directory from environment or use default +const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR || + path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions'); + +/** + * Install and configure the 2captcha extension + */ +async function installCaptchaExtension() { + console.log('[*] Installing 2captcha extension...'); + + // Install the extension + const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR); + + if (!extension) { + console.error('[❌] Failed to install 2captcha extension'); + return null; + } + + // Check if API key is configured + const apiKey = process.env.API_KEY_2CAPTCHA; + if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') { + console.warn('[⚠️] 2captcha extension installed but API_KEY_2CAPTCHA not configured'); + console.warn('[⚠️] Set API_KEY_2CAPTCHA environment variable to enable automatic CAPTCHA solving'); + } else { + console.log('[+] 2captcha extension installed and API key configured'); + } + + return extension; +} + +/** + * Note: 2captcha configuration is now handled by chrome plugin + * during first-time browser setup to avoid repeated configuration on every snapshot. + * The API key is injected via chrome.storage API once per browser session. + */ + +/** + * Main entry point - install extension before archiving + */ +async function main() { + // Check if extension is already cached + const cacheFile = path.join(EXTENSIONS_DIR, 'captcha2.extension.json'); + + if (fs.existsSync(cacheFile)) { + try { + const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8')); + const manifestPath = path.join(cached.unpacked_path, 'manifest.json'); + + if (fs.existsSync(manifestPath)) { + console.log('[*] 2captcha extension already installed (using cache)'); + return cached; + } + } catch (e) { + // Cache file corrupted, re-install + console.warn('[⚠️] Extension cache corrupted, re-installing...'); + } + } + + // Install extension + const extension = await installCaptchaExtension(); + + // Export extension metadata for chrome plugin to load + if (extension) { + // Write extension info to a cache file that chrome plugin can read + await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true }); + await fs.promises.writeFile( + cacheFile, + JSON.stringify(extension, null, 2) + ); + console.log(`[+] Extension metadata written to ${cacheFile}`); + } + + return extension; +} + +// Export functions for use by other plugins +module.exports = { + EXTENSION, + installCaptchaExtension, +}; + +// Run if executed directly +if (require.main === module) { + main().then(() => { + console.log('[✓] 2captcha extension setup complete'); + process.exit(0); + }).catch(err => { + console.error('[❌] 2captcha extension setup failed:', err); + process.exit(1); + }); +} diff --git a/archivebox/plugins/captcha2/on_Crawl__11_captcha2_config.js b/archivebox/plugins/captcha2/on_Crawl__11_captcha2_config.js new file mode 100755 index 00000000..cf528a1b --- /dev/null +++ b/archivebox/plugins/captcha2/on_Crawl__11_captcha2_config.js @@ -0,0 +1,279 @@ +#!/usr/bin/env node +/** + * 2Captcha Extension Configuration + * + * Configures the 2captcha extension with API key after Crawl-level Chrome session starts. + * Runs once per crawl to inject API key into extension storage. + * + * Priority: 11 (after chrome_launch at 20) + * Hook: on_Crawl (runs once per crawl, not per snapshot) + * + * Requirements: + * - API_KEY_2CAPTCHA environment variable must be set + * - chrome plugin must have loaded extensions (extensions.json must exist) + */ + +const path = require('path'); +const fs = require('fs'); +// Add NODE_MODULES_DIR to module resolution paths if set +if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR); +const puppeteer = require('puppeteer-core'); + +// Get crawl's chrome directory from environment variable set by hooks.py +function getCrawlChromeSessionDir() { + const crawlOutputDir = process.env.CRAWL_OUTPUT_DIR || ''; + if (!crawlOutputDir) { + return null; + } + return path.join(crawlOutputDir, 'chrome'); +} + +const CHROME_SESSION_DIR = getCrawlChromeSessionDir() || '../chrome'; +const CONFIG_MARKER = path.join(CHROME_SESSION_DIR, '.captcha2_configured'); + +// Get environment variable with default +function getEnv(name, defaultValue = '') { + return (process.env[name] || defaultValue).trim(); +} + +// Parse command line arguments +function parseArgs() { + const args = {}; + process.argv.slice(2).forEach(arg => { + if (arg.startsWith('--')) { + const [key, ...valueParts] = arg.slice(2).split('='); + args[key.replace(/-/g, '_')] = valueParts.join('=') || true; + } + }); + return args; +} + +async function configure2Captcha() { + // Check if already configured in this session + if (fs.existsSync(CONFIG_MARKER)) { + console.error('[*] 2captcha already configured in this browser session'); + return { success: true, skipped: true }; + } + + // Check if API key is set + const apiKey = getEnv('API_KEY_2CAPTCHA'); + if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') { + console.warn('[⚠️] 2captcha extension loaded but API_KEY_2CAPTCHA not configured'); + console.warn('[⚠️] Set API_KEY_2CAPTCHA environment variable to enable automatic CAPTCHA solving'); + return { success: false, error: 'API_KEY_2CAPTCHA not configured' }; + } + + // Load extensions metadata + const extensionsFile = path.join(CHROME_SESSION_DIR, 'extensions.json'); + if (!fs.existsSync(extensionsFile)) { + return { success: false, error: 'extensions.json not found - chrome plugin must run first' }; + } + + const extensions = JSON.parse(fs.readFileSync(extensionsFile, 'utf-8')); + const captchaExt = extensions.find(ext => ext.name === 'captcha2'); + + if (!captchaExt) { + console.error('[*] 2captcha extension not installed, skipping configuration'); + return { success: true, skipped: true }; + } + + console.error('[*] Configuring 2captcha extension with API key...'); + + try { + // Connect to the existing Chrome session via CDP + const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt'); + if (!fs.existsSync(cdpFile)) { + return { success: false, error: 'CDP URL not found - chrome plugin must run first' }; + } + + const cdpUrl = fs.readFileSync(cdpFile, 'utf-8').trim(); + const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl }); + + try { + // Method 1: Try to inject via extension background page + if (captchaExt.target && captchaExt.target_ctx) { + console.error('[*] Attempting to configure via extension background page...'); + + // Reconnect to the browser to get fresh target context + const targets = await browser.targets(); + const extTarget = targets.find(t => + t.url().startsWith(`chrome-extension://${captchaExt.id}`) + ); + + if (extTarget) { + const extContext = await extTarget.worker() || await extTarget.page(); + + if (extContext) { + await extContext.evaluate((key) => { + // Try all common storage patterns + if (typeof chrome !== 'undefined' && chrome.storage) { + chrome.storage.local.set({ + apiKey: key, + api_key: key, + '2captcha_apikey': key, + apikey: key, + 'solver-api-key': key, + }); + chrome.storage.sync.set({ + apiKey: key, + api_key: key, + '2captcha_apikey': key, + apikey: key, + 'solver-api-key': key, + }); + } + + // Also try localStorage as fallback + if (typeof localStorage !== 'undefined') { + localStorage.setItem('apiKey', key); + localStorage.setItem('2captcha_apikey', key); + localStorage.setItem('solver-api-key', key); + } + }, apiKey); + + console.error('[+] 2captcha API key configured successfully via background page'); + + // Mark as configured + fs.writeFileSync(CONFIG_MARKER, new Date().toISOString()); + + return { success: true, method: 'background_page' }; + } + } + } + + // Method 2: Try to configure via options page + console.error('[*] Attempting to configure via options page...'); + const optionsUrl = `chrome-extension://${captchaExt.id}/options.html`; + const configPage = await browser.newPage(); + + try { + await configPage.goto(optionsUrl, { waitUntil: 'networkidle0', timeout: 10000 }); + + const configured = await configPage.evaluate((key) => { + // Try to find API key input field + const selectors = [ + 'input[name*="apikey" i]', + 'input[id*="apikey" i]', + 'input[name*="api-key" i]', + 'input[id*="api-key" i]', + 'input[name*="key" i]', + 'input[placeholder*="api" i]', + 'input[type="text"]', + ]; + + for (const selector of selectors) { + const input = document.querySelector(selector); + if (input) { + input.value = key; + input.dispatchEvent(new Event('input', { bubbles: true })); + input.dispatchEvent(new Event('change', { bubbles: true })); + + // Try to find and click save button + const saveSelectors = [ + 'button[type="submit"]', + 'input[type="submit"]', + 'button:contains("Save")', + 'button:contains("Apply")', + ]; + + for (const btnSel of saveSelectors) { + const btn = document.querySelector(btnSel); + if (btn) { + btn.click(); + break; + } + } + + // Also save to storage + if (typeof chrome !== 'undefined' && chrome.storage) { + chrome.storage.local.set({ apiKey: key, api_key: key, '2captcha_apikey': key }); + chrome.storage.sync.set({ apiKey: key, api_key: key, '2captcha_apikey': key }); + } + + return true; + } + } + + // Fallback: Just save to storage + if (typeof chrome !== 'undefined' && chrome.storage) { + chrome.storage.local.set({ apiKey: key, api_key: key, '2captcha_apikey': key }); + chrome.storage.sync.set({ apiKey: key, api_key: key, '2captcha_apikey': key }); + return true; + } + + return false; + }, apiKey); + + await configPage.close(); + + if (configured) { + console.error('[+] 2captcha API key configured successfully via options page'); + + // Mark as configured + fs.writeFileSync(CONFIG_MARKER, new Date().toISOString()); + + return { success: true, method: 'options_page' }; + } + } catch (e) { + console.warn(`[⚠️] Failed to configure via options page: ${e.message}`); + try { + await configPage.close(); + } catch (e2) {} + } + + return { success: false, error: 'Could not configure via any method' }; + } finally { + browser.disconnect(); + } + } catch (e) { + return { success: false, error: `${e.name}: ${e.message}` }; + } +} + +async function main() { + const args = parseArgs(); + const url = args.url; + const snapshotId = args.snapshot_id; + + if (!url || !snapshotId) { + console.error('Usage: on_Snapshot__21_captcha2_config.js --url= --snapshot-id='); + process.exit(1); + } + + const startTs = new Date(); + let status = 'failed'; + let error = ''; + + try { + const result = await configure2Captcha(); + + if (result.skipped) { + status = 'skipped'; + } else if (result.success) { + status = 'succeeded'; + } else { + status = 'failed'; + error = result.error || 'Configuration failed'; + } + } catch (e) { + error = `${e.name}: ${e.message}`; + status = 'failed'; + } + + const endTs = new Date(); + const duration = (endTs - startTs) / 1000; + + if (error) { + console.error(`ERROR: ${error}`); + } + + // Config hooks don't emit JSONL - they're utility hooks for setup + // Exit code indicates success/failure + + process.exit(status === 'succeeded' || status === 'skipped' ? 0 : 1); +} + +main().catch(e => { + console.error(`Fatal error: ${e.message}`); + process.exit(1); +}); diff --git a/archivebox/plugins/captcha2/templates/icon.html b/archivebox/plugins/captcha2/templates/icon.html new file mode 100644 index 00000000..e69de29b diff --git a/archivebox/plugins/captcha2/tests/test_captcha2.py b/archivebox/plugins/captcha2/tests/test_captcha2.py new file mode 100644 index 00000000..bc08a072 --- /dev/null +++ b/archivebox/plugins/captcha2/tests/test_captcha2.py @@ -0,0 +1,184 @@ +""" +Unit tests for captcha2 plugin + +Tests invoke the plugin hooks as external processes and verify outputs/side effects. +""" + +import json +import os +import subprocess +import tempfile +from pathlib import Path + +import pytest + + +PLUGIN_DIR = Path(__file__).parent.parent +INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_captcha2.*'), None) +CONFIG_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_captcha2_config.*'), None) + + +def test_install_script_exists(): + """Verify install script exists""" + assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}" + + +def test_config_script_exists(): + """Verify config script exists""" + assert CONFIG_SCRIPT.exists(), f"Config script not found: {CONFIG_SCRIPT}" + + +def test_extension_metadata(): + """Test that captcha2 extension has correct metadata""" + with tempfile.TemporaryDirectory() as tmpdir: + env = os.environ.copy() + env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions") + + # Just check the script can be loaded + result = subprocess.run( + ["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"], + capture_output=True, + text=True, + env=env + ) + + assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}" + + metadata = json.loads(result.stdout) + assert metadata["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo" + assert metadata["name"] == "captcha2" + + +def test_install_creates_cache(): + """Test that install creates extension cache""" + with tempfile.TemporaryDirectory() as tmpdir: + ext_dir = Path(tmpdir) / "chrome_extensions" + ext_dir.mkdir(parents=True) + + env = os.environ.copy() + env["CHROME_EXTENSIONS_DIR"] = str(ext_dir) + env["API_KEY_2CAPTCHA"] = "test_api_key" + + # Run install script + result = subprocess.run( + ["node", str(INSTALL_SCRIPT)], + capture_output=True, + text=True, + env=env, + timeout=60 + ) + + # Check output mentions installation + assert "[*] Installing 2captcha extension" in result.stdout or "[*] 2captcha extension already installed" in result.stdout + + # Check cache file was created + cache_file = ext_dir / "captcha2.extension.json" + assert cache_file.exists(), "Cache file should be created" + + # Verify cache content + cache_data = json.loads(cache_file.read_text()) + assert cache_data["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo" + assert cache_data["name"] == "captcha2" + assert "unpacked_path" in cache_data + assert "version" in cache_data + + +def test_install_twice_uses_cache(): + """Test that running install twice uses existing cache on second run""" + with tempfile.TemporaryDirectory() as tmpdir: + ext_dir = Path(tmpdir) / "chrome_extensions" + ext_dir.mkdir(parents=True) + + env = os.environ.copy() + env["CHROME_EXTENSIONS_DIR"] = str(ext_dir) + env["API_KEY_2CAPTCHA"] = "test_api_key" + + # First install - downloads the extension + result1 = subprocess.run( + ["node", str(INSTALL_SCRIPT)], + capture_output=True, + text=True, + env=env, + timeout=60 + ) + assert result1.returncode == 0, f"First install failed: {result1.stderr}" + + # Verify cache was created + cache_file = ext_dir / "captcha2.extension.json" + assert cache_file.exists(), "Cache file should exist after first install" + + # Second install - should use cache + result2 = subprocess.run( + ["node", str(INSTALL_SCRIPT)], + capture_output=True, + text=True, + env=env, + timeout=30 + ) + assert result2.returncode == 0, f"Second install failed: {result2.stderr}" + + # Second run should mention cache reuse + assert "already installed" in result2.stdout or "cache" in result2.stdout.lower() or result2.returncode == 0 + + +def test_install_warns_without_api_key(): + """Test that install warns when API key not configured""" + with tempfile.TemporaryDirectory() as tmpdir: + ext_dir = Path(tmpdir) / "chrome_extensions" + ext_dir.mkdir(parents=True) + + env = os.environ.copy() + env["CHROME_EXTENSIONS_DIR"] = str(ext_dir) + # Don't set API_KEY_2CAPTCHA + + # Run install script + result = subprocess.run( + ["node", str(INSTALL_SCRIPT)], + capture_output=True, + text=True, + env=env, + timeout=60 + ) + + # Should warn about missing API key + combined_output = result.stdout + result.stderr + assert "API_KEY_2CAPTCHA not configured" in combined_output or "Set API_KEY_2CAPTCHA" in combined_output + + +def test_install_success_with_api_key(): + """Test that install succeeds when API key is configured""" + with tempfile.TemporaryDirectory() as tmpdir: + ext_dir = Path(tmpdir) / "chrome_extensions" + ext_dir.mkdir(parents=True) + + env = os.environ.copy() + env["CHROME_EXTENSIONS_DIR"] = str(ext_dir) + env["API_KEY_2CAPTCHA"] = "test_valid_api_key_123" + + # Run install script + result = subprocess.run( + ["node", str(INSTALL_SCRIPT)], + capture_output=True, + text=True, + env=env, + timeout=60 + ) + + # Should mention API key configured + combined_output = result.stdout + result.stderr + assert "API key configured" in combined_output or "API_KEY_2CAPTCHA" in combined_output + + +def test_config_script_structure(): + """Test that config script has proper structure""" + # Verify the script exists and contains expected markers + script_content = CONFIG_SCRIPT.read_text() + + # Should mention configuration marker file + assert "CONFIG_MARKER" in script_content or "captcha2_configured" in script_content + + # Should mention API key + assert "API_KEY_2CAPTCHA" in script_content + + # Should have main function or be executable + assert "async function" in script_content or "main" in script_content diff --git a/archivebox/plugins/chrome/on_Crawl__00_chrome_install.py b/archivebox/plugins/chrome/on_Crawl__00_chrome_install.py new file mode 100644 index 00000000..4c6bbbdd --- /dev/null +++ b/archivebox/plugins/chrome/on_Crawl__00_chrome_install.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python3 +""" +Install hook for Chrome/Chromium and puppeteer-core. + +Runs at crawl start to install/find Chromium and puppeteer-core. +Outputs JSONL for Binary and Machine config updates. +Respects CHROME_BINARY env var for custom binary paths. +Uses `npx @puppeteer/browsers install chromium@latest` and parses output. + +NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for +--load-extension and --disable-extensions-except flags, which are needed for +loading unpacked extensions in headless mode. +""" + +import os +import sys +import json +import subprocess +from pathlib import Path + + +def get_chrome_version(binary_path: str) -> str | None: + """Get Chrome/Chromium version string.""" + try: + result = subprocess.run( + [binary_path, '--version'], + capture_output=True, + text=True, + timeout=5 + ) + if result.returncode == 0: + return result.stdout.strip() + except Exception: + pass + return None + + +def install_puppeteer_core() -> bool: + """Install puppeteer-core to NODE_MODULES_DIR if not present.""" + node_modules_dir = os.environ.get('NODE_MODULES_DIR', '').strip() + if not node_modules_dir: + # No isolated node_modules, skip (will use global) + return True + + node_modules_path = Path(node_modules_dir) + if (node_modules_path / 'puppeteer-core').exists(): + return True + + # Get npm prefix from NODE_MODULES_DIR (parent of node_modules) + npm_prefix = node_modules_path.parent + + try: + print(f"[*] Installing puppeteer-core to {npm_prefix}...", file=sys.stderr) + result = subprocess.run( + ['npm', 'install', '--prefix', str(npm_prefix), 'puppeteer-core', '@puppeteer/browsers'], + capture_output=True, + text=True, + timeout=60 + ) + if result.returncode == 0: + print(f"[+] puppeteer-core installed", file=sys.stderr) + return True + else: + print(f"[!] Failed to install puppeteer-core: {result.stderr}", file=sys.stderr) + return False + except Exception as e: + print(f"[!] Failed to install puppeteer-core: {e}", file=sys.stderr) + return False + + +def install_chromium() -> dict | None: + """Install Chromium using @puppeteer/browsers and parse output for binary path. + + Output format: "chromium@ " + e.g.: "chromium@1563294 /Users/x/.cache/puppeteer/chromium/.../Chromium" + + Note: npx is fast when chromium is already cached - it returns the path without re-downloading. + """ + try: + print("[*] Installing Chromium via @puppeteer/browsers...", file=sys.stderr) + + # Use --path to install to puppeteer's standard cache location + cache_path = os.path.expanduser('~/.cache/puppeteer') + + result = subprocess.run( + ['npx', '@puppeteer/browsers', 'install', 'chromium@1563297', f'--path={cache_path}'], + capture_output=True, + text=True, + stdin=subprocess.DEVNULL, + timeout=300 + ) + + if result.returncode != 0: + print(f"[!] Failed to install Chromium: {result.stderr}", file=sys.stderr) + return None + + # Parse output: "chromium@1563294 /path/to/Chromium" + output = result.stdout.strip() + parts = output.split(' ', 1) + if len(parts) != 2: + print(f"[!] Failed to parse install output: {output}", file=sys.stderr) + return None + + version_str = parts[0] # "chromium@1563294" + binary_path = parts[1].strip() + + if not binary_path or not os.path.exists(binary_path): + print(f"[!] Binary not found at: {binary_path}", file=sys.stderr) + return None + + # Extract version number + version = version_str.split('@')[1] if '@' in version_str else None + + print(f"[+] Chromium installed: {binary_path}", file=sys.stderr) + + return { + 'name': 'chromium', + 'abspath': binary_path, + 'version': version, + 'binprovider': 'puppeteer', + } + + except subprocess.TimeoutExpired: + print("[!] Chromium install timed out", file=sys.stderr) + except FileNotFoundError: + print("[!] npx not found - is Node.js installed?", file=sys.stderr) + except Exception as e: + print(f"[!] Failed to install Chromium: {e}", file=sys.stderr) + + return None + + +def main(): + # Install puppeteer-core if NODE_MODULES_DIR is set + install_puppeteer_core() + + # Check if CHROME_BINARY is already set and valid + configured_binary = os.environ.get('CHROME_BINARY', '').strip() + if configured_binary and os.path.isfile(configured_binary) and os.access(configured_binary, os.X_OK): + version = get_chrome_version(configured_binary) + print(json.dumps({ + 'type': 'Binary', + 'name': 'chromium', + 'abspath': configured_binary, + 'version': version, + 'binprovider': 'env', + })) + sys.exit(0) + + # Install/find Chromium via puppeteer + result = install_chromium() + + if result and result.get('abspath'): + print(json.dumps({ + 'type': 'Binary', + 'name': result['name'], + 'abspath': result['abspath'], + 'version': result['version'], + 'binprovider': result['binprovider'], + })) + + print(json.dumps({ + 'type': 'Machine', + '_method': 'update', + 'key': 'config/CHROME_BINARY', + 'value': result['abspath'], + })) + + if result['version']: + print(json.dumps({ + 'type': 'Machine', + '_method': 'update', + 'key': 'config/CHROMIUM_VERSION', + 'value': result['version'], + })) + + sys.exit(0) + else: + print("Chromium binary not found", file=sys.stderr) + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/chrome/on_Crawl__10_chrome_validate_config.py b/archivebox/plugins/chrome/on_Crawl__10_chrome_validate_config.py new file mode 100644 index 00000000..7aa8639c --- /dev/null +++ b/archivebox/plugins/chrome/on_Crawl__10_chrome_validate_config.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 +""" +Validate and compute derived Chrome config values. + +This hook runs early in the Crawl lifecycle to: +1. Auto-detect Chrome binary location +2. Compute sandbox settings based on Docker detection +3. Validate binary availability and version +4. Set computed env vars for subsequent hooks + +Output: + - COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env + - Binary JSONL records to stdout when binaries are found +""" + +import json +import os +import sys + +from abx_pkg import Binary, EnvProvider + + +# Chrome binary search order +CHROME_BINARY_NAMES = [ + 'chromium', + 'chromium-browser', + 'google-chrome', + 'google-chrome-stable', + 'chrome', +] + +def get_env(name: str, default: str = '') -> str: + return os.environ.get(name, default).strip() + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, '').lower() + if val in ('true', '1', 'yes', 'on'): + return True + if val in ('false', '0', 'no', 'off'): + return False + return default + + +def detect_docker() -> bool: + """Detect if running inside Docker container.""" + return ( + os.path.exists('/.dockerenv') or + os.environ.get('IN_DOCKER', '').lower() in ('true', '1', 'yes') or + os.path.exists('/run/.containerenv') + ) + + +def find_chrome_binary(configured: str, provider: EnvProvider) -> Binary | None: + """Find Chrome binary using abx-pkg, checking configured path first.""" + # Try configured binary first + if configured: + try: + binary = Binary(name=configured, binproviders=[provider]).load() + if binary.abspath: + return binary + except Exception: + pass + + # Search common names + for name in CHROME_BINARY_NAMES: + try: + binary = Binary(name=name, binproviders=[provider]).load() + if binary.abspath: + return binary + except Exception: + continue + + return None + + +def output_binary(binary: Binary, name: str): + """Output Binary JSONL record to stdout.""" + machine_id = os.environ.get('MACHINE_ID', '') + + record = { + 'type': 'Binary', + 'name': name, + 'abspath': str(binary.abspath), + 'version': str(binary.version) if binary.version else '', + 'sha256': binary.sha256 or '', + 'binprovider': 'env', + 'machine_id': machine_id, + } + print(json.dumps(record)) + + +def main(): + warnings = [] + errors = [] + computed = {} + + # Get config values + chrome_binary = get_env('CHROME_BINARY', 'chromium') + chrome_sandbox = get_env_bool('CHROME_SANDBOX', True) + screenshot_enabled = get_env_bool('SCREENSHOT_ENABLED', True) + pdf_enabled = get_env_bool('PDF_ENABLED', True) + dom_enabled = get_env_bool('DOM_ENABLED', True) + + # Compute USE_CHROME (derived from extractor enabled flags) + use_chrome = screenshot_enabled or pdf_enabled or dom_enabled + computed['USE_CHROME'] = str(use_chrome).lower() + + # Detect Docker and adjust sandbox + in_docker = detect_docker() + computed['IN_DOCKER'] = str(in_docker).lower() + + if in_docker and chrome_sandbox: + warnings.append( + "Running in Docker with CHROME_SANDBOX=true. " + "Chrome may fail to start. Consider setting CHROME_SANDBOX=false." + ) + # Auto-disable sandbox in Docker unless explicitly set + if not get_env('CHROME_SANDBOX'): + computed['CHROME_SANDBOX'] = 'false' + + # Find Chrome binary using abx-pkg + provider = EnvProvider() + if use_chrome: + chrome = find_chrome_binary(chrome_binary, provider) + if not chrome or not chrome.abspath: + errors.append( + f"Chrome binary not found (tried: {chrome_binary}). " + "Install Chrome/Chromium or set CHROME_BINARY path." + ) + computed['CHROME_BINARY'] = '' + else: + computed['CHROME_BINARY'] = str(chrome.abspath) + computed['CHROME_VERSION'] = str(chrome.version) if chrome.version else 'unknown' + + # Output Binary JSONL record for Chrome + output_binary(chrome, name='chrome') + + # Check Node.js for Puppeteer + node_binary_name = get_env('NODE_BINARY', 'node') + try: + node = Binary(name=node_binary_name, binproviders=[provider]).load() + node_path = str(node.abspath) if node.abspath else '' + except Exception: + node = None + node_path = '' + + if use_chrome and not node_path: + errors.append( + f"Node.js not found (tried: {node_binary_name}). " + "Install Node.js or set NODE_BINARY path for Puppeteer." + ) + else: + computed['NODE_BINARY'] = node_path + if node and node.abspath: + # Output Binary JSONL record for Node + output_binary(node, name='node') + + # Output computed values + for key, value in computed.items(): + print(f"COMPUTED:{key}={value}") + + for warning in warnings: + print(f"WARNING:{warning}", file=sys.stderr) + + for error in errors: + print(f"ERROR:{error}", file=sys.stderr) + + sys.exit(1 if errors else 0) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js b/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js new file mode 100644 index 00000000..c2d62775 --- /dev/null +++ b/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js @@ -0,0 +1,245 @@ +#!/usr/bin/env node +/** + * Launch a shared Chromium browser session for the entire crawl. + * + * This runs once per crawl and keeps Chromium alive for all snapshots to share. + * Each snapshot creates its own tab via on_Snapshot__20_chrome_tab.bg.js. + * + * NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for + * --load-extension and --disable-extensions-except flags. + * + * Usage: on_Crawl__20_chrome_launch.bg.js --crawl-id= --source-url= + * Output: Creates chrome/ directory under crawl output dir with: + * - cdp_url.txt: WebSocket URL for CDP connection + * - chrome.pid: Chromium process ID (for cleanup) + * - port.txt: Debug port number + * - extensions.json: Loaded extensions metadata + * + * Environment variables: + * NODE_MODULES_DIR: Path to node_modules directory for module resolution + * CHROME_BINARY: Path to Chromium binary (falls back to auto-detection) + * CHROME_RESOLUTION: Page resolution (default: 1440,2000) + * CHROME_HEADLESS: Run in headless mode (default: true) + * CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true) + * CHROME_EXTENSIONS_DIR: Directory containing Chrome extensions + */ + +// Add NODE_MODULES_DIR to module resolution paths if set +if (process.env.NODE_MODULES_DIR) { + module.paths.unshift(process.env.NODE_MODULES_DIR); +} + +const fs = require('fs'); +const path = require('path'); +const puppeteer = require('puppeteer-core'); +const { + findChromium, + launchChromium, + killChrome, + getEnv, + writePidWithMtime, +} = require('./chrome_utils.js'); + +// Extractor metadata +const PLUGIN_NAME = 'chrome_launch'; +const OUTPUT_DIR = 'chrome'; + +// Global state for cleanup +let chromePid = null; +let browserInstance = null; + +// Parse command line arguments +function parseArgs() { + const args = {}; + process.argv.slice(2).forEach((arg) => { + if (arg.startsWith('--')) { + const [key, ...valueParts] = arg.slice(2).split('='); + args[key.replace(/-/g, '_')] = valueParts.join('=') || true; + } + }); + return args; +} + +// Cleanup handler for SIGTERM +async function cleanup() { + console.error('[*] Cleaning up Chrome session...'); + + // Try graceful browser close first + if (browserInstance) { + try { + console.error('[*] Closing browser gracefully...'); + await browserInstance.close(); + browserInstance = null; + console.error('[+] Browser closed gracefully'); + } catch (e) { + console.error(`[!] Graceful close failed: ${e.message}`); + } + } + + // Kill Chrome process + if (chromePid) { + await killChrome(chromePid, OUTPUT_DIR); + } + + process.exit(0); +} + +// Register signal handlers +process.on('SIGTERM', cleanup); +process.on('SIGINT', cleanup); + +async function main() { + const args = parseArgs(); + const crawlId = args.crawl_id; + + try { + const binary = findChromium(); + if (!binary) { + console.error('ERROR: Chromium binary not found'); + console.error('DEPENDENCY_NEEDED=chromium'); + console.error('BIN_PROVIDERS=puppeteer,env,playwright,apt,brew'); + console.error('INSTALL_HINT=npx @puppeteer/browsers install chromium@latest'); + process.exit(1); + } + + // Get Chromium version + let version = ''; + try { + const { execSync } = require('child_process'); + version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 }) + .trim() + .slice(0, 64); + } catch (e) {} + + console.error(`[*] Using browser: ${binary}`); + if (version) console.error(`[*] Version: ${version}`); + + // Load installed extensions + const extensionsDir = getEnv('CHROME_EXTENSIONS_DIR') || + path.join(getEnv('DATA_DIR', '.'), 'personas', getEnv('ACTIVE_PERSONA', 'Default'), 'chrome_extensions'); + + const installedExtensions = []; + const extensionPaths = []; + if (fs.existsSync(extensionsDir)) { + const files = fs.readdirSync(extensionsDir); + for (const file of files) { + if (file.endsWith('.extension.json')) { + try { + const extPath = path.join(extensionsDir, file); + const extData = JSON.parse(fs.readFileSync(extPath, 'utf-8')); + if (extData.unpacked_path && fs.existsSync(extData.unpacked_path)) { + installedExtensions.push(extData); + extensionPaths.push(extData.unpacked_path); + console.error(`[*] Loading extension: ${extData.name || file}`); + } + } catch (e) { + console.warn(`[!] Skipping invalid extension cache: ${file}`); + } + } + } + } + + if (installedExtensions.length > 0) { + console.error(`[+] Found ${installedExtensions.length} extension(s) to load`); + } + + // Write hook's own PID + const hookStartTime = Date.now() / 1000; + if (!fs.existsSync(OUTPUT_DIR)) { + fs.mkdirSync(OUTPUT_DIR, { recursive: true }); + } + writePidWithMtime(path.join(OUTPUT_DIR, 'hook.pid'), process.pid, hookStartTime); + + // Launch Chromium using consolidated function + const result = await launchChromium({ + binary, + outputDir: OUTPUT_DIR, + extensionPaths, + }); + + if (!result.success) { + console.error(`ERROR: ${result.error}`); + process.exit(1); + } + + chromePid = result.pid; + const cdpUrl = result.cdpUrl; + + // Write extensions metadata + if (installedExtensions.length > 0) { + fs.writeFileSync( + path.join(OUTPUT_DIR, 'extensions.json'), + JSON.stringify(installedExtensions, null, 2) + ); + } + + // Connect puppeteer for extension verification + console.error(`[*] Connecting puppeteer to CDP...`); + const browser = await puppeteer.connect({ + browserWSEndpoint: cdpUrl, + defaultViewport: null, + }); + browserInstance = browser; + + // Verify extensions loaded + if (extensionPaths.length > 0) { + await new Promise(r => setTimeout(r, 3000)); + + const targets = browser.targets(); + console.error(`[*] All browser targets (${targets.length}):`); + for (const t of targets) { + console.error(` - ${t.type()}: ${t.url().slice(0, 80)}`); + } + + const extTargets = targets.filter(t => + t.url().startsWith('chrome-extension://') || + t.type() === 'service_worker' || + t.type() === 'background_page' + ); + + // Filter out built-in extensions + const builtinIds = [ + 'nkeimhogjdpnpccoofpliimaahmaaome', + 'fignfifoniblkonapihmkfakmlgkbkcf', + 'ahfgeienlihckogmohjhadlkjgocpleb', + 'mhjfbmdgcfjbbpaeojofohoefgiehjai', + ]; + const customExtTargets = extTargets.filter(t => { + const url = t.url(); + if (!url.startsWith('chrome-extension://')) return false; + const extId = url.split('://')[1].split('/')[0]; + return !builtinIds.includes(extId); + }); + + console.error(`[+] Found ${customExtTargets.length} custom extension target(s)`); + + for (const target of customExtTargets) { + const url = target.url(); + const extId = url.split('://')[1].split('/')[0]; + console.error(`[+] Extension loaded: ${extId} (${target.type()})`); + } + + if (customExtTargets.length === 0 && extensionPaths.length > 0) { + console.error(`[!] Warning: No custom extensions detected. Extension loading may have failed.`); + console.error(`[!] Make sure you are using Chromium, not Chrome (Chrome 137+ removed --load-extension support)`); + } + } + + console.error(`[+] Chromium session started for crawl ${crawlId}`); + console.error(`[+] CDP URL: ${cdpUrl}`); + console.error(`[+] PID: ${chromePid}`); + + // Stay alive to handle cleanup on SIGTERM + console.log('[*] Chromium launch hook staying alive to handle cleanup...'); + setInterval(() => {}, 1000000); + + } catch (e) { + console.error(`ERROR: ${e.name}: ${e.message}`); + process.exit(1); + } +} + +main().catch((e) => { + console.error(`Fatal error: ${e.message}`); + process.exit(1); +}); diff --git a/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__02_istilldontcareaboutcookies.js b/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__02_istilldontcareaboutcookies.js new file mode 100755 index 00000000..f2df6629 --- /dev/null +++ b/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__02_istilldontcareaboutcookies.js @@ -0,0 +1,115 @@ +#!/usr/bin/env node +/** + * I Still Don't Care About Cookies Extension Plugin + * + * Installs and configures the "I still don't care about cookies" Chrome extension + * for automatic cookie consent banner dismissal during page archiving. + * + * Extension: https://chromewebstore.google.com/detail/edibdbjcniadpccecjdfdjjppcpchdlm + * + * Priority: 02 (early) - Must install before Chrome session starts at Crawl level + * Hook: on_Crawl (runs once per crawl, not per snapshot) + * + * This extension automatically: + * - Dismisses cookie consent popups + * - Removes cookie banners + * - Accepts necessary cookies to proceed with browsing + * - Works on thousands of websites out of the box + */ + +const path = require('path'); +const fs = require('fs'); + +// Import extension utilities +const extensionUtils = require('../chrome/chrome_utils.js'); + +// Extension metadata +const EXTENSION = { + webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm', + name: 'istilldontcareaboutcookies', +}; + +// Get extensions directory from environment or use default +const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR || + path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions'); + +/** + * Install the I Still Don't Care About Cookies extension + */ +async function installCookiesExtension() { + console.log('[*] Installing I Still Don\'t Care About Cookies extension...'); + + // Install the extension + const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR); + + if (!extension) { + console.error('[❌] Failed to install I Still Don\'t Care About Cookies extension'); + return null; + } + + console.log('[+] I Still Don\'t Care About Cookies extension installed'); + console.log('[+] Cookie banners will be automatically dismissed during archiving'); + + return extension; +} + +/** + * Note: This extension works out of the box with no configuration needed. + * It automatically detects and dismisses cookie banners on page load. + */ + +/** + * Main entry point - install extension before archiving + */ +async function main() { + // Check if extension is already cached + const cacheFile = path.join(EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json'); + + if (fs.existsSync(cacheFile)) { + try { + const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8')); + const manifestPath = path.join(cached.unpacked_path, 'manifest.json'); + + if (fs.existsSync(manifestPath)) { + console.log('[*] I Still Don\'t Care About Cookies extension already installed (using cache)'); + return cached; + } + } catch (e) { + // Cache file corrupted, re-install + console.warn('[⚠️] Extension cache corrupted, re-installing...'); + } + } + + // Install extension + const extension = await installCookiesExtension(); + + // Export extension metadata for chrome plugin to load + if (extension) { + // Write extension info to a cache file that chrome plugin can read + await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true }); + await fs.promises.writeFile( + cacheFile, + JSON.stringify(extension, null, 2) + ); + console.log(`[+] Extension metadata written to ${cacheFile}`); + } + + return extension; +} + +// Export functions for use by other plugins +module.exports = { + EXTENSION, + installCookiesExtension, +}; + +// Run if executed directly +if (require.main === module) { + main().then(() => { + console.log('[✓] I Still Don\'t Care About Cookies extension setup complete'); + process.exit(0); + }).catch(err => { + console.error('[❌] I Still Don\'t Care About Cookies extension setup failed:', err); + process.exit(1); + }); +} diff --git a/archivebox/plugins/singlefile/on_Crawl__04_singlefile.js b/archivebox/plugins/singlefile/on_Crawl__04_singlefile.js new file mode 100755 index 00000000..7637bf98 --- /dev/null +++ b/archivebox/plugins/singlefile/on_Crawl__04_singlefile.js @@ -0,0 +1,268 @@ +#!/usr/bin/env node +/** + * SingleFile Extension Plugin + * + * Installs and uses the SingleFile Chrome extension for archiving complete web pages. + * Falls back to single-file-cli if the extension is not available. + * + * Extension: https://chromewebstore.google.com/detail/mpiodijhokgodhhofbcjdecpffjipkle + * + * Priority: 04 (early) - Must install before Chrome session starts at Crawl level + * Hook: on_Crawl (runs once per crawl, not per snapshot) + * + * This extension automatically: + * - Saves complete web pages as single HTML files + * - Inlines all resources (CSS, JS, images, fonts) + * - Preserves page fidelity better than wget/curl + * - Works with SPAs and dynamically loaded content + */ + +const path = require('path'); +const fs = require('fs'); +const { promisify } = require('util'); +const { exec } = require('child_process'); + +const execAsync = promisify(exec); + +// Import extension utilities +const extensionUtils = require('../chrome/chrome_utils.js'); + +// Extension metadata +const EXTENSION = { + webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle', + name: 'singlefile', +}; + +// Get extensions directory from environment or use default +const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR || + path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions'); + +const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR || + path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads'); + +const OUTPUT_DIR = '.'; +const OUTPUT_FILE = 'singlefile.html'; + +/** + * Install the SingleFile extension + */ +async function installSinglefileExtension() { + console.log('[*] Installing SingleFile extension...'); + + // Install the extension + const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR); + + if (!extension) { + console.error('[❌] Failed to install SingleFile extension'); + return null; + } + + console.log('[+] SingleFile extension installed'); + console.log('[+] Web pages will be saved as single HTML files'); + + return extension; +} + +/** + * Wait for a specified amount of time + */ +function wait(ms) { + return new Promise(resolve => setTimeout(resolve, ms)); +} + +/** + * Save a page using the SingleFile extension + * + * @param {Object} page - Puppeteer page object + * @param {Object} extension - Extension metadata with dispatchAction method + * @param {Object} options - Additional options + * @returns {Promise} - Path to saved file or null on failure + */ +async function saveSinglefileWithExtension(page, extension, options = {}) { + if (!extension || !extension.version) { + throw new Error('SingleFile extension not found or not loaded'); + } + + const url = await page.url(); + + // Check for unsupported URL schemes + const URL_SCHEMES_IGNORED = ['about', 'chrome', 'chrome-extension', 'data', 'javascript', 'blob']; + const scheme = url.split(':')[0]; + if (URL_SCHEMES_IGNORED.includes(scheme)) { + console.log(`[⚠️] Skipping SingleFile for URL scheme: ${scheme}`); + return null; + } + + // Ensure downloads directory exists + await fs.promises.mkdir(CHROME_DOWNLOADS_DIR, { recursive: true }); + + // Get list of existing files to ignore + const files_before = new Set( + (await fs.promises.readdir(CHROME_DOWNLOADS_DIR)) + .filter(fn => fn.endsWith('.html')) + ); + + // Output directory is current directory (hook already runs in output dir) + const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE); + + console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`); + + // Bring page to front (extension action button acts on foreground tab) + await page.bringToFront(); + + // Trigger the extension's action (toolbar button click) + await extension.dispatchAction(); + + // Wait for file to appear in downloads directory + const check_delay = 3000; // 3 seconds + const max_tries = 10; + let files_new = []; + + for (let attempt = 0; attempt < max_tries; attempt++) { + await wait(check_delay); + + const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR)) + .filter(fn => fn.endsWith('.html')); + + files_new = files_after.filter(file => !files_before.has(file)); + + if (files_new.length === 0) { + continue; + } + + // Find the matching file by checking if it contains the URL in the HTML header + for (const file of files_new) { + const dl_path = path.join(CHROME_DOWNLOADS_DIR, file); + const dl_text = await fs.promises.readFile(dl_path, 'utf-8'); + const dl_header = dl_text.split('meta charset')[0]; + + if (dl_header.includes(`url: ${url}`)) { + console.log(`[✍️] Moving SingleFile download from ${file} to ${out_path}`); + await fs.promises.rename(dl_path, out_path); + return out_path; + } + } + } + + console.warn(`[❌] Couldn't find matching SingleFile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay * max_tries) / 1000}s`); + console.warn(`[⚠️] New files found: ${files_new.join(', ')}`); + return null; +} + +/** + * Save a page using single-file-cli (fallback method) + * + * @param {string} url - URL to archive + * @param {Object} options - Additional options + * @returns {Promise} - Path to saved file or null on failure + */ +async function saveSinglefileWithCLI(url, options = {}) { + console.log('[*] Falling back to single-file-cli...'); + + // Find single-file binary + let binary = null; + try { + const { stdout } = await execAsync('which single-file'); + binary = stdout.trim(); + } catch (err) { + console.error('[❌] single-file-cli not found. Install with: npm install -g single-file-cli'); + return null; + } + + // Output directory is current directory (hook already runs in output dir) + const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE); + + // Build command + const cmd = [ + binary, + '--browser-headless', + url, + out_path, + ]; + + // Add optional args + if (options.userAgent) { + cmd.splice(2, 0, '--browser-user-agent', options.userAgent); + } + if (options.cookiesFile && fs.existsSync(options.cookiesFile)) { + cmd.splice(2, 0, '--browser-cookies-file', options.cookiesFile); + } + if (options.ignoreSSL) { + cmd.splice(2, 0, '--browser-ignore-insecure-certs'); + } + + // Execute + try { + const timeout = options.timeout || 120000; + await execAsync(cmd.join(' '), { timeout }); + + if (fs.existsSync(out_path) && fs.statSync(out_path).size > 0) { + console.log(`[+] SingleFile saved via CLI: ${out_path}`); + return out_path; + } + + console.error('[❌] SingleFile CLI completed but no output file found'); + return null; + } catch (err) { + console.error(`[❌] SingleFile CLI error: ${err.message}`); + return null; + } +} + +/** + * Main entry point - install extension before archiving + */ +async function main() { + // Check if extension is already cached + const cacheFile = path.join(EXTENSIONS_DIR, 'singlefile.extension.json'); + + if (fs.existsSync(cacheFile)) { + try { + const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8')); + const manifestPath = path.join(cached.unpacked_path, 'manifest.json'); + + if (fs.existsSync(manifestPath)) { + console.log('[*] SingleFile extension already installed (using cache)'); + return cached; + } + } catch (e) { + // Cache file corrupted, re-install + console.warn('[⚠️] Extension cache corrupted, re-installing...'); + } + } + + // Install extension + const extension = await installSinglefileExtension(); + + // Export extension metadata for chrome plugin to load + if (extension) { + // Write extension info to a cache file that chrome plugin can read + await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true }); + await fs.promises.writeFile( + cacheFile, + JSON.stringify(extension, null, 2) + ); + console.log(`[+] Extension metadata written to ${cacheFile}`); + } + + return extension; +} + +// Export functions for use by other plugins +module.exports = { + EXTENSION, + installSinglefileExtension, + saveSinglefileWithExtension, + saveSinglefileWithCLI, +}; + +// Run if executed directly +if (require.main === module) { + main().then(() => { + console.log('[✓] SingleFile extension setup complete'); + process.exit(0); + }).catch(err => { + console.error('[❌] SingleFile extension setup failed:', err); + process.exit(1); + }); +} diff --git a/archivebox/plugins/ublock/on_Crawl__03_ublock.js b/archivebox/plugins/ublock/on_Crawl__03_ublock.js new file mode 100755 index 00000000..b8a0219c --- /dev/null +++ b/archivebox/plugins/ublock/on_Crawl__03_ublock.js @@ -0,0 +1,116 @@ +#!/usr/bin/env node +/** + * uBlock Origin Extension Plugin + * + * Installs and configures the uBlock Origin Chrome extension for ad blocking + * and privacy protection during page archiving. + * + * Extension: https://chromewebstore.google.com/detail/cjpalhdlnbpafiamejdnhcphjbkeiagm + * + * Priority: 03 (early) - Must install before Chrome session starts at Crawl level + * Hook: on_Crawl (runs once per crawl, not per snapshot) + * + * This extension automatically: + * - Blocks ads, trackers, and malware domains + * - Reduces page load time and bandwidth usage + * - Improves privacy during archiving + * - Removes clutter from archived pages + * - Uses efficient blocking with filter lists + */ + +const path = require('path'); +const fs = require('fs'); + +// Import extension utilities +const extensionUtils = require('../chrome/chrome_utils.js'); + +// Extension metadata +const EXTENSION = { + webstore_id: 'cjpalhdlnbpafiamejdnhcphjbkeiagm', + name: 'ublock', +}; + +// Get extensions directory from environment or use default +const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR || + path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions'); + +/** + * Install the uBlock Origin extension + */ +async function installUblockExtension() { + console.log('[*] Installing uBlock Origin extension...'); + + // Install the extension + const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR); + + if (!extension) { + console.error('[❌] Failed to install uBlock Origin extension'); + return null; + } + + console.log('[+] uBlock Origin extension installed'); + console.log('[+] Ads and trackers will be blocked during archiving'); + + return extension; +} + +/** + * Note: uBlock Origin works automatically with default filter lists. + * No configuration needed - blocks ads, trackers, and malware domains out of the box. + */ + +/** + * Main entry point - install extension before archiving + */ +async function main() { + // Check if extension is already cached + const cacheFile = path.join(EXTENSIONS_DIR, 'ublock.extension.json'); + + if (fs.existsSync(cacheFile)) { + try { + const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8')); + const manifestPath = path.join(cached.unpacked_path, 'manifest.json'); + + if (fs.existsSync(manifestPath)) { + console.log('[*] uBlock Origin extension already installed (using cache)'); + return cached; + } + } catch (e) { + // Cache file corrupted, re-install + console.warn('[⚠️] Extension cache corrupted, re-installing...'); + } + } + + // Install extension + const extension = await installUblockExtension(); + + // Export extension metadata for chrome plugin to load + if (extension) { + // Write extension info to a cache file that chrome plugin can read + await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true }); + await fs.promises.writeFile( + cacheFile, + JSON.stringify(extension, null, 2) + ); + console.log(`[+] Extension metadata written to ${cacheFile}`); + } + + return extension; +} + +// Export functions for use by other plugins +module.exports = { + EXTENSION, + installUblockExtension, +}; + +// Run if executed directly +if (require.main === module) { + main().then(() => { + console.log('[✓] uBlock Origin extension setup complete'); + process.exit(0); + }).catch(err => { + console.error('[❌] uBlock Origin extension setup failed:', err); + process.exit(1); + }); +} diff --git a/archivebox/plugins/wget/on_Crawl__10_wget_validate_config.py b/archivebox/plugins/wget/on_Crawl__10_wget_validate_config.py new file mode 100644 index 00000000..d3116ed3 --- /dev/null +++ b/archivebox/plugins/wget/on_Crawl__10_wget_validate_config.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +""" +Validate and compute derived wget config values. + +This hook runs early in the Crawl lifecycle to: +1. Validate config values with warnings (not hard errors) +2. Compute derived values (USE_WGET from WGET_ENABLED) +3. Check binary availability and version + +Output: + - COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env + - Binary JSONL records to stdout when binaries are found +""" + +import json +import os +import shutil +import subprocess +import sys + +from abx_pkg import Binary, EnvProvider + + +# Read config from environment (already validated by JSONSchema) +def get_env(name: str, default: str = '') -> str: + return os.environ.get(name, default).strip() + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, '').lower() + if val in ('true', '1', 'yes', 'on'): + return True + if val in ('false', '0', 'no', 'off'): + return False + return default + +def get_env_int(name: str, default: int = 0) -> int: + try: + return int(get_env(name, str(default))) + except ValueError: + return default + + +def output_binary(binary: Binary, name: str): + """Output Binary JSONL record to stdout.""" + machine_id = os.environ.get('MACHINE_ID', '') + + record = { + 'type': 'Binary', + 'name': name, + 'abspath': str(binary.abspath), + 'version': str(binary.version) if binary.version else '', + 'sha256': binary.sha256 or '', + 'binprovider': 'env', + 'machine_id': machine_id, + } + print(json.dumps(record)) + + +def main(): + warnings = [] + errors = [] + computed = {} + + # Get config values + wget_enabled = get_env_bool('WGET_ENABLED', True) + wget_save_warc = get_env_bool('WGET_SAVE_WARC', True) + wget_timeout = get_env_int('WGET_TIMEOUT') or get_env_int('TIMEOUT', 60) + wget_binary = get_env('WGET_BINARY', 'wget') + + # Compute derived values (USE_WGET for backward compatibility) + use_wget = wget_enabled + computed['USE_WGET'] = str(use_wget).lower() + + # Validate timeout with warning (not error) + if use_wget and wget_timeout < 20: + warnings.append( + f"WGET_TIMEOUT={wget_timeout} is very low. " + "wget may fail to archive sites if set to less than ~20 seconds. " + "Consider setting WGET_TIMEOUT=60 or higher." + ) + + # Check binary availability using abx-pkg + provider = EnvProvider() + try: + binary = Binary(name=wget_binary, binproviders=[provider]).load() + binary_path = str(binary.abspath) if binary.abspath else '' + except Exception: + binary = None + binary_path = '' + + if not binary_path: + if use_wget: + errors.append(f"WGET_BINARY={wget_binary} not found. Install wget or set WGET_ENABLED=false.") + computed['WGET_BINARY'] = '' + else: + computed['WGET_BINARY'] = binary_path + wget_version = str(binary.version) if binary.version else 'unknown' + computed['WGET_VERSION'] = wget_version + + # Output Binary JSONL record + output_binary(binary, name='wget') + + # Check for compression support + if computed.get('WGET_BINARY'): + try: + result = subprocess.run( + [computed['WGET_BINARY'], '--compression=auto', '--help'], + capture_output=True, timeout=5 + ) + computed['WGET_AUTO_COMPRESSION'] = 'true' if result.returncode == 0 else 'false' + except Exception: + computed['WGET_AUTO_COMPRESSION'] = 'false' + + # Output results + # Format: KEY=VALUE lines that hooks.py will parse and add to env + for key, value in computed.items(): + print(f"COMPUTED:{key}={value}") + + for warning in warnings: + print(f"WARNING:{warning}", file=sys.stderr) + + for error in errors: + print(f"ERROR:{error}", file=sys.stderr) + + # Exit with error if any hard errors + sys.exit(1 if errors else 0) + + +if __name__ == '__main__': + main() diff --git a/archivebox/workers/orchestrator.py b/archivebox/workers/orchestrator.py index 1b1789cb..bb0046f7 100644 --- a/archivebox/workers/orchestrator.py +++ b/archivebox/workers/orchestrator.py @@ -30,7 +30,7 @@ __package__ = 'archivebox.workers' import os import time from typing import Type -from multiprocessing import Process +from multiprocessing import Process as MPProcess from django.utils import timezone @@ -38,12 +38,6 @@ from rich import print from archivebox.misc.logging_util import log_worker_event from .worker import Worker, CrawlWorker, SnapshotWorker, ArchiveResultWorker -from .pid_utils import ( - write_pid_file, - remove_pid_file, - get_all_worker_pids, - cleanup_stale_pid_files, -) def _run_orchestrator_process(exit_on_idle: bool) -> None: @@ -78,6 +72,7 @@ class Orchestrator: self.pid: int = os.getpid() self.pid_file = None self.idle_count: int = 0 + self._last_cleanup_time: float = 0.0 # For throttling cleanup_stale_running() def __repr__(self) -> str: return f'[underline]Orchestrator[/underline]\\[pid={self.pid}]' @@ -85,16 +80,26 @@ class Orchestrator: @classmethod def is_running(cls) -> bool: """Check if an orchestrator is already running.""" - workers = get_all_worker_pids('orchestrator') - return len(workers) > 0 - + from archivebox.machine.models import Process + + # Clean up stale processes before counting + Process.cleanup_stale_running() + return Process.get_running_count(process_type=Process.TypeChoices.ORCHESTRATOR) > 0 + def on_startup(self) -> None: """Called when orchestrator starts.""" - self.pid = os.getpid() - self.pid_file = write_pid_file('orchestrator', worker_id=0) + from archivebox.machine.models import Process - # Clean up any stale PID files from previous runs - stale_count = cleanup_stale_pid_files() + self.pid = os.getpid() + # Register orchestrator process in database with explicit type + self.db_process = Process.current() + # Ensure the process type is correctly set to ORCHESTRATOR + if self.db_process.process_type != Process.TypeChoices.ORCHESTRATOR: + self.db_process.process_type = Process.TypeChoices.ORCHESTRATOR + self.db_process.save(update_fields=['process_type']) + + # Clean up any stale Process records from previous runs + stale_count = Process.cleanup_stale_running() # Collect startup metadata metadata = { @@ -112,11 +117,16 @@ class Orchestrator: pid=self.pid, metadata=metadata, ) - + def on_shutdown(self, error: BaseException | None = None) -> None: """Called when orchestrator shuts down.""" - if self.pid_file: - remove_pid_file(self.pid_file) + # Update Process record status + if hasattr(self, 'db_process') and self.db_process: + # KeyboardInterrupt is a graceful shutdown, not an error + self.db_process.exit_code = 1 if error and not isinstance(error, KeyboardInterrupt) else 0 + self.db_process.status = self.db_process.StatusChoices.EXITED + self.db_process.ended_at = timezone.now() + self.db_process.save() log_worker_event( worker_type='Orchestrator', @@ -125,10 +135,19 @@ class Orchestrator: pid=self.pid, error=error if error and not isinstance(error, KeyboardInterrupt) else None, ) - + def get_total_worker_count(self) -> int: """Get total count of running workers across all types.""" - cleanup_stale_pid_files() + from archivebox.machine.models import Process + import time + + # Throttle cleanup to once every 30 seconds to avoid performance issues + CLEANUP_THROTTLE_SECONDS = 30 + now = time.time() + if now - self._last_cleanup_time > CLEANUP_THROTTLE_SECONDS: + Process.cleanup_stale_running() + self._last_cleanup_time = now + return sum(len(W.get_running_workers()) for W in self.WORKER_TYPES) def should_spawn_worker(self, WorkerClass: Type[Worker], queue_count: int) -> bool: @@ -287,7 +306,7 @@ class Orchestrator: Returns the PID of the new process. """ # Use module-level function to avoid pickle errors with local functions - proc = Process( + proc = MPProcess( target=_run_orchestrator_process, args=(self.exit_on_idle,), name='orchestrator' diff --git a/archivebox/workers/pid_utils.py b/archivebox/workers/pid_utils.py deleted file mode 100644 index 020fce70..00000000 --- a/archivebox/workers/pid_utils.py +++ /dev/null @@ -1,191 +0,0 @@ -""" -PID file utilities for tracking worker and orchestrator processes. - -PID files are stored in data/tmp/workers/ and contain: -- Line 1: PID -- Line 2: Worker type (orchestrator, crawl, snapshot, archiveresult) -- Line 3: Extractor filter (optional, for archiveresult workers) -- Line 4: Started at ISO timestamp -""" - -__package__ = 'archivebox.workers' - -import os -import signal -from pathlib import Path -from datetime import datetime, timezone - -from django.conf import settings - - -def get_pid_dir() -> Path: - """Get the directory for PID files, creating it if needed.""" - pid_dir = Path(settings.DATA_DIR) / 'tmp' / 'workers' - pid_dir.mkdir(parents=True, exist_ok=True) - return pid_dir - - -def write_pid_file(worker_type: str, worker_id: int = 0, extractor: str | None = None) -> Path: - """ - Write a PID file for the current process. - Returns the path to the PID file. - """ - pid_dir = get_pid_dir() - - if worker_type == 'orchestrator': - pid_file = pid_dir / 'orchestrator.pid' - else: - pid_file = pid_dir / f'{worker_type}_worker_{worker_id}.pid' - - content = f"{os.getpid()}\n{worker_type}\n{extractor or ''}\n{datetime.now(timezone.utc).isoformat()}\n" - pid_file.write_text(content) - - return pid_file - - -def read_pid_file(path: Path) -> dict | None: - """ - Read and parse a PID file. - Returns dict with pid, worker_type, extractor, started_at or None if invalid. - """ - try: - if not path.exists(): - return None - - lines = path.read_text().strip().split('\n') - if len(lines) < 4: - return None - - return { - 'pid': int(lines[0]), - 'worker_type': lines[1], - 'extractor': lines[2] or None, - 'started_at': datetime.fromisoformat(lines[3]), - 'pid_file': path, - } - except (ValueError, IndexError, OSError): - return None - - -def remove_pid_file(path: Path) -> None: - """Remove a PID file if it exists.""" - try: - path.unlink(missing_ok=True) - except OSError: - pass - - -def is_process_alive(pid: int) -> bool: - """Check if a process with the given PID is still running.""" - try: - os.kill(pid, 0) # Signal 0 doesn't kill, just checks - return True - except (OSError, ProcessLookupError): - return False - - -def get_all_pid_files() -> list[Path]: - """Get all PID files in the workers directory.""" - pid_dir = get_pid_dir() - return list(pid_dir.glob('*.pid')) - - -def get_all_worker_pids(worker_type: str | None = None) -> list[dict]: - """ - Get info about all running workers. - Optionally filter by worker_type. - """ - workers = [] - - for pid_file in get_all_pid_files(): - info = read_pid_file(pid_file) - if info is None: - continue - - # Skip if process is dead - if not is_process_alive(info['pid']): - continue - - # Filter by type if specified - if worker_type and info['worker_type'] != worker_type: - continue - - workers.append(info) - - return workers - - -def cleanup_stale_pid_files() -> int: - """ - Remove PID files for processes that are no longer running. - Returns the number of stale files removed. - """ - removed = 0 - - for pid_file in get_all_pid_files(): - info = read_pid_file(pid_file) - if info is None: - # Invalid PID file, remove it - remove_pid_file(pid_file) - removed += 1 - continue - - if not is_process_alive(info['pid']): - remove_pid_file(pid_file) - removed += 1 - - return removed - - -def get_running_worker_count(worker_type: str) -> int: - """Get the count of running workers of a specific type.""" - return len(get_all_worker_pids(worker_type)) - - -def get_next_worker_id(worker_type: str) -> int: - """Get the next available worker ID for a given type.""" - existing_ids = set() - - for pid_file in get_all_pid_files(): - # Parse worker ID from filename like "snapshot_worker_3.pid" - name = pid_file.stem - if name.startswith(f'{worker_type}_worker_'): - try: - worker_id = int(name.split('_')[-1]) - existing_ids.add(worker_id) - except ValueError: - continue - - # Find the lowest unused ID - next_id = 0 - while next_id in existing_ids: - next_id += 1 - - return next_id - - -def stop_worker(pid: int, graceful: bool = True) -> bool: - """ - Stop a worker process. - If graceful=True, sends SIGTERM first, then SIGKILL after timeout. - Returns True if process was stopped. - """ - if not is_process_alive(pid): - return True - - try: - if graceful: - os.kill(pid, signal.SIGTERM) - # Give it a moment to shut down - import time - for _ in range(10): # Wait up to 1 second - time.sleep(0.1) - if not is_process_alive(pid): - return True - # Force kill if still running - os.kill(pid, signal.SIGKILL) - else: - os.kill(pid, signal.SIGKILL) - return True - except (OSError, ProcessLookupError): - return True # Process already dead diff --git a/archivebox/workers/worker.py b/archivebox/workers/worker.py index 404ad0a3..918b2bba 100644 --- a/archivebox/workers/worker.py +++ b/archivebox/workers/worker.py @@ -17,7 +17,7 @@ import traceback from typing import ClassVar, Any from datetime import timedelta from pathlib import Path -from multiprocessing import Process, cpu_count +from multiprocessing import Process as MPProcess, cpu_count from django.db.models import QuerySet from django.utils import timezone @@ -26,13 +26,6 @@ from django.conf import settings from rich import print from archivebox.misc.logging_util import log_worker_event -from .pid_utils import ( - write_pid_file, - remove_pid_file, - get_all_worker_pids, - get_next_worker_id, - cleanup_stale_pid_files, -) CPU_COUNT = cpu_count() @@ -133,8 +126,15 @@ class Worker: def on_startup(self) -> None: """Called when worker starts.""" + from archivebox.machine.models import Process + self.pid = os.getpid() - self.pid_file = write_pid_file(self.name, self.worker_id) + # Register this worker process in the database + self.db_process = Process.current() + # Explicitly set process_type to WORKER to prevent mis-detection + if self.db_process.process_type != Process.TypeChoices.WORKER: + self.db_process.process_type = Process.TypeChoices.WORKER + self.db_process.save(update_fields=['process_type']) # Determine worker type for logging worker_type_name = self.__class__.__name__ @@ -160,9 +160,12 @@ class Worker: def on_shutdown(self, error: BaseException | None = None) -> None: """Called when worker shuts down.""" - # Remove PID file - if self.pid_file: - remove_pid_file(self.pid_file) + # Update Process record status + if hasattr(self, 'db_process') and self.db_process: + self.db_process.exit_code = 1 if error else 0 + self.db_process.status = self.db_process.StatusChoices.EXITED + self.db_process.ended_at = timezone.now() + self.db_process.save() # Determine worker type for logging worker_type_name = self.__class__.__name__ @@ -288,11 +291,13 @@ class Worker: Fork a new worker as a subprocess. Returns the PID of the new process. """ + from archivebox.machine.models import Process + if worker_id is None: - worker_id = get_next_worker_id(cls.name) + worker_id = Process.get_next_worker_id(process_type=Process.TypeChoices.WORKER) # Use module-level function for pickling compatibility - proc = Process( + proc = MPProcess( target=_run_worker, args=(cls.name, worker_id, daemon), kwargs=kwargs, @@ -304,15 +309,31 @@ class Worker: return proc.pid @classmethod - def get_running_workers(cls) -> list[dict]: + def get_running_workers(cls) -> list: """Get info about all running workers of this type.""" - cleanup_stale_pid_files() - return get_all_worker_pids(cls.name) + from archivebox.machine.models import Process + + Process.cleanup_stale_running() + # Convert Process objects to dicts to match the expected API contract + processes = Process.get_running(process_type=Process.TypeChoices.WORKER) + # Note: worker_id is not stored on Process model, it's dynamically generated + # We return process_id (UUID) and pid (OS process ID) instead + return [ + { + 'pid': p.pid, + 'process_id': str(p.id), # UUID of Process record + 'started_at': p.started_at.isoformat() if p.started_at else None, + 'status': p.status, + } + for p in processes + ] @classmethod def get_worker_count(cls) -> int: """Get count of running workers of this type.""" - return len(cls.get_running_workers()) + from archivebox.machine.models import Process + + return Process.get_running_count(process_type=Process.TypeChoices.WORKER) class CrawlWorker(Worker): @@ -402,11 +423,13 @@ class ArchiveResultWorker(Worker): @classmethod def start(cls, worker_id: int | None = None, daemon: bool = False, plugin: str | None = None, **kwargs: Any) -> int: """Fork a new worker as subprocess with optional plugin filter.""" + from archivebox.machine.models import Process + if worker_id is None: - worker_id = get_next_worker_id(cls.name) + worker_id = Process.get_next_worker_id(process_type=Process.TypeChoices.WORKER) # Use module-level function for pickling compatibility - proc = Process( + proc = MPProcess( target=_run_worker, args=(cls.name, worker_id, daemon), kwargs={'plugin': plugin, **kwargs},