Files
ArchiveBox/archivebox/plugins/chrome_session/on_CrawlEnd__99_chrome_cleanup.py
2025-12-25 03:59:51 -08:00

142 lines
4.1 KiB
Python

#!/usr/bin/env python3
"""
Clean up Chrome browser session at the end of a crawl.
This runs after all snapshots in a crawl have been processed to terminate
the shared Chrome session that was started by on_Crawl__10_chrome_session.js.
Usage: on_Crawl__99_chrome_cleanup.py --crawl-id=<uuid>
Output: Terminates the crawl's Chrome process
"""
import json
import os
import signal
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'chrome_cleanup'
CHROME_SESSION_DIR = 'chrome_session'
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def cleanup_crawl_chrome() -> tuple[bool, str | None, str]:
"""
Clean up Chrome session for the crawl.
Returns: (success, output_info, error_message)
"""
session_dir = Path(CHROME_SESSION_DIR)
if not session_dir.exists():
return True, 'No chrome_session directory found', ''
pid_file = session_dir / 'pid.txt'
killed = False
if pid_file.exists():
try:
pid = int(pid_file.read_text().strip())
# Try graceful termination first
try:
os.kill(pid, signal.SIGTERM)
killed = True
print(f'[*] Sent SIGTERM to Chrome PID {pid}')
# Wait briefly for graceful shutdown
for _ in range(20):
try:
os.kill(pid, 0) # Check if still running
time.sleep(0.1)
except OSError:
print(f'[+] Chrome process {pid} terminated')
break # Process is gone
else:
# Force kill if still running
print(f'[!] Chrome still running, sending SIGKILL')
try:
os.kill(pid, signal.SIGKILL)
except OSError:
pass
except OSError as e:
# Process might already be dead, that's fine
if e.errno == 3: # No such process
print(f'[*] Chrome process {pid} already terminated')
else:
return False, None, f'Failed to kill Chrome PID {pid}: {e}'
except ValueError:
return False, None, f'Invalid PID in {pid_file}'
except Exception as e:
return False, None, f'{type(e).__name__}: {e}'
result_info = f'Crawl Chrome cleanup: PID {"killed" if killed else "not found or already terminated"}'
return True, result_info, ''
@click.command()
@click.option('--crawl-id', required=True, help='Crawl UUID')
@click.option('--source-url', default='', help='Source URL (unused)')
def main(crawl_id: str, source_url: str):
"""Clean up shared Chrome browser session for crawl."""
start_ts = datetime.now(timezone.utc)
output = None
status = 'failed'
error = ''
try:
success, output, error = cleanup_crawl_chrome()
status = 'succeeded' if success else 'failed'
if success:
print(f'Crawl Chrome cleanup completed: {output}')
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
# Print results
end_ts = datetime.now(timezone.utc)
duration = (end_ts - start_ts).total_seconds()
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={end_ts.isoformat()}')
print(f'DURATION={duration:.2f}')
if output:
print(f'OUTPUT={output}')
print(f'STATUS={status}')
if error:
print(f'ERROR={error}', file=sys.stderr)
# Print JSON result
result_json = {
'extractor': EXTRACTOR_NAME,
'crawl_id': crawl_id,
'status': status,
'start_ts': start_ts.isoformat(),
'end_ts': end_ts.isoformat(),
'duration': round(duration, 2),
'output': output,
'error': error or None,
}
print(f'RESULT_JSON={json.dumps(result_json)}')
sys.exit(0 if status == 'succeeded' else 1)
if __name__ == '__main__':
main()