diff --git a/TODO_process_tracking.md b/TODO_process_tracking.md index c0bf3784..be8c7c63 100644 --- a/TODO_process_tracking.md +++ b/TODO_process_tracking.md @@ -621,18 +621,6 @@ class Process(ModelWithHealthStats): return self - def is_alive(self) -> bool: - """Check if this process is still running.""" - from archivebox.misc.process_utils import validate_pid_file - - if self.status == self.StatusChoices.EXITED: - return False - - if not self.pid: - return False - - return validate_pid_file(self.pid_file, self.cmd_file) - def kill(self, signal_num: int = 15) -> bool: """ Kill this process and update status. @@ -712,7 +700,7 @@ class Process(ModelWithHealthStats): Wait for process to exit, polling periodically. Args: - timeout: Max seconds to wait (None = use self.timeout) + timeout: Max seconds to wait (None = use self.timeout, or config.TIMEOUT * 5 if that's also None) Returns: exit_code @@ -721,8 +709,10 @@ class Process(ModelWithHealthStats): TimeoutError if process doesn't exit in time """ import time + from archivebox import config - timeout = timeout or self.timeout + # Require a timeout - default to config.TIMEOUT * 5 (typically 300s) + timeout = timeout or self.timeout or (config.TIMEOUT * 5) start = time.time() while True: diff --git a/archivebox/config/configset.py b/archivebox/config/configset.py index 4130a2bc..00835ab7 100644 --- a/archivebox/config/configset.py +++ b/archivebox/config/configset.py @@ -120,6 +120,7 @@ class BaseConfigSet(BaseSettings): def get_config( scope: str = "global", defaults: Optional[Dict] = None, + persona: Any = None, user: Any = None, crawl: Any = None, snapshot: Any = None, @@ -131,14 +132,16 @@ def get_config( 1. Per-snapshot config (snapshot.config JSON field) 2. Per-crawl config (crawl.config JSON field) 3. Per-user config (user.config JSON field) - 4. Environment variables - 5. Config file (ArchiveBox.conf) - 6. Plugin schema defaults (config.json) - 7. Core config defaults + 4. Per-persona config (persona.get_derived_config() - includes CHROME_USER_DATA_DIR etc.) + 5. Environment variables + 6. Config file (ArchiveBox.conf) + 7. Plugin schema defaults (config.json) + 8. Core config defaults Args: scope: Config scope ('global', 'crawl', 'snapshot', etc.) defaults: Default values to start with + persona: Persona object (provides derived paths like CHROME_USER_DATA_DIR) user: User object with config JSON field crawl: Crawl object with config JSON field snapshot: Snapshot object with config JSON field @@ -205,6 +208,10 @@ def get_config( except ImportError: pass + # Apply persona config overrides (includes derived paths like CHROME_USER_DATA_DIR) + if persona and hasattr(persona, "get_derived_config"): + config.update(persona.get_derived_config()) + # Apply user config overrides if user and hasattr(user, "config") and user.config: config.update(user.config) diff --git a/archivebox/misc/util.py b/archivebox/misc/util.py index 61354d80..c69c8c86 100644 --- a/archivebox/misc/util.py +++ b/archivebox/misc/util.py @@ -480,12 +480,39 @@ for url_str, num_urls in _test_url_strs.items(): def chrome_cleanup(): """ - Cleans up any state or runtime files that chrome leaves behind when killed by - a timeout or other error + Cleans up any state or runtime files that Chrome leaves behind when killed by + a timeout or other error. Handles: + - All persona chrome_user_data directories (via Persona.cleanup_chrome_all()) + - Explicit CHROME_USER_DATA_DIR from config + - Legacy Docker chromium path """ import os + from pathlib import Path from archivebox.config.permissions import IN_DOCKER - + + # Clean up all persona chrome directories using Persona class + try: + from archivebox.personas.models import Persona + + # Clean up all personas + Persona.cleanup_chrome_all() + + # Also clean up the active persona's explicit CHROME_USER_DATA_DIR if set + # (in case it's a custom path not under PERSONAS_DIR) + from archivebox.config.configset import get_config + config = get_config() + chrome_user_data_dir = config.get('CHROME_USER_DATA_DIR') + if chrome_user_data_dir: + singleton_lock = Path(chrome_user_data_dir) / 'SingletonLock' + if os.path.lexists(singleton_lock): + try: + singleton_lock.unlink() + except OSError: + pass + except Exception: + pass # Persona/config not available during early startup + + # Legacy Docker cleanup (for backwards compatibility) if IN_DOCKER: singleton_lock = "/home/archivebox/.config/chromium/SingletonLock" if os.path.lexists(singleton_lock): diff --git a/archivebox/personas/models.py b/archivebox/personas/models.py index 99f8ef87..470ec846 100644 --- a/archivebox/personas/models.py +++ b/archivebox/personas/models.py @@ -1,59 +1,155 @@ -# from django.db import models +""" +Persona management for ArchiveBox. -# from django.conf import settings +A Persona represents a browser profile/identity used for archiving. +Each persona has its own: +- Chrome user data directory (for cookies, localStorage, extensions, etc.) +- Chrome extensions directory +- Cookies file +- Config overrides +""" + +__package__ = 'archivebox.personas' + +from pathlib import Path +from typing import TYPE_CHECKING, Iterator + +from django.db import models +from django.conf import settings +from django.utils import timezone + +from archivebox.base_models.models import ModelWithConfig, get_or_create_system_user_pk + +if TYPE_CHECKING: + from django.db.models import QuerySet -# class Persona(models.Model): -# """Aka a "SessionType", its a template for a crawler browsing session containing some config.""" +class Persona(ModelWithConfig): + """ + Browser persona/profile for archiving sessions. -# id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID') - -# created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False) -# created_at = AutoDateTimeField(default=None, null=False, db_index=True) -# modified_at = models.DateTimeField(auto_now=True) - -# name = models.CharField(max_length=100, blank=False, null=False, editable=False) - -# persona_dir = models.FilePathField(path=settings.PERSONAS_DIR, allow_files=False, allow_folders=True, blank=True, null=False, editable=False) -# config = models.JSONField(default=dict) -# # e.g. { -# # USER_AGENT: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36', -# # COOKIES_TXT_FILE: '/path/to/cookies.txt', -# # CHROME_USER_DATA_DIR: '/path/to/chrome/user/data/dir', -# # CHECK_SSL_VALIDITY: False, -# # SAVE_ARCHIVEDOTORG: True, -# # CHROME_BINARY: 'chromium' -# # ... -# # } -# # domain_allowlist = models.CharField(max_length=1024, blank=True, null=False, default='') -# # domain_denylist = models.CharField(max_length=1024, blank=True, null=False, default='') - -# class Meta: -# app_label = 'personas' -# verbose_name = 'Session Type' -# verbose_name_plural = 'Session Types' -# unique_together = (('created_by', 'name'),) - + Each persona provides: + - CHROME_USER_DATA_DIR: Chrome profile directory + - CHROME_EXTENSIONS_DIR: Installed extensions directory + - COOKIES_FILE: Cookies file for wget/curl + - config: JSON field with persona-specific config overrides -# def clean(self): -# self.persona_dir = settings.PERSONAS_DIR / self.name -# assert self.persona_dir == settings.PERSONAS_DIR / self.name, f'Persona dir {self.persona_dir} must match settings.PERSONAS_DIR / self.name' - - -# # make sure config keys all exist in FLAT_CONFIG -# # make sure config values all match expected types -# pass - -# def save(self, *args, **kwargs): -# self.full_clean() - -# # make sure basic file structure is present in persona_dir: -# # - PERSONAS_DIR / self.name / -# # - chrome_profile/ -# # - chrome_downloads/ -# # - chrome_extensions/ -# # - cookies.txt -# # - auth.json -# # - config.json # json dump of the model - -# super().save(*args, **kwargs) + Usage: + # Get persona and its derived config + config = get_config(persona=crawl.persona, crawl=crawl, snapshot=snapshot) + chrome_dir = config['CHROME_USER_DATA_DIR'] + + # Or access directly from persona + persona = Persona.objects.get(name='Default') + persona.CHROME_USER_DATA_DIR # -> Path to chrome_user_data + """ + + name = models.CharField(max_length=64, unique=True) + created_at = models.DateTimeField(default=timezone.now, db_index=True) + created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk) + + class Meta: + app_label = 'personas' + + def __str__(self) -> str: + return self.name + + @property + def path(self) -> Path: + """Path to persona directory under PERSONAS_DIR.""" + from archivebox.config.constants import CONSTANTS + return CONSTANTS.PERSONAS_DIR / self.name + + @property + def CHROME_USER_DATA_DIR(self) -> str: + """Derived path to Chrome user data directory for this persona.""" + return str(self.path / 'chrome_user_data') + + @property + def CHROME_EXTENSIONS_DIR(self) -> str: + """Derived path to Chrome extensions directory for this persona.""" + return str(self.path / 'chrome_extensions') + + @property + def COOKIES_FILE(self) -> str: + """Derived path to cookies.txt file for this persona (if exists).""" + cookies_path = self.path / 'cookies.txt' + return str(cookies_path) if cookies_path.exists() else '' + + def get_derived_config(self) -> dict: + """ + Get config dict with derived paths filled in. + + Returns dict with: + - All values from self.config JSONField + - CHROME_USER_DATA_DIR (derived from persona path) + - CHROME_EXTENSIONS_DIR (derived from persona path) + - COOKIES_FILE (derived from persona path, if file exists) + - ACTIVE_PERSONA (set to this persona's name) + """ + derived = dict(self.config or {}) + + # Add derived paths (don't override if explicitly set in config) + if 'CHROME_USER_DATA_DIR' not in derived: + derived['CHROME_USER_DATA_DIR'] = self.CHROME_USER_DATA_DIR + if 'CHROME_EXTENSIONS_DIR' not in derived: + derived['CHROME_EXTENSIONS_DIR'] = self.CHROME_EXTENSIONS_DIR + if 'COOKIES_FILE' not in derived and self.COOKIES_FILE: + derived['COOKIES_FILE'] = self.COOKIES_FILE + + # Always set ACTIVE_PERSONA to this persona's name + derived['ACTIVE_PERSONA'] = self.name + + return derived + + def ensure_dirs(self) -> None: + """Create persona directories if they don't exist.""" + self.path.mkdir(parents=True, exist_ok=True) + (self.path / 'chrome_user_data').mkdir(parents=True, exist_ok=True) + (self.path / 'chrome_extensions').mkdir(parents=True, exist_ok=True) + + def cleanup_chrome(self) -> bool: + """ + Clean up Chrome state files (SingletonLock, etc.) for this persona. + + Returns: + True if cleanup was performed, False if no cleanup needed + """ + cleaned = False + chrome_dir = self.path / 'chrome_user_data' + + if not chrome_dir.exists(): + return False + + # Clean up SingletonLock files + for lock_file in chrome_dir.glob('**/SingletonLock'): + try: + lock_file.unlink() + cleaned = True + except OSError: + pass + + # Clean up SingletonSocket files + for socket_file in chrome_dir.glob('**/SingletonSocket'): + try: + socket_file.unlink() + cleaned = True + except OSError: + pass + + return cleaned + + @classmethod + def get_or_create_default(cls) -> 'Persona': + """Get or create the Default persona.""" + persona, _ = cls.objects.get_or_create(name='Default') + return persona + + @classmethod + def cleanup_chrome_all(cls) -> int: + """Clean up Chrome state files for all personas.""" + cleaned = 0 + for persona in cls.objects.all(): + if persona.cleanup_chrome(): + cleaned += 1 + return cleaned diff --git a/archivebox/plugins/chrome/chrome_utils.js b/archivebox/plugins/chrome/chrome_utils.js index 7faa92ea..245e0ba9 100755 --- a/archivebox/plugins/chrome/chrome_utils.js +++ b/archivebox/plugins/chrome/chrome_utils.js @@ -56,6 +56,40 @@ function getEnvInt(name, defaultValue = 0) { return isNaN(val) ? defaultValue : val; } +/** + * Get array environment variable (JSON array or comma-separated string). + * + * Parsing strategy: + * - If value starts with '[', parse as JSON array + * - Otherwise, parse as comma-separated values + * + * This prevents incorrect splitting of arguments that contain internal commas. + * For arguments with commas, use JSON format: + * CHROME_ARGS='["--user-data-dir=/path/with,comma", "--window-size=1440,900"]' + * + * @param {string} name - Environment variable name + * @param {string[]} [defaultValue=[]] - Default value if not set + * @returns {string[]} - Array of strings + */ +function getEnvArray(name, defaultValue = []) { + const val = getEnv(name, ''); + if (!val) return defaultValue; + + // If starts with '[', parse as JSON array + if (val.startsWith('[')) { + try { + const parsed = JSON.parse(val); + if (Array.isArray(parsed)) return parsed; + } catch (e) { + console.error(`[!] Failed to parse ${name} as JSON array: ${e.message}`); + // Fall through to comma-separated parsing + } + } + + // Parse as comma-separated values + return val.split(',').map(s => s.trim()).filter(Boolean); +} + /** * Parse resolution string into width/height. * @param {string} resolution - Resolution string like "1440,2000" @@ -257,6 +291,31 @@ function killZombieChrome(dataDir = null) { console.error('[+] No zombies found'); } + // Clean up stale SingletonLock files from persona chrome_user_data directories + const personasDir = path.join(dataDir, 'personas'); + if (fs.existsSync(personasDir)) { + try { + const personas = fs.readdirSync(personasDir, { withFileTypes: true }); + for (const persona of personas) { + if (!persona.isDirectory()) continue; + + const userDataDir = path.join(personasDir, persona.name, 'chrome_user_data'); + const singletonLock = path.join(userDataDir, 'SingletonLock'); + + if (fs.existsSync(singletonLock)) { + try { + fs.unlinkSync(singletonLock); + console.error(`[+] Removed stale SingletonLock: ${singletonLock}`); + } catch (e) { + // Ignore - may be in use by active Chrome + } + } + } + } catch (e) { + // Ignore errors scanning personas directory + } + } + return killed; } @@ -270,8 +329,10 @@ function killZombieChrome(dataDir = null) { * @param {Object} options - Launch options * @param {string} [options.binary] - Chrome binary path (auto-detected if not provided) * @param {string} [options.outputDir='chrome'] - Directory for output files + * @param {string} [options.userDataDir] - Chrome user data directory for persistent sessions * @param {string} [options.resolution='1440,2000'] - Window resolution * @param {boolean} [options.headless=true] - Run in headless mode + * @param {boolean} [options.sandbox=true] - Enable Chrome sandbox * @param {boolean} [options.checkSsl=true] - Check SSL certificates * @param {string[]} [options.extensionPaths=[]] - Paths to unpacked extensions * @param {boolean} [options.killZombies=true] - Kill zombie processes first @@ -281,8 +342,10 @@ async function launchChromium(options = {}) { const { binary = findChromium(), outputDir = 'chrome', + userDataDir = getEnv('CHROME_USER_DATA_DIR'), resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000'), headless = getEnvBool('CHROME_HEADLESS', true), + sandbox = getEnvBool('CHROME_SANDBOX', true), checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true)), extensionPaths = [], killZombies = true, @@ -304,41 +367,65 @@ async function launchChromium(options = {}) { fs.mkdirSync(outputDir, { recursive: true }); } + // Create user data directory if specified and doesn't exist + if (userDataDir) { + if (!fs.existsSync(userDataDir)) { + fs.mkdirSync(userDataDir, { recursive: true }); + console.error(`[*] Created user data directory: ${userDataDir}`); + } + // Clean up any stale SingletonLock file from previous crashed sessions + const singletonLock = path.join(userDataDir, 'SingletonLock'); + if (fs.existsSync(singletonLock)) { + try { + fs.unlinkSync(singletonLock); + console.error(`[*] Removed stale SingletonLock: ${singletonLock}`); + } catch (e) { + console.error(`[!] Failed to remove SingletonLock: ${e.message}`); + } + } + } + // Find a free port const debugPort = await findFreePort(); console.error(`[*] Using debug port: ${debugPort}`); - // Build Chrome arguments - const chromiumArgs = [ + // Get base Chrome args from config (static flags from CHROME_ARGS env var) + // These come from config.json defaults, merged by get_config() in Python + const baseArgs = getEnvArray('CHROME_ARGS', []); + + // Get extra user-provided args + const extraArgs = getEnvArray('CHROME_ARGS_EXTRA', []); + + // Build dynamic Chrome arguments (these must be computed at runtime) + const dynamicArgs = [ + // Remote debugging setup `--remote-debugging-port=${debugPort}`, '--remote-debugging-address=127.0.0.1', - '--no-sandbox', - '--disable-setuid-sandbox', + + // Sandbox settings (disable in Docker) + ...(sandbox ? [] : ['--no-sandbox', '--disable-setuid-sandbox']), + + // Docker-specific workarounds '--disable-dev-shm-usage', '--disable-gpu', - '--disable-sync', - '--no-first-run', - '--no-default-browser-check', - '--disable-default-apps', - '--disable-infobars', - '--disable-blink-features=AutomationControlled', - '--disable-component-update', - '--disable-domain-reliability', - '--disable-breakpad', - '--disable-background-networking', - '--disable-background-timer-throttling', - '--disable-backgrounding-occluded-windows', - '--disable-renderer-backgrounding', - '--disable-ipc-flooding-protection', - '--password-store=basic', - '--use-mock-keychain', - '--font-render-hinting=none', - '--force-color-profile=srgb', + + // Window size `--window-size=${width},${height}`, + + // User data directory (for persistent sessions with persona) + ...(userDataDir ? [`--user-data-dir=${userDataDir}`] : []), + + // Headless mode ...(headless ? ['--headless=new'] : []), + + // SSL certificate checking ...(checkSsl ? [] : ['--ignore-certificate-errors']), ]; + // Combine all args: base (from config) + dynamic (runtime) + extra (user overrides) + // Dynamic args come after base so they can override if needed + const chromiumArgs = [...baseArgs, ...dynamicArgs, ...extraArgs]; + // Add extension loading flags if (extensionPaths.length > 0) { const extPathsArg = extensionPaths.join(','); @@ -1231,6 +1318,7 @@ module.exports = { getEnv, getEnvBool, getEnvInt, + getEnvArray, parseResolution, // PID file management writePidWithMtime, diff --git a/archivebox/plugins/chrome/config.json b/archivebox/plugins/chrome/config.json index 4ff40faa..0bc9e754 100644 --- a/archivebox/plugins/chrome/config.json +++ b/archivebox/plugins/chrome/config.json @@ -42,7 +42,7 @@ "CHROME_USER_DATA_DIR": { "type": "string", "default": "", - "description": "Path to Chrome user data directory for persistent sessions" + "description": "Path to Chrome user data directory for persistent sessions (derived from ACTIVE_PERSONA if not set)" }, "CHROME_USER_AGENT": { "type": "string", @@ -53,16 +53,74 @@ "CHROME_ARGS": { "type": "array", "items": {"type": "string"}, - "default": [], + "default": [ + "--no-first-run", + "--no-default-browser-check", + "--disable-default-apps", + "--disable-sync", + "--disable-infobars", + "--disable-blink-features=AutomationControlled", + "--disable-component-update", + "--disable-domain-reliability", + "--disable-breakpad", + "--disable-client-side-phishing-detection", + "--disable-hang-monitor", + "--disable-speech-synthesis-api", + "--disable-speech-api", + "--disable-print-preview", + "--disable-notifications", + "--disable-desktop-notifications", + "--disable-popup-blocking", + "--disable-prompt-on-repost", + "--disable-external-intent-requests", + "--disable-session-crashed-bubble", + "--disable-search-engine-choice-screen", + "--disable-datasaver-prompt", + "--ash-no-nudges", + "--hide-crash-restore-bubble", + "--suppress-message-center-popups", + "--noerrdialogs", + "--no-pings", + "--silent-debugger-extension-api", + "--deny-permission-prompts", + "--safebrowsing-disable-auto-update", + "--metrics-recording-only", + "--password-store=basic", + "--use-mock-keychain", + "--disable-cookie-encryption", + "--font-render-hinting=none", + "--force-color-profile=srgb", + "--disable-partial-raster", + "--disable-skia-runtime-opts", + "--disable-2d-canvas-clip-aa", + "--enable-webgl", + "--hide-scrollbars", + "--export-tagged-pdf", + "--generate-pdf-document-outline", + "--disable-lazy-loading", + "--disable-renderer-backgrounding", + "--disable-background-networking", + "--disable-background-timer-throttling", + "--disable-backgrounding-occluded-windows", + "--disable-ipc-flooding-protection", + "--disable-extensions-http-throttling", + "--disable-field-trial-config", + "--disable-back-forward-cache", + "--autoplay-policy=no-user-gesture-required", + "--disable-gesture-requirement-for-media-playback", + "--lang=en-US,en;q=0.9", + "--log-level=2", + "--enable-logging=stderr" + ], "x-aliases": ["CHROME_DEFAULT_ARGS"], - "description": "Default Chrome command-line arguments" + "description": "Default Chrome command-line arguments (static flags only, dynamic args like --user-data-dir are added at runtime)" }, "CHROME_ARGS_EXTRA": { "type": "array", "items": {"type": "string"}, "default": [], "x-aliases": ["CHROME_EXTRA_ARGS"], - "description": "Extra arguments to append to Chrome command" + "description": "Extra arguments to append to Chrome command (for user customization)" }, "CHROME_PAGELOAD_TIMEOUT": { "type": "integer", diff --git a/archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js b/archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js index 6b7a6391..58cafca0 100644 --- a/archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js +++ b/archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js @@ -117,6 +117,11 @@ async function main() { // Load installed extensions const extensionsDir = getEnv('CHROME_EXTENSIONS_DIR') || path.join(getEnv('DATA_DIR', '.'), 'personas', getEnv('ACTIVE_PERSONA', 'Default'), 'chrome_extensions'); + const userDataDir = getEnv('CHROME_USER_DATA_DIR'); + + if (userDataDir) { + console.error(`[*] Using user data dir: ${userDataDir}`); + } const installedExtensions = []; const extensionPaths = []; @@ -150,9 +155,11 @@ async function main() { } // Launch Chromium using consolidated function + // userDataDir is derived from ACTIVE_PERSONA by get_config() if not explicitly set const result = await launchChromium({ binary, outputDir: OUTPUT_DIR, + userDataDir, extensionPaths, });