From 877b5f91c29aa8ae025576c673f9af6da2afab65 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 31 Dec 2025 00:21:07 +0000 Subject: [PATCH] Derive CHROME_USER_DATA_DIR from ACTIVE_PERSONA in config system - Add _derive_persona_paths() in configset.py to automatically derive CHROME_USER_DATA_DIR and CHROME_EXTENSIONS_DIR from ACTIVE_PERSONA when not explicitly set. This allows plugins to use these paths without knowing about the persona system. - Update chrome_utils.js launchChromium() to accept userDataDir option and pass --user-data-dir to Chrome. Also cleans up SingletonLock before launch. - Update killZombieChrome() to clean up SingletonLock files from all persona chrome_user_data directories after killing zombies. - Update chrome_cleanup() in misc/util.py to handle persona-based user data directories when cleaning up stale Chrome state. - Simplify on_Crawl__20_chrome_launch.bg.js to use CHROME_USER_DATA_DIR and CHROME_EXTENSIONS_DIR from env (derived by get_config()). Config priority flow: ACTIVE_PERSONA=WorkAccount (set on crawl/snapshot) -> get_config() derives: CHROME_USER_DATA_DIR = PERSONAS_DIR/WorkAccount/chrome_user_data CHROME_EXTENSIONS_DIR = PERSONAS_DIR/WorkAccount/chrome_extensions -> hooks receive these as env vars without needing persona logic --- archivebox/config/configset.py | 46 +++++++++++++++++++ archivebox/misc/util.py | 44 ++++++++++++++++-- archivebox/plugins/chrome/chrome_utils.js | 46 +++++++++++++++++++ .../chrome/on_Crawl__20_chrome_launch.bg.js | 13 ++++-- 4 files changed, 143 insertions(+), 6 deletions(-) diff --git a/archivebox/config/configset.py b/archivebox/config/configset.py index 4130a2bc..afc02c38 100644 --- a/archivebox/config/configset.py +++ b/archivebox/config/configset.py @@ -240,6 +240,52 @@ def get_config( except ImportError: pass + # Derive persona-based paths if not explicitly set + # This allows plugins to just use CHROME_USER_DATA_DIR without knowing about personas + config = _derive_persona_paths(config, CONSTANTS) + + return config + + +def _derive_persona_paths(config: Dict[str, Any], CONSTANTS: Any) -> Dict[str, Any]: + """ + Derive persona-specific paths from ACTIVE_PERSONA if not explicitly set. + + This runs after all config sources are merged, so plugins receive + the final resolved paths without needing to know about the persona system. + + Derived paths: + CHROME_USER_DATA_DIR <- PERSONAS_DIR / ACTIVE_PERSONA / chrome_user_data + CHROME_EXTENSIONS_DIR <- PERSONAS_DIR / ACTIVE_PERSONA / chrome_extensions + COOKIES_FILE <- PERSONAS_DIR / ACTIVE_PERSONA / cookies.txt (if exists) + """ + # Get active persona (defaults to "Default") + active_persona = config.get('ACTIVE_PERSONA') or config.get('DEFAULT_PERSONA') or 'Default' + + # Ensure ACTIVE_PERSONA is always set in config for downstream use + config['ACTIVE_PERSONA'] = active_persona + + # Get personas directory + personas_dir = CONSTANTS.PERSONAS_DIR + persona_dir = personas_dir / active_persona + + # Derive CHROME_USER_DATA_DIR if not explicitly set + chrome_user_data_dir = config.get('CHROME_USER_DATA_DIR') + if not chrome_user_data_dir: + config['CHROME_USER_DATA_DIR'] = str(persona_dir / 'chrome_user_data') + + # Derive CHROME_EXTENSIONS_DIR if not explicitly set + chrome_extensions_dir = config.get('CHROME_EXTENSIONS_DIR') + if not chrome_extensions_dir: + config['CHROME_EXTENSIONS_DIR'] = str(persona_dir / 'chrome_extensions') + + # Derive COOKIES_FILE if not explicitly set and file exists + cookies_file = config.get('COOKIES_FILE') + if not cookies_file: + persona_cookies = persona_dir / 'cookies.txt' + if persona_cookies.exists(): + config['COOKIES_FILE'] = str(persona_cookies) + return config diff --git a/archivebox/misc/util.py b/archivebox/misc/util.py index 61354d80..423d187b 100644 --- a/archivebox/misc/util.py +++ b/archivebox/misc/util.py @@ -480,12 +480,50 @@ for url_str, num_urls in _test_url_strs.items(): def chrome_cleanup(): """ - Cleans up any state or runtime files that chrome leaves behind when killed by - a timeout or other error + Cleans up any state or runtime files that Chrome leaves behind when killed by + a timeout or other error. Handles: + - Persona-based chrome_user_data directories (from ACTIVE_PERSONA) + - Explicit CHROME_USER_DATA_DIR + - Legacy Docker chromium path """ import os + from pathlib import Path from archivebox.config.permissions import IN_DOCKER - + + # Clean up persona-based user data directories + try: + from archivebox.config.configset import get_config + from archivebox.config.constants import CONSTANTS + + config = get_config() + + # Clean up the active persona's chrome_user_data SingletonLock + chrome_user_data_dir = config.get('CHROME_USER_DATA_DIR') + if chrome_user_data_dir: + singleton_lock = Path(chrome_user_data_dir) / 'SingletonLock' + if singleton_lock.exists(): + try: + singleton_lock.unlink() + except OSError: + pass + + # Clean up all persona directories + personas_dir = CONSTANTS.PERSONAS_DIR + if personas_dir.exists(): + for persona_dir in personas_dir.iterdir(): + if not persona_dir.is_dir(): + continue + user_data_dir = persona_dir / 'chrome_user_data' + singleton_lock = user_data_dir / 'SingletonLock' + if singleton_lock.exists(): + try: + singleton_lock.unlink() + except OSError: + pass + except Exception: + pass # Config not available during early startup + + # Legacy Docker cleanup if IN_DOCKER: singleton_lock = "/home/archivebox/.config/chromium/SingletonLock" if os.path.lexists(singleton_lock): diff --git a/archivebox/plugins/chrome/chrome_utils.js b/archivebox/plugins/chrome/chrome_utils.js index d448923b..dda6612b 100755 --- a/archivebox/plugins/chrome/chrome_utils.js +++ b/archivebox/plugins/chrome/chrome_utils.js @@ -257,6 +257,31 @@ function killZombieChrome(dataDir = null) { console.error('[+] No zombies found'); } + // Clean up stale SingletonLock files from persona chrome_user_data directories + const personasDir = path.join(dataDir, 'personas'); + if (fs.existsSync(personasDir)) { + try { + const personas = fs.readdirSync(personasDir, { withFileTypes: true }); + for (const persona of personas) { + if (!persona.isDirectory()) continue; + + const userDataDir = path.join(personasDir, persona.name, 'chrome_user_data'); + const singletonLock = path.join(userDataDir, 'SingletonLock'); + + if (fs.existsSync(singletonLock)) { + try { + fs.unlinkSync(singletonLock); + console.error(`[+] Removed stale SingletonLock: ${singletonLock}`); + } catch (e) { + // Ignore - may be in use by active Chrome + } + } + } + } catch (e) { + // Ignore errors scanning personas directory + } + } + return killed; } @@ -270,6 +295,7 @@ function killZombieChrome(dataDir = null) { * @param {Object} options - Launch options * @param {string} [options.binary] - Chrome binary path (auto-detected if not provided) * @param {string} [options.outputDir='chrome'] - Directory for output files + * @param {string} [options.userDataDir] - Chrome user data directory for persistent sessions * @param {string} [options.resolution='1440,2000'] - Window resolution * @param {boolean} [options.headless=true] - Run in headless mode * @param {boolean} [options.checkSsl=true] - Check SSL certificates @@ -281,6 +307,7 @@ async function launchChromium(options = {}) { const { binary = findChromium(), outputDir = 'chrome', + userDataDir = getEnv('CHROME_USER_DATA_DIR'), resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000'), headless = getEnvBool('CHROME_HEADLESS', true), checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true)), @@ -304,6 +331,24 @@ async function launchChromium(options = {}) { fs.mkdirSync(outputDir, { recursive: true }); } + // Create user data directory if specified and doesn't exist + if (userDataDir) { + if (!fs.existsSync(userDataDir)) { + fs.mkdirSync(userDataDir, { recursive: true }); + console.error(`[*] Created user data directory: ${userDataDir}`); + } + // Clean up any stale SingletonLock file from previous crashed sessions + const singletonLock = path.join(userDataDir, 'SingletonLock'); + if (fs.existsSync(singletonLock)) { + try { + fs.unlinkSync(singletonLock); + console.error(`[*] Removed stale SingletonLock: ${singletonLock}`); + } catch (e) { + console.error(`[!] Failed to remove SingletonLock: ${e.message}`); + } + } + } + // Find a free port const debugPort = await findFreePort(); console.error(`[*] Using debug port: ${debugPort}`); @@ -335,6 +380,7 @@ async function launchChromium(options = {}) { '--font-render-hinting=none', '--force-color-profile=srgb', `--window-size=${width},${height}`, + ...(userDataDir ? [`--user-data-dir=${userDataDir}`] : []), ...(headless ? ['--headless=new'] : []), ...(checkSsl ? [] : ['--ignore-certificate-errors']), ]; diff --git a/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js b/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js index c2d62775..ed264c95 100644 --- a/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js +++ b/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js @@ -115,12 +115,17 @@ async function main() { if (version) console.error(`[*] Version: ${version}`); // Load installed extensions - const extensionsDir = getEnv('CHROME_EXTENSIONS_DIR') || - path.join(getEnv('DATA_DIR', '.'), 'personas', getEnv('ACTIVE_PERSONA', 'Default'), 'chrome_extensions'); + // CHROME_EXTENSIONS_DIR is derived from ACTIVE_PERSONA by get_config() in configset.py + const extensionsDir = getEnv('CHROME_EXTENSIONS_DIR'); + const userDataDir = getEnv('CHROME_USER_DATA_DIR'); + + if (userDataDir) { + console.error(`[*] Using user data dir: ${userDataDir}`); + } const installedExtensions = []; const extensionPaths = []; - if (fs.existsSync(extensionsDir)) { + if (extensionsDir && fs.existsSync(extensionsDir)) { const files = fs.readdirSync(extensionsDir); for (const file of files) { if (file.endsWith('.extension.json')) { @@ -151,9 +156,11 @@ async function main() { writePidWithMtime(path.join(OUTPUT_DIR, 'hook.pid'), process.pid, hookStartTime); // Launch Chromium using consolidated function + // userDataDir is derived from ACTIVE_PERSONA by get_config() if not explicitly set const result = await launchChromium({ binary, outputDir: OUTPUT_DIR, + userDataDir, extensionPaths, });