From 877b5f91c29aa8ae025576c673f9af6da2afab65 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 31 Dec 2025 00:21:07 +0000 Subject: [PATCH 1/7] Derive CHROME_USER_DATA_DIR from ACTIVE_PERSONA in config system - Add _derive_persona_paths() in configset.py to automatically derive CHROME_USER_DATA_DIR and CHROME_EXTENSIONS_DIR from ACTIVE_PERSONA when not explicitly set. This allows plugins to use these paths without knowing about the persona system. - Update chrome_utils.js launchChromium() to accept userDataDir option and pass --user-data-dir to Chrome. Also cleans up SingletonLock before launch. - Update killZombieChrome() to clean up SingletonLock files from all persona chrome_user_data directories after killing zombies. - Update chrome_cleanup() in misc/util.py to handle persona-based user data directories when cleaning up stale Chrome state. - Simplify on_Crawl__20_chrome_launch.bg.js to use CHROME_USER_DATA_DIR and CHROME_EXTENSIONS_DIR from env (derived by get_config()). Config priority flow: ACTIVE_PERSONA=WorkAccount (set on crawl/snapshot) -> get_config() derives: CHROME_USER_DATA_DIR = PERSONAS_DIR/WorkAccount/chrome_user_data CHROME_EXTENSIONS_DIR = PERSONAS_DIR/WorkAccount/chrome_extensions -> hooks receive these as env vars without needing persona logic --- archivebox/config/configset.py | 46 +++++++++++++++++++ archivebox/misc/util.py | 44 ++++++++++++++++-- archivebox/plugins/chrome/chrome_utils.js | 46 +++++++++++++++++++ .../chrome/on_Crawl__20_chrome_launch.bg.js | 13 ++++-- 4 files changed, 143 insertions(+), 6 deletions(-) diff --git a/archivebox/config/configset.py b/archivebox/config/configset.py index 4130a2bc..afc02c38 100644 --- a/archivebox/config/configset.py +++ b/archivebox/config/configset.py @@ -240,6 +240,52 @@ def get_config( except ImportError: pass + # Derive persona-based paths if not explicitly set + # This allows plugins to just use CHROME_USER_DATA_DIR without knowing about personas + config = _derive_persona_paths(config, CONSTANTS) + + return config + + +def _derive_persona_paths(config: Dict[str, Any], CONSTANTS: Any) -> Dict[str, Any]: + """ + Derive persona-specific paths from ACTIVE_PERSONA if not explicitly set. + + This runs after all config sources are merged, so plugins receive + the final resolved paths without needing to know about the persona system. + + Derived paths: + CHROME_USER_DATA_DIR <- PERSONAS_DIR / ACTIVE_PERSONA / chrome_user_data + CHROME_EXTENSIONS_DIR <- PERSONAS_DIR / ACTIVE_PERSONA / chrome_extensions + COOKIES_FILE <- PERSONAS_DIR / ACTIVE_PERSONA / cookies.txt (if exists) + """ + # Get active persona (defaults to "Default") + active_persona = config.get('ACTIVE_PERSONA') or config.get('DEFAULT_PERSONA') or 'Default' + + # Ensure ACTIVE_PERSONA is always set in config for downstream use + config['ACTIVE_PERSONA'] = active_persona + + # Get personas directory + personas_dir = CONSTANTS.PERSONAS_DIR + persona_dir = personas_dir / active_persona + + # Derive CHROME_USER_DATA_DIR if not explicitly set + chrome_user_data_dir = config.get('CHROME_USER_DATA_DIR') + if not chrome_user_data_dir: + config['CHROME_USER_DATA_DIR'] = str(persona_dir / 'chrome_user_data') + + # Derive CHROME_EXTENSIONS_DIR if not explicitly set + chrome_extensions_dir = config.get('CHROME_EXTENSIONS_DIR') + if not chrome_extensions_dir: + config['CHROME_EXTENSIONS_DIR'] = str(persona_dir / 'chrome_extensions') + + # Derive COOKIES_FILE if not explicitly set and file exists + cookies_file = config.get('COOKIES_FILE') + if not cookies_file: + persona_cookies = persona_dir / 'cookies.txt' + if persona_cookies.exists(): + config['COOKIES_FILE'] = str(persona_cookies) + return config diff --git a/archivebox/misc/util.py b/archivebox/misc/util.py index 61354d80..423d187b 100644 --- a/archivebox/misc/util.py +++ b/archivebox/misc/util.py @@ -480,12 +480,50 @@ for url_str, num_urls in _test_url_strs.items(): def chrome_cleanup(): """ - Cleans up any state or runtime files that chrome leaves behind when killed by - a timeout or other error + Cleans up any state or runtime files that Chrome leaves behind when killed by + a timeout or other error. Handles: + - Persona-based chrome_user_data directories (from ACTIVE_PERSONA) + - Explicit CHROME_USER_DATA_DIR + - Legacy Docker chromium path """ import os + from pathlib import Path from archivebox.config.permissions import IN_DOCKER - + + # Clean up persona-based user data directories + try: + from archivebox.config.configset import get_config + from archivebox.config.constants import CONSTANTS + + config = get_config() + + # Clean up the active persona's chrome_user_data SingletonLock + chrome_user_data_dir = config.get('CHROME_USER_DATA_DIR') + if chrome_user_data_dir: + singleton_lock = Path(chrome_user_data_dir) / 'SingletonLock' + if singleton_lock.exists(): + try: + singleton_lock.unlink() + except OSError: + pass + + # Clean up all persona directories + personas_dir = CONSTANTS.PERSONAS_DIR + if personas_dir.exists(): + for persona_dir in personas_dir.iterdir(): + if not persona_dir.is_dir(): + continue + user_data_dir = persona_dir / 'chrome_user_data' + singleton_lock = user_data_dir / 'SingletonLock' + if singleton_lock.exists(): + try: + singleton_lock.unlink() + except OSError: + pass + except Exception: + pass # Config not available during early startup + + # Legacy Docker cleanup if IN_DOCKER: singleton_lock = "/home/archivebox/.config/chromium/SingletonLock" if os.path.lexists(singleton_lock): diff --git a/archivebox/plugins/chrome/chrome_utils.js b/archivebox/plugins/chrome/chrome_utils.js index d448923b..dda6612b 100755 --- a/archivebox/plugins/chrome/chrome_utils.js +++ b/archivebox/plugins/chrome/chrome_utils.js @@ -257,6 +257,31 @@ function killZombieChrome(dataDir = null) { console.error('[+] No zombies found'); } + // Clean up stale SingletonLock files from persona chrome_user_data directories + const personasDir = path.join(dataDir, 'personas'); + if (fs.existsSync(personasDir)) { + try { + const personas = fs.readdirSync(personasDir, { withFileTypes: true }); + for (const persona of personas) { + if (!persona.isDirectory()) continue; + + const userDataDir = path.join(personasDir, persona.name, 'chrome_user_data'); + const singletonLock = path.join(userDataDir, 'SingletonLock'); + + if (fs.existsSync(singletonLock)) { + try { + fs.unlinkSync(singletonLock); + console.error(`[+] Removed stale SingletonLock: ${singletonLock}`); + } catch (e) { + // Ignore - may be in use by active Chrome + } + } + } + } catch (e) { + // Ignore errors scanning personas directory + } + } + return killed; } @@ -270,6 +295,7 @@ function killZombieChrome(dataDir = null) { * @param {Object} options - Launch options * @param {string} [options.binary] - Chrome binary path (auto-detected if not provided) * @param {string} [options.outputDir='chrome'] - Directory for output files + * @param {string} [options.userDataDir] - Chrome user data directory for persistent sessions * @param {string} [options.resolution='1440,2000'] - Window resolution * @param {boolean} [options.headless=true] - Run in headless mode * @param {boolean} [options.checkSsl=true] - Check SSL certificates @@ -281,6 +307,7 @@ async function launchChromium(options = {}) { const { binary = findChromium(), outputDir = 'chrome', + userDataDir = getEnv('CHROME_USER_DATA_DIR'), resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000'), headless = getEnvBool('CHROME_HEADLESS', true), checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true)), @@ -304,6 +331,24 @@ async function launchChromium(options = {}) { fs.mkdirSync(outputDir, { recursive: true }); } + // Create user data directory if specified and doesn't exist + if (userDataDir) { + if (!fs.existsSync(userDataDir)) { + fs.mkdirSync(userDataDir, { recursive: true }); + console.error(`[*] Created user data directory: ${userDataDir}`); + } + // Clean up any stale SingletonLock file from previous crashed sessions + const singletonLock = path.join(userDataDir, 'SingletonLock'); + if (fs.existsSync(singletonLock)) { + try { + fs.unlinkSync(singletonLock); + console.error(`[*] Removed stale SingletonLock: ${singletonLock}`); + } catch (e) { + console.error(`[!] Failed to remove SingletonLock: ${e.message}`); + } + } + } + // Find a free port const debugPort = await findFreePort(); console.error(`[*] Using debug port: ${debugPort}`); @@ -335,6 +380,7 @@ async function launchChromium(options = {}) { '--font-render-hinting=none', '--force-color-profile=srgb', `--window-size=${width},${height}`, + ...(userDataDir ? [`--user-data-dir=${userDataDir}`] : []), ...(headless ? ['--headless=new'] : []), ...(checkSsl ? [] : ['--ignore-certificate-errors']), ]; diff --git a/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js b/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js index c2d62775..ed264c95 100644 --- a/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js +++ b/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js @@ -115,12 +115,17 @@ async function main() { if (version) console.error(`[*] Version: ${version}`); // Load installed extensions - const extensionsDir = getEnv('CHROME_EXTENSIONS_DIR') || - path.join(getEnv('DATA_DIR', '.'), 'personas', getEnv('ACTIVE_PERSONA', 'Default'), 'chrome_extensions'); + // CHROME_EXTENSIONS_DIR is derived from ACTIVE_PERSONA by get_config() in configset.py + const extensionsDir = getEnv('CHROME_EXTENSIONS_DIR'); + const userDataDir = getEnv('CHROME_USER_DATA_DIR'); + + if (userDataDir) { + console.error(`[*] Using user data dir: ${userDataDir}`); + } const installedExtensions = []; const extensionPaths = []; - if (fs.existsSync(extensionsDir)) { + if (extensionsDir && fs.existsSync(extensionsDir)) { const files = fs.readdirSync(extensionsDir); for (const file of files) { if (file.endsWith('.extension.json')) { @@ -151,9 +156,11 @@ async function main() { writePidWithMtime(path.join(OUTPUT_DIR, 'hook.pid'), process.pid, hookStartTime); // Launch Chromium using consolidated function + // userDataDir is derived from ACTIVE_PERSONA by get_config() if not explicitly set const result = await launchChromium({ binary, outputDir: OUTPUT_DIR, + userDataDir, extensionPaths, }); From 1a867895234d23ed7f41c8f712380bb5ed8c6836 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 31 Dec 2025 00:57:29 +0000 Subject: [PATCH 2/7] Move Chrome default args to config.json CHROME_ARGS - Add comprehensive default CHROME_ARGS in config.json with 55+ flags for deterministic rendering, security, performance, and UI suppression - Update chrome_utils.js launchChromium() to read CHROME_ARGS and CHROME_ARGS_EXTRA from environment variables (set by get_config()) - Add getEnvArray() helper to parse JSON arrays or comma-separated strings from environment variables - Separate args into three categories: 1. baseArgs: Static flags from CHROME_ARGS config (configurable) 2. dynamicArgs: Runtime-computed flags (port, sandbox, headless, etc.) 3. extraArgs: User overrides from CHROME_ARGS_EXTRA - Add CHROME_SANDBOX config option to control --no-sandbox flag Args are now configurable via: - config.json defaults - ArchiveBox.conf file - Environment variables - Per-crawl/snapshot config overrides --- archivebox/plugins/chrome/chrome_utils.js | 81 +++++++++++++++++------ archivebox/plugins/chrome/config.json | 66 ++++++++++++++++-- 2 files changed, 121 insertions(+), 26 deletions(-) diff --git a/archivebox/plugins/chrome/chrome_utils.js b/archivebox/plugins/chrome/chrome_utils.js index dda6612b..def11874 100755 --- a/archivebox/plugins/chrome/chrome_utils.js +++ b/archivebox/plugins/chrome/chrome_utils.js @@ -56,6 +56,36 @@ function getEnvInt(name, defaultValue = 0) { return isNaN(val) ? defaultValue : val; } +/** + * Get array environment variable (JSON array or comma-separated string). + * @param {string} name - Environment variable name + * @param {string[]} [defaultValue=[]] - Default value if not set + * @returns {string[]} - Array of strings + */ +function getEnvArray(name, defaultValue = []) { + const val = getEnv(name, ''); + if (!val) return defaultValue; + + // Try parsing as JSON array first + if (val.startsWith('[')) { + try { + const parsed = JSON.parse(val); + if (Array.isArray(parsed)) return parsed; + } catch (e) { + // Fall through to comma-separated parsing + } + } + + // Parse as comma-separated (but be careful with args that contain commas) + // For Chrome args, we split on comma followed by '--' to be safe + if (val.includes(',--')) { + return val.split(/,(?=--)/).map(s => s.trim()).filter(Boolean); + } + + // Simple comma-separated + return val.split(',').map(s => s.trim()).filter(Boolean); +} + /** * Parse resolution string into width/height. * @param {string} resolution - Resolution string like "1440,2000" @@ -298,6 +328,7 @@ function killZombieChrome(dataDir = null) { * @param {string} [options.userDataDir] - Chrome user data directory for persistent sessions * @param {string} [options.resolution='1440,2000'] - Window resolution * @param {boolean} [options.headless=true] - Run in headless mode + * @param {boolean} [options.sandbox=true] - Enable Chrome sandbox * @param {boolean} [options.checkSsl=true] - Check SSL certificates * @param {string[]} [options.extensionPaths=[]] - Paths to unpacked extensions * @param {boolean} [options.killZombies=true] - Kill zombie processes first @@ -310,6 +341,7 @@ async function launchChromium(options = {}) { userDataDir = getEnv('CHROME_USER_DATA_DIR'), resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000'), headless = getEnvBool('CHROME_HEADLESS', true), + sandbox = getEnvBool('CHROME_SANDBOX', true), checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true)), extensionPaths = [], killZombies = true, @@ -353,38 +385,43 @@ async function launchChromium(options = {}) { const debugPort = await findFreePort(); console.error(`[*] Using debug port: ${debugPort}`); - // Build Chrome arguments - const chromiumArgs = [ + // Get base Chrome args from config (static flags from CHROME_ARGS env var) + // These come from config.json defaults, merged by get_config() in Python + const baseArgs = getEnvArray('CHROME_ARGS', []); + + // Get extra user-provided args + const extraArgs = getEnvArray('CHROME_ARGS_EXTRA', []); + + // Build dynamic Chrome arguments (these must be computed at runtime) + const dynamicArgs = [ + // Remote debugging setup `--remote-debugging-port=${debugPort}`, '--remote-debugging-address=127.0.0.1', - '--no-sandbox', - '--disable-setuid-sandbox', + + // Sandbox settings (disable in Docker) + ...(sandbox ? [] : ['--no-sandbox', '--disable-setuid-sandbox']), + + // Docker-specific workarounds '--disable-dev-shm-usage', '--disable-gpu', - '--disable-sync', - '--no-first-run', - '--no-default-browser-check', - '--disable-default-apps', - '--disable-infobars', - '--disable-blink-features=AutomationControlled', - '--disable-component-update', - '--disable-domain-reliability', - '--disable-breakpad', - '--disable-background-networking', - '--disable-background-timer-throttling', - '--disable-backgrounding-occluded-windows', - '--disable-renderer-backgrounding', - '--disable-ipc-flooding-protection', - '--password-store=basic', - '--use-mock-keychain', - '--font-render-hinting=none', - '--force-color-profile=srgb', + + // Window size `--window-size=${width},${height}`, + + // User data directory (for persistent sessions with persona) ...(userDataDir ? [`--user-data-dir=${userDataDir}`] : []), + + // Headless mode ...(headless ? ['--headless=new'] : []), + + // SSL certificate checking ...(checkSsl ? [] : ['--ignore-certificate-errors']), ]; + // Combine all args: base (from config) + dynamic (runtime) + extra (user overrides) + // Dynamic args come after base so they can override if needed + const chromiumArgs = [...baseArgs, ...dynamicArgs, ...extraArgs]; + // Add extension loading flags if (extensionPaths.length > 0) { const extPathsArg = extensionPaths.join(','); diff --git a/archivebox/plugins/chrome/config.json b/archivebox/plugins/chrome/config.json index 4ff40faa..0bc9e754 100644 --- a/archivebox/plugins/chrome/config.json +++ b/archivebox/plugins/chrome/config.json @@ -42,7 +42,7 @@ "CHROME_USER_DATA_DIR": { "type": "string", "default": "", - "description": "Path to Chrome user data directory for persistent sessions" + "description": "Path to Chrome user data directory for persistent sessions (derived from ACTIVE_PERSONA if not set)" }, "CHROME_USER_AGENT": { "type": "string", @@ -53,16 +53,74 @@ "CHROME_ARGS": { "type": "array", "items": {"type": "string"}, - "default": [], + "default": [ + "--no-first-run", + "--no-default-browser-check", + "--disable-default-apps", + "--disable-sync", + "--disable-infobars", + "--disable-blink-features=AutomationControlled", + "--disable-component-update", + "--disable-domain-reliability", + "--disable-breakpad", + "--disable-client-side-phishing-detection", + "--disable-hang-monitor", + "--disable-speech-synthesis-api", + "--disable-speech-api", + "--disable-print-preview", + "--disable-notifications", + "--disable-desktop-notifications", + "--disable-popup-blocking", + "--disable-prompt-on-repost", + "--disable-external-intent-requests", + "--disable-session-crashed-bubble", + "--disable-search-engine-choice-screen", + "--disable-datasaver-prompt", + "--ash-no-nudges", + "--hide-crash-restore-bubble", + "--suppress-message-center-popups", + "--noerrdialogs", + "--no-pings", + "--silent-debugger-extension-api", + "--deny-permission-prompts", + "--safebrowsing-disable-auto-update", + "--metrics-recording-only", + "--password-store=basic", + "--use-mock-keychain", + "--disable-cookie-encryption", + "--font-render-hinting=none", + "--force-color-profile=srgb", + "--disable-partial-raster", + "--disable-skia-runtime-opts", + "--disable-2d-canvas-clip-aa", + "--enable-webgl", + "--hide-scrollbars", + "--export-tagged-pdf", + "--generate-pdf-document-outline", + "--disable-lazy-loading", + "--disable-renderer-backgrounding", + "--disable-background-networking", + "--disable-background-timer-throttling", + "--disable-backgrounding-occluded-windows", + "--disable-ipc-flooding-protection", + "--disable-extensions-http-throttling", + "--disable-field-trial-config", + "--disable-back-forward-cache", + "--autoplay-policy=no-user-gesture-required", + "--disable-gesture-requirement-for-media-playback", + "--lang=en-US,en;q=0.9", + "--log-level=2", + "--enable-logging=stderr" + ], "x-aliases": ["CHROME_DEFAULT_ARGS"], - "description": "Default Chrome command-line arguments" + "description": "Default Chrome command-line arguments (static flags only, dynamic args like --user-data-dir are added at runtime)" }, "CHROME_ARGS_EXTRA": { "type": "array", "items": {"type": "string"}, "default": [], "x-aliases": ["CHROME_EXTRA_ARGS"], - "description": "Extra arguments to append to Chrome command" + "description": "Extra arguments to append to Chrome command (for user customization)" }, "CHROME_PAGELOAD_TIMEOUT": { "type": "integer", From 503a2f77cb5282dd4c97ca8d62b697ef71d39dd5 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 31 Dec 2025 00:59:37 +0000 Subject: [PATCH 3/7] Add Persona class with cleanup_chrome() method - Create Persona class in personas/models.py for managing browser profiles/identities used for archiving sessions - Each Persona has: - chrome_user_data_dir: Chrome profile directory - chrome_extensions_dir: Installed extensions - cookies_file: Cookies for wget/curl - config_file: Persona-specific config overrides - Add Persona methods: - cleanup_chrome(): Remove stale SingletonLock/SingletonSocket files - get_config(): Load persona config from config.json - save_config(): Save persona config to config.json - ensure_dirs(): Create persona directory structure - all(): Iterator over all personas - get_active(): Get persona based on ACTIVE_PERSONA config - cleanup_chrome_all(): Clean up all personas - Update chrome_cleanup() in misc/util.py to use Persona.cleanup_chrome_all() instead of manual directory iteration - Add convenience functions: - cleanup_chrome_for_persona(name) - cleanup_chrome_all_personas() --- archivebox/misc/util.py | 35 ++-- archivebox/personas/models.py | 296 +++++++++++++++++++++++++++------- 2 files changed, 254 insertions(+), 77 deletions(-) diff --git a/archivebox/misc/util.py b/archivebox/misc/util.py index 423d187b..67e9b45b 100644 --- a/archivebox/misc/util.py +++ b/archivebox/misc/util.py @@ -482,22 +482,25 @@ def chrome_cleanup(): """ Cleans up any state or runtime files that Chrome leaves behind when killed by a timeout or other error. Handles: - - Persona-based chrome_user_data directories (from ACTIVE_PERSONA) - - Explicit CHROME_USER_DATA_DIR + - All persona chrome_user_data directories (via Persona.cleanup_chrome_all()) + - Explicit CHROME_USER_DATA_DIR from config - Legacy Docker chromium path """ import os from pathlib import Path from archivebox.config.permissions import IN_DOCKER - # Clean up persona-based user data directories + # Clean up all persona chrome directories using Persona class try: + from archivebox.personas.models import Persona + + # Clean up all personas + Persona.cleanup_chrome_all() + + # Also clean up the active persona's explicit CHROME_USER_DATA_DIR if set + # (in case it's a custom path not under PERSONAS_DIR) from archivebox.config.configset import get_config - from archivebox.config.constants import CONSTANTS - config = get_config() - - # Clean up the active persona's chrome_user_data SingletonLock chrome_user_data_dir = config.get('CHROME_USER_DATA_DIR') if chrome_user_data_dir: singleton_lock = Path(chrome_user_data_dir) / 'SingletonLock' @@ -506,24 +509,10 @@ def chrome_cleanup(): singleton_lock.unlink() except OSError: pass - - # Clean up all persona directories - personas_dir = CONSTANTS.PERSONAS_DIR - if personas_dir.exists(): - for persona_dir in personas_dir.iterdir(): - if not persona_dir.is_dir(): - continue - user_data_dir = persona_dir / 'chrome_user_data' - singleton_lock = user_data_dir / 'SingletonLock' - if singleton_lock.exists(): - try: - singleton_lock.unlink() - except OSError: - pass except Exception: - pass # Config not available during early startup + pass # Persona/config not available during early startup - # Legacy Docker cleanup + # Legacy Docker cleanup (for backwards compatibility) if IN_DOCKER: singleton_lock = "/home/archivebox/.config/chromium/SingletonLock" if os.path.lexists(singleton_lock): diff --git a/archivebox/personas/models.py b/archivebox/personas/models.py index 99f8ef87..3b38c49f 100644 --- a/archivebox/personas/models.py +++ b/archivebox/personas/models.py @@ -1,59 +1,247 @@ -# from django.db import models +""" +Persona management for ArchiveBox. -# from django.conf import settings +A Persona represents a browser profile/identity used for archiving. +Each persona has its own: +- Chrome user data directory (for cookies, localStorage, extensions, etc.) +- Chrome extensions directory +- Cookies file +- Config overrides + +Personas are stored as directories under PERSONAS_DIR (default: data/personas/). +""" + +__package__ = 'archivebox.personas' + +from pathlib import Path +from typing import Optional, Dict, Any, Iterator -# class Persona(models.Model): -# """Aka a "SessionType", its a template for a crawler browsing session containing some config.""" +class Persona: + """ + Represents a browser persona/profile for archiving sessions. -# id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID') - -# created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False) -# created_at = AutoDateTimeField(default=None, null=False, db_index=True) -# modified_at = models.DateTimeField(auto_now=True) - -# name = models.CharField(max_length=100, blank=False, null=False, editable=False) - -# persona_dir = models.FilePathField(path=settings.PERSONAS_DIR, allow_files=False, allow_folders=True, blank=True, null=False, editable=False) -# config = models.JSONField(default=dict) -# # e.g. { -# # USER_AGENT: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36', -# # COOKIES_TXT_FILE: '/path/to/cookies.txt', -# # CHROME_USER_DATA_DIR: '/path/to/chrome/user/data/dir', -# # CHECK_SSL_VALIDITY: False, -# # SAVE_ARCHIVEDOTORG: True, -# # CHROME_BINARY: 'chromium' -# # ... -# # } -# # domain_allowlist = models.CharField(max_length=1024, blank=True, null=False, default='') -# # domain_denylist = models.CharField(max_length=1024, blank=True, null=False, default='') - -# class Meta: -# app_label = 'personas' -# verbose_name = 'Session Type' -# verbose_name_plural = 'Session Types' -# unique_together = (('created_by', 'name'),) - + Each persona is a directory containing: + - chrome_user_data/ Chrome profile directory + - chrome_extensions/ Installed extensions + - cookies.txt Cookies file for wget/curl + - config.json Persona-specific config overrides -# def clean(self): -# self.persona_dir = settings.PERSONAS_DIR / self.name -# assert self.persona_dir == settings.PERSONAS_DIR / self.name, f'Persona dir {self.persona_dir} must match settings.PERSONAS_DIR / self.name' - - -# # make sure config keys all exist in FLAT_CONFIG -# # make sure config values all match expected types -# pass - -# def save(self, *args, **kwargs): -# self.full_clean() - -# # make sure basic file structure is present in persona_dir: -# # - PERSONAS_DIR / self.name / -# # - chrome_profile/ -# # - chrome_downloads/ -# # - chrome_extensions/ -# # - cookies.txt -# # - auth.json -# # - config.json # json dump of the model - -# super().save(*args, **kwargs) + Usage: + persona = Persona('Default') + persona.cleanup_chrome() + + # Or iterate all personas: + for persona in Persona.all(): + persona.cleanup_chrome() + """ + + def __init__(self, name: str, personas_dir: Optional[Path] = None): + """ + Initialize a Persona by name. + + Args: + name: Persona name (directory name under PERSONAS_DIR) + personas_dir: Override PERSONAS_DIR (defaults to CONSTANTS.PERSONAS_DIR) + """ + self.name = name + + if personas_dir is None: + from archivebox.config.constants import CONSTANTS + personas_dir = CONSTANTS.PERSONAS_DIR + + self.personas_dir = Path(personas_dir) + self.path = self.personas_dir / name + + @property + def chrome_user_data_dir(self) -> Path: + """Path to Chrome user data directory for this persona.""" + return self.path / 'chrome_user_data' + + @property + def chrome_extensions_dir(self) -> Path: + """Path to Chrome extensions directory for this persona.""" + return self.path / 'chrome_extensions' + + @property + def cookies_file(self) -> Path: + """Path to cookies.txt file for this persona.""" + return self.path / 'cookies.txt' + + @property + def config_file(self) -> Path: + """Path to config.json file for this persona.""" + return self.path / 'config.json' + + @property + def singleton_lock(self) -> Path: + """Path to Chrome's SingletonLock file.""" + return self.chrome_user_data_dir / 'SingletonLock' + + def exists(self) -> bool: + """Check if persona directory exists.""" + return self.path.is_dir() + + def ensure_dirs(self) -> None: + """Create persona directories if they don't exist.""" + self.path.mkdir(parents=True, exist_ok=True) + self.chrome_user_data_dir.mkdir(parents=True, exist_ok=True) + self.chrome_extensions_dir.mkdir(parents=True, exist_ok=True) + + def cleanup_chrome(self) -> bool: + """ + Clean up Chrome state files for this persona. + + Removes stale SingletonLock files left behind when Chrome crashes + or is killed unexpectedly. This allows Chrome to start fresh. + + Returns: + True if cleanup was performed, False if no cleanup needed + """ + cleaned = False + + # Remove SingletonLock if it exists + if self.singleton_lock.exists(): + try: + self.singleton_lock.unlink() + cleaned = True + except OSError: + pass # May be in use by active Chrome + + # Also clean up any other stale lock files Chrome might leave + if self.chrome_user_data_dir.exists(): + for lock_file in self.chrome_user_data_dir.glob('**/SingletonLock'): + try: + lock_file.unlink() + cleaned = True + except OSError: + pass + + # Clean up socket files + for socket_file in self.chrome_user_data_dir.glob('**/SingletonSocket'): + try: + socket_file.unlink() + cleaned = True + except OSError: + pass + + return cleaned + + def get_config(self) -> Dict[str, Any]: + """ + Load persona-specific config overrides from config.json. + + Returns: + Dict of config overrides, or empty dict if no config file + """ + import json + + if not self.config_file.exists(): + return {} + + try: + return json.loads(self.config_file.read_text()) + except (json.JSONDecodeError, OSError): + return {} + + def save_config(self, config: Dict[str, Any]) -> None: + """ + Save persona-specific config overrides to config.json. + + Args: + config: Dict of config overrides to save + """ + import json + + self.ensure_dirs() + self.config_file.write_text(json.dumps(config, indent=2)) + + @classmethod + def all(cls, personas_dir: Optional[Path] = None) -> Iterator['Persona']: + """ + Iterate over all personas in PERSONAS_DIR. + + Args: + personas_dir: Override PERSONAS_DIR (defaults to CONSTANTS.PERSONAS_DIR) + + Yields: + Persona instances for each persona directory + """ + if personas_dir is None: + from archivebox.config.constants import CONSTANTS + personas_dir = CONSTANTS.PERSONAS_DIR + + personas_dir = Path(personas_dir) + + if not personas_dir.exists(): + return + + for persona_path in personas_dir.iterdir(): + if persona_path.is_dir(): + yield cls(persona_path.name, personas_dir) + + @classmethod + def get_active(cls) -> 'Persona': + """ + Get the currently active persona based on ACTIVE_PERSONA config. + + Returns: + Persona instance for the active persona + """ + from archivebox.config.configset import get_config + + config = get_config() + active_name = config.get('ACTIVE_PERSONA', 'Default') + return cls(active_name) + + @classmethod + def cleanup_chrome_all(cls, personas_dir: Optional[Path] = None) -> int: + """ + Clean up Chrome state files for all personas. + + Args: + personas_dir: Override PERSONAS_DIR (defaults to CONSTANTS.PERSONAS_DIR) + + Returns: + Number of personas that had cleanup performed + """ + cleaned_count = 0 + for persona in cls.all(personas_dir): + if persona.cleanup_chrome(): + cleaned_count += 1 + return cleaned_count + + def __str__(self) -> str: + return f"Persona({self.name})" + + def __repr__(self) -> str: + return f"Persona(name={self.name!r}, path={self.path!r})" + + +# Convenience functions for use without instantiating Persona class + +def cleanup_chrome_for_persona(name: str, personas_dir: Optional[Path] = None) -> bool: + """ + Clean up Chrome state files for a specific persona. + + Args: + name: Persona name + personas_dir: Override PERSONAS_DIR (defaults to CONSTANTS.PERSONAS_DIR) + + Returns: + True if cleanup was performed, False if no cleanup needed + """ + return Persona(name, personas_dir).cleanup_chrome() + + +def cleanup_chrome_all_personas(personas_dir: Optional[Path] = None) -> int: + """ + Clean up Chrome state files for all personas. + + Args: + personas_dir: Override PERSONAS_DIR (defaults to CONSTANTS.PERSONAS_DIR) + + Returns: + Number of personas that had cleanup performed + """ + return Persona.cleanup_chrome_all(personas_dir) From b1e31c3def83861797d4bfda11460b2e5cc4402a Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 31 Dec 2025 01:00:52 +0000 Subject: [PATCH 4/7] Simplify Persona class: remove convenience functions, fix get_active() - Remove standalone convenience functions (cleanup_chrome_for_persona, cleanup_chrome_all_personas) to reduce LOC - Change Persona.get_active(config) to accept config dict as argument instead of calling get_config() internally, since the caller needs to pass user/crawl/snapshot/archiveresult context for proper config --- archivebox/personas/models.py | 41 +++++------------------------------ 1 file changed, 6 insertions(+), 35 deletions(-) diff --git a/archivebox/personas/models.py b/archivebox/personas/models.py index 3b38c49f..87e7369e 100644 --- a/archivebox/personas/models.py +++ b/archivebox/personas/models.py @@ -181,17 +181,17 @@ class Persona: yield cls(persona_path.name, personas_dir) @classmethod - def get_active(cls) -> 'Persona': + def get_active(cls, config: Dict[str, Any]) -> 'Persona': """ - Get the currently active persona based on ACTIVE_PERSONA config. + Get the currently active persona from a merged config dict. + + Args: + config: Merged config dict from get_config(user=, crawl=, snapshot=, ...) Returns: Persona instance for the active persona """ - from archivebox.config.configset import get_config - - config = get_config() - active_name = config.get('ACTIVE_PERSONA', 'Default') + active_name = config.get('ACTIVE_PERSONA') or config.get('DEFAULT_PERSONA') or 'Default' return cls(active_name) @classmethod @@ -216,32 +216,3 @@ class Persona: def __repr__(self) -> str: return f"Persona(name={self.name!r}, path={self.path!r})" - - -# Convenience functions for use without instantiating Persona class - -def cleanup_chrome_for_persona(name: str, personas_dir: Optional[Path] = None) -> bool: - """ - Clean up Chrome state files for a specific persona. - - Args: - name: Persona name - personas_dir: Override PERSONAS_DIR (defaults to CONSTANTS.PERSONAS_DIR) - - Returns: - True if cleanup was performed, False if no cleanup needed - """ - return Persona(name, personas_dir).cleanup_chrome() - - -def cleanup_chrome_all_personas(personas_dir: Optional[Path] = None) -> int: - """ - Clean up Chrome state files for all personas. - - Args: - personas_dir: Override PERSONAS_DIR (defaults to CONSTANTS.PERSONAS_DIR) - - Returns: - Number of personas that had cleanup performed - """ - return Persona.cleanup_chrome_all(personas_dir) From b8a66c4a84b991cc6075cce8e0bff51633867baa Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 31 Dec 2025 01:07:29 +0000 Subject: [PATCH 5/7] Convert Persona to Django ModelWithConfig, add to get_config() - Convert Persona from plain Python class to Django model with ModelWithConfig - Add config JSONField for persona-specific config overrides - Add get_derived_config() method that returns config with derived paths: - CHROME_USER_DATA_DIR, CHROME_EXTENSIONS_DIR, COOKIES_FILE, ACTIVE_PERSONA - Update get_config() to accept persona parameter in merge chain: get_config(persona=crawl.persona, crawl=crawl, snapshot=snapshot) - Remove _derive_persona_paths() - derivation now happens in Persona model - Merge order (highest to lowest priority): 1. snapshot.config 2. crawl.config 3. user.config 4. persona.get_derived_config() <- NEW 5. environment variables 6. ArchiveBox.conf file 7. plugin defaults 8. core defaults Usage: config = get_config(persona=crawl.persona, crawl=crawl) config['CHROME_USER_DATA_DIR'] # derived from persona --- archivebox/config/configset.py | 61 ++------ archivebox/personas/models.py | 269 +++++++++++++-------------------- 2 files changed, 114 insertions(+), 216 deletions(-) diff --git a/archivebox/config/configset.py b/archivebox/config/configset.py index afc02c38..00835ab7 100644 --- a/archivebox/config/configset.py +++ b/archivebox/config/configset.py @@ -120,6 +120,7 @@ class BaseConfigSet(BaseSettings): def get_config( scope: str = "global", defaults: Optional[Dict] = None, + persona: Any = None, user: Any = None, crawl: Any = None, snapshot: Any = None, @@ -131,14 +132,16 @@ def get_config( 1. Per-snapshot config (snapshot.config JSON field) 2. Per-crawl config (crawl.config JSON field) 3. Per-user config (user.config JSON field) - 4. Environment variables - 5. Config file (ArchiveBox.conf) - 6. Plugin schema defaults (config.json) - 7. Core config defaults + 4. Per-persona config (persona.get_derived_config() - includes CHROME_USER_DATA_DIR etc.) + 5. Environment variables + 6. Config file (ArchiveBox.conf) + 7. Plugin schema defaults (config.json) + 8. Core config defaults Args: scope: Config scope ('global', 'crawl', 'snapshot', etc.) defaults: Default values to start with + persona: Persona object (provides derived paths like CHROME_USER_DATA_DIR) user: User object with config JSON field crawl: Crawl object with config JSON field snapshot: Snapshot object with config JSON field @@ -205,6 +208,10 @@ def get_config( except ImportError: pass + # Apply persona config overrides (includes derived paths like CHROME_USER_DATA_DIR) + if persona and hasattr(persona, "get_derived_config"): + config.update(persona.get_derived_config()) + # Apply user config overrides if user and hasattr(user, "config") and user.config: config.update(user.config) @@ -240,52 +247,6 @@ def get_config( except ImportError: pass - # Derive persona-based paths if not explicitly set - # This allows plugins to just use CHROME_USER_DATA_DIR without knowing about personas - config = _derive_persona_paths(config, CONSTANTS) - - return config - - -def _derive_persona_paths(config: Dict[str, Any], CONSTANTS: Any) -> Dict[str, Any]: - """ - Derive persona-specific paths from ACTIVE_PERSONA if not explicitly set. - - This runs after all config sources are merged, so plugins receive - the final resolved paths without needing to know about the persona system. - - Derived paths: - CHROME_USER_DATA_DIR <- PERSONAS_DIR / ACTIVE_PERSONA / chrome_user_data - CHROME_EXTENSIONS_DIR <- PERSONAS_DIR / ACTIVE_PERSONA / chrome_extensions - COOKIES_FILE <- PERSONAS_DIR / ACTIVE_PERSONA / cookies.txt (if exists) - """ - # Get active persona (defaults to "Default") - active_persona = config.get('ACTIVE_PERSONA') or config.get('DEFAULT_PERSONA') or 'Default' - - # Ensure ACTIVE_PERSONA is always set in config for downstream use - config['ACTIVE_PERSONA'] = active_persona - - # Get personas directory - personas_dir = CONSTANTS.PERSONAS_DIR - persona_dir = personas_dir / active_persona - - # Derive CHROME_USER_DATA_DIR if not explicitly set - chrome_user_data_dir = config.get('CHROME_USER_DATA_DIR') - if not chrome_user_data_dir: - config['CHROME_USER_DATA_DIR'] = str(persona_dir / 'chrome_user_data') - - # Derive CHROME_EXTENSIONS_DIR if not explicitly set - chrome_extensions_dir = config.get('CHROME_EXTENSIONS_DIR') - if not chrome_extensions_dir: - config['CHROME_EXTENSIONS_DIR'] = str(persona_dir / 'chrome_extensions') - - # Derive COOKIES_FILE if not explicitly set and file exists - cookies_file = config.get('COOKIES_FILE') - if not cookies_file: - persona_cookies = persona_dir / 'cookies.txt' - if persona_cookies.exists(): - config['COOKIES_FILE'] = str(persona_cookies) - return config diff --git a/archivebox/personas/models.py b/archivebox/personas/models.py index 87e7369e..470ec846 100644 --- a/archivebox/personas/models.py +++ b/archivebox/personas/models.py @@ -7,212 +7,149 @@ Each persona has its own: - Chrome extensions directory - Cookies file - Config overrides - -Personas are stored as directories under PERSONAS_DIR (default: data/personas/). """ __package__ = 'archivebox.personas' from pathlib import Path -from typing import Optional, Dict, Any, Iterator +from typing import TYPE_CHECKING, Iterator + +from django.db import models +from django.conf import settings +from django.utils import timezone + +from archivebox.base_models.models import ModelWithConfig, get_or_create_system_user_pk + +if TYPE_CHECKING: + from django.db.models import QuerySet -class Persona: +class Persona(ModelWithConfig): """ - Represents a browser persona/profile for archiving sessions. + Browser persona/profile for archiving sessions. - Each persona is a directory containing: - - chrome_user_data/ Chrome profile directory - - chrome_extensions/ Installed extensions - - cookies.txt Cookies file for wget/curl - - config.json Persona-specific config overrides + Each persona provides: + - CHROME_USER_DATA_DIR: Chrome profile directory + - CHROME_EXTENSIONS_DIR: Installed extensions directory + - COOKIES_FILE: Cookies file for wget/curl + - config: JSON field with persona-specific config overrides Usage: - persona = Persona('Default') - persona.cleanup_chrome() + # Get persona and its derived config + config = get_config(persona=crawl.persona, crawl=crawl, snapshot=snapshot) + chrome_dir = config['CHROME_USER_DATA_DIR'] - # Or iterate all personas: - for persona in Persona.all(): - persona.cleanup_chrome() + # Or access directly from persona + persona = Persona.objects.get(name='Default') + persona.CHROME_USER_DATA_DIR # -> Path to chrome_user_data """ - def __init__(self, name: str, personas_dir: Optional[Path] = None): + name = models.CharField(max_length=64, unique=True) + created_at = models.DateTimeField(default=timezone.now, db_index=True) + created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk) + + class Meta: + app_label = 'personas' + + def __str__(self) -> str: + return self.name + + @property + def path(self) -> Path: + """Path to persona directory under PERSONAS_DIR.""" + from archivebox.config.constants import CONSTANTS + return CONSTANTS.PERSONAS_DIR / self.name + + @property + def CHROME_USER_DATA_DIR(self) -> str: + """Derived path to Chrome user data directory for this persona.""" + return str(self.path / 'chrome_user_data') + + @property + def CHROME_EXTENSIONS_DIR(self) -> str: + """Derived path to Chrome extensions directory for this persona.""" + return str(self.path / 'chrome_extensions') + + @property + def COOKIES_FILE(self) -> str: + """Derived path to cookies.txt file for this persona (if exists).""" + cookies_path = self.path / 'cookies.txt' + return str(cookies_path) if cookies_path.exists() else '' + + def get_derived_config(self) -> dict: """ - Initialize a Persona by name. + Get config dict with derived paths filled in. - Args: - name: Persona name (directory name under PERSONAS_DIR) - personas_dir: Override PERSONAS_DIR (defaults to CONSTANTS.PERSONAS_DIR) + Returns dict with: + - All values from self.config JSONField + - CHROME_USER_DATA_DIR (derived from persona path) + - CHROME_EXTENSIONS_DIR (derived from persona path) + - COOKIES_FILE (derived from persona path, if file exists) + - ACTIVE_PERSONA (set to this persona's name) """ - self.name = name + derived = dict(self.config or {}) - if personas_dir is None: - from archivebox.config.constants import CONSTANTS - personas_dir = CONSTANTS.PERSONAS_DIR + # Add derived paths (don't override if explicitly set in config) + if 'CHROME_USER_DATA_DIR' not in derived: + derived['CHROME_USER_DATA_DIR'] = self.CHROME_USER_DATA_DIR + if 'CHROME_EXTENSIONS_DIR' not in derived: + derived['CHROME_EXTENSIONS_DIR'] = self.CHROME_EXTENSIONS_DIR + if 'COOKIES_FILE' not in derived and self.COOKIES_FILE: + derived['COOKIES_FILE'] = self.COOKIES_FILE - self.personas_dir = Path(personas_dir) - self.path = self.personas_dir / name + # Always set ACTIVE_PERSONA to this persona's name + derived['ACTIVE_PERSONA'] = self.name - @property - def chrome_user_data_dir(self) -> Path: - """Path to Chrome user data directory for this persona.""" - return self.path / 'chrome_user_data' - - @property - def chrome_extensions_dir(self) -> Path: - """Path to Chrome extensions directory for this persona.""" - return self.path / 'chrome_extensions' - - @property - def cookies_file(self) -> Path: - """Path to cookies.txt file for this persona.""" - return self.path / 'cookies.txt' - - @property - def config_file(self) -> Path: - """Path to config.json file for this persona.""" - return self.path / 'config.json' - - @property - def singleton_lock(self) -> Path: - """Path to Chrome's SingletonLock file.""" - return self.chrome_user_data_dir / 'SingletonLock' - - def exists(self) -> bool: - """Check if persona directory exists.""" - return self.path.is_dir() + return derived def ensure_dirs(self) -> None: """Create persona directories if they don't exist.""" self.path.mkdir(parents=True, exist_ok=True) - self.chrome_user_data_dir.mkdir(parents=True, exist_ok=True) - self.chrome_extensions_dir.mkdir(parents=True, exist_ok=True) + (self.path / 'chrome_user_data').mkdir(parents=True, exist_ok=True) + (self.path / 'chrome_extensions').mkdir(parents=True, exist_ok=True) def cleanup_chrome(self) -> bool: """ - Clean up Chrome state files for this persona. - - Removes stale SingletonLock files left behind when Chrome crashes - or is killed unexpectedly. This allows Chrome to start fresh. + Clean up Chrome state files (SingletonLock, etc.) for this persona. Returns: True if cleanup was performed, False if no cleanup needed """ cleaned = False + chrome_dir = self.path / 'chrome_user_data' - # Remove SingletonLock if it exists - if self.singleton_lock.exists(): + if not chrome_dir.exists(): + return False + + # Clean up SingletonLock files + for lock_file in chrome_dir.glob('**/SingletonLock'): try: - self.singleton_lock.unlink() + lock_file.unlink() cleaned = True except OSError: - pass # May be in use by active Chrome + pass - # Also clean up any other stale lock files Chrome might leave - if self.chrome_user_data_dir.exists(): - for lock_file in self.chrome_user_data_dir.glob('**/SingletonLock'): - try: - lock_file.unlink() - cleaned = True - except OSError: - pass - - # Clean up socket files - for socket_file in self.chrome_user_data_dir.glob('**/SingletonSocket'): - try: - socket_file.unlink() - cleaned = True - except OSError: - pass + # Clean up SingletonSocket files + for socket_file in chrome_dir.glob('**/SingletonSocket'): + try: + socket_file.unlink() + cleaned = True + except OSError: + pass return cleaned - def get_config(self) -> Dict[str, Any]: - """ - Load persona-specific config overrides from config.json. - - Returns: - Dict of config overrides, or empty dict if no config file - """ - import json - - if not self.config_file.exists(): - return {} - - try: - return json.loads(self.config_file.read_text()) - except (json.JSONDecodeError, OSError): - return {} - - def save_config(self, config: Dict[str, Any]) -> None: - """ - Save persona-specific config overrides to config.json. - - Args: - config: Dict of config overrides to save - """ - import json - - self.ensure_dirs() - self.config_file.write_text(json.dumps(config, indent=2)) + @classmethod + def get_or_create_default(cls) -> 'Persona': + """Get or create the Default persona.""" + persona, _ = cls.objects.get_or_create(name='Default') + return persona @classmethod - def all(cls, personas_dir: Optional[Path] = None) -> Iterator['Persona']: - """ - Iterate over all personas in PERSONAS_DIR. - - Args: - personas_dir: Override PERSONAS_DIR (defaults to CONSTANTS.PERSONAS_DIR) - - Yields: - Persona instances for each persona directory - """ - if personas_dir is None: - from archivebox.config.constants import CONSTANTS - personas_dir = CONSTANTS.PERSONAS_DIR - - personas_dir = Path(personas_dir) - - if not personas_dir.exists(): - return - - for persona_path in personas_dir.iterdir(): - if persona_path.is_dir(): - yield cls(persona_path.name, personas_dir) - - @classmethod - def get_active(cls, config: Dict[str, Any]) -> 'Persona': - """ - Get the currently active persona from a merged config dict. - - Args: - config: Merged config dict from get_config(user=, crawl=, snapshot=, ...) - - Returns: - Persona instance for the active persona - """ - active_name = config.get('ACTIVE_PERSONA') or config.get('DEFAULT_PERSONA') or 'Default' - return cls(active_name) - - @classmethod - def cleanup_chrome_all(cls, personas_dir: Optional[Path] = None) -> int: - """ - Clean up Chrome state files for all personas. - - Args: - personas_dir: Override PERSONAS_DIR (defaults to CONSTANTS.PERSONAS_DIR) - - Returns: - Number of personas that had cleanup performed - """ - cleaned_count = 0 - for persona in cls.all(personas_dir): + def cleanup_chrome_all(cls) -> int: + """Clean up Chrome state files for all personas.""" + cleaned = 0 + for persona in cls.objects.all(): if persona.cleanup_chrome(): - cleaned_count += 1 - return cleaned_count - - def __str__(self) -> str: - return f"Persona({self.name})" - - def __repr__(self) -> str: - return f"Persona(name={self.name!r}, path={self.path!r})" + cleaned += 1 + return cleaned From f7b186d7c8c643edb5a65084dc8870e4dcc35136 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 31 Dec 2025 02:31:46 -0500 Subject: [PATCH 6/7] Apply suggestion from @cubic-dev-ai[bot] Co-authored-by: cubic-dev-ai[bot] <191113872+cubic-dev-ai[bot]@users.noreply.github.com> --- archivebox/misc/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archivebox/misc/util.py b/archivebox/misc/util.py index 67e9b45b..c69c8c86 100644 --- a/archivebox/misc/util.py +++ b/archivebox/misc/util.py @@ -504,7 +504,7 @@ def chrome_cleanup(): chrome_user_data_dir = config.get('CHROME_USER_DATA_DIR') if chrome_user_data_dir: singleton_lock = Path(chrome_user_data_dir) / 'SingletonLock' - if singleton_lock.exists(): + if os.path.lexists(singleton_lock): try: singleton_lock.unlink() except OSError: From 4285a05d19a8b246fbdcbad2ef66f186ed0b1ed7 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Wed, 31 Dec 2025 07:39:49 +0000 Subject: [PATCH 7/7] Fix getEnvArray to parse JSON when '[' present, CSV otherwise Simplifies the comma-separated parsing logic to: - If value contains '[', parse as JSON array - Otherwise, parse as comma-separated values This prevents incorrect splitting of arguments containing internal commas when there's only one argument. For arguments with commas, users should use JSON format: CHROME_ARGS='["--arg1,val", "--arg2"]' Also exports getEnvArray in module.exports for consistency. Co-authored-by: Nick Sweeting --- archivebox/plugins/chrome/chrome_utils.js | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/archivebox/plugins/chrome/chrome_utils.js b/archivebox/plugins/chrome/chrome_utils.js index def11874..263f2cbf 100755 --- a/archivebox/plugins/chrome/chrome_utils.js +++ b/archivebox/plugins/chrome/chrome_utils.js @@ -58,6 +58,15 @@ function getEnvInt(name, defaultValue = 0) { /** * Get array environment variable (JSON array or comma-separated string). + * + * Parsing strategy: + * - If value contains '[' anywhere, parse as JSON array + * - Otherwise, parse as comma-separated values + * + * This prevents incorrect splitting of arguments that contain internal commas. + * For arguments with commas, use JSON format: + * CHROME_ARGS='["--user-data-dir=/path/with,comma", "--window-size=1440,900"]' + * * @param {string} name - Environment variable name * @param {string[]} [defaultValue=[]] - Default value if not set * @returns {string[]} - Array of strings @@ -66,23 +75,18 @@ function getEnvArray(name, defaultValue = []) { const val = getEnv(name, ''); if (!val) return defaultValue; - // Try parsing as JSON array first - if (val.startsWith('[')) { + // If contains '[', parse as JSON array + if (val.includes('[')) { try { const parsed = JSON.parse(val); if (Array.isArray(parsed)) return parsed; } catch (e) { + console.error(`[!] Failed to parse ${name} as JSON array: ${e.message}`); // Fall through to comma-separated parsing } } - // Parse as comma-separated (but be careful with args that contain commas) - // For Chrome args, we split on comma followed by '--' to be safe - if (val.includes(',--')) { - return val.split(/,(?=--)/).map(s => s.trim()).filter(Boolean); - } - - // Simple comma-separated + // Parse as comma-separated values return val.split(',').map(s => s.trim()).filter(Boolean); } @@ -1314,6 +1318,7 @@ module.exports = { getEnv, getEnvBool, getEnvInt, + getEnvArray, parseResolution, // PID file management writePidWithMtime,