Derive CHROME_USER_DATA_DIR from ACTIVE_PERSONA in config system

- Add _derive_persona_paths() in configset.py to automatically derive
  CHROME_USER_DATA_DIR and CHROME_EXTENSIONS_DIR from ACTIVE_PERSONA
  when not explicitly set. This allows plugins to use these paths
  without knowing about the persona system.

- Update chrome_utils.js launchChromium() to accept userDataDir option
  and pass --user-data-dir to Chrome. Also cleans up SingletonLock
  before launch.

- Update killZombieChrome() to clean up SingletonLock files from all
  persona chrome_user_data directories after killing zombies.

- Update chrome_cleanup() in misc/util.py to handle persona-based
  user data directories when cleaning up stale Chrome state.

- Simplify on_Crawl__20_chrome_launch.bg.js to use CHROME_USER_DATA_DIR
  and CHROME_EXTENSIONS_DIR from env (derived by get_config()).

Config priority flow:
  ACTIVE_PERSONA=WorkAccount (set on crawl/snapshot)
  -> get_config() derives:
     CHROME_USER_DATA_DIR = PERSONAS_DIR/WorkAccount/chrome_user_data
     CHROME_EXTENSIONS_DIR = PERSONAS_DIR/WorkAccount/chrome_extensions
  -> hooks receive these as env vars without needing persona logic
This commit is contained in:
Claude
2025-12-31 00:21:07 +00:00
parent ba8c28a866
commit 877b5f91c2
4 changed files with 143 additions and 6 deletions

View File

@@ -240,6 +240,52 @@ def get_config(
except ImportError:
pass
# Derive persona-based paths if not explicitly set
# This allows plugins to just use CHROME_USER_DATA_DIR without knowing about personas
config = _derive_persona_paths(config, CONSTANTS)
return config
def _derive_persona_paths(config: Dict[str, Any], CONSTANTS: Any) -> Dict[str, Any]:
"""
Derive persona-specific paths from ACTIVE_PERSONA if not explicitly set.
This runs after all config sources are merged, so plugins receive
the final resolved paths without needing to know about the persona system.
Derived paths:
CHROME_USER_DATA_DIR <- PERSONAS_DIR / ACTIVE_PERSONA / chrome_user_data
CHROME_EXTENSIONS_DIR <- PERSONAS_DIR / ACTIVE_PERSONA / chrome_extensions
COOKIES_FILE <- PERSONAS_DIR / ACTIVE_PERSONA / cookies.txt (if exists)
"""
# Get active persona (defaults to "Default")
active_persona = config.get('ACTIVE_PERSONA') or config.get('DEFAULT_PERSONA') or 'Default'
# Ensure ACTIVE_PERSONA is always set in config for downstream use
config['ACTIVE_PERSONA'] = active_persona
# Get personas directory
personas_dir = CONSTANTS.PERSONAS_DIR
persona_dir = personas_dir / active_persona
# Derive CHROME_USER_DATA_DIR if not explicitly set
chrome_user_data_dir = config.get('CHROME_USER_DATA_DIR')
if not chrome_user_data_dir:
config['CHROME_USER_DATA_DIR'] = str(persona_dir / 'chrome_user_data')
# Derive CHROME_EXTENSIONS_DIR if not explicitly set
chrome_extensions_dir = config.get('CHROME_EXTENSIONS_DIR')
if not chrome_extensions_dir:
config['CHROME_EXTENSIONS_DIR'] = str(persona_dir / 'chrome_extensions')
# Derive COOKIES_FILE if not explicitly set and file exists
cookies_file = config.get('COOKIES_FILE')
if not cookies_file:
persona_cookies = persona_dir / 'cookies.txt'
if persona_cookies.exists():
config['COOKIES_FILE'] = str(persona_cookies)
return config

View File

@@ -480,12 +480,50 @@ for url_str, num_urls in _test_url_strs.items():
def chrome_cleanup():
"""
Cleans up any state or runtime files that chrome leaves behind when killed by
a timeout or other error
Cleans up any state or runtime files that Chrome leaves behind when killed by
a timeout or other error. Handles:
- Persona-based chrome_user_data directories (from ACTIVE_PERSONA)
- Explicit CHROME_USER_DATA_DIR
- Legacy Docker chromium path
"""
import os
from pathlib import Path
from archivebox.config.permissions import IN_DOCKER
# Clean up persona-based user data directories
try:
from archivebox.config.configset import get_config
from archivebox.config.constants import CONSTANTS
config = get_config()
# Clean up the active persona's chrome_user_data SingletonLock
chrome_user_data_dir = config.get('CHROME_USER_DATA_DIR')
if chrome_user_data_dir:
singleton_lock = Path(chrome_user_data_dir) / 'SingletonLock'
if singleton_lock.exists():
try:
singleton_lock.unlink()
except OSError:
pass
# Clean up all persona directories
personas_dir = CONSTANTS.PERSONAS_DIR
if personas_dir.exists():
for persona_dir in personas_dir.iterdir():
if not persona_dir.is_dir():
continue
user_data_dir = persona_dir / 'chrome_user_data'
singleton_lock = user_data_dir / 'SingletonLock'
if singleton_lock.exists():
try:
singleton_lock.unlink()
except OSError:
pass
except Exception:
pass # Config not available during early startup
# Legacy Docker cleanup
if IN_DOCKER:
singleton_lock = "/home/archivebox/.config/chromium/SingletonLock"
if os.path.lexists(singleton_lock):

View File

@@ -257,6 +257,31 @@ function killZombieChrome(dataDir = null) {
console.error('[+] No zombies found');
}
// Clean up stale SingletonLock files from persona chrome_user_data directories
const personasDir = path.join(dataDir, 'personas');
if (fs.existsSync(personasDir)) {
try {
const personas = fs.readdirSync(personasDir, { withFileTypes: true });
for (const persona of personas) {
if (!persona.isDirectory()) continue;
const userDataDir = path.join(personasDir, persona.name, 'chrome_user_data');
const singletonLock = path.join(userDataDir, 'SingletonLock');
if (fs.existsSync(singletonLock)) {
try {
fs.unlinkSync(singletonLock);
console.error(`[+] Removed stale SingletonLock: ${singletonLock}`);
} catch (e) {
// Ignore - may be in use by active Chrome
}
}
}
} catch (e) {
// Ignore errors scanning personas directory
}
}
return killed;
}
@@ -270,6 +295,7 @@ function killZombieChrome(dataDir = null) {
* @param {Object} options - Launch options
* @param {string} [options.binary] - Chrome binary path (auto-detected if not provided)
* @param {string} [options.outputDir='chrome'] - Directory for output files
* @param {string} [options.userDataDir] - Chrome user data directory for persistent sessions
* @param {string} [options.resolution='1440,2000'] - Window resolution
* @param {boolean} [options.headless=true] - Run in headless mode
* @param {boolean} [options.checkSsl=true] - Check SSL certificates
@@ -281,6 +307,7 @@ async function launchChromium(options = {}) {
const {
binary = findChromium(),
outputDir = 'chrome',
userDataDir = getEnv('CHROME_USER_DATA_DIR'),
resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000'),
headless = getEnvBool('CHROME_HEADLESS', true),
checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true)),
@@ -304,6 +331,24 @@ async function launchChromium(options = {}) {
fs.mkdirSync(outputDir, { recursive: true });
}
// Create user data directory if specified and doesn't exist
if (userDataDir) {
if (!fs.existsSync(userDataDir)) {
fs.mkdirSync(userDataDir, { recursive: true });
console.error(`[*] Created user data directory: ${userDataDir}`);
}
// Clean up any stale SingletonLock file from previous crashed sessions
const singletonLock = path.join(userDataDir, 'SingletonLock');
if (fs.existsSync(singletonLock)) {
try {
fs.unlinkSync(singletonLock);
console.error(`[*] Removed stale SingletonLock: ${singletonLock}`);
} catch (e) {
console.error(`[!] Failed to remove SingletonLock: ${e.message}`);
}
}
}
// Find a free port
const debugPort = await findFreePort();
console.error(`[*] Using debug port: ${debugPort}`);
@@ -335,6 +380,7 @@ async function launchChromium(options = {}) {
'--font-render-hinting=none',
'--force-color-profile=srgb',
`--window-size=${width},${height}`,
...(userDataDir ? [`--user-data-dir=${userDataDir}`] : []),
...(headless ? ['--headless=new'] : []),
...(checkSsl ? [] : ['--ignore-certificate-errors']),
];

View File

@@ -115,12 +115,17 @@ async function main() {
if (version) console.error(`[*] Version: ${version}`);
// Load installed extensions
const extensionsDir = getEnv('CHROME_EXTENSIONS_DIR') ||
path.join(getEnv('DATA_DIR', '.'), 'personas', getEnv('ACTIVE_PERSONA', 'Default'), 'chrome_extensions');
// CHROME_EXTENSIONS_DIR is derived from ACTIVE_PERSONA by get_config() in configset.py
const extensionsDir = getEnv('CHROME_EXTENSIONS_DIR');
const userDataDir = getEnv('CHROME_USER_DATA_DIR');
if (userDataDir) {
console.error(`[*] Using user data dir: ${userDataDir}`);
}
const installedExtensions = [];
const extensionPaths = [];
if (fs.existsSync(extensionsDir)) {
if (extensionsDir && fs.existsSync(extensionsDir)) {
const files = fs.readdirSync(extensionsDir);
for (const file of files) {
if (file.endsWith('.extension.json')) {
@@ -151,9 +156,11 @@ async function main() {
writePidWithMtime(path.join(OUTPUT_DIR, 'hook.pid'), process.pid, hookStartTime);
// Launch Chromium using consolidated function
// userDataDir is derived from ACTIVE_PERSONA by get_config() if not explicitly set
const result = await launchChromium({
binary,
outputDir: OUTPUT_DIR,
userDataDir,
extensionPaths,
});