mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-03 09:25:42 +10:00
Derive CHROME_USER_DATA_DIR from ACTIVE_PERSONA in config system
- Add _derive_persona_paths() in configset.py to automatically derive
CHROME_USER_DATA_DIR and CHROME_EXTENSIONS_DIR from ACTIVE_PERSONA
when not explicitly set. This allows plugins to use these paths
without knowing about the persona system.
- Update chrome_utils.js launchChromium() to accept userDataDir option
and pass --user-data-dir to Chrome. Also cleans up SingletonLock
before launch.
- Update killZombieChrome() to clean up SingletonLock files from all
persona chrome_user_data directories after killing zombies.
- Update chrome_cleanup() in misc/util.py to handle persona-based
user data directories when cleaning up stale Chrome state.
- Simplify on_Crawl__20_chrome_launch.bg.js to use CHROME_USER_DATA_DIR
and CHROME_EXTENSIONS_DIR from env (derived by get_config()).
Config priority flow:
ACTIVE_PERSONA=WorkAccount (set on crawl/snapshot)
-> get_config() derives:
CHROME_USER_DATA_DIR = PERSONAS_DIR/WorkAccount/chrome_user_data
CHROME_EXTENSIONS_DIR = PERSONAS_DIR/WorkAccount/chrome_extensions
-> hooks receive these as env vars without needing persona logic
This commit is contained in:
@@ -240,6 +240,52 @@ def get_config(
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Derive persona-based paths if not explicitly set
|
||||
# This allows plugins to just use CHROME_USER_DATA_DIR without knowing about personas
|
||||
config = _derive_persona_paths(config, CONSTANTS)
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def _derive_persona_paths(config: Dict[str, Any], CONSTANTS: Any) -> Dict[str, Any]:
|
||||
"""
|
||||
Derive persona-specific paths from ACTIVE_PERSONA if not explicitly set.
|
||||
|
||||
This runs after all config sources are merged, so plugins receive
|
||||
the final resolved paths without needing to know about the persona system.
|
||||
|
||||
Derived paths:
|
||||
CHROME_USER_DATA_DIR <- PERSONAS_DIR / ACTIVE_PERSONA / chrome_user_data
|
||||
CHROME_EXTENSIONS_DIR <- PERSONAS_DIR / ACTIVE_PERSONA / chrome_extensions
|
||||
COOKIES_FILE <- PERSONAS_DIR / ACTIVE_PERSONA / cookies.txt (if exists)
|
||||
"""
|
||||
# Get active persona (defaults to "Default")
|
||||
active_persona = config.get('ACTIVE_PERSONA') or config.get('DEFAULT_PERSONA') or 'Default'
|
||||
|
||||
# Ensure ACTIVE_PERSONA is always set in config for downstream use
|
||||
config['ACTIVE_PERSONA'] = active_persona
|
||||
|
||||
# Get personas directory
|
||||
personas_dir = CONSTANTS.PERSONAS_DIR
|
||||
persona_dir = personas_dir / active_persona
|
||||
|
||||
# Derive CHROME_USER_DATA_DIR if not explicitly set
|
||||
chrome_user_data_dir = config.get('CHROME_USER_DATA_DIR')
|
||||
if not chrome_user_data_dir:
|
||||
config['CHROME_USER_DATA_DIR'] = str(persona_dir / 'chrome_user_data')
|
||||
|
||||
# Derive CHROME_EXTENSIONS_DIR if not explicitly set
|
||||
chrome_extensions_dir = config.get('CHROME_EXTENSIONS_DIR')
|
||||
if not chrome_extensions_dir:
|
||||
config['CHROME_EXTENSIONS_DIR'] = str(persona_dir / 'chrome_extensions')
|
||||
|
||||
# Derive COOKIES_FILE if not explicitly set and file exists
|
||||
cookies_file = config.get('COOKIES_FILE')
|
||||
if not cookies_file:
|
||||
persona_cookies = persona_dir / 'cookies.txt'
|
||||
if persona_cookies.exists():
|
||||
config['COOKIES_FILE'] = str(persona_cookies)
|
||||
|
||||
return config
|
||||
|
||||
|
||||
|
||||
@@ -480,12 +480,50 @@ for url_str, num_urls in _test_url_strs.items():
|
||||
|
||||
def chrome_cleanup():
|
||||
"""
|
||||
Cleans up any state or runtime files that chrome leaves behind when killed by
|
||||
a timeout or other error
|
||||
Cleans up any state or runtime files that Chrome leaves behind when killed by
|
||||
a timeout or other error. Handles:
|
||||
- Persona-based chrome_user_data directories (from ACTIVE_PERSONA)
|
||||
- Explicit CHROME_USER_DATA_DIR
|
||||
- Legacy Docker chromium path
|
||||
"""
|
||||
import os
|
||||
from pathlib import Path
|
||||
from archivebox.config.permissions import IN_DOCKER
|
||||
|
||||
|
||||
# Clean up persona-based user data directories
|
||||
try:
|
||||
from archivebox.config.configset import get_config
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
|
||||
config = get_config()
|
||||
|
||||
# Clean up the active persona's chrome_user_data SingletonLock
|
||||
chrome_user_data_dir = config.get('CHROME_USER_DATA_DIR')
|
||||
if chrome_user_data_dir:
|
||||
singleton_lock = Path(chrome_user_data_dir) / 'SingletonLock'
|
||||
if singleton_lock.exists():
|
||||
try:
|
||||
singleton_lock.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# Clean up all persona directories
|
||||
personas_dir = CONSTANTS.PERSONAS_DIR
|
||||
if personas_dir.exists():
|
||||
for persona_dir in personas_dir.iterdir():
|
||||
if not persona_dir.is_dir():
|
||||
continue
|
||||
user_data_dir = persona_dir / 'chrome_user_data'
|
||||
singleton_lock = user_data_dir / 'SingletonLock'
|
||||
if singleton_lock.exists():
|
||||
try:
|
||||
singleton_lock.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
except Exception:
|
||||
pass # Config not available during early startup
|
||||
|
||||
# Legacy Docker cleanup
|
||||
if IN_DOCKER:
|
||||
singleton_lock = "/home/archivebox/.config/chromium/SingletonLock"
|
||||
if os.path.lexists(singleton_lock):
|
||||
|
||||
@@ -257,6 +257,31 @@ function killZombieChrome(dataDir = null) {
|
||||
console.error('[+] No zombies found');
|
||||
}
|
||||
|
||||
// Clean up stale SingletonLock files from persona chrome_user_data directories
|
||||
const personasDir = path.join(dataDir, 'personas');
|
||||
if (fs.existsSync(personasDir)) {
|
||||
try {
|
||||
const personas = fs.readdirSync(personasDir, { withFileTypes: true });
|
||||
for (const persona of personas) {
|
||||
if (!persona.isDirectory()) continue;
|
||||
|
||||
const userDataDir = path.join(personasDir, persona.name, 'chrome_user_data');
|
||||
const singletonLock = path.join(userDataDir, 'SingletonLock');
|
||||
|
||||
if (fs.existsSync(singletonLock)) {
|
||||
try {
|
||||
fs.unlinkSync(singletonLock);
|
||||
console.error(`[+] Removed stale SingletonLock: ${singletonLock}`);
|
||||
} catch (e) {
|
||||
// Ignore - may be in use by active Chrome
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
// Ignore errors scanning personas directory
|
||||
}
|
||||
}
|
||||
|
||||
return killed;
|
||||
}
|
||||
|
||||
@@ -270,6 +295,7 @@ function killZombieChrome(dataDir = null) {
|
||||
* @param {Object} options - Launch options
|
||||
* @param {string} [options.binary] - Chrome binary path (auto-detected if not provided)
|
||||
* @param {string} [options.outputDir='chrome'] - Directory for output files
|
||||
* @param {string} [options.userDataDir] - Chrome user data directory for persistent sessions
|
||||
* @param {string} [options.resolution='1440,2000'] - Window resolution
|
||||
* @param {boolean} [options.headless=true] - Run in headless mode
|
||||
* @param {boolean} [options.checkSsl=true] - Check SSL certificates
|
||||
@@ -281,6 +307,7 @@ async function launchChromium(options = {}) {
|
||||
const {
|
||||
binary = findChromium(),
|
||||
outputDir = 'chrome',
|
||||
userDataDir = getEnv('CHROME_USER_DATA_DIR'),
|
||||
resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000'),
|
||||
headless = getEnvBool('CHROME_HEADLESS', true),
|
||||
checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true)),
|
||||
@@ -304,6 +331,24 @@ async function launchChromium(options = {}) {
|
||||
fs.mkdirSync(outputDir, { recursive: true });
|
||||
}
|
||||
|
||||
// Create user data directory if specified and doesn't exist
|
||||
if (userDataDir) {
|
||||
if (!fs.existsSync(userDataDir)) {
|
||||
fs.mkdirSync(userDataDir, { recursive: true });
|
||||
console.error(`[*] Created user data directory: ${userDataDir}`);
|
||||
}
|
||||
// Clean up any stale SingletonLock file from previous crashed sessions
|
||||
const singletonLock = path.join(userDataDir, 'SingletonLock');
|
||||
if (fs.existsSync(singletonLock)) {
|
||||
try {
|
||||
fs.unlinkSync(singletonLock);
|
||||
console.error(`[*] Removed stale SingletonLock: ${singletonLock}`);
|
||||
} catch (e) {
|
||||
console.error(`[!] Failed to remove SingletonLock: ${e.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Find a free port
|
||||
const debugPort = await findFreePort();
|
||||
console.error(`[*] Using debug port: ${debugPort}`);
|
||||
@@ -335,6 +380,7 @@ async function launchChromium(options = {}) {
|
||||
'--font-render-hinting=none',
|
||||
'--force-color-profile=srgb',
|
||||
`--window-size=${width},${height}`,
|
||||
...(userDataDir ? [`--user-data-dir=${userDataDir}`] : []),
|
||||
...(headless ? ['--headless=new'] : []),
|
||||
...(checkSsl ? [] : ['--ignore-certificate-errors']),
|
||||
];
|
||||
|
||||
@@ -115,12 +115,17 @@ async function main() {
|
||||
if (version) console.error(`[*] Version: ${version}`);
|
||||
|
||||
// Load installed extensions
|
||||
const extensionsDir = getEnv('CHROME_EXTENSIONS_DIR') ||
|
||||
path.join(getEnv('DATA_DIR', '.'), 'personas', getEnv('ACTIVE_PERSONA', 'Default'), 'chrome_extensions');
|
||||
// CHROME_EXTENSIONS_DIR is derived from ACTIVE_PERSONA by get_config() in configset.py
|
||||
const extensionsDir = getEnv('CHROME_EXTENSIONS_DIR');
|
||||
const userDataDir = getEnv('CHROME_USER_DATA_DIR');
|
||||
|
||||
if (userDataDir) {
|
||||
console.error(`[*] Using user data dir: ${userDataDir}`);
|
||||
}
|
||||
|
||||
const installedExtensions = [];
|
||||
const extensionPaths = [];
|
||||
if (fs.existsSync(extensionsDir)) {
|
||||
if (extensionsDir && fs.existsSync(extensionsDir)) {
|
||||
const files = fs.readdirSync(extensionsDir);
|
||||
for (const file of files) {
|
||||
if (file.endsWith('.extension.json')) {
|
||||
@@ -151,9 +156,11 @@ async function main() {
|
||||
writePidWithMtime(path.join(OUTPUT_DIR, 'hook.pid'), process.pid, hookStartTime);
|
||||
|
||||
// Launch Chromium using consolidated function
|
||||
// userDataDir is derived from ACTIVE_PERSONA by get_config() if not explicitly set
|
||||
const result = await launchChromium({
|
||||
binary,
|
||||
outputDir: OUTPUT_DIR,
|
||||
userDataDir,
|
||||
extensionPaths,
|
||||
});
|
||||
|
||||
|
||||
Reference in New Issue
Block a user