mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
Derive CHROME_USER_DATA_DIR from ACTIVE_PERSONA in config system
- Add _derive_persona_paths() in configset.py to automatically derive
CHROME_USER_DATA_DIR and CHROME_EXTENSIONS_DIR from ACTIVE_PERSONA
when not explicitly set. This allows plugins to use these paths
without knowing about the persona system.
- Update chrome_utils.js launchChromium() to accept userDataDir option
and pass --user-data-dir to Chrome. Also cleans up SingletonLock
before launch.
- Update killZombieChrome() to clean up SingletonLock files from all
persona chrome_user_data directories after killing zombies.
- Update chrome_cleanup() in misc/util.py to handle persona-based
user data directories when cleaning up stale Chrome state.
- Simplify on_Crawl__20_chrome_launch.bg.js to use CHROME_USER_DATA_DIR
and CHROME_EXTENSIONS_DIR from env (derived by get_config()).
Config priority flow:
ACTIVE_PERSONA=WorkAccount (set on crawl/snapshot)
-> get_config() derives:
CHROME_USER_DATA_DIR = PERSONAS_DIR/WorkAccount/chrome_user_data
CHROME_EXTENSIONS_DIR = PERSONAS_DIR/WorkAccount/chrome_extensions
-> hooks receive these as env vars without needing persona logic
This commit is contained in:
@@ -240,6 +240,52 @@ def get_config(
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# Derive persona-based paths if not explicitly set
|
||||||
|
# This allows plugins to just use CHROME_USER_DATA_DIR without knowing about personas
|
||||||
|
config = _derive_persona_paths(config, CONSTANTS)
|
||||||
|
|
||||||
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
def _derive_persona_paths(config: Dict[str, Any], CONSTANTS: Any) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Derive persona-specific paths from ACTIVE_PERSONA if not explicitly set.
|
||||||
|
|
||||||
|
This runs after all config sources are merged, so plugins receive
|
||||||
|
the final resolved paths without needing to know about the persona system.
|
||||||
|
|
||||||
|
Derived paths:
|
||||||
|
CHROME_USER_DATA_DIR <- PERSONAS_DIR / ACTIVE_PERSONA / chrome_user_data
|
||||||
|
CHROME_EXTENSIONS_DIR <- PERSONAS_DIR / ACTIVE_PERSONA / chrome_extensions
|
||||||
|
COOKIES_FILE <- PERSONAS_DIR / ACTIVE_PERSONA / cookies.txt (if exists)
|
||||||
|
"""
|
||||||
|
# Get active persona (defaults to "Default")
|
||||||
|
active_persona = config.get('ACTIVE_PERSONA') or config.get('DEFAULT_PERSONA') or 'Default'
|
||||||
|
|
||||||
|
# Ensure ACTIVE_PERSONA is always set in config for downstream use
|
||||||
|
config['ACTIVE_PERSONA'] = active_persona
|
||||||
|
|
||||||
|
# Get personas directory
|
||||||
|
personas_dir = CONSTANTS.PERSONAS_DIR
|
||||||
|
persona_dir = personas_dir / active_persona
|
||||||
|
|
||||||
|
# Derive CHROME_USER_DATA_DIR if not explicitly set
|
||||||
|
chrome_user_data_dir = config.get('CHROME_USER_DATA_DIR')
|
||||||
|
if not chrome_user_data_dir:
|
||||||
|
config['CHROME_USER_DATA_DIR'] = str(persona_dir / 'chrome_user_data')
|
||||||
|
|
||||||
|
# Derive CHROME_EXTENSIONS_DIR if not explicitly set
|
||||||
|
chrome_extensions_dir = config.get('CHROME_EXTENSIONS_DIR')
|
||||||
|
if not chrome_extensions_dir:
|
||||||
|
config['CHROME_EXTENSIONS_DIR'] = str(persona_dir / 'chrome_extensions')
|
||||||
|
|
||||||
|
# Derive COOKIES_FILE if not explicitly set and file exists
|
||||||
|
cookies_file = config.get('COOKIES_FILE')
|
||||||
|
if not cookies_file:
|
||||||
|
persona_cookies = persona_dir / 'cookies.txt'
|
||||||
|
if persona_cookies.exists():
|
||||||
|
config['COOKIES_FILE'] = str(persona_cookies)
|
||||||
|
|
||||||
return config
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -480,12 +480,50 @@ for url_str, num_urls in _test_url_strs.items():
|
|||||||
|
|
||||||
def chrome_cleanup():
|
def chrome_cleanup():
|
||||||
"""
|
"""
|
||||||
Cleans up any state or runtime files that chrome leaves behind when killed by
|
Cleans up any state or runtime files that Chrome leaves behind when killed by
|
||||||
a timeout or other error
|
a timeout or other error. Handles:
|
||||||
|
- Persona-based chrome_user_data directories (from ACTIVE_PERSONA)
|
||||||
|
- Explicit CHROME_USER_DATA_DIR
|
||||||
|
- Legacy Docker chromium path
|
||||||
"""
|
"""
|
||||||
import os
|
import os
|
||||||
|
from pathlib import Path
|
||||||
from archivebox.config.permissions import IN_DOCKER
|
from archivebox.config.permissions import IN_DOCKER
|
||||||
|
|
||||||
|
# Clean up persona-based user data directories
|
||||||
|
try:
|
||||||
|
from archivebox.config.configset import get_config
|
||||||
|
from archivebox.config.constants import CONSTANTS
|
||||||
|
|
||||||
|
config = get_config()
|
||||||
|
|
||||||
|
# Clean up the active persona's chrome_user_data SingletonLock
|
||||||
|
chrome_user_data_dir = config.get('CHROME_USER_DATA_DIR')
|
||||||
|
if chrome_user_data_dir:
|
||||||
|
singleton_lock = Path(chrome_user_data_dir) / 'SingletonLock'
|
||||||
|
if singleton_lock.exists():
|
||||||
|
try:
|
||||||
|
singleton_lock.unlink()
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Clean up all persona directories
|
||||||
|
personas_dir = CONSTANTS.PERSONAS_DIR
|
||||||
|
if personas_dir.exists():
|
||||||
|
for persona_dir in personas_dir.iterdir():
|
||||||
|
if not persona_dir.is_dir():
|
||||||
|
continue
|
||||||
|
user_data_dir = persona_dir / 'chrome_user_data'
|
||||||
|
singleton_lock = user_data_dir / 'SingletonLock'
|
||||||
|
if singleton_lock.exists():
|
||||||
|
try:
|
||||||
|
singleton_lock.unlink()
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
except Exception:
|
||||||
|
pass # Config not available during early startup
|
||||||
|
|
||||||
|
# Legacy Docker cleanup
|
||||||
if IN_DOCKER:
|
if IN_DOCKER:
|
||||||
singleton_lock = "/home/archivebox/.config/chromium/SingletonLock"
|
singleton_lock = "/home/archivebox/.config/chromium/SingletonLock"
|
||||||
if os.path.lexists(singleton_lock):
|
if os.path.lexists(singleton_lock):
|
||||||
|
|||||||
@@ -257,6 +257,31 @@ function killZombieChrome(dataDir = null) {
|
|||||||
console.error('[+] No zombies found');
|
console.error('[+] No zombies found');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Clean up stale SingletonLock files from persona chrome_user_data directories
|
||||||
|
const personasDir = path.join(dataDir, 'personas');
|
||||||
|
if (fs.existsSync(personasDir)) {
|
||||||
|
try {
|
||||||
|
const personas = fs.readdirSync(personasDir, { withFileTypes: true });
|
||||||
|
for (const persona of personas) {
|
||||||
|
if (!persona.isDirectory()) continue;
|
||||||
|
|
||||||
|
const userDataDir = path.join(personasDir, persona.name, 'chrome_user_data');
|
||||||
|
const singletonLock = path.join(userDataDir, 'SingletonLock');
|
||||||
|
|
||||||
|
if (fs.existsSync(singletonLock)) {
|
||||||
|
try {
|
||||||
|
fs.unlinkSync(singletonLock);
|
||||||
|
console.error(`[+] Removed stale SingletonLock: ${singletonLock}`);
|
||||||
|
} catch (e) {
|
||||||
|
// Ignore - may be in use by active Chrome
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
// Ignore errors scanning personas directory
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return killed;
|
return killed;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -270,6 +295,7 @@ function killZombieChrome(dataDir = null) {
|
|||||||
* @param {Object} options - Launch options
|
* @param {Object} options - Launch options
|
||||||
* @param {string} [options.binary] - Chrome binary path (auto-detected if not provided)
|
* @param {string} [options.binary] - Chrome binary path (auto-detected if not provided)
|
||||||
* @param {string} [options.outputDir='chrome'] - Directory for output files
|
* @param {string} [options.outputDir='chrome'] - Directory for output files
|
||||||
|
* @param {string} [options.userDataDir] - Chrome user data directory for persistent sessions
|
||||||
* @param {string} [options.resolution='1440,2000'] - Window resolution
|
* @param {string} [options.resolution='1440,2000'] - Window resolution
|
||||||
* @param {boolean} [options.headless=true] - Run in headless mode
|
* @param {boolean} [options.headless=true] - Run in headless mode
|
||||||
* @param {boolean} [options.checkSsl=true] - Check SSL certificates
|
* @param {boolean} [options.checkSsl=true] - Check SSL certificates
|
||||||
@@ -281,6 +307,7 @@ async function launchChromium(options = {}) {
|
|||||||
const {
|
const {
|
||||||
binary = findChromium(),
|
binary = findChromium(),
|
||||||
outputDir = 'chrome',
|
outputDir = 'chrome',
|
||||||
|
userDataDir = getEnv('CHROME_USER_DATA_DIR'),
|
||||||
resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000'),
|
resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000'),
|
||||||
headless = getEnvBool('CHROME_HEADLESS', true),
|
headless = getEnvBool('CHROME_HEADLESS', true),
|
||||||
checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true)),
|
checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true)),
|
||||||
@@ -304,6 +331,24 @@ async function launchChromium(options = {}) {
|
|||||||
fs.mkdirSync(outputDir, { recursive: true });
|
fs.mkdirSync(outputDir, { recursive: true });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Create user data directory if specified and doesn't exist
|
||||||
|
if (userDataDir) {
|
||||||
|
if (!fs.existsSync(userDataDir)) {
|
||||||
|
fs.mkdirSync(userDataDir, { recursive: true });
|
||||||
|
console.error(`[*] Created user data directory: ${userDataDir}`);
|
||||||
|
}
|
||||||
|
// Clean up any stale SingletonLock file from previous crashed sessions
|
||||||
|
const singletonLock = path.join(userDataDir, 'SingletonLock');
|
||||||
|
if (fs.existsSync(singletonLock)) {
|
||||||
|
try {
|
||||||
|
fs.unlinkSync(singletonLock);
|
||||||
|
console.error(`[*] Removed stale SingletonLock: ${singletonLock}`);
|
||||||
|
} catch (e) {
|
||||||
|
console.error(`[!] Failed to remove SingletonLock: ${e.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Find a free port
|
// Find a free port
|
||||||
const debugPort = await findFreePort();
|
const debugPort = await findFreePort();
|
||||||
console.error(`[*] Using debug port: ${debugPort}`);
|
console.error(`[*] Using debug port: ${debugPort}`);
|
||||||
@@ -335,6 +380,7 @@ async function launchChromium(options = {}) {
|
|||||||
'--font-render-hinting=none',
|
'--font-render-hinting=none',
|
||||||
'--force-color-profile=srgb',
|
'--force-color-profile=srgb',
|
||||||
`--window-size=${width},${height}`,
|
`--window-size=${width},${height}`,
|
||||||
|
...(userDataDir ? [`--user-data-dir=${userDataDir}`] : []),
|
||||||
...(headless ? ['--headless=new'] : []),
|
...(headless ? ['--headless=new'] : []),
|
||||||
...(checkSsl ? [] : ['--ignore-certificate-errors']),
|
...(checkSsl ? [] : ['--ignore-certificate-errors']),
|
||||||
];
|
];
|
||||||
|
|||||||
@@ -115,12 +115,17 @@ async function main() {
|
|||||||
if (version) console.error(`[*] Version: ${version}`);
|
if (version) console.error(`[*] Version: ${version}`);
|
||||||
|
|
||||||
// Load installed extensions
|
// Load installed extensions
|
||||||
const extensionsDir = getEnv('CHROME_EXTENSIONS_DIR') ||
|
// CHROME_EXTENSIONS_DIR is derived from ACTIVE_PERSONA by get_config() in configset.py
|
||||||
path.join(getEnv('DATA_DIR', '.'), 'personas', getEnv('ACTIVE_PERSONA', 'Default'), 'chrome_extensions');
|
const extensionsDir = getEnv('CHROME_EXTENSIONS_DIR');
|
||||||
|
const userDataDir = getEnv('CHROME_USER_DATA_DIR');
|
||||||
|
|
||||||
|
if (userDataDir) {
|
||||||
|
console.error(`[*] Using user data dir: ${userDataDir}`);
|
||||||
|
}
|
||||||
|
|
||||||
const installedExtensions = [];
|
const installedExtensions = [];
|
||||||
const extensionPaths = [];
|
const extensionPaths = [];
|
||||||
if (fs.existsSync(extensionsDir)) {
|
if (extensionsDir && fs.existsSync(extensionsDir)) {
|
||||||
const files = fs.readdirSync(extensionsDir);
|
const files = fs.readdirSync(extensionsDir);
|
||||||
for (const file of files) {
|
for (const file of files) {
|
||||||
if (file.endsWith('.extension.json')) {
|
if (file.endsWith('.extension.json')) {
|
||||||
@@ -151,9 +156,11 @@ async function main() {
|
|||||||
writePidWithMtime(path.join(OUTPUT_DIR, 'hook.pid'), process.pid, hookStartTime);
|
writePidWithMtime(path.join(OUTPUT_DIR, 'hook.pid'), process.pid, hookStartTime);
|
||||||
|
|
||||||
// Launch Chromium using consolidated function
|
// Launch Chromium using consolidated function
|
||||||
|
// userDataDir is derived from ACTIVE_PERSONA by get_config() if not explicitly set
|
||||||
const result = await launchChromium({
|
const result = await launchChromium({
|
||||||
binary,
|
binary,
|
||||||
outputDir: OUTPUT_DIR,
|
outputDir: OUTPUT_DIR,
|
||||||
|
userDataDir,
|
||||||
extensionPaths,
|
extensionPaths,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user