mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-04 18:05:36 +10:00
Convert Persona to Django ModelWithConfig, add to get_config()
- Convert Persona from plain Python class to Django model with ModelWithConfig - Add config JSONField for persona-specific config overrides - Add get_derived_config() method that returns config with derived paths: - CHROME_USER_DATA_DIR, CHROME_EXTENSIONS_DIR, COOKIES_FILE, ACTIVE_PERSONA - Update get_config() to accept persona parameter in merge chain: get_config(persona=crawl.persona, crawl=crawl, snapshot=snapshot) - Remove _derive_persona_paths() - derivation now happens in Persona model - Merge order (highest to lowest priority): 1. snapshot.config 2. crawl.config 3. user.config 4. persona.get_derived_config() <- NEW 5. environment variables 6. ArchiveBox.conf file 7. plugin defaults 8. core defaults Usage: config = get_config(persona=crawl.persona, crawl=crawl) config['CHROME_USER_DATA_DIR'] # derived from persona
This commit is contained in:
@@ -120,6 +120,7 @@ class BaseConfigSet(BaseSettings):
|
||||
def get_config(
|
||||
scope: str = "global",
|
||||
defaults: Optional[Dict] = None,
|
||||
persona: Any = None,
|
||||
user: Any = None,
|
||||
crawl: Any = None,
|
||||
snapshot: Any = None,
|
||||
@@ -131,14 +132,16 @@ def get_config(
|
||||
1. Per-snapshot config (snapshot.config JSON field)
|
||||
2. Per-crawl config (crawl.config JSON field)
|
||||
3. Per-user config (user.config JSON field)
|
||||
4. Environment variables
|
||||
5. Config file (ArchiveBox.conf)
|
||||
6. Plugin schema defaults (config.json)
|
||||
7. Core config defaults
|
||||
4. Per-persona config (persona.get_derived_config() - includes CHROME_USER_DATA_DIR etc.)
|
||||
5. Environment variables
|
||||
6. Config file (ArchiveBox.conf)
|
||||
7. Plugin schema defaults (config.json)
|
||||
8. Core config defaults
|
||||
|
||||
Args:
|
||||
scope: Config scope ('global', 'crawl', 'snapshot', etc.)
|
||||
defaults: Default values to start with
|
||||
persona: Persona object (provides derived paths like CHROME_USER_DATA_DIR)
|
||||
user: User object with config JSON field
|
||||
crawl: Crawl object with config JSON field
|
||||
snapshot: Snapshot object with config JSON field
|
||||
@@ -205,6 +208,10 @@ def get_config(
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Apply persona config overrides (includes derived paths like CHROME_USER_DATA_DIR)
|
||||
if persona and hasattr(persona, "get_derived_config"):
|
||||
config.update(persona.get_derived_config())
|
||||
|
||||
# Apply user config overrides
|
||||
if user and hasattr(user, "config") and user.config:
|
||||
config.update(user.config)
|
||||
@@ -240,52 +247,6 @@ def get_config(
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Derive persona-based paths if not explicitly set
|
||||
# This allows plugins to just use CHROME_USER_DATA_DIR without knowing about personas
|
||||
config = _derive_persona_paths(config, CONSTANTS)
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def _derive_persona_paths(config: Dict[str, Any], CONSTANTS: Any) -> Dict[str, Any]:
|
||||
"""
|
||||
Derive persona-specific paths from ACTIVE_PERSONA if not explicitly set.
|
||||
|
||||
This runs after all config sources are merged, so plugins receive
|
||||
the final resolved paths without needing to know about the persona system.
|
||||
|
||||
Derived paths:
|
||||
CHROME_USER_DATA_DIR <- PERSONAS_DIR / ACTIVE_PERSONA / chrome_user_data
|
||||
CHROME_EXTENSIONS_DIR <- PERSONAS_DIR / ACTIVE_PERSONA / chrome_extensions
|
||||
COOKIES_FILE <- PERSONAS_DIR / ACTIVE_PERSONA / cookies.txt (if exists)
|
||||
"""
|
||||
# Get active persona (defaults to "Default")
|
||||
active_persona = config.get('ACTIVE_PERSONA') or config.get('DEFAULT_PERSONA') or 'Default'
|
||||
|
||||
# Ensure ACTIVE_PERSONA is always set in config for downstream use
|
||||
config['ACTIVE_PERSONA'] = active_persona
|
||||
|
||||
# Get personas directory
|
||||
personas_dir = CONSTANTS.PERSONAS_DIR
|
||||
persona_dir = personas_dir / active_persona
|
||||
|
||||
# Derive CHROME_USER_DATA_DIR if not explicitly set
|
||||
chrome_user_data_dir = config.get('CHROME_USER_DATA_DIR')
|
||||
if not chrome_user_data_dir:
|
||||
config['CHROME_USER_DATA_DIR'] = str(persona_dir / 'chrome_user_data')
|
||||
|
||||
# Derive CHROME_EXTENSIONS_DIR if not explicitly set
|
||||
chrome_extensions_dir = config.get('CHROME_EXTENSIONS_DIR')
|
||||
if not chrome_extensions_dir:
|
||||
config['CHROME_EXTENSIONS_DIR'] = str(persona_dir / 'chrome_extensions')
|
||||
|
||||
# Derive COOKIES_FILE if not explicitly set and file exists
|
||||
cookies_file = config.get('COOKIES_FILE')
|
||||
if not cookies_file:
|
||||
persona_cookies = persona_dir / 'cookies.txt'
|
||||
if persona_cookies.exists():
|
||||
config['COOKIES_FILE'] = str(persona_cookies)
|
||||
|
||||
return config
|
||||
|
||||
|
||||
|
||||
@@ -7,212 +7,149 @@ Each persona has its own:
|
||||
- Chrome extensions directory
|
||||
- Cookies file
|
||||
- Config overrides
|
||||
|
||||
Personas are stored as directories under PERSONAS_DIR (default: data/personas/).
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.personas'
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, Any, Iterator
|
||||
from typing import TYPE_CHECKING, Iterator
|
||||
|
||||
from django.db import models
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.base_models.models import ModelWithConfig, get_or_create_system_user_pk
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from django.db.models import QuerySet
|
||||
|
||||
|
||||
class Persona:
|
||||
class Persona(ModelWithConfig):
|
||||
"""
|
||||
Represents a browser persona/profile for archiving sessions.
|
||||
Browser persona/profile for archiving sessions.
|
||||
|
||||
Each persona is a directory containing:
|
||||
- chrome_user_data/ Chrome profile directory
|
||||
- chrome_extensions/ Installed extensions
|
||||
- cookies.txt Cookies file for wget/curl
|
||||
- config.json Persona-specific config overrides
|
||||
Each persona provides:
|
||||
- CHROME_USER_DATA_DIR: Chrome profile directory
|
||||
- CHROME_EXTENSIONS_DIR: Installed extensions directory
|
||||
- COOKIES_FILE: Cookies file for wget/curl
|
||||
- config: JSON field with persona-specific config overrides
|
||||
|
||||
Usage:
|
||||
persona = Persona('Default')
|
||||
persona.cleanup_chrome()
|
||||
# Get persona and its derived config
|
||||
config = get_config(persona=crawl.persona, crawl=crawl, snapshot=snapshot)
|
||||
chrome_dir = config['CHROME_USER_DATA_DIR']
|
||||
|
||||
# Or iterate all personas:
|
||||
for persona in Persona.all():
|
||||
persona.cleanup_chrome()
|
||||
# Or access directly from persona
|
||||
persona = Persona.objects.get(name='Default')
|
||||
persona.CHROME_USER_DATA_DIR # -> Path to chrome_user_data
|
||||
"""
|
||||
|
||||
def __init__(self, name: str, personas_dir: Optional[Path] = None):
|
||||
name = models.CharField(max_length=64, unique=True)
|
||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk)
|
||||
|
||||
class Meta:
|
||||
app_label = 'personas'
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.name
|
||||
|
||||
@property
|
||||
def path(self) -> Path:
|
||||
"""Path to persona directory under PERSONAS_DIR."""
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
return CONSTANTS.PERSONAS_DIR / self.name
|
||||
|
||||
@property
|
||||
def CHROME_USER_DATA_DIR(self) -> str:
|
||||
"""Derived path to Chrome user data directory for this persona."""
|
||||
return str(self.path / 'chrome_user_data')
|
||||
|
||||
@property
|
||||
def CHROME_EXTENSIONS_DIR(self) -> str:
|
||||
"""Derived path to Chrome extensions directory for this persona."""
|
||||
return str(self.path / 'chrome_extensions')
|
||||
|
||||
@property
|
||||
def COOKIES_FILE(self) -> str:
|
||||
"""Derived path to cookies.txt file for this persona (if exists)."""
|
||||
cookies_path = self.path / 'cookies.txt'
|
||||
return str(cookies_path) if cookies_path.exists() else ''
|
||||
|
||||
def get_derived_config(self) -> dict:
|
||||
"""
|
||||
Initialize a Persona by name.
|
||||
Get config dict with derived paths filled in.
|
||||
|
||||
Args:
|
||||
name: Persona name (directory name under PERSONAS_DIR)
|
||||
personas_dir: Override PERSONAS_DIR (defaults to CONSTANTS.PERSONAS_DIR)
|
||||
Returns dict with:
|
||||
- All values from self.config JSONField
|
||||
- CHROME_USER_DATA_DIR (derived from persona path)
|
||||
- CHROME_EXTENSIONS_DIR (derived from persona path)
|
||||
- COOKIES_FILE (derived from persona path, if file exists)
|
||||
- ACTIVE_PERSONA (set to this persona's name)
|
||||
"""
|
||||
self.name = name
|
||||
derived = dict(self.config or {})
|
||||
|
||||
if personas_dir is None:
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
personas_dir = CONSTANTS.PERSONAS_DIR
|
||||
# Add derived paths (don't override if explicitly set in config)
|
||||
if 'CHROME_USER_DATA_DIR' not in derived:
|
||||
derived['CHROME_USER_DATA_DIR'] = self.CHROME_USER_DATA_DIR
|
||||
if 'CHROME_EXTENSIONS_DIR' not in derived:
|
||||
derived['CHROME_EXTENSIONS_DIR'] = self.CHROME_EXTENSIONS_DIR
|
||||
if 'COOKIES_FILE' not in derived and self.COOKIES_FILE:
|
||||
derived['COOKIES_FILE'] = self.COOKIES_FILE
|
||||
|
||||
self.personas_dir = Path(personas_dir)
|
||||
self.path = self.personas_dir / name
|
||||
# Always set ACTIVE_PERSONA to this persona's name
|
||||
derived['ACTIVE_PERSONA'] = self.name
|
||||
|
||||
@property
|
||||
def chrome_user_data_dir(self) -> Path:
|
||||
"""Path to Chrome user data directory for this persona."""
|
||||
return self.path / 'chrome_user_data'
|
||||
|
||||
@property
|
||||
def chrome_extensions_dir(self) -> Path:
|
||||
"""Path to Chrome extensions directory for this persona."""
|
||||
return self.path / 'chrome_extensions'
|
||||
|
||||
@property
|
||||
def cookies_file(self) -> Path:
|
||||
"""Path to cookies.txt file for this persona."""
|
||||
return self.path / 'cookies.txt'
|
||||
|
||||
@property
|
||||
def config_file(self) -> Path:
|
||||
"""Path to config.json file for this persona."""
|
||||
return self.path / 'config.json'
|
||||
|
||||
@property
|
||||
def singleton_lock(self) -> Path:
|
||||
"""Path to Chrome's SingletonLock file."""
|
||||
return self.chrome_user_data_dir / 'SingletonLock'
|
||||
|
||||
def exists(self) -> bool:
|
||||
"""Check if persona directory exists."""
|
||||
return self.path.is_dir()
|
||||
return derived
|
||||
|
||||
def ensure_dirs(self) -> None:
|
||||
"""Create persona directories if they don't exist."""
|
||||
self.path.mkdir(parents=True, exist_ok=True)
|
||||
self.chrome_user_data_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
|
||||
(self.path / 'chrome_user_data').mkdir(parents=True, exist_ok=True)
|
||||
(self.path / 'chrome_extensions').mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def cleanup_chrome(self) -> bool:
|
||||
"""
|
||||
Clean up Chrome state files for this persona.
|
||||
|
||||
Removes stale SingletonLock files left behind when Chrome crashes
|
||||
or is killed unexpectedly. This allows Chrome to start fresh.
|
||||
Clean up Chrome state files (SingletonLock, etc.) for this persona.
|
||||
|
||||
Returns:
|
||||
True if cleanup was performed, False if no cleanup needed
|
||||
"""
|
||||
cleaned = False
|
||||
chrome_dir = self.path / 'chrome_user_data'
|
||||
|
||||
# Remove SingletonLock if it exists
|
||||
if self.singleton_lock.exists():
|
||||
if not chrome_dir.exists():
|
||||
return False
|
||||
|
||||
# Clean up SingletonLock files
|
||||
for lock_file in chrome_dir.glob('**/SingletonLock'):
|
||||
try:
|
||||
self.singleton_lock.unlink()
|
||||
lock_file.unlink()
|
||||
cleaned = True
|
||||
except OSError:
|
||||
pass # May be in use by active Chrome
|
||||
pass
|
||||
|
||||
# Also clean up any other stale lock files Chrome might leave
|
||||
if self.chrome_user_data_dir.exists():
|
||||
for lock_file in self.chrome_user_data_dir.glob('**/SingletonLock'):
|
||||
try:
|
||||
lock_file.unlink()
|
||||
cleaned = True
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# Clean up socket files
|
||||
for socket_file in self.chrome_user_data_dir.glob('**/SingletonSocket'):
|
||||
try:
|
||||
socket_file.unlink()
|
||||
cleaned = True
|
||||
except OSError:
|
||||
pass
|
||||
# Clean up SingletonSocket files
|
||||
for socket_file in chrome_dir.glob('**/SingletonSocket'):
|
||||
try:
|
||||
socket_file.unlink()
|
||||
cleaned = True
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
return cleaned
|
||||
|
||||
def get_config(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Load persona-specific config overrides from config.json.
|
||||
|
||||
Returns:
|
||||
Dict of config overrides, or empty dict if no config file
|
||||
"""
|
||||
import json
|
||||
|
||||
if not self.config_file.exists():
|
||||
return {}
|
||||
|
||||
try:
|
||||
return json.loads(self.config_file.read_text())
|
||||
except (json.JSONDecodeError, OSError):
|
||||
return {}
|
||||
|
||||
def save_config(self, config: Dict[str, Any]) -> None:
|
||||
"""
|
||||
Save persona-specific config overrides to config.json.
|
||||
|
||||
Args:
|
||||
config: Dict of config overrides to save
|
||||
"""
|
||||
import json
|
||||
|
||||
self.ensure_dirs()
|
||||
self.config_file.write_text(json.dumps(config, indent=2))
|
||||
@classmethod
|
||||
def get_or_create_default(cls) -> 'Persona':
|
||||
"""Get or create the Default persona."""
|
||||
persona, _ = cls.objects.get_or_create(name='Default')
|
||||
return persona
|
||||
|
||||
@classmethod
|
||||
def all(cls, personas_dir: Optional[Path] = None) -> Iterator['Persona']:
|
||||
"""
|
||||
Iterate over all personas in PERSONAS_DIR.
|
||||
|
||||
Args:
|
||||
personas_dir: Override PERSONAS_DIR (defaults to CONSTANTS.PERSONAS_DIR)
|
||||
|
||||
Yields:
|
||||
Persona instances for each persona directory
|
||||
"""
|
||||
if personas_dir is None:
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
personas_dir = CONSTANTS.PERSONAS_DIR
|
||||
|
||||
personas_dir = Path(personas_dir)
|
||||
|
||||
if not personas_dir.exists():
|
||||
return
|
||||
|
||||
for persona_path in personas_dir.iterdir():
|
||||
if persona_path.is_dir():
|
||||
yield cls(persona_path.name, personas_dir)
|
||||
|
||||
@classmethod
|
||||
def get_active(cls, config: Dict[str, Any]) -> 'Persona':
|
||||
"""
|
||||
Get the currently active persona from a merged config dict.
|
||||
|
||||
Args:
|
||||
config: Merged config dict from get_config(user=, crawl=, snapshot=, ...)
|
||||
|
||||
Returns:
|
||||
Persona instance for the active persona
|
||||
"""
|
||||
active_name = config.get('ACTIVE_PERSONA') or config.get('DEFAULT_PERSONA') or 'Default'
|
||||
return cls(active_name)
|
||||
|
||||
@classmethod
|
||||
def cleanup_chrome_all(cls, personas_dir: Optional[Path] = None) -> int:
|
||||
"""
|
||||
Clean up Chrome state files for all personas.
|
||||
|
||||
Args:
|
||||
personas_dir: Override PERSONAS_DIR (defaults to CONSTANTS.PERSONAS_DIR)
|
||||
|
||||
Returns:
|
||||
Number of personas that had cleanup performed
|
||||
"""
|
||||
cleaned_count = 0
|
||||
for persona in cls.all(personas_dir):
|
||||
def cleanup_chrome_all(cls) -> int:
|
||||
"""Clean up Chrome state files for all personas."""
|
||||
cleaned = 0
|
||||
for persona in cls.objects.all():
|
||||
if persona.cleanup_chrome():
|
||||
cleaned_count += 1
|
||||
return cleaned_count
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"Persona({self.name})"
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"Persona(name={self.name!r}, path={self.path!r})"
|
||||
cleaned += 1
|
||||
return cleaned
|
||||
|
||||
Reference in New Issue
Block a user