mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-03 01:15:57 +10:00
- Convert Persona from plain Python class to Django model with ModelWithConfig - Add config JSONField for persona-specific config overrides - Add get_derived_config() method that returns config with derived paths: - CHROME_USER_DATA_DIR, CHROME_EXTENSIONS_DIR, COOKIES_FILE, ACTIVE_PERSONA - Update get_config() to accept persona parameter in merge chain: get_config(persona=crawl.persona, crawl=crawl, snapshot=snapshot) - Remove _derive_persona_paths() - derivation now happens in Persona model - Merge order (highest to lowest priority): 1. snapshot.config 2. crawl.config 3. user.config 4. persona.get_derived_config() <- NEW 5. environment variables 6. ArchiveBox.conf file 7. plugin defaults 8. core defaults Usage: config = get_config(persona=crawl.persona, crawl=crawl) config['CHROME_USER_DATA_DIR'] # derived from persona
156 lines
5.1 KiB
Python
156 lines
5.1 KiB
Python
"""
|
|
Persona management for ArchiveBox.
|
|
|
|
A Persona represents a browser profile/identity used for archiving.
|
|
Each persona has its own:
|
|
- Chrome user data directory (for cookies, localStorage, extensions, etc.)
|
|
- Chrome extensions directory
|
|
- Cookies file
|
|
- Config overrides
|
|
"""
|
|
|
|
__package__ = 'archivebox.personas'
|
|
|
|
from pathlib import Path
|
|
from typing import TYPE_CHECKING, Iterator
|
|
|
|
from django.db import models
|
|
from django.conf import settings
|
|
from django.utils import timezone
|
|
|
|
from archivebox.base_models.models import ModelWithConfig, get_or_create_system_user_pk
|
|
|
|
if TYPE_CHECKING:
|
|
from django.db.models import QuerySet
|
|
|
|
|
|
class Persona(ModelWithConfig):
|
|
"""
|
|
Browser persona/profile for archiving sessions.
|
|
|
|
Each persona provides:
|
|
- CHROME_USER_DATA_DIR: Chrome profile directory
|
|
- CHROME_EXTENSIONS_DIR: Installed extensions directory
|
|
- COOKIES_FILE: Cookies file for wget/curl
|
|
- config: JSON field with persona-specific config overrides
|
|
|
|
Usage:
|
|
# Get persona and its derived config
|
|
config = get_config(persona=crawl.persona, crawl=crawl, snapshot=snapshot)
|
|
chrome_dir = config['CHROME_USER_DATA_DIR']
|
|
|
|
# Or access directly from persona
|
|
persona = Persona.objects.get(name='Default')
|
|
persona.CHROME_USER_DATA_DIR # -> Path to chrome_user_data
|
|
"""
|
|
|
|
name = models.CharField(max_length=64, unique=True)
|
|
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
|
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk)
|
|
|
|
class Meta:
|
|
app_label = 'personas'
|
|
|
|
def __str__(self) -> str:
|
|
return self.name
|
|
|
|
@property
|
|
def path(self) -> Path:
|
|
"""Path to persona directory under PERSONAS_DIR."""
|
|
from archivebox.config.constants import CONSTANTS
|
|
return CONSTANTS.PERSONAS_DIR / self.name
|
|
|
|
@property
|
|
def CHROME_USER_DATA_DIR(self) -> str:
|
|
"""Derived path to Chrome user data directory for this persona."""
|
|
return str(self.path / 'chrome_user_data')
|
|
|
|
@property
|
|
def CHROME_EXTENSIONS_DIR(self) -> str:
|
|
"""Derived path to Chrome extensions directory for this persona."""
|
|
return str(self.path / 'chrome_extensions')
|
|
|
|
@property
|
|
def COOKIES_FILE(self) -> str:
|
|
"""Derived path to cookies.txt file for this persona (if exists)."""
|
|
cookies_path = self.path / 'cookies.txt'
|
|
return str(cookies_path) if cookies_path.exists() else ''
|
|
|
|
def get_derived_config(self) -> dict:
|
|
"""
|
|
Get config dict with derived paths filled in.
|
|
|
|
Returns dict with:
|
|
- All values from self.config JSONField
|
|
- CHROME_USER_DATA_DIR (derived from persona path)
|
|
- CHROME_EXTENSIONS_DIR (derived from persona path)
|
|
- COOKIES_FILE (derived from persona path, if file exists)
|
|
- ACTIVE_PERSONA (set to this persona's name)
|
|
"""
|
|
derived = dict(self.config or {})
|
|
|
|
# Add derived paths (don't override if explicitly set in config)
|
|
if 'CHROME_USER_DATA_DIR' not in derived:
|
|
derived['CHROME_USER_DATA_DIR'] = self.CHROME_USER_DATA_DIR
|
|
if 'CHROME_EXTENSIONS_DIR' not in derived:
|
|
derived['CHROME_EXTENSIONS_DIR'] = self.CHROME_EXTENSIONS_DIR
|
|
if 'COOKIES_FILE' not in derived and self.COOKIES_FILE:
|
|
derived['COOKIES_FILE'] = self.COOKIES_FILE
|
|
|
|
# Always set ACTIVE_PERSONA to this persona's name
|
|
derived['ACTIVE_PERSONA'] = self.name
|
|
|
|
return derived
|
|
|
|
def ensure_dirs(self) -> None:
|
|
"""Create persona directories if they don't exist."""
|
|
self.path.mkdir(parents=True, exist_ok=True)
|
|
(self.path / 'chrome_user_data').mkdir(parents=True, exist_ok=True)
|
|
(self.path / 'chrome_extensions').mkdir(parents=True, exist_ok=True)
|
|
|
|
def cleanup_chrome(self) -> bool:
|
|
"""
|
|
Clean up Chrome state files (SingletonLock, etc.) for this persona.
|
|
|
|
Returns:
|
|
True if cleanup was performed, False if no cleanup needed
|
|
"""
|
|
cleaned = False
|
|
chrome_dir = self.path / 'chrome_user_data'
|
|
|
|
if not chrome_dir.exists():
|
|
return False
|
|
|
|
# Clean up SingletonLock files
|
|
for lock_file in chrome_dir.glob('**/SingletonLock'):
|
|
try:
|
|
lock_file.unlink()
|
|
cleaned = True
|
|
except OSError:
|
|
pass
|
|
|
|
# Clean up SingletonSocket files
|
|
for socket_file in chrome_dir.glob('**/SingletonSocket'):
|
|
try:
|
|
socket_file.unlink()
|
|
cleaned = True
|
|
except OSError:
|
|
pass
|
|
|
|
return cleaned
|
|
|
|
@classmethod
|
|
def get_or_create_default(cls) -> 'Persona':
|
|
"""Get or create the Default persona."""
|
|
persona, _ = cls.objects.get_or_create(name='Default')
|
|
return persona
|
|
|
|
@classmethod
|
|
def cleanup_chrome_all(cls) -> int:
|
|
"""Clean up Chrome state files for all personas."""
|
|
cleaned = 0
|
|
for persona in cls.objects.all():
|
|
if persona.cleanup_chrome():
|
|
cleaned += 1
|
|
return cleaned
|