Add Persona class with cleanup_chrome() method

- Create Persona class in personas/models.py for managing browser
  profiles/identities used for archiving sessions

- Each Persona has:
  - chrome_user_data_dir: Chrome profile directory
  - chrome_extensions_dir: Installed extensions
  - cookies_file: Cookies for wget/curl
  - config_file: Persona-specific config overrides

- Add Persona methods:
  - cleanup_chrome(): Remove stale SingletonLock/SingletonSocket files
  - get_config(): Load persona config from config.json
  - save_config(): Save persona config to config.json
  - ensure_dirs(): Create persona directory structure
  - all(): Iterator over all personas
  - get_active(): Get persona based on ACTIVE_PERSONA config
  - cleanup_chrome_all(): Clean up all personas

- Update chrome_cleanup() in misc/util.py to use Persona.cleanup_chrome_all()
  instead of manual directory iteration

- Add convenience functions:
  - cleanup_chrome_for_persona(name)
  - cleanup_chrome_all_personas()
This commit is contained in:
Claude
2025-12-31 00:59:37 +00:00
parent 1a86789523
commit 503a2f77cb
2 changed files with 254 additions and 77 deletions

View File

@@ -482,22 +482,25 @@ def chrome_cleanup():
"""
Cleans up any state or runtime files that Chrome leaves behind when killed by
a timeout or other error. Handles:
- Persona-based chrome_user_data directories (from ACTIVE_PERSONA)
- Explicit CHROME_USER_DATA_DIR
- All persona chrome_user_data directories (via Persona.cleanup_chrome_all())
- Explicit CHROME_USER_DATA_DIR from config
- Legacy Docker chromium path
"""
import os
from pathlib import Path
from archivebox.config.permissions import IN_DOCKER
# Clean up persona-based user data directories
# Clean up all persona chrome directories using Persona class
try:
from archivebox.personas.models import Persona
# Clean up all personas
Persona.cleanup_chrome_all()
# Also clean up the active persona's explicit CHROME_USER_DATA_DIR if set
# (in case it's a custom path not under PERSONAS_DIR)
from archivebox.config.configset import get_config
from archivebox.config.constants import CONSTANTS
config = get_config()
# Clean up the active persona's chrome_user_data SingletonLock
chrome_user_data_dir = config.get('CHROME_USER_DATA_DIR')
if chrome_user_data_dir:
singleton_lock = Path(chrome_user_data_dir) / 'SingletonLock'
@@ -506,24 +509,10 @@ def chrome_cleanup():
singleton_lock.unlink()
except OSError:
pass
# Clean up all persona directories
personas_dir = CONSTANTS.PERSONAS_DIR
if personas_dir.exists():
for persona_dir in personas_dir.iterdir():
if not persona_dir.is_dir():
continue
user_data_dir = persona_dir / 'chrome_user_data'
singleton_lock = user_data_dir / 'SingletonLock'
if singleton_lock.exists():
try:
singleton_lock.unlink()
except OSError:
pass
except Exception:
pass # Config not available during early startup
pass # Persona/config not available during early startup
# Legacy Docker cleanup
# Legacy Docker cleanup (for backwards compatibility)
if IN_DOCKER:
singleton_lock = "/home/archivebox/.config/chromium/SingletonLock"
if os.path.lexists(singleton_lock):

View File

@@ -1,59 +1,247 @@
# from django.db import models
"""
Persona management for ArchiveBox.
# from django.conf import settings
A Persona represents a browser profile/identity used for archiving.
Each persona has its own:
- Chrome user data directory (for cookies, localStorage, extensions, etc.)
- Chrome extensions directory
- Cookies file
- Config overrides
Personas are stored as directories under PERSONAS_DIR (default: data/personas/).
"""
__package__ = 'archivebox.personas'
from pathlib import Path
from typing import Optional, Dict, Any, Iterator
# class Persona(models.Model):
# """Aka a "SessionType", its a template for a crawler browsing session containing some config."""
class Persona:
"""
Represents a browser persona/profile for archiving sessions.
# id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
# created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
# created_at = AutoDateTimeField(default=None, null=False, db_index=True)
# modified_at = models.DateTimeField(auto_now=True)
# name = models.CharField(max_length=100, blank=False, null=False, editable=False)
# persona_dir = models.FilePathField(path=settings.PERSONAS_DIR, allow_files=False, allow_folders=True, blank=True, null=False, editable=False)
# config = models.JSONField(default=dict)
# # e.g. {
# # USER_AGENT: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
# # COOKIES_TXT_FILE: '/path/to/cookies.txt',
# # CHROME_USER_DATA_DIR: '/path/to/chrome/user/data/dir',
# # CHECK_SSL_VALIDITY: False,
# # SAVE_ARCHIVEDOTORG: True,
# # CHROME_BINARY: 'chromium'
# # ...
# # }
# # domain_allowlist = models.CharField(max_length=1024, blank=True, null=False, default='')
# # domain_denylist = models.CharField(max_length=1024, blank=True, null=False, default='')
# class Meta:
# app_label = 'personas'
# verbose_name = 'Session Type'
# verbose_name_plural = 'Session Types'
# unique_together = (('created_by', 'name'),)
Each persona is a directory containing:
- chrome_user_data/ Chrome profile directory
- chrome_extensions/ Installed extensions
- cookies.txt Cookies file for wget/curl
- config.json Persona-specific config overrides
# def clean(self):
# self.persona_dir = settings.PERSONAS_DIR / self.name
# assert self.persona_dir == settings.PERSONAS_DIR / self.name, f'Persona dir {self.persona_dir} must match settings.PERSONAS_DIR / self.name'
# # make sure config keys all exist in FLAT_CONFIG
# # make sure config values all match expected types
# pass
# def save(self, *args, **kwargs):
# self.full_clean()
# # make sure basic file structure is present in persona_dir:
# # - PERSONAS_DIR / self.name /
# # - chrome_profile/
# # - chrome_downloads/
# # - chrome_extensions/
# # - cookies.txt
# # - auth.json
# # - config.json # json dump of the model
# super().save(*args, **kwargs)
Usage:
persona = Persona('Default')
persona.cleanup_chrome()
# Or iterate all personas:
for persona in Persona.all():
persona.cleanup_chrome()
"""
def __init__(self, name: str, personas_dir: Optional[Path] = None):
"""
Initialize a Persona by name.
Args:
name: Persona name (directory name under PERSONAS_DIR)
personas_dir: Override PERSONAS_DIR (defaults to CONSTANTS.PERSONAS_DIR)
"""
self.name = name
if personas_dir is None:
from archivebox.config.constants import CONSTANTS
personas_dir = CONSTANTS.PERSONAS_DIR
self.personas_dir = Path(personas_dir)
self.path = self.personas_dir / name
@property
def chrome_user_data_dir(self) -> Path:
"""Path to Chrome user data directory for this persona."""
return self.path / 'chrome_user_data'
@property
def chrome_extensions_dir(self) -> Path:
"""Path to Chrome extensions directory for this persona."""
return self.path / 'chrome_extensions'
@property
def cookies_file(self) -> Path:
"""Path to cookies.txt file for this persona."""
return self.path / 'cookies.txt'
@property
def config_file(self) -> Path:
"""Path to config.json file for this persona."""
return self.path / 'config.json'
@property
def singleton_lock(self) -> Path:
"""Path to Chrome's SingletonLock file."""
return self.chrome_user_data_dir / 'SingletonLock'
def exists(self) -> bool:
"""Check if persona directory exists."""
return self.path.is_dir()
def ensure_dirs(self) -> None:
"""Create persona directories if they don't exist."""
self.path.mkdir(parents=True, exist_ok=True)
self.chrome_user_data_dir.mkdir(parents=True, exist_ok=True)
self.chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
def cleanup_chrome(self) -> bool:
"""
Clean up Chrome state files for this persona.
Removes stale SingletonLock files left behind when Chrome crashes
or is killed unexpectedly. This allows Chrome to start fresh.
Returns:
True if cleanup was performed, False if no cleanup needed
"""
cleaned = False
# Remove SingletonLock if it exists
if self.singleton_lock.exists():
try:
self.singleton_lock.unlink()
cleaned = True
except OSError:
pass # May be in use by active Chrome
# Also clean up any other stale lock files Chrome might leave
if self.chrome_user_data_dir.exists():
for lock_file in self.chrome_user_data_dir.glob('**/SingletonLock'):
try:
lock_file.unlink()
cleaned = True
except OSError:
pass
# Clean up socket files
for socket_file in self.chrome_user_data_dir.glob('**/SingletonSocket'):
try:
socket_file.unlink()
cleaned = True
except OSError:
pass
return cleaned
def get_config(self) -> Dict[str, Any]:
"""
Load persona-specific config overrides from config.json.
Returns:
Dict of config overrides, or empty dict if no config file
"""
import json
if not self.config_file.exists():
return {}
try:
return json.loads(self.config_file.read_text())
except (json.JSONDecodeError, OSError):
return {}
def save_config(self, config: Dict[str, Any]) -> None:
"""
Save persona-specific config overrides to config.json.
Args:
config: Dict of config overrides to save
"""
import json
self.ensure_dirs()
self.config_file.write_text(json.dumps(config, indent=2))
@classmethod
def all(cls, personas_dir: Optional[Path] = None) -> Iterator['Persona']:
"""
Iterate over all personas in PERSONAS_DIR.
Args:
personas_dir: Override PERSONAS_DIR (defaults to CONSTANTS.PERSONAS_DIR)
Yields:
Persona instances for each persona directory
"""
if personas_dir is None:
from archivebox.config.constants import CONSTANTS
personas_dir = CONSTANTS.PERSONAS_DIR
personas_dir = Path(personas_dir)
if not personas_dir.exists():
return
for persona_path in personas_dir.iterdir():
if persona_path.is_dir():
yield cls(persona_path.name, personas_dir)
@classmethod
def get_active(cls) -> 'Persona':
"""
Get the currently active persona based on ACTIVE_PERSONA config.
Returns:
Persona instance for the active persona
"""
from archivebox.config.configset import get_config
config = get_config()
active_name = config.get('ACTIVE_PERSONA', 'Default')
return cls(active_name)
@classmethod
def cleanup_chrome_all(cls, personas_dir: Optional[Path] = None) -> int:
"""
Clean up Chrome state files for all personas.
Args:
personas_dir: Override PERSONAS_DIR (defaults to CONSTANTS.PERSONAS_DIR)
Returns:
Number of personas that had cleanup performed
"""
cleaned_count = 0
for persona in cls.all(personas_dir):
if persona.cleanup_chrome():
cleaned_count += 1
return cleaned_count
def __str__(self) -> str:
return f"Persona({self.name})"
def __repr__(self) -> str:
return f"Persona(name={self.name!r}, path={self.path!r})"
# Convenience functions for use without instantiating Persona class
def cleanup_chrome_for_persona(name: str, personas_dir: Optional[Path] = None) -> bool:
"""
Clean up Chrome state files for a specific persona.
Args:
name: Persona name
personas_dir: Override PERSONAS_DIR (defaults to CONSTANTS.PERSONAS_DIR)
Returns:
True if cleanup was performed, False if no cleanup needed
"""
return Persona(name, personas_dir).cleanup_chrome()
def cleanup_chrome_all_personas(personas_dir: Optional[Path] = None) -> int:
"""
Clean up Chrome state files for all personas.
Args:
personas_dir: Override PERSONAS_DIR (defaults to CONSTANTS.PERSONAS_DIR)
Returns:
Number of personas that had cleanup performed
"""
return Persona.cleanup_chrome_all(personas_dir)