mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-05 10:26:03 +10:00
Add Persona class with cleanup_chrome() method
- Create Persona class in personas/models.py for managing browser profiles/identities used for archiving sessions - Each Persona has: - chrome_user_data_dir: Chrome profile directory - chrome_extensions_dir: Installed extensions - cookies_file: Cookies for wget/curl - config_file: Persona-specific config overrides - Add Persona methods: - cleanup_chrome(): Remove stale SingletonLock/SingletonSocket files - get_config(): Load persona config from config.json - save_config(): Save persona config to config.json - ensure_dirs(): Create persona directory structure - all(): Iterator over all personas - get_active(): Get persona based on ACTIVE_PERSONA config - cleanup_chrome_all(): Clean up all personas - Update chrome_cleanup() in misc/util.py to use Persona.cleanup_chrome_all() instead of manual directory iteration - Add convenience functions: - cleanup_chrome_for_persona(name) - cleanup_chrome_all_personas()
This commit is contained in:
@@ -482,22 +482,25 @@ def chrome_cleanup():
|
||||
"""
|
||||
Cleans up any state or runtime files that Chrome leaves behind when killed by
|
||||
a timeout or other error. Handles:
|
||||
- Persona-based chrome_user_data directories (from ACTIVE_PERSONA)
|
||||
- Explicit CHROME_USER_DATA_DIR
|
||||
- All persona chrome_user_data directories (via Persona.cleanup_chrome_all())
|
||||
- Explicit CHROME_USER_DATA_DIR from config
|
||||
- Legacy Docker chromium path
|
||||
"""
|
||||
import os
|
||||
from pathlib import Path
|
||||
from archivebox.config.permissions import IN_DOCKER
|
||||
|
||||
# Clean up persona-based user data directories
|
||||
# Clean up all persona chrome directories using Persona class
|
||||
try:
|
||||
from archivebox.personas.models import Persona
|
||||
|
||||
# Clean up all personas
|
||||
Persona.cleanup_chrome_all()
|
||||
|
||||
# Also clean up the active persona's explicit CHROME_USER_DATA_DIR if set
|
||||
# (in case it's a custom path not under PERSONAS_DIR)
|
||||
from archivebox.config.configset import get_config
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
|
||||
config = get_config()
|
||||
|
||||
# Clean up the active persona's chrome_user_data SingletonLock
|
||||
chrome_user_data_dir = config.get('CHROME_USER_DATA_DIR')
|
||||
if chrome_user_data_dir:
|
||||
singleton_lock = Path(chrome_user_data_dir) / 'SingletonLock'
|
||||
@@ -506,24 +509,10 @@ def chrome_cleanup():
|
||||
singleton_lock.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# Clean up all persona directories
|
||||
personas_dir = CONSTANTS.PERSONAS_DIR
|
||||
if personas_dir.exists():
|
||||
for persona_dir in personas_dir.iterdir():
|
||||
if not persona_dir.is_dir():
|
||||
continue
|
||||
user_data_dir = persona_dir / 'chrome_user_data'
|
||||
singleton_lock = user_data_dir / 'SingletonLock'
|
||||
if singleton_lock.exists():
|
||||
try:
|
||||
singleton_lock.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
except Exception:
|
||||
pass # Config not available during early startup
|
||||
pass # Persona/config not available during early startup
|
||||
|
||||
# Legacy Docker cleanup
|
||||
# Legacy Docker cleanup (for backwards compatibility)
|
||||
if IN_DOCKER:
|
||||
singleton_lock = "/home/archivebox/.config/chromium/SingletonLock"
|
||||
if os.path.lexists(singleton_lock):
|
||||
|
||||
@@ -1,59 +1,247 @@
|
||||
# from django.db import models
|
||||
"""
|
||||
Persona management for ArchiveBox.
|
||||
|
||||
# from django.conf import settings
|
||||
A Persona represents a browser profile/identity used for archiving.
|
||||
Each persona has its own:
|
||||
- Chrome user data directory (for cookies, localStorage, extensions, etc.)
|
||||
- Chrome extensions directory
|
||||
- Cookies file
|
||||
- Config overrides
|
||||
|
||||
Personas are stored as directories under PERSONAS_DIR (default: data/personas/).
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.personas'
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, Any, Iterator
|
||||
|
||||
|
||||
# class Persona(models.Model):
|
||||
# """Aka a "SessionType", its a template for a crawler browsing session containing some config."""
|
||||
class Persona:
|
||||
"""
|
||||
Represents a browser persona/profile for archiving sessions.
|
||||
|
||||
# id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
|
||||
|
||||
# created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
|
||||
# created_at = AutoDateTimeField(default=None, null=False, db_index=True)
|
||||
# modified_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
# name = models.CharField(max_length=100, blank=False, null=False, editable=False)
|
||||
|
||||
# persona_dir = models.FilePathField(path=settings.PERSONAS_DIR, allow_files=False, allow_folders=True, blank=True, null=False, editable=False)
|
||||
# config = models.JSONField(default=dict)
|
||||
# # e.g. {
|
||||
# # USER_AGENT: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
|
||||
# # COOKIES_TXT_FILE: '/path/to/cookies.txt',
|
||||
# # CHROME_USER_DATA_DIR: '/path/to/chrome/user/data/dir',
|
||||
# # CHECK_SSL_VALIDITY: False,
|
||||
# # SAVE_ARCHIVEDOTORG: True,
|
||||
# # CHROME_BINARY: 'chromium'
|
||||
# # ...
|
||||
# # }
|
||||
# # domain_allowlist = models.CharField(max_length=1024, blank=True, null=False, default='')
|
||||
# # domain_denylist = models.CharField(max_length=1024, blank=True, null=False, default='')
|
||||
|
||||
# class Meta:
|
||||
# app_label = 'personas'
|
||||
# verbose_name = 'Session Type'
|
||||
# verbose_name_plural = 'Session Types'
|
||||
# unique_together = (('created_by', 'name'),)
|
||||
|
||||
Each persona is a directory containing:
|
||||
- chrome_user_data/ Chrome profile directory
|
||||
- chrome_extensions/ Installed extensions
|
||||
- cookies.txt Cookies file for wget/curl
|
||||
- config.json Persona-specific config overrides
|
||||
|
||||
# def clean(self):
|
||||
# self.persona_dir = settings.PERSONAS_DIR / self.name
|
||||
# assert self.persona_dir == settings.PERSONAS_DIR / self.name, f'Persona dir {self.persona_dir} must match settings.PERSONAS_DIR / self.name'
|
||||
|
||||
|
||||
# # make sure config keys all exist in FLAT_CONFIG
|
||||
# # make sure config values all match expected types
|
||||
# pass
|
||||
|
||||
# def save(self, *args, **kwargs):
|
||||
# self.full_clean()
|
||||
|
||||
# # make sure basic file structure is present in persona_dir:
|
||||
# # - PERSONAS_DIR / self.name /
|
||||
# # - chrome_profile/
|
||||
# # - chrome_downloads/
|
||||
# # - chrome_extensions/
|
||||
# # - cookies.txt
|
||||
# # - auth.json
|
||||
# # - config.json # json dump of the model
|
||||
|
||||
# super().save(*args, **kwargs)
|
||||
Usage:
|
||||
persona = Persona('Default')
|
||||
persona.cleanup_chrome()
|
||||
|
||||
# Or iterate all personas:
|
||||
for persona in Persona.all():
|
||||
persona.cleanup_chrome()
|
||||
"""
|
||||
|
||||
def __init__(self, name: str, personas_dir: Optional[Path] = None):
|
||||
"""
|
||||
Initialize a Persona by name.
|
||||
|
||||
Args:
|
||||
name: Persona name (directory name under PERSONAS_DIR)
|
||||
personas_dir: Override PERSONAS_DIR (defaults to CONSTANTS.PERSONAS_DIR)
|
||||
"""
|
||||
self.name = name
|
||||
|
||||
if personas_dir is None:
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
personas_dir = CONSTANTS.PERSONAS_DIR
|
||||
|
||||
self.personas_dir = Path(personas_dir)
|
||||
self.path = self.personas_dir / name
|
||||
|
||||
@property
|
||||
def chrome_user_data_dir(self) -> Path:
|
||||
"""Path to Chrome user data directory for this persona."""
|
||||
return self.path / 'chrome_user_data'
|
||||
|
||||
@property
|
||||
def chrome_extensions_dir(self) -> Path:
|
||||
"""Path to Chrome extensions directory for this persona."""
|
||||
return self.path / 'chrome_extensions'
|
||||
|
||||
@property
|
||||
def cookies_file(self) -> Path:
|
||||
"""Path to cookies.txt file for this persona."""
|
||||
return self.path / 'cookies.txt'
|
||||
|
||||
@property
|
||||
def config_file(self) -> Path:
|
||||
"""Path to config.json file for this persona."""
|
||||
return self.path / 'config.json'
|
||||
|
||||
@property
|
||||
def singleton_lock(self) -> Path:
|
||||
"""Path to Chrome's SingletonLock file."""
|
||||
return self.chrome_user_data_dir / 'SingletonLock'
|
||||
|
||||
def exists(self) -> bool:
|
||||
"""Check if persona directory exists."""
|
||||
return self.path.is_dir()
|
||||
|
||||
def ensure_dirs(self) -> None:
|
||||
"""Create persona directories if they don't exist."""
|
||||
self.path.mkdir(parents=True, exist_ok=True)
|
||||
self.chrome_user_data_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.chrome_extensions_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def cleanup_chrome(self) -> bool:
|
||||
"""
|
||||
Clean up Chrome state files for this persona.
|
||||
|
||||
Removes stale SingletonLock files left behind when Chrome crashes
|
||||
or is killed unexpectedly. This allows Chrome to start fresh.
|
||||
|
||||
Returns:
|
||||
True if cleanup was performed, False if no cleanup needed
|
||||
"""
|
||||
cleaned = False
|
||||
|
||||
# Remove SingletonLock if it exists
|
||||
if self.singleton_lock.exists():
|
||||
try:
|
||||
self.singleton_lock.unlink()
|
||||
cleaned = True
|
||||
except OSError:
|
||||
pass # May be in use by active Chrome
|
||||
|
||||
# Also clean up any other stale lock files Chrome might leave
|
||||
if self.chrome_user_data_dir.exists():
|
||||
for lock_file in self.chrome_user_data_dir.glob('**/SingletonLock'):
|
||||
try:
|
||||
lock_file.unlink()
|
||||
cleaned = True
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# Clean up socket files
|
||||
for socket_file in self.chrome_user_data_dir.glob('**/SingletonSocket'):
|
||||
try:
|
||||
socket_file.unlink()
|
||||
cleaned = True
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
return cleaned
|
||||
|
||||
def get_config(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Load persona-specific config overrides from config.json.
|
||||
|
||||
Returns:
|
||||
Dict of config overrides, or empty dict if no config file
|
||||
"""
|
||||
import json
|
||||
|
||||
if not self.config_file.exists():
|
||||
return {}
|
||||
|
||||
try:
|
||||
return json.loads(self.config_file.read_text())
|
||||
except (json.JSONDecodeError, OSError):
|
||||
return {}
|
||||
|
||||
def save_config(self, config: Dict[str, Any]) -> None:
|
||||
"""
|
||||
Save persona-specific config overrides to config.json.
|
||||
|
||||
Args:
|
||||
config: Dict of config overrides to save
|
||||
"""
|
||||
import json
|
||||
|
||||
self.ensure_dirs()
|
||||
self.config_file.write_text(json.dumps(config, indent=2))
|
||||
|
||||
@classmethod
|
||||
def all(cls, personas_dir: Optional[Path] = None) -> Iterator['Persona']:
|
||||
"""
|
||||
Iterate over all personas in PERSONAS_DIR.
|
||||
|
||||
Args:
|
||||
personas_dir: Override PERSONAS_DIR (defaults to CONSTANTS.PERSONAS_DIR)
|
||||
|
||||
Yields:
|
||||
Persona instances for each persona directory
|
||||
"""
|
||||
if personas_dir is None:
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
personas_dir = CONSTANTS.PERSONAS_DIR
|
||||
|
||||
personas_dir = Path(personas_dir)
|
||||
|
||||
if not personas_dir.exists():
|
||||
return
|
||||
|
||||
for persona_path in personas_dir.iterdir():
|
||||
if persona_path.is_dir():
|
||||
yield cls(persona_path.name, personas_dir)
|
||||
|
||||
@classmethod
|
||||
def get_active(cls) -> 'Persona':
|
||||
"""
|
||||
Get the currently active persona based on ACTIVE_PERSONA config.
|
||||
|
||||
Returns:
|
||||
Persona instance for the active persona
|
||||
"""
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
config = get_config()
|
||||
active_name = config.get('ACTIVE_PERSONA', 'Default')
|
||||
return cls(active_name)
|
||||
|
||||
@classmethod
|
||||
def cleanup_chrome_all(cls, personas_dir: Optional[Path] = None) -> int:
|
||||
"""
|
||||
Clean up Chrome state files for all personas.
|
||||
|
||||
Args:
|
||||
personas_dir: Override PERSONAS_DIR (defaults to CONSTANTS.PERSONAS_DIR)
|
||||
|
||||
Returns:
|
||||
Number of personas that had cleanup performed
|
||||
"""
|
||||
cleaned_count = 0
|
||||
for persona in cls.all(personas_dir):
|
||||
if persona.cleanup_chrome():
|
||||
cleaned_count += 1
|
||||
return cleaned_count
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"Persona({self.name})"
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"Persona(name={self.name!r}, path={self.path!r})"
|
||||
|
||||
|
||||
# Convenience functions for use without instantiating Persona class
|
||||
|
||||
def cleanup_chrome_for_persona(name: str, personas_dir: Optional[Path] = None) -> bool:
|
||||
"""
|
||||
Clean up Chrome state files for a specific persona.
|
||||
|
||||
Args:
|
||||
name: Persona name
|
||||
personas_dir: Override PERSONAS_DIR (defaults to CONSTANTS.PERSONAS_DIR)
|
||||
|
||||
Returns:
|
||||
True if cleanup was performed, False if no cleanup needed
|
||||
"""
|
||||
return Persona(name, personas_dir).cleanup_chrome()
|
||||
|
||||
|
||||
def cleanup_chrome_all_personas(personas_dir: Optional[Path] = None) -> int:
|
||||
"""
|
||||
Clean up Chrome state files for all personas.
|
||||
|
||||
Args:
|
||||
personas_dir: Override PERSONAS_DIR (defaults to CONSTANTS.PERSONAS_DIR)
|
||||
|
||||
Returns:
|
||||
Number of personas that had cleanup performed
|
||||
"""
|
||||
return Persona.cleanup_chrome_all(personas_dir)
|
||||
|
||||
Reference in New Issue
Block a user