Files
ArchiveBox/archivebox/personas/models.py
2026-03-22 20:25:18 -07:00

291 lines
10 KiB
Python

"""
Persona management for ArchiveBox.
A Persona represents a browser profile/identity used for archiving.
Each persona has its own:
- Chrome user data directory (for cookies, localStorage, extensions, etc.)
- Chrome extensions directory
- Cookies file
- Config overrides
"""
__package__ = 'archivebox.personas'
import shutil
import subprocess
import sys
from contextlib import contextmanager
from pathlib import Path
from typing import TYPE_CHECKING, Any
from django.db import models
from django.conf import settings
from django.utils import timezone
from archivebox.base_models.models import ModelWithConfig, get_or_create_system_user_pk
from archivebox.uuid_compat import uuid7
_fcntl: Any | None = None
try:
import fcntl as _fcntl_import
except ImportError: # pragma: no cover
pass
else:
_fcntl = _fcntl_import
if TYPE_CHECKING:
import fcntl
else:
fcntl = _fcntl
VOLATILE_PROFILE_DIR_NAMES = {
'Cache',
'Code Cache',
'GPUCache',
'ShaderCache',
'Service Worker',
'GCM Store',
'Crashpad',
'BrowserMetrics',
}
VOLATILE_PROFILE_FILE_NAMES = {
'BrowserMetrics-spare.pma',
'SingletonCookie',
'SingletonLock',
'SingletonSocket',
}
class Persona(ModelWithConfig):
"""
Browser persona/profile for archiving sessions.
Each persona provides:
- CHROME_USER_DATA_DIR: Chrome profile directory
- CHROME_EXTENSIONS_DIR: Installed extensions directory
- CHROME_DOWNLOADS_DIR: Chrome downloads directory
- COOKIES_FILE: Cookies file for wget/curl
- config: JSON field with persona-specific config overrides
Usage:
# Get persona and its derived config
config = get_config(persona=crawl.persona, crawl=crawl, snapshot=snapshot)
chrome_dir = config['CHROME_USER_DATA_DIR']
# Or access directly from persona
persona = Persona.objects.get(name='Default')
persona.CHROME_USER_DATA_DIR # -> Path to chrome_user_data
"""
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
name = models.CharField(max_length=64, unique=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk)
class Meta(ModelWithConfig.Meta):
app_label = 'personas'
def __str__(self) -> str:
return self.name
@property
def path(self) -> Path:
"""Path to persona directory under PERSONAS_DIR."""
from archivebox.config.constants import CONSTANTS
return CONSTANTS.PERSONAS_DIR / self.name
@property
def CHROME_USER_DATA_DIR(self) -> str:
"""Derived path to Chrome user data directory for this persona."""
return str(self.path / 'chrome_user_data')
@property
def CHROME_EXTENSIONS_DIR(self) -> str:
"""Derived path to Chrome extensions directory for this persona."""
return str(self.path / 'chrome_extensions')
@property
def CHROME_DOWNLOADS_DIR(self) -> str:
"""Derived path to Chrome downloads directory for this persona."""
return str(self.path / 'chrome_downloads')
@property
def COOKIES_FILE(self) -> str:
"""Derived path to cookies.txt file for this persona (if exists)."""
cookies_path = self.path / 'cookies.txt'
return str(cookies_path) if cookies_path.exists() else ''
@property
def AUTH_STORAGE_FILE(self) -> str:
"""Derived path to auth.json for this persona (if it exists)."""
auth_path = self.path / 'auth.json'
return str(auth_path) if auth_path.exists() else ''
def get_derived_config(self) -> dict:
"""
Get config dict with derived paths filled in.
Returns dict with:
- All values from self.config JSONField
- CHROME_USER_DATA_DIR (derived from persona path)
- CHROME_EXTENSIONS_DIR (derived from persona path)
- CHROME_DOWNLOADS_DIR (derived from persona path)
- COOKIES_FILE (derived from persona path, if file exists)
- AUTH_STORAGE_FILE (derived from persona path, if file exists)
- ACTIVE_PERSONA (set to this persona's name)
"""
derived = dict(self.config or {})
# Add derived paths (don't override if explicitly set in config)
if 'CHROME_USER_DATA_DIR' not in derived:
derived['CHROME_USER_DATA_DIR'] = self.CHROME_USER_DATA_DIR
if 'CHROME_EXTENSIONS_DIR' not in derived:
derived['CHROME_EXTENSIONS_DIR'] = self.CHROME_EXTENSIONS_DIR
if 'CHROME_DOWNLOADS_DIR' not in derived:
derived['CHROME_DOWNLOADS_DIR'] = self.CHROME_DOWNLOADS_DIR
if 'COOKIES_FILE' not in derived and self.COOKIES_FILE:
derived['COOKIES_FILE'] = self.COOKIES_FILE
if 'AUTH_STORAGE_FILE' not in derived and self.AUTH_STORAGE_FILE:
derived['AUTH_STORAGE_FILE'] = self.AUTH_STORAGE_FILE
# Always set ACTIVE_PERSONA to this persona's name
derived['ACTIVE_PERSONA'] = self.name
return derived
def ensure_dirs(self) -> None:
"""Create persona directories if they don't exist."""
self.path.mkdir(parents=True, exist_ok=True)
(self.path / 'chrome_user_data').mkdir(parents=True, exist_ok=True)
(self.path / 'chrome_extensions').mkdir(parents=True, exist_ok=True)
(self.path / 'chrome_downloads').mkdir(parents=True, exist_ok=True)
def cleanup_chrome_profile(self, profile_dir: Path) -> bool:
"""Remove volatile Chrome state that should never be reused across launches."""
cleaned = False
if not profile_dir.exists():
return False
for path in profile_dir.rglob('*'):
if path.name in VOLATILE_PROFILE_FILE_NAMES:
try:
path.unlink()
cleaned = True
except OSError:
pass
for dirname in VOLATILE_PROFILE_DIR_NAMES:
for path in profile_dir.rglob(dirname):
if not path.is_dir():
continue
shutil.rmtree(path, ignore_errors=True)
cleaned = True
for path in profile_dir.rglob('*.log'):
try:
path.unlink()
cleaned = True
except OSError:
pass
return cleaned
def cleanup_chrome(self) -> bool:
"""Clean up volatile Chrome state for this persona's base profile."""
return self.cleanup_chrome_profile(self.path / 'chrome_user_data')
@contextmanager
def lock_runtime_for_crawl(self):
lock_path = self.path / '.archivebox-crawl-profile.lock'
lock_path.parent.mkdir(parents=True, exist_ok=True)
with lock_path.open('w') as lock_file:
if fcntl is not None:
fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX)
try:
yield
finally:
if fcntl is not None:
fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
def runtime_root_for_crawl(self, crawl) -> Path:
return Path(crawl.output_dir) / '.persona' / self.name
def runtime_profile_dir_for_crawl(self, crawl) -> Path:
return self.runtime_root_for_crawl(crawl) / 'chrome_user_data'
def runtime_downloads_dir_for_crawl(self, crawl) -> Path:
return self.runtime_root_for_crawl(crawl) / 'chrome_downloads'
def copy_chrome_profile(self, source_dir: Path, destination_dir: Path) -> None:
destination_dir.parent.mkdir(parents=True, exist_ok=True)
shutil.rmtree(destination_dir, ignore_errors=True)
destination_dir.mkdir(parents=True, exist_ok=True)
copy_cmd: list[str] | None = None
source_contents = f'{source_dir}/.'
if sys.platform == 'darwin':
copy_cmd = ['cp', '-cR', source_contents, str(destination_dir)]
elif sys.platform.startswith('linux'):
copy_cmd = ['cp', '-a', source_contents, str(destination_dir)]
if copy_cmd:
result = subprocess.run(copy_cmd, capture_output=True, text=True)
if result.returncode == 0:
return
shutil.rmtree(destination_dir, ignore_errors=True)
destination_dir.mkdir(parents=True, exist_ok=True)
shutil.copytree(source_dir, destination_dir, symlinks=True, dirs_exist_ok=True)
def prepare_runtime_for_crawl(self, crawl, chrome_binary: str = '') -> dict[str, str]:
self.ensure_dirs()
template_dir = Path(self.CHROME_USER_DATA_DIR)
runtime_root = self.runtime_root_for_crawl(crawl)
runtime_profile_dir = self.runtime_profile_dir_for_crawl(crawl)
runtime_downloads_dir = self.runtime_downloads_dir_for_crawl(crawl)
with self.lock_runtime_for_crawl():
if not runtime_profile_dir.exists():
if template_dir.exists() and any(template_dir.iterdir()):
self.copy_chrome_profile(template_dir, runtime_profile_dir)
else:
runtime_profile_dir.mkdir(parents=True, exist_ok=True)
runtime_downloads_dir.mkdir(parents=True, exist_ok=True)
self.cleanup_chrome_profile(runtime_profile_dir)
(runtime_root / 'persona_name.txt').write_text(self.name)
(runtime_root / 'template_dir.txt').write_text(str(template_dir))
if chrome_binary:
(runtime_root / 'chrome_binary.txt').write_text(chrome_binary)
return {
'CHROME_USER_DATA_DIR': str(runtime_profile_dir),
'CHROME_DOWNLOADS_DIR': str(runtime_downloads_dir),
}
def cleanup_runtime_for_crawl(self, crawl) -> None:
shutil.rmtree(Path(crawl.output_dir) / '.persona', ignore_errors=True)
@classmethod
def get_or_create_default(cls) -> 'Persona':
"""Get or create the Default persona."""
persona, _ = cls.objects.get_or_create(name='Default')
return persona
@classmethod
def cleanup_chrome_all(cls) -> int:
"""Clean up Chrome state files for all personas."""
cleaned = 0
for persona in cls.objects.all():
if persona.cleanup_chrome():
cleaned += 1
return cleaned