Files
ArchiveBox/archivebox/personas/models.py
Nick Sweeting b749b26c5d wip
2026-03-23 03:58:32 -07:00

292 lines
10 KiB
Python

"""
Persona management for ArchiveBox.
A Persona represents a browser profile/identity used for archiving.
Each persona has its own:
- Chrome user data directory (for cookies, localStorage, extensions, etc.)
- Chrome extensions directory
- Cookies file
- Config overrides
"""
__package__ = "archivebox.personas"
import shutil
import subprocess
import sys
from contextlib import contextmanager
from pathlib import Path
from typing import TYPE_CHECKING, Any
from django.db import models
from django.conf import settings
from django.utils import timezone
from archivebox.base_models.models import ModelWithConfig, get_or_create_system_user_pk
from archivebox.uuid_compat import uuid7
_fcntl: Any | None = None
try:
import fcntl as _fcntl_import
except ImportError: # pragma: no cover
pass
else:
_fcntl = _fcntl_import
if TYPE_CHECKING:
import fcntl
else:
fcntl = _fcntl
VOLATILE_PROFILE_DIR_NAMES = {
"Cache",
"Code Cache",
"GPUCache",
"ShaderCache",
"Service Worker",
"GCM Store",
"Crashpad",
"BrowserMetrics",
}
VOLATILE_PROFILE_FILE_NAMES = {
"BrowserMetrics-spare.pma",
"SingletonCookie",
"SingletonLock",
"SingletonSocket",
}
class Persona(ModelWithConfig):
"""
Browser persona/profile for archiving sessions.
Each persona provides:
- CHROME_USER_DATA_DIR: Chrome profile directory
- CHROME_EXTENSIONS_DIR: Installed extensions directory
- CHROME_DOWNLOADS_DIR: Chrome downloads directory
- COOKIES_FILE: Cookies file for wget/curl
- config: JSON field with persona-specific config overrides
Usage:
# Get persona and its derived config
config = get_config(persona=crawl.persona, crawl=crawl, snapshot=snapshot)
chrome_dir = config['CHROME_USER_DATA_DIR']
# Or access directly from persona
persona = Persona.objects.get(name='Default')
persona.CHROME_USER_DATA_DIR # -> Path to chrome_user_data
"""
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
name = models.CharField(max_length=64, unique=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk)
class Meta(ModelWithConfig.Meta):
app_label = "personas"
def __str__(self) -> str:
return self.name
@property
def path(self) -> Path:
"""Path to persona directory under PERSONAS_DIR."""
from archivebox.config.constants import CONSTANTS
return CONSTANTS.PERSONAS_DIR / self.name
@property
def CHROME_USER_DATA_DIR(self) -> str:
"""Derived path to Chrome user data directory for this persona."""
return str(self.path / "chrome_user_data")
@property
def CHROME_EXTENSIONS_DIR(self) -> str:
"""Derived path to Chrome extensions directory for this persona."""
return str(self.path / "chrome_extensions")
@property
def CHROME_DOWNLOADS_DIR(self) -> str:
"""Derived path to Chrome downloads directory for this persona."""
return str(self.path / "chrome_downloads")
@property
def COOKIES_FILE(self) -> str:
"""Derived path to cookies.txt file for this persona (if exists)."""
cookies_path = self.path / "cookies.txt"
return str(cookies_path) if cookies_path.exists() else ""
@property
def AUTH_STORAGE_FILE(self) -> str:
"""Derived path to auth.json for this persona (if it exists)."""
auth_path = self.path / "auth.json"
return str(auth_path) if auth_path.exists() else ""
def get_derived_config(self) -> dict:
"""
Get config dict with derived paths filled in.
Returns dict with:
- All values from self.config JSONField
- CHROME_USER_DATA_DIR (derived from persona path)
- CHROME_EXTENSIONS_DIR (derived from persona path)
- CHROME_DOWNLOADS_DIR (derived from persona path)
- COOKIES_FILE (derived from persona path, if file exists)
- AUTH_STORAGE_FILE (derived from persona path, if file exists)
- ACTIVE_PERSONA (set to this persona's name)
"""
derived = dict(self.config or {})
# Add derived paths (don't override if explicitly set in config)
if "CHROME_USER_DATA_DIR" not in derived:
derived["CHROME_USER_DATA_DIR"] = self.CHROME_USER_DATA_DIR
if "CHROME_EXTENSIONS_DIR" not in derived:
derived["CHROME_EXTENSIONS_DIR"] = self.CHROME_EXTENSIONS_DIR
if "CHROME_DOWNLOADS_DIR" not in derived:
derived["CHROME_DOWNLOADS_DIR"] = self.CHROME_DOWNLOADS_DIR
if "COOKIES_FILE" not in derived and self.COOKIES_FILE:
derived["COOKIES_FILE"] = self.COOKIES_FILE
if "AUTH_STORAGE_FILE" not in derived and self.AUTH_STORAGE_FILE:
derived["AUTH_STORAGE_FILE"] = self.AUTH_STORAGE_FILE
# Always set ACTIVE_PERSONA to this persona's name
derived["ACTIVE_PERSONA"] = self.name
return derived
def ensure_dirs(self) -> None:
"""Create persona directories if they don't exist."""
self.path.mkdir(parents=True, exist_ok=True)
(self.path / "chrome_user_data").mkdir(parents=True, exist_ok=True)
(self.path / "chrome_extensions").mkdir(parents=True, exist_ok=True)
(self.path / "chrome_downloads").mkdir(parents=True, exist_ok=True)
def cleanup_chrome_profile(self, profile_dir: Path) -> bool:
"""Remove volatile Chrome state that should never be reused across launches."""
cleaned = False
if not profile_dir.exists():
return False
for path in profile_dir.rglob("*"):
if path.name in VOLATILE_PROFILE_FILE_NAMES:
try:
path.unlink()
cleaned = True
except OSError:
pass
for dirname in VOLATILE_PROFILE_DIR_NAMES:
for path in profile_dir.rglob(dirname):
if not path.is_dir():
continue
shutil.rmtree(path, ignore_errors=True)
cleaned = True
for path in profile_dir.rglob("*.log"):
try:
path.unlink()
cleaned = True
except OSError:
pass
return cleaned
def cleanup_chrome(self) -> bool:
"""Clean up volatile Chrome state for this persona's base profile."""
return self.cleanup_chrome_profile(self.path / "chrome_user_data")
@contextmanager
def lock_runtime_for_crawl(self):
lock_path = self.path / ".archivebox-crawl-profile.lock"
lock_path.parent.mkdir(parents=True, exist_ok=True)
with lock_path.open("w") as lock_file:
if fcntl is not None:
fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX)
try:
yield
finally:
if fcntl is not None:
fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
def runtime_root_for_crawl(self, crawl) -> Path:
return Path(crawl.output_dir) / ".persona" / self.name
def runtime_profile_dir_for_crawl(self, crawl) -> Path:
return self.runtime_root_for_crawl(crawl) / "chrome_user_data"
def runtime_downloads_dir_for_crawl(self, crawl) -> Path:
return self.runtime_root_for_crawl(crawl) / "chrome_downloads"
def copy_chrome_profile(self, source_dir: Path, destination_dir: Path) -> None:
destination_dir.parent.mkdir(parents=True, exist_ok=True)
shutil.rmtree(destination_dir, ignore_errors=True)
destination_dir.mkdir(parents=True, exist_ok=True)
copy_cmd: list[str] | None = None
source_contents = f"{source_dir}/."
if sys.platform == "darwin":
copy_cmd = ["cp", "-cR", source_contents, str(destination_dir)]
elif sys.platform.startswith("linux"):
copy_cmd = ["cp", "-a", source_contents, str(destination_dir)]
if copy_cmd:
result = subprocess.run(copy_cmd, capture_output=True, text=True)
if result.returncode == 0:
return
shutil.rmtree(destination_dir, ignore_errors=True)
destination_dir.mkdir(parents=True, exist_ok=True)
shutil.copytree(source_dir, destination_dir, symlinks=True, dirs_exist_ok=True)
def prepare_runtime_for_crawl(self, crawl, chrome_binary: str = "") -> dict[str, str]:
self.ensure_dirs()
template_dir = Path(self.CHROME_USER_DATA_DIR)
runtime_root = self.runtime_root_for_crawl(crawl)
runtime_profile_dir = self.runtime_profile_dir_for_crawl(crawl)
runtime_downloads_dir = self.runtime_downloads_dir_for_crawl(crawl)
with self.lock_runtime_for_crawl():
if not runtime_profile_dir.exists():
if template_dir.exists() and any(template_dir.iterdir()):
self.copy_chrome_profile(template_dir, runtime_profile_dir)
else:
runtime_profile_dir.mkdir(parents=True, exist_ok=True)
runtime_downloads_dir.mkdir(parents=True, exist_ok=True)
self.cleanup_chrome_profile(runtime_profile_dir)
(runtime_root / "persona_name.txt").write_text(self.name)
(runtime_root / "template_dir.txt").write_text(str(template_dir))
if chrome_binary:
(runtime_root / "chrome_binary.txt").write_text(chrome_binary)
return {
"CHROME_USER_DATA_DIR": str(runtime_profile_dir),
"CHROME_DOWNLOADS_DIR": str(runtime_downloads_dir),
}
def cleanup_runtime_for_crawl(self, crawl) -> None:
shutil.rmtree(Path(crawl.output_dir) / ".persona", ignore_errors=True)
@classmethod
def get_or_create_default(cls) -> "Persona":
"""Get or create the Default persona."""
persona, _ = cls.objects.get_or_create(name="Default")
return persona
@classmethod
def cleanup_chrome_all(cls) -> int:
"""Clean up Chrome state files for all personas."""
cleaned = 0
for persona in cls.objects.all():
if persona.cleanup_chrome():
cleaned += 1
return cleaned