mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
292 lines
10 KiB
Python
292 lines
10 KiB
Python
"""
|
|
Persona management for ArchiveBox.
|
|
|
|
A Persona represents a browser profile/identity used for archiving.
|
|
Each persona has its own:
|
|
- Chrome user data directory (for cookies, localStorage, extensions, etc.)
|
|
- Chrome extensions directory
|
|
- Cookies file
|
|
- Config overrides
|
|
"""
|
|
|
|
__package__ = "archivebox.personas"
|
|
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
from contextlib import contextmanager
|
|
from pathlib import Path
|
|
from typing import TYPE_CHECKING, Any
|
|
|
|
from django.db import models
|
|
from django.conf import settings
|
|
from django.utils import timezone
|
|
|
|
from archivebox.base_models.models import ModelWithConfig, get_or_create_system_user_pk
|
|
from archivebox.uuid_compat import uuid7
|
|
|
|
_fcntl: Any | None = None
|
|
try:
|
|
import fcntl as _fcntl_import
|
|
except ImportError: # pragma: no cover
|
|
pass
|
|
else:
|
|
_fcntl = _fcntl_import
|
|
|
|
if TYPE_CHECKING:
|
|
import fcntl
|
|
else:
|
|
fcntl = _fcntl
|
|
|
|
|
|
VOLATILE_PROFILE_DIR_NAMES = {
|
|
"Cache",
|
|
"Code Cache",
|
|
"GPUCache",
|
|
"ShaderCache",
|
|
"Service Worker",
|
|
"GCM Store",
|
|
"Crashpad",
|
|
"BrowserMetrics",
|
|
}
|
|
|
|
VOLATILE_PROFILE_FILE_NAMES = {
|
|
"BrowserMetrics-spare.pma",
|
|
"SingletonCookie",
|
|
"SingletonLock",
|
|
"SingletonSocket",
|
|
}
|
|
|
|
|
|
class Persona(ModelWithConfig):
|
|
"""
|
|
Browser persona/profile for archiving sessions.
|
|
|
|
Each persona provides:
|
|
- CHROME_USER_DATA_DIR: Chrome profile directory
|
|
- CHROME_EXTENSIONS_DIR: Installed extensions directory
|
|
- CHROME_DOWNLOADS_DIR: Chrome downloads directory
|
|
- COOKIES_FILE: Cookies file for wget/curl
|
|
- config: JSON field with persona-specific config overrides
|
|
|
|
Usage:
|
|
# Get persona and its derived config
|
|
config = get_config(persona=crawl.persona, crawl=crawl, snapshot=snapshot)
|
|
chrome_dir = config['CHROME_USER_DATA_DIR']
|
|
|
|
# Or access directly from persona
|
|
persona = Persona.objects.get(name='Default')
|
|
persona.CHROME_USER_DATA_DIR # -> Path to chrome_user_data
|
|
"""
|
|
|
|
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
|
name = models.CharField(max_length=64, unique=True)
|
|
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
|
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk)
|
|
|
|
class Meta(ModelWithConfig.Meta):
|
|
app_label = "personas"
|
|
|
|
def __str__(self) -> str:
|
|
return self.name
|
|
|
|
@property
|
|
def path(self) -> Path:
|
|
"""Path to persona directory under PERSONAS_DIR."""
|
|
from archivebox.config.constants import CONSTANTS
|
|
|
|
return CONSTANTS.PERSONAS_DIR / self.name
|
|
|
|
@property
|
|
def CHROME_USER_DATA_DIR(self) -> str:
|
|
"""Derived path to Chrome user data directory for this persona."""
|
|
return str(self.path / "chrome_user_data")
|
|
|
|
@property
|
|
def CHROME_EXTENSIONS_DIR(self) -> str:
|
|
"""Derived path to Chrome extensions directory for this persona."""
|
|
return str(self.path / "chrome_extensions")
|
|
|
|
@property
|
|
def CHROME_DOWNLOADS_DIR(self) -> str:
|
|
"""Derived path to Chrome downloads directory for this persona."""
|
|
return str(self.path / "chrome_downloads")
|
|
|
|
@property
|
|
def COOKIES_FILE(self) -> str:
|
|
"""Derived path to cookies.txt file for this persona (if exists)."""
|
|
cookies_path = self.path / "cookies.txt"
|
|
return str(cookies_path) if cookies_path.exists() else ""
|
|
|
|
@property
|
|
def AUTH_STORAGE_FILE(self) -> str:
|
|
"""Derived path to auth.json for this persona (if it exists)."""
|
|
auth_path = self.path / "auth.json"
|
|
return str(auth_path) if auth_path.exists() else ""
|
|
|
|
def get_derived_config(self) -> dict:
|
|
"""
|
|
Get config dict with derived paths filled in.
|
|
|
|
Returns dict with:
|
|
- All values from self.config JSONField
|
|
- CHROME_USER_DATA_DIR (derived from persona path)
|
|
- CHROME_EXTENSIONS_DIR (derived from persona path)
|
|
- CHROME_DOWNLOADS_DIR (derived from persona path)
|
|
- COOKIES_FILE (derived from persona path, if file exists)
|
|
- AUTH_STORAGE_FILE (derived from persona path, if file exists)
|
|
- ACTIVE_PERSONA (set to this persona's name)
|
|
"""
|
|
derived = dict(self.config or {})
|
|
|
|
# Add derived paths (don't override if explicitly set in config)
|
|
if "CHROME_USER_DATA_DIR" not in derived:
|
|
derived["CHROME_USER_DATA_DIR"] = self.CHROME_USER_DATA_DIR
|
|
if "CHROME_EXTENSIONS_DIR" not in derived:
|
|
derived["CHROME_EXTENSIONS_DIR"] = self.CHROME_EXTENSIONS_DIR
|
|
if "CHROME_DOWNLOADS_DIR" not in derived:
|
|
derived["CHROME_DOWNLOADS_DIR"] = self.CHROME_DOWNLOADS_DIR
|
|
if "COOKIES_FILE" not in derived and self.COOKIES_FILE:
|
|
derived["COOKIES_FILE"] = self.COOKIES_FILE
|
|
if "AUTH_STORAGE_FILE" not in derived and self.AUTH_STORAGE_FILE:
|
|
derived["AUTH_STORAGE_FILE"] = self.AUTH_STORAGE_FILE
|
|
|
|
# Always set ACTIVE_PERSONA to this persona's name
|
|
derived["ACTIVE_PERSONA"] = self.name
|
|
|
|
return derived
|
|
|
|
def ensure_dirs(self) -> None:
|
|
"""Create persona directories if they don't exist."""
|
|
self.path.mkdir(parents=True, exist_ok=True)
|
|
(self.path / "chrome_user_data").mkdir(parents=True, exist_ok=True)
|
|
(self.path / "chrome_extensions").mkdir(parents=True, exist_ok=True)
|
|
(self.path / "chrome_downloads").mkdir(parents=True, exist_ok=True)
|
|
|
|
def cleanup_chrome_profile(self, profile_dir: Path) -> bool:
|
|
"""Remove volatile Chrome state that should never be reused across launches."""
|
|
cleaned = False
|
|
|
|
if not profile_dir.exists():
|
|
return False
|
|
|
|
for path in profile_dir.rglob("*"):
|
|
if path.name in VOLATILE_PROFILE_FILE_NAMES:
|
|
try:
|
|
path.unlink()
|
|
cleaned = True
|
|
except OSError:
|
|
pass
|
|
|
|
for dirname in VOLATILE_PROFILE_DIR_NAMES:
|
|
for path in profile_dir.rglob(dirname):
|
|
if not path.is_dir():
|
|
continue
|
|
shutil.rmtree(path, ignore_errors=True)
|
|
cleaned = True
|
|
|
|
for path in profile_dir.rglob("*.log"):
|
|
try:
|
|
path.unlink()
|
|
cleaned = True
|
|
except OSError:
|
|
pass
|
|
|
|
return cleaned
|
|
|
|
def cleanup_chrome(self) -> bool:
|
|
"""Clean up volatile Chrome state for this persona's base profile."""
|
|
return self.cleanup_chrome_profile(self.path / "chrome_user_data")
|
|
|
|
@contextmanager
|
|
def lock_runtime_for_crawl(self):
|
|
lock_path = self.path / ".archivebox-crawl-profile.lock"
|
|
lock_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with lock_path.open("w") as lock_file:
|
|
if fcntl is not None:
|
|
fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX)
|
|
try:
|
|
yield
|
|
finally:
|
|
if fcntl is not None:
|
|
fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
|
|
|
|
def runtime_root_for_crawl(self, crawl) -> Path:
|
|
return Path(crawl.output_dir) / ".persona" / self.name
|
|
|
|
def runtime_profile_dir_for_crawl(self, crawl) -> Path:
|
|
return self.runtime_root_for_crawl(crawl) / "chrome_user_data"
|
|
|
|
def runtime_downloads_dir_for_crawl(self, crawl) -> Path:
|
|
return self.runtime_root_for_crawl(crawl) / "chrome_downloads"
|
|
|
|
def copy_chrome_profile(self, source_dir: Path, destination_dir: Path) -> None:
|
|
destination_dir.parent.mkdir(parents=True, exist_ok=True)
|
|
shutil.rmtree(destination_dir, ignore_errors=True)
|
|
destination_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
copy_cmd: list[str] | None = None
|
|
source_contents = f"{source_dir}/."
|
|
|
|
if sys.platform == "darwin":
|
|
copy_cmd = ["cp", "-cR", source_contents, str(destination_dir)]
|
|
elif sys.platform.startswith("linux"):
|
|
copy_cmd = ["cp", "-a", source_contents, str(destination_dir)]
|
|
|
|
if copy_cmd:
|
|
result = subprocess.run(copy_cmd, capture_output=True, text=True)
|
|
if result.returncode == 0:
|
|
return
|
|
|
|
shutil.rmtree(destination_dir, ignore_errors=True)
|
|
destination_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
shutil.copytree(source_dir, destination_dir, symlinks=True, dirs_exist_ok=True)
|
|
|
|
def prepare_runtime_for_crawl(self, crawl, chrome_binary: str = "") -> dict[str, str]:
|
|
self.ensure_dirs()
|
|
|
|
template_dir = Path(self.CHROME_USER_DATA_DIR)
|
|
runtime_root = self.runtime_root_for_crawl(crawl)
|
|
runtime_profile_dir = self.runtime_profile_dir_for_crawl(crawl)
|
|
runtime_downloads_dir = self.runtime_downloads_dir_for_crawl(crawl)
|
|
|
|
with self.lock_runtime_for_crawl():
|
|
if not runtime_profile_dir.exists():
|
|
if template_dir.exists() and any(template_dir.iterdir()):
|
|
self.copy_chrome_profile(template_dir, runtime_profile_dir)
|
|
else:
|
|
runtime_profile_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
runtime_downloads_dir.mkdir(parents=True, exist_ok=True)
|
|
self.cleanup_chrome_profile(runtime_profile_dir)
|
|
|
|
(runtime_root / "persona_name.txt").write_text(self.name)
|
|
(runtime_root / "template_dir.txt").write_text(str(template_dir))
|
|
if chrome_binary:
|
|
(runtime_root / "chrome_binary.txt").write_text(chrome_binary)
|
|
|
|
return {
|
|
"CHROME_USER_DATA_DIR": str(runtime_profile_dir),
|
|
"CHROME_DOWNLOADS_DIR": str(runtime_downloads_dir),
|
|
}
|
|
|
|
def cleanup_runtime_for_crawl(self, crawl) -> None:
|
|
shutil.rmtree(Path(crawl.output_dir) / ".persona", ignore_errors=True)
|
|
|
|
@classmethod
|
|
def get_or_create_default(cls) -> "Persona":
|
|
"""Get or create the Default persona."""
|
|
persona, _ = cls.objects.get_or_create(name="Default")
|
|
return persona
|
|
|
|
@classmethod
|
|
def cleanup_chrome_all(cls) -> int:
|
|
"""Clean up Chrome state files for all personas."""
|
|
cleaned = 0
|
|
for persona in cls.objects.all():
|
|
if persona.cleanup_chrome():
|
|
cleaned += 1
|
|
return cleaned
|