ArchiveBox/archivebox/cli/archivebox_persona.py

#!/usr/bin/env python3

"""
archivebox persona <action> [args...] [--filters]

Manage Persona records (browser profiles for archiving).

Actions:
    create  - Create Personas
    list    - List Personas as JSONL (with optional filters)
    update  - Update Personas from stdin JSONL
    delete  - Delete Personas from stdin JSONL

Examples:
    # Create a new persona
    archivebox persona create work
    archivebox persona create --import=chrome personal

    # List all personas
    archivebox persona list

    # Delete a persona
    archivebox persona list --name=old | archivebox persona delete --yes
"""

__package__ = 'archivebox.cli'
__command__ = 'archivebox persona'

import os
import sys
import shutil
import platform
import subprocess
import tempfile
from pathlib import Path
from typing import Optional, Iterable

import rich_click as click
from rich import print as rprint

from archivebox.cli.cli_utils import apply_filters


# =============================================================================
# Browser Profile Locations
# =============================================================================

def get_chrome_user_data_dir() -> Optional[Path]:
    """Get the default Chrome user data directory for the current platform."""
    system = platform.system()
    home = Path.home()

    if system == 'Darwin':  # macOS
        candidates = [
            home / 'Library' / 'Application Support' / 'Google' / 'Chrome',
            home / 'Library' / 'Application Support' / 'Chromium',
        ]
    elif system == 'Linux':
        candidates = [
            home / '.config' / 'google-chrome',
            home / '.config' / 'chromium',
            home / '.config' / 'chrome',
            home / 'snap' / 'chromium' / 'common' / 'chromium',
        ]
    elif system == 'Windows':
        local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local'))
        candidates = [
            local_app_data / 'Google' / 'Chrome' / 'User Data',
            local_app_data / 'Chromium' / 'User Data',
        ]
    else:
        candidates = []

    for candidate in candidates:
        if candidate.exists() and (candidate / 'Default').exists():
            return candidate

    return None


def get_firefox_profile_dir() -> Optional[Path]:
    """Get the default Firefox profile directory for the current platform."""
    system = platform.system()
    home = Path.home()

    if system == 'Darwin':
        profiles_dir = home / 'Library' / 'Application Support' / 'Firefox' / 'Profiles'
    elif system == 'Linux':
        profiles_dir = home / '.mozilla' / 'firefox'
    elif system == 'Windows':
        app_data = Path(os.environ.get('APPDATA', home / 'AppData' / 'Roaming'))
        profiles_dir = app_data / 'Mozilla' / 'Firefox' / 'Profiles'
    else:
        return None

    if not profiles_dir.exists():
        return None

    # Find the default profile (usually ends with .default or .default-release)
    for profile in profiles_dir.iterdir():
        if profile.is_dir() and ('default' in profile.name.lower()):
            return profile

    # If no default found, return the first profile
    profiles = [p for p in profiles_dir.iterdir() if p.is_dir()]
    return profiles[0] if profiles else None


def get_brave_user_data_dir() -> Optional[Path]:
    """Get the default Brave user data directory for the current platform."""
    system = platform.system()
    home = Path.home()

    if system == 'Darwin':
        candidates = [
            home / 'Library' / 'Application Support' / 'BraveSoftware' / 'Brave-Browser',
        ]
    elif system == 'Linux':
        candidates = [
            home / '.config' / 'BraveSoftware' / 'Brave-Browser',
        ]
    elif system == 'Windows':
        local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local'))
        candidates = [
            local_app_data / 'BraveSoftware' / 'Brave-Browser' / 'User Data',
        ]
    else:
        candidates = []

    for candidate in candidates:
        if candidate.exists() and (candidate / 'Default').exists():
            return candidate

    return None


BROWSER_PROFILE_FINDERS = {
    'chrome': get_chrome_user_data_dir,
    'chromium': get_chrome_user_data_dir,  # Same locations
    'firefox': get_firefox_profile_dir,
    'brave': get_brave_user_data_dir,
}


# =============================================================================
# Cookie Extraction via CDP
# =============================================================================

def extract_cookies_via_cdp(user_data_dir: Path, output_file: Path) -> bool:
    """
    Launch Chrome with the given user data dir and extract cookies via CDP.

    Returns True if successful, False otherwise.
    """
    from archivebox.config.constants import CONSTANTS

    # Find the cookie extraction script
    chrome_plugin_dir = Path(__file__).parent.parent / 'plugins' / 'chrome'
    extract_script = chrome_plugin_dir / 'extract_cookies.js'

    if not extract_script.exists():
        rprint(f'[yellow]Cookie extraction script not found at {extract_script}[/yellow]', file=sys.stderr)
        return False

    # Get node modules dir
    node_modules_dir = CONSTANTS.LIB_DIR / 'npm' / 'node_modules'

    # Set up environment
    env = os.environ.copy()
    env['NODE_MODULES_DIR'] = str(node_modules_dir)
    env['CHROME_USER_DATA_DIR'] = str(user_data_dir)
    env['COOKIES_OUTPUT_FILE'] = str(output_file)
    env['CHROME_HEADLESS'] = 'true'

    try:
        result = subprocess.run(
            ['node', str(extract_script)],
            env=env,
            capture_output=True,
            text=True,
            timeout=60,
        )

        if result.returncode == 0:
            return True
        else:
            rprint(f'[yellow]Cookie extraction failed: {result.stderr}[/yellow]', file=sys.stderr)
            return False

    except subprocess.TimeoutExpired:
        rprint('[yellow]Cookie extraction timed out[/yellow]', file=sys.stderr)
        return False
    except FileNotFoundError:
        rprint('[yellow]Node.js not found. Cannot extract cookies.[/yellow]', file=sys.stderr)
        return False
    except Exception as e:
        rprint(f'[yellow]Cookie extraction error: {e}[/yellow]', file=sys.stderr)
        return False


# =============================================================================
# Validation Helpers
# =============================================================================

def validate_persona_name(name: str) -> tuple[bool, str]:
    """
    Validate persona name to prevent path traversal attacks.

    Returns:
        (is_valid, error_message): tuple indicating if name is valid
    """
    if not name or not name.strip():
        return False, "Persona name cannot be empty"

    # Check for path separators
    if '/' in name or '\\' in name:
        return False, "Persona name cannot contain path separators (/ or \\)"

    # Check for parent directory references
    if '..' in name:
        return False, "Persona name cannot contain parent directory references (..)"

    # Check for hidden files/directories
    if name.startswith('.'):
        return False, "Persona name cannot start with a dot (.)"

    # Ensure name doesn't contain null bytes or other dangerous chars
    if '\x00' in name or '\n' in name or '\r' in name:
        return False, "Persona name contains invalid characters"

    return True, ""


def ensure_path_within_personas_dir(persona_path: Path) -> bool:
    """
    Verify that a persona path is within PERSONAS_DIR.

    This is a safety check to prevent path traversal attacks where
    a malicious persona name could cause operations on paths outside
    the expected PERSONAS_DIR.

    Returns:
        True if path is safe, False otherwise
    """
    from archivebox.config.constants import CONSTANTS

    try:
        # Resolve both paths to absolute paths
        personas_dir = CONSTANTS.PERSONAS_DIR.resolve()
        resolved_path = persona_path.resolve()

        # Check if resolved_path is a child of personas_dir
        return resolved_path.is_relative_to(personas_dir)
    except (ValueError, RuntimeError):
        return False


# =============================================================================
# CREATE
# =============================================================================

def create_personas(
    names: Iterable[str],
    import_from: Optional[str] = None,
) -> int:
    """
    Create Personas from names.

    If --import is specified, copy the browser profile to the persona directory
    and extract cookies.

    Exit codes:
        0: Success
        1: Failure
    """
    from archivebox.misc.jsonl import write_record
    from archivebox.personas.models import Persona
    from archivebox.config.constants import CONSTANTS

    is_tty = sys.stdout.isatty()
    name_list = list(names) if names else []

    if not name_list:
        rprint('[yellow]No persona names provided. Pass names as arguments.[/yellow]', file=sys.stderr)
        return 1

    # Validate import source if specified
    source_profile_dir = None
    if import_from:
        import_from = import_from.lower()
        if import_from not in BROWSER_PROFILE_FINDERS:
            rprint(f'[red]Unknown browser: {import_from}[/red]', file=sys.stderr)
            rprint(f'[dim]Supported browsers: {", ".join(BROWSER_PROFILE_FINDERS.keys())}[/dim]', file=sys.stderr)
            return 1

        source_profile_dir = BROWSER_PROFILE_FINDERS[import_from]()
        if not source_profile_dir:
            rprint(f'[red]Could not find {import_from} profile directory[/red]', file=sys.stderr)
            return 1

        rprint(f'[dim]Found {import_from} profile: {source_profile_dir}[/dim]', file=sys.stderr)

    created_count = 0
    for name in name_list:
        name = name.strip()
        if not name:
            continue

        # Validate persona name to prevent path traversal
        is_valid, error_msg = validate_persona_name(name)
        if not is_valid:
            rprint(f'[red]Invalid persona name "{name}": {error_msg}[/red]', file=sys.stderr)
            continue

        persona, created = Persona.objects.get_or_create(name=name)

        if created:
            persona.ensure_dirs()
            created_count += 1
            rprint(f'[green]Created persona: {name}[/green]', file=sys.stderr)
        else:
            rprint(f'[dim]Persona already exists: {name}[/dim]', file=sys.stderr)

        # Import browser profile if requested
        if import_from and source_profile_dir:
            persona_chrome_dir = Path(persona.CHROME_USER_DATA_DIR)

            # Copy the browser profile
            rprint(f'[dim]Copying browser profile to {persona_chrome_dir}...[/dim]', file=sys.stderr)

            try:
                # Remove existing chrome_user_data if it exists
                if persona_chrome_dir.exists():
                    shutil.rmtree(persona_chrome_dir)

                # Copy the profile directory
                # We copy the entire user data dir, not just Default profile
                shutil.copytree(
                    source_profile_dir,
                    persona_chrome_dir,
                    symlinks=True,
                    ignore=shutil.ignore_patterns(
                        'Cache', 'Code Cache', 'GPUCache', 'ShaderCache',
                        'Service Worker', 'GCM Store', '*.log', 'Crashpad',
                        'BrowserMetrics', 'BrowserMetrics-spare.pma',
                        'SingletonLock', 'SingletonSocket', 'SingletonCookie',
                    ),
                )
                rprint(f'[green]Copied browser profile to persona[/green]', file=sys.stderr)

                # Extract cookies via CDP
                cookies_file = Path(persona.path) / 'cookies.txt'
                rprint(f'[dim]Extracting cookies via CDP...[/dim]', file=sys.stderr)

                if extract_cookies_via_cdp(persona_chrome_dir, cookies_file):
                    rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr)
                else:
                    rprint(f'[yellow]Could not extract cookies automatically.[/yellow]', file=sys.stderr)
                    rprint(f'[dim]You can manually export cookies using a browser extension.[/dim]', file=sys.stderr)

            except Exception as e:
                rprint(f'[red]Failed to copy browser profile: {e}[/red]', file=sys.stderr)
                return 1

        if not is_tty:
            write_record({
                'id': str(persona.id) if hasattr(persona, 'id') else None,
                'name': persona.name,
                'path': str(persona.path),
                'CHROME_USER_DATA_DIR': persona.CHROME_USER_DATA_DIR,
                'COOKIES_FILE': persona.COOKIES_FILE,
            })

    rprint(f'[green]Created {created_count} new persona(s)[/green]', file=sys.stderr)
    return 0


# =============================================================================
# LIST
# =============================================================================

def list_personas(
    name: Optional[str] = None,
    name__icontains: Optional[str] = None,
    limit: Optional[int] = None,
) -> int:
    """
    List Personas as JSONL with optional filters.

    Exit codes:
        0: Success (even if no results)
    """
    from archivebox.misc.jsonl import write_record
    from archivebox.personas.models import Persona

    is_tty = sys.stdout.isatty()

    queryset = Persona.objects.all().order_by('name')

    # Apply filters
    filter_kwargs = {
        'name': name,
        'name__icontains': name__icontains,
    }
    queryset = apply_filters(queryset, filter_kwargs, limit=limit)

    count = 0
    for persona in queryset:
        cookies_status = '[green]✓[/green]' if persona.COOKIES_FILE else '[dim]✗[/dim]'
        chrome_status = '[green]✓[/green]' if Path(persona.CHROME_USER_DATA_DIR).exists() else '[dim]✗[/dim]'

        if is_tty:
            rprint(f'[cyan]{persona.name:20}[/cyan] cookies:{cookies_status} chrome:{chrome_status} [dim]{persona.path}[/dim]')
        else:
            write_record({
                'id': str(persona.id) if hasattr(persona, 'id') else None,
                'name': persona.name,
                'path': str(persona.path),
                'CHROME_USER_DATA_DIR': persona.CHROME_USER_DATA_DIR,
                'COOKIES_FILE': persona.COOKIES_FILE,
            })
        count += 1

    rprint(f'[dim]Listed {count} persona(s)[/dim]', file=sys.stderr)
    return 0


# =============================================================================
# UPDATE
# =============================================================================

def update_personas(name: Optional[str] = None) -> int:
    """
    Update Personas from stdin JSONL.

    Reads Persona records from stdin and applies updates.
    Uses PATCH semantics - only specified fields are updated.

    Exit codes:
        0: Success
        1: No input or error
    """
    from archivebox.misc.jsonl import read_stdin, write_record
    from archivebox.personas.models import Persona

    is_tty = sys.stdout.isatty()

    records = list(read_stdin())
    if not records:
        rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
        return 1

    updated_count = 0
    for record in records:
        persona_id = record.get('id')
        old_name = record.get('name')

        if not persona_id and not old_name:
            continue

        try:
            if persona_id:
                persona = Persona.objects.get(id=persona_id)
            else:
                persona = Persona.objects.get(name=old_name)

            # Apply updates from CLI flags
            if name:
                # Validate new name to prevent path traversal
                is_valid, error_msg = validate_persona_name(name)
                if not is_valid:
                    rprint(f'[red]Invalid new persona name "{name}": {error_msg}[/red]', file=sys.stderr)
                    continue

                # Rename the persona directory too
                old_path = persona.path
                persona.name = name
                new_path = persona.path

                if old_path.exists() and old_path != new_path:
                    shutil.move(str(old_path), str(new_path))

                persona.save()

            updated_count += 1

            if not is_tty:
                write_record({
                    'id': str(persona.id) if hasattr(persona, 'id') else None,
                    'name': persona.name,
                    'path': str(persona.path),
                })

        except Persona.DoesNotExist:
            rprint(f'[yellow]Persona not found: {persona_id or old_name}[/yellow]', file=sys.stderr)
            continue

    rprint(f'[green]Updated {updated_count} persona(s)[/green]', file=sys.stderr)
    return 0


# =============================================================================
# DELETE
# =============================================================================

def delete_personas(yes: bool = False, dry_run: bool = False) -> int:
    """
    Delete Personas from stdin JSONL.

    Requires --yes flag to confirm deletion.

    Exit codes:
        0: Success
        1: No input or missing --yes flag
    """
    from archivebox.misc.jsonl import read_stdin
    from archivebox.personas.models import Persona

    records = list(read_stdin())
    if not records:
        rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
        return 1

    # Collect persona IDs or names
    persona_ids = []
    persona_names = []
    for r in records:
        if r.get('id'):
            persona_ids.append(r['id'])
        elif r.get('name'):
            persona_names.append(r['name'])

    if not persona_ids and not persona_names:
        rprint('[yellow]No valid persona IDs or names in input[/yellow]', file=sys.stderr)
        return 1

    from django.db.models import Q
    query = Q()
    if persona_ids:
        query |= Q(id__in=persona_ids)
    if persona_names:
        query |= Q(name__in=persona_names)

    personas = Persona.objects.filter(query)
    count = personas.count()

    if count == 0:
        rprint('[yellow]No matching personas found[/yellow]', file=sys.stderr)
        return 0

    if dry_run:
        rprint(f'[yellow]Would delete {count} persona(s) (dry run)[/yellow]', file=sys.stderr)
        for persona in personas:
            rprint(f'  {persona.name} ({persona.path})', file=sys.stderr)
        return 0

    if not yes:
        rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
        return 1

    # Delete persona directories and database records
    deleted_count = 0
    for persona in personas:
        persona_path = persona.path

        # Safety check: ensure path is within PERSONAS_DIR before deletion
        if not ensure_path_within_personas_dir(persona_path):
            rprint(f'[red]Security error: persona path "{persona_path}" is outside PERSONAS_DIR. Skipping deletion.[/red]', file=sys.stderr)
            continue

        if persona_path.exists():
            shutil.rmtree(persona_path)
        persona.delete()
        deleted_count += 1

    rprint(f'[green]Deleted {deleted_count} persona(s)[/green]', file=sys.stderr)
    return 0


# =============================================================================
# CLI Commands
# =============================================================================

@click.group()
def main():
    """Manage Persona records (browser profiles)."""
    pass


@main.command('create')
@click.argument('names', nargs=-1)
@click.option('--import', 'import_from', help='Import profile from browser (chrome, firefox, brave)')
def create_cmd(names: tuple, import_from: Optional[str]):
    """Create Personas, optionally importing from a browser profile."""
    sys.exit(create_personas(names, import_from=import_from))


@main.command('list')
@click.option('--name', help='Filter by exact name')
@click.option('--name__icontains', help='Filter by name contains')
@click.option('--limit', '-n', type=int, help='Limit number of results')
def list_cmd(name: Optional[str], name__icontains: Optional[str], limit: Optional[int]):
    """List Personas as JSONL."""
    sys.exit(list_personas(name=name, name__icontains=name__icontains, limit=limit))


@main.command('update')
@click.option('--name', '-n', help='Set new name')
def update_cmd(name: Optional[str]):
    """Update Personas from stdin JSONL."""
    sys.exit(update_personas(name=name))


@main.command('delete')
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
def delete_cmd(yes: bool, dry_run: bool):
    """Delete Personas from stdin JSONL."""
    sys.exit(delete_personas(yes=yes, dry_run=dry_run))


if __name__ == '__main__':
    main()