Add persona CLI command with browser cookie import (#1747)

2026-01-02 17:05:38 +10:00 · 2025-12-31 10:56:40 -08:00
parent cd0394c858 3659adeb7e
commit edc83bfac6
4 changed files with 879 additions and 1 deletions
--- a/archivebox/cli/init.py
+++ b/archivebox/cli/init.py
@@ -36,6 +36,7 @@ class ArchiveBoxGroup(click.Group):
        'binary': 'archivebox.cli.archivebox_binary.main',
        'process': 'archivebox.cli.archivebox_process.main',
        'machine': 'archivebox.cli.archivebox_machine.main',
+        'persona': 'archivebox.cli.archivebox_persona.main',
    }
    archive_commands = {
        # High-level commands
--- a/archivebox/cli/archivebox_persona.py
+++ b/archivebox/cli/archivebox_persona.py
@@ -0,0 +1,623 @@
+#!/usr/bin/env python3
+
+"""
+archivebox persona <action> [args...] [--filters]
+
+Manage Persona records (browser profiles for archiving).
+
+Actions:
+    create  - Create Personas
+    list    - List Personas as JSONL (with optional filters)
+    update  - Update Personas from stdin JSONL
+    delete  - Delete Personas from stdin JSONL
+
+Examples:
+    # Create a new persona
+    archivebox persona create work
+    archivebox persona create --import=chrome personal
+
+    # List all personas
+    archivebox persona list
+
+    # Delete a persona
+    archivebox persona list --name=old | archivebox persona delete --yes
+"""
+
+__package__ = 'archivebox.cli'
+__command__ = 'archivebox persona'
+
+import os
+import sys
+import shutil
+import platform
+import subprocess
+import tempfile
+from pathlib import Path
+from typing import Optional, Iterable
+
+import rich_click as click
+from rich import print as rprint
+
+from archivebox.cli.cli_utils import apply_filters
+
+
+# =============================================================================
+# Browser Profile Locations
+# =============================================================================
+
+def get_chrome_user_data_dir() -> Optional[Path]:
+    """Get the default Chrome user data directory for the current platform."""
+    system = platform.system()
+    home = Path.home()
+
+    if system == 'Darwin':  # macOS
+        candidates = [
+            home / 'Library' / 'Application Support' / 'Google' / 'Chrome',
+            home / 'Library' / 'Application Support' / 'Chromium',
+        ]
+    elif system == 'Linux':
+        candidates = [
+            home / '.config' / 'google-chrome',
+            home / '.config' / 'chromium',
+            home / '.config' / 'chrome',
+            home / 'snap' / 'chromium' / 'common' / 'chromium',
+        ]
+    elif system == 'Windows':
+        local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local'))
+        candidates = [
+            local_app_data / 'Google' / 'Chrome' / 'User Data',
+            local_app_data / 'Chromium' / 'User Data',
+        ]
+    else:
+        candidates = []
+
+    for candidate in candidates:
+        if candidate.exists() and (candidate / 'Default').exists():
+            return candidate
+
+    return None
+
+
+def get_firefox_profile_dir() -> Optional[Path]:
+    """Get the default Firefox profile directory for the current platform."""
+    system = platform.system()
+    home = Path.home()
+
+    if system == 'Darwin':
+        profiles_dir = home / 'Library' / 'Application Support' / 'Firefox' / 'Profiles'
+    elif system == 'Linux':
+        profiles_dir = home / '.mozilla' / 'firefox'
+    elif system == 'Windows':
+        app_data = Path(os.environ.get('APPDATA', home / 'AppData' / 'Roaming'))
+        profiles_dir = app_data / 'Mozilla' / 'Firefox' / 'Profiles'
+    else:
+        return None
+
+    if not profiles_dir.exists():
+        return None
+
+    # Find the default profile (usually ends with .default or .default-release)
+    for profile in profiles_dir.iterdir():
+        if profile.is_dir() and ('default' in profile.name.lower()):
+            return profile
+
+    # If no default found, return the first profile
+    profiles = [p for p in profiles_dir.iterdir() if p.is_dir()]
+    return profiles[0] if profiles else None
+
+
+def get_brave_user_data_dir() -> Optional[Path]:
+    """Get the default Brave user data directory for the current platform."""
+    system = platform.system()
+    home = Path.home()
+
+    if system == 'Darwin':
+        candidates = [
+            home / 'Library' / 'Application Support' / 'BraveSoftware' / 'Brave-Browser',
+        ]
+    elif system == 'Linux':
+        candidates = [
+            home / '.config' / 'BraveSoftware' / 'Brave-Browser',
+        ]
+    elif system == 'Windows':
+        local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local'))
+        candidates = [
+            local_app_data / 'BraveSoftware' / 'Brave-Browser' / 'User Data',
+        ]
+    else:
+        candidates = []
+
+    for candidate in candidates:
+        if candidate.exists() and (candidate / 'Default').exists():
+            return candidate
+
+    return None
+
+
+BROWSER_PROFILE_FINDERS = {
+    'chrome': get_chrome_user_data_dir,
+    'chromium': get_chrome_user_data_dir,  # Same locations
+    'firefox': get_firefox_profile_dir,
+    'brave': get_brave_user_data_dir,
+}
+
+
+# =============================================================================
+# Cookie Extraction via CDP
+# =============================================================================
+
+def extract_cookies_via_cdp(user_data_dir: Path, output_file: Path) -> bool:
+    """
+    Launch Chrome with the given user data dir and extract cookies via CDP.
+
+    Returns True if successful, False otherwise.
+    """
+    from archivebox.config.constants import CONSTANTS
+
+    # Find the cookie extraction script
+    chrome_plugin_dir = Path(__file__).parent.parent / 'plugins' / 'chrome'
+    extract_script = chrome_plugin_dir / 'extract_cookies.js'
+
+    if not extract_script.exists():
+        rprint(f'[yellow]Cookie extraction script not found at {extract_script}[/yellow]', file=sys.stderr)
+        return False
+
+    # Get node modules dir
+    node_modules_dir = CONSTANTS.LIB_DIR / 'npm' / 'node_modules'
+
+    # Set up environment
+    env = os.environ.copy()
+    env['NODE_MODULES_DIR'] = str(node_modules_dir)
+    env['CHROME_USER_DATA_DIR'] = str(user_data_dir)
+    env['COOKIES_OUTPUT_FILE'] = str(output_file)
+    env['CHROME_HEADLESS'] = 'true'
+
+    try:
+        result = subprocess.run(
+            ['node', str(extract_script)],
+            env=env,
+            capture_output=True,
+            text=True,
+            timeout=60,
+        )
+
+        if result.returncode == 0:
+            return True
+        else:
+            rprint(f'[yellow]Cookie extraction failed: {result.stderr}[/yellow]', file=sys.stderr)
+            return False
+
+    except subprocess.TimeoutExpired:
+        rprint('[yellow]Cookie extraction timed out[/yellow]', file=sys.stderr)
+        return False
+    except FileNotFoundError:
+        rprint('[yellow]Node.js not found. Cannot extract cookies.[/yellow]', file=sys.stderr)
+        return False
+    except Exception as e:
+        rprint(f'[yellow]Cookie extraction error: {e}[/yellow]', file=sys.stderr)
+        return False
+
+
+# =============================================================================
+# Validation Helpers
+# =============================================================================
+
+def validate_persona_name(name: str) -> tuple[bool, str]:
+    """
+    Validate persona name to prevent path traversal attacks.
+
+    Returns:
+        (is_valid, error_message): tuple indicating if name is valid
+    """
+    if not name or not name.strip():
+        return False, "Persona name cannot be empty"
+
+    # Check for path separators
+    if '/' in name or '\\' in name:
+        return False, "Persona name cannot contain path separators (/ or \\)"
+
+    # Check for parent directory references
+    if '..' in name:
+        return False, "Persona name cannot contain parent directory references (..)"
+
+    # Check for hidden files/directories
+    if name.startswith('.'):
+        return False, "Persona name cannot start with a dot (.)"
+
+    # Ensure name doesn't contain null bytes or other dangerous chars
+    if '\x00' in name or '\n' in name or '\r' in name:
+        return False, "Persona name contains invalid characters"
+
+    return True, ""
+
+
+def ensure_path_within_personas_dir(persona_path: Path) -> bool:
+    """
+    Verify that a persona path is within PERSONAS_DIR.
+
+    This is a safety check to prevent path traversal attacks where
+    a malicious persona name could cause operations on paths outside
+    the expected PERSONAS_DIR.
+
+    Returns:
+        True if path is safe, False otherwise
+    """
+    from archivebox.config.constants import CONSTANTS
+
+    try:
+        # Resolve both paths to absolute paths
+        personas_dir = CONSTANTS.PERSONAS_DIR.resolve()
+        resolved_path = persona_path.resolve()
+
+        # Check if resolved_path is a child of personas_dir
+        return resolved_path.is_relative_to(personas_dir)
+    except (ValueError, RuntimeError):
+        return False
+
+
+# =============================================================================
+# CREATE
+# =============================================================================
+
+def create_personas(
+    names: Iterable[str],
+    import_from: Optional[str] = None,
+) -> int:
+    """
+    Create Personas from names.
+
+    If --import is specified, copy the browser profile to the persona directory
+    and extract cookies.
+
+    Exit codes:
+        0: Success
+        1: Failure
+    """
+    from archivebox.misc.jsonl import write_record
+    from archivebox.personas.models import Persona
+    from archivebox.config.constants import CONSTANTS
+
+    is_tty = sys.stdout.isatty()
+    name_list = list(names) if names else []
+
+    if not name_list:
+        rprint('[yellow]No persona names provided. Pass names as arguments.[/yellow]', file=sys.stderr)
+        return 1
+
+    # Validate import source if specified
+    source_profile_dir = None
+    if import_from:
+        import_from = import_from.lower()
+        if import_from not in BROWSER_PROFILE_FINDERS:
+            rprint(f'[red]Unknown browser: {import_from}[/red]', file=sys.stderr)
+            rprint(f'[dim]Supported browsers: {", ".join(BROWSER_PROFILE_FINDERS.keys())}[/dim]', file=sys.stderr)
+            return 1
+
+        source_profile_dir = BROWSER_PROFILE_FINDERS[import_from]()
+        if not source_profile_dir:
+            rprint(f'[red]Could not find {import_from} profile directory[/red]', file=sys.stderr)
+            return 1
+
+        rprint(f'[dim]Found {import_from} profile: {source_profile_dir}[/dim]', file=sys.stderr)
+
+    created_count = 0
+    for name in name_list:
+        name = name.strip()
+        if not name:
+            continue
+
+        # Validate persona name to prevent path traversal
+        is_valid, error_msg = validate_persona_name(name)
+        if not is_valid:
+            rprint(f'[red]Invalid persona name "{name}": {error_msg}[/red]', file=sys.stderr)
+            continue
+
+        persona, created = Persona.objects.get_or_create(name=name)
+
+        if created:
+            persona.ensure_dirs()
+            created_count += 1
+            rprint(f'[green]Created persona: {name}[/green]', file=sys.stderr)
+        else:
+            rprint(f'[dim]Persona already exists: {name}[/dim]', file=sys.stderr)
+
+        # Import browser profile if requested
+        if import_from and source_profile_dir:
+            persona_chrome_dir = Path(persona.CHROME_USER_DATA_DIR)
+
+            # Copy the browser profile
+            rprint(f'[dim]Copying browser profile to {persona_chrome_dir}...[/dim]', file=sys.stderr)
+
+            try:
+                # Remove existing chrome_user_data if it exists
+                if persona_chrome_dir.exists():
+                    shutil.rmtree(persona_chrome_dir)
+
+                # Copy the profile directory
+                # We copy the entire user data dir, not just Default profile
+                shutil.copytree(
+                    source_profile_dir,
+                    persona_chrome_dir,
+                    symlinks=True,
+                    ignore=shutil.ignore_patterns(
+                        'Cache', 'Code Cache', 'GPUCache', 'ShaderCache',
+                        'Service Worker', 'GCM Store', '*.log', 'Crashpad',
+                        'BrowserMetrics', 'BrowserMetrics-spare.pma',
+                        'SingletonLock', 'SingletonSocket', 'SingletonCookie',
+                    ),
+                )
+                rprint(f'[green]Copied browser profile to persona[/green]', file=sys.stderr)
+
+                # Extract cookies via CDP
+                cookies_file = Path(persona.path) / 'cookies.txt'
+                rprint(f'[dim]Extracting cookies via CDP...[/dim]', file=sys.stderr)
+
+                if extract_cookies_via_cdp(persona_chrome_dir, cookies_file):
+                    rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr)
+                else:
+                    rprint(f'[yellow]Could not extract cookies automatically.[/yellow]', file=sys.stderr)
+                    rprint(f'[dim]You can manually export cookies using a browser extension.[/dim]', file=sys.stderr)
+
+            except Exception as e:
+                rprint(f'[red]Failed to copy browser profile: {e}[/red]', file=sys.stderr)
+                return 1
+
+        if not is_tty:
+            write_record({
+                'id': str(persona.id) if hasattr(persona, 'id') else None,
+                'name': persona.name,
+                'path': str(persona.path),
+                'CHROME_USER_DATA_DIR': persona.CHROME_USER_DATA_DIR,
+                'COOKIES_FILE': persona.COOKIES_FILE,
+            })
+
+    rprint(f'[green]Created {created_count} new persona(s)[/green]', file=sys.stderr)
+    return 0
+
+
+# =============================================================================
+# LIST
+# =============================================================================
+
+def list_personas(
+    name: Optional[str] = None,
+    name__icontains: Optional[str] = None,
+    limit: Optional[int] = None,
+) -> int:
+    """
+    List Personas as JSONL with optional filters.
+
+    Exit codes:
+        0: Success (even if no results)
+    """
+    from archivebox.misc.jsonl import write_record
+    from archivebox.personas.models import Persona
+
+    is_tty = sys.stdout.isatty()
+
+    queryset = Persona.objects.all().order_by('name')
+
+    # Apply filters
+    filter_kwargs = {
+        'name': name,
+        'name__icontains': name__icontains,
+    }
+    queryset = apply_filters(queryset, filter_kwargs, limit=limit)
+
+    count = 0
+    for persona in queryset:
+        cookies_status = '[green]✓[/green]' if persona.COOKIES_FILE else '[dim]✗[/dim]'
+        chrome_status = '[green]✓[/green]' if Path(persona.CHROME_USER_DATA_DIR).exists() else '[dim]✗[/dim]'
+
+        if is_tty:
+            rprint(f'[cyan]{persona.name:20}[/cyan] cookies:{cookies_status} chrome:{chrome_status} [dim]{persona.path}[/dim]')
+        else:
+            write_record({
+                'id': str(persona.id) if hasattr(persona, 'id') else None,
+                'name': persona.name,
+                'path': str(persona.path),
+                'CHROME_USER_DATA_DIR': persona.CHROME_USER_DATA_DIR,
+                'COOKIES_FILE': persona.COOKIES_FILE,
+            })
+        count += 1
+
+    rprint(f'[dim]Listed {count} persona(s)[/dim]', file=sys.stderr)
+    return 0
+
+
+# =============================================================================
+# UPDATE
+# =============================================================================
+
+def update_personas(name: Optional[str] = None) -> int:
+    """
+    Update Personas from stdin JSONL.
+
+    Reads Persona records from stdin and applies updates.
+    Uses PATCH semantics - only specified fields are updated.
+
+    Exit codes:
+        0: Success
+        1: No input or error
+    """
+    from archivebox.misc.jsonl import read_stdin, write_record
+    from archivebox.personas.models import Persona
+
+    is_tty = sys.stdout.isatty()
+
+    records = list(read_stdin())
+    if not records:
+        rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
+        return 1
+
+    updated_count = 0
+    for record in records:
+        persona_id = record.get('id')
+        old_name = record.get('name')
+
+        if not persona_id and not old_name:
+            continue
+
+        try:
+            if persona_id:
+                persona = Persona.objects.get(id=persona_id)
+            else:
+                persona = Persona.objects.get(name=old_name)
+
+            # Apply updates from CLI flags
+            if name:
+                # Validate new name to prevent path traversal
+                is_valid, error_msg = validate_persona_name(name)
+                if not is_valid:
+                    rprint(f'[red]Invalid new persona name "{name}": {error_msg}[/red]', file=sys.stderr)
+                    continue
+
+                # Rename the persona directory too
+                old_path = persona.path
+                persona.name = name
+                new_path = persona.path
+
+                if old_path.exists() and old_path != new_path:
+                    shutil.move(str(old_path), str(new_path))
+
+                persona.save()
+
+            updated_count += 1
+
+            if not is_tty:
+                write_record({
+                    'id': str(persona.id) if hasattr(persona, 'id') else None,
+                    'name': persona.name,
+                    'path': str(persona.path),
+                })
+
+        except Persona.DoesNotExist:
+            rprint(f'[yellow]Persona not found: {persona_id or old_name}[/yellow]', file=sys.stderr)
+            continue
+
+    rprint(f'[green]Updated {updated_count} persona(s)[/green]', file=sys.stderr)
+    return 0
+
+
+# =============================================================================
+# DELETE
+# =============================================================================
+
+def delete_personas(yes: bool = False, dry_run: bool = False) -> int:
+    """
+    Delete Personas from stdin JSONL.
+
+    Requires --yes flag to confirm deletion.
+
+    Exit codes:
+        0: Success
+        1: No input or missing --yes flag
+    """
+    from archivebox.misc.jsonl import read_stdin
+    from archivebox.personas.models import Persona
+
+    records = list(read_stdin())
+    if not records:
+        rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
+        return 1
+
+    # Collect persona IDs or names
+    persona_ids = []
+    persona_names = []
+    for r in records:
+        if r.get('id'):
+            persona_ids.append(r['id'])
+        elif r.get('name'):
+            persona_names.append(r['name'])
+
+    if not persona_ids and not persona_names:
+        rprint('[yellow]No valid persona IDs or names in input[/yellow]', file=sys.stderr)
+        return 1
+
+    from django.db.models import Q
+    query = Q()
+    if persona_ids:
+        query |= Q(id__in=persona_ids)
+    if persona_names:
+        query |= Q(name__in=persona_names)
+
+    personas = Persona.objects.filter(query)
+    count = personas.count()
+
+    if count == 0:
+        rprint('[yellow]No matching personas found[/yellow]', file=sys.stderr)
+        return 0
+
+    if dry_run:
+        rprint(f'[yellow]Would delete {count} persona(s) (dry run)[/yellow]', file=sys.stderr)
+        for persona in personas:
+            rprint(f'  {persona.name} ({persona.path})', file=sys.stderr)
+        return 0
+
+    if not yes:
+        rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
+        return 1
+
+    # Delete persona directories and database records
+    deleted_count = 0
+    for persona in personas:
+        persona_path = persona.path
+
+        # Safety check: ensure path is within PERSONAS_DIR before deletion
+        if not ensure_path_within_personas_dir(persona_path):
+            rprint(f'[red]Security error: persona path "{persona_path}" is outside PERSONAS_DIR. Skipping deletion.[/red]', file=sys.stderr)
+            continue
+
+        if persona_path.exists():
+            shutil.rmtree(persona_path)
+        persona.delete()
+        deleted_count += 1
+
+    rprint(f'[green]Deleted {deleted_count} persona(s)[/green]', file=sys.stderr)
+    return 0
+
+
+# =============================================================================
+# CLI Commands
+# =============================================================================
+
+@click.group()
+def main():
+    """Manage Persona records (browser profiles)."""
+    pass
+
+
+@main.command('create')
+@click.argument('names', nargs=-1)
+@click.option('--import', 'import_from', help='Import profile from browser (chrome, firefox, brave)')
+def create_cmd(names: tuple, import_from: Optional[str]):
+    """Create Personas, optionally importing from a browser profile."""
+    sys.exit(create_personas(names, import_from=import_from))
+
+
+@main.command('list')
+@click.option('--name', help='Filter by exact name')
+@click.option('--name__icontains', help='Filter by name contains')
+@click.option('--limit', '-n', type=int, help='Limit number of results')
+def list_cmd(name: Optional[str], name__icontains: Optional[str], limit: Optional[int]):
+    """List Personas as JSONL."""
+    sys.exit(list_personas(name=name, name__icontains=name__icontains, limit=limit))
+
+
+@main.command('update')
+@click.option('--name', '-n', help='Set new name')
+def update_cmd(name: Optional[str]):
+    """Update Personas from stdin JSONL."""
+    sys.exit(update_personas(name=name))
+
+
+@main.command('delete')
+@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
+@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
+def delete_cmd(yes: bool, dry_run: bool):
+    """Delete Personas from stdin JSONL."""
+    sys.exit(delete_personas(yes=yes, dry_run=dry_run))
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/chrome/chrome_utils.js
+++ b/archivebox/plugins/chrome/chrome_utils.js
@@ -203,7 +203,7 @@ function waitForDebugPort(port, timeout = 30000) {

 /**
 * Kill zombie Chrome processes from stale crawls.
- * Recursively scans DATA_DIR for any */chrome/*.pid files from stale crawls.
+ * Recursively scans DATA_DIR for any chrome/*.pid files from stale crawls.
 * Does not assume specific directory structure - works with nested paths.
 * @param {string} [dataDir] - Data directory (defaults to DATA_DIR env or '.')
 * @returns {number} - Number of zombies killed
--- a/archivebox/plugins/chrome/extract_cookies.js
+++ b/archivebox/plugins/chrome/extract_cookies.js
@@ -0,0 +1,254 @@
+#!/usr/bin/env node
+/**
+ * Extract cookies from Chrome via CDP and write to Netscape cookies.txt format.
+ *
+ * This script launches Chrome with a given user data directory, connects via CDP,
+ * extracts all cookies, and writes them to a cookies.txt file in Netscape format.
+ *
+ * Usage:
+ *   CHROME_USER_DATA_DIR=/path/to/profile COOKIES_OUTPUT_FILE=/path/to/cookies.txt node extract_cookies.js
+ *
+ * Environment variables:
+ *   CHROME_USER_DATA_DIR: Path to Chrome user data directory (required)
+ *   COOKIES_OUTPUT_FILE: Path to output cookies.txt file (required)
+ *   CHROME_HEADLESS: Run in headless mode (default: true)
+ *   NODE_MODULES_DIR: Path to node_modules for module resolution
+ */
+
+// Add NODE_MODULES_DIR to module resolution paths if set
+if (process.env.NODE_MODULES_DIR) {
+    module.paths.unshift(process.env.NODE_MODULES_DIR);
+}
+
+const fs = require('fs');
+const path = require('path');
+const {
+    findChromium,
+    launchChromium,
+    killChrome,
+    getEnv,
+} = require('./chrome_utils.js');
+
+/**
+ * Convert a cookie object to Netscape cookies.txt format line.
+ *
+ * Format: domain  includeSubdomains  path  secure  expiry  name  value
+ *
+ * @param {Object} cookie - CDP cookie object
+ * @returns {string} - Netscape format cookie line
+ */
+function cookieToNetscape(cookie) {
+    // Domain: prefix with . for domain cookies (not host-only)
+    let domain = cookie.domain;
+    if (!domain.startsWith('.') && !cookie.hostOnly) {
+        domain = '.' + domain;
+    }
+
+    // Include subdomains: TRUE if domain cookie (starts with .)
+    const includeSubdomains = domain.startsWith('.') ? 'TRUE' : 'FALSE';
+
+    // Path
+    const cookiePath = cookie.path || '/';
+
+    // Secure flag
+    const secure = cookie.secure ? 'TRUE' : 'FALSE';
+
+    // Expiry timestamp (0 for session cookies)
+    let expiry = '0';
+    if (cookie.expires && cookie.expires > 0) {
+        // CDP returns expiry in seconds since epoch
+        expiry = Math.floor(cookie.expires).toString();
+    }
+
+    // Name and value
+    const name = cookie.name;
+    const value = cookie.value;
+
+    return `${domain}\t${includeSubdomains}\t${cookiePath}\t${secure}\t${expiry}\t${name}\t${value}`;
+}
+
+/**
+ * Write cookies to Netscape cookies.txt format file.
+ *
+ * @param {Array} cookies - Array of CDP cookie objects
+ * @param {string} outputPath - Path to output file
+ */
+function writeCookiesFile(cookies, outputPath) {
+    const lines = [
+        '# Netscape HTTP Cookie File',
+        '# https://curl.se/docs/http-cookies.html',
+        '# This file was generated by ArchiveBox persona cookie extraction',
+        '#',
+        '# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue',
+        '',
+    ];
+
+    for (const cookie of cookies) {
+        lines.push(cookieToNetscape(cookie));
+    }
+
+    fs.writeFileSync(outputPath, lines.join('\n') + '\n');
+}
+
+async function main() {
+    const userDataDir = getEnv('CHROME_USER_DATA_DIR');
+    const outputFile = getEnv('COOKIES_OUTPUT_FILE');
+
+    if (!userDataDir) {
+        console.error('ERROR: CHROME_USER_DATA_DIR environment variable is required');
+        process.exit(1);
+    }
+
+    if (!outputFile) {
+        console.error('ERROR: COOKIES_OUTPUT_FILE environment variable is required');
+        process.exit(1);
+    }
+
+    if (!fs.existsSync(userDataDir)) {
+        console.error(`ERROR: User data directory does not exist: ${userDataDir}`);
+        process.exit(1);
+    }
+
+    const binary = findChromium();
+    if (!binary) {
+        console.error('ERROR: Chromium binary not found');
+        process.exit(1);
+    }
+
+    console.error(`[*] Extracting cookies from: ${userDataDir}`);
+    console.error(`[*] Output file: ${outputFile}`);
+    console.error(`[*] Using browser: ${binary}`);
+
+    // Create a temporary output directory for Chrome files
+    const outputDir = fs.mkdtempSync(path.join(require('os').tmpdir(), 'chrome-cookies-'));
+
+    let chromePid = null;
+
+    try {
+        // Launch Chrome with the user data directory
+        const result = await launchChromium({
+            binary,
+            outputDir,
+            userDataDir,
+            headless: true,
+            killZombies: false,  // Don't kill other Chrome instances
+        });
+
+        if (!result.success) {
+            console.error(`ERROR: Failed to launch Chrome: ${result.error}`);
+            process.exit(1);
+        }
+
+        chromePid = result.pid;
+        const cdpUrl = result.cdpUrl;
+        const port = result.port;
+
+        console.error(`[*] Chrome launched (PID: ${chromePid})`);
+        console.error(`[*] CDP URL: ${cdpUrl}`);
+
+        // Connect to CDP and get cookies
+        const http = require('http');
+
+        // Use CDP directly via HTTP to get all cookies
+        const getCookies = () => {
+            return new Promise((resolve, reject) => {
+                const req = http.request(
+                    {
+                        hostname: '127.0.0.1',
+                        port: port,
+                        path: '/json/list',
+                        method: 'GET',
+                    },
+                    (res) => {
+                        let data = '';
+                        res.on('data', (chunk) => (data += chunk));
+                        res.on('end', () => {
+                            try {
+                                const targets = JSON.parse(data);
+                                // Find a page target
+                                const pageTarget = targets.find(t => t.type === 'page') || targets[0];
+                                if (!pageTarget) {
+                                    reject(new Error('No page target found'));
+                                    return;
+                                }
+
+                                // Connect via WebSocket and send CDP command
+                                const WebSocket = require('ws');
+                                const ws = new WebSocket(pageTarget.webSocketDebuggerUrl);
+
+                                ws.on('open', () => {
+                                    ws.send(JSON.stringify({
+                                        id: 1,
+                                        method: 'Network.getAllCookies',
+                                    }));
+                                });
+
+                                ws.on('message', (message) => {
+                                    const response = JSON.parse(message);
+                                    if (response.id === 1) {
+                                        ws.close();
+                                        if (response.result && response.result.cookies) {
+                                            resolve(response.result.cookies);
+                                        } else {
+                                            reject(new Error('Failed to get cookies: ' + JSON.stringify(response)));
+                                        }
+                                    }
+                                });
+
+                                ws.on('error', (err) => {
+                                    reject(err);
+                                });
+                            } catch (e) {
+                                reject(e);
+                            }
+                        });
+                    }
+                );
+
+                req.on('error', reject);
+                req.end();
+            });
+        };
+
+        // Wait a moment for the browser to fully initialize
+        await new Promise(r => setTimeout(r, 2000));
+
+        console.error('[*] Fetching cookies via CDP...');
+        const cookies = await getCookies();
+
+        console.error(`[+] Retrieved ${cookies.length} cookies`);
+
+        // Write cookies to file
+        writeCookiesFile(cookies, outputFile);
+        console.error(`[+] Wrote cookies to: ${outputFile}`);
+
+        // Clean up
+        await killChrome(chromePid, outputDir);
+        chromePid = null;
+
+        // Remove temp directory
+        fs.rmSync(outputDir, { recursive: true, force: true });
+
+        console.error('[+] Cookie extraction complete');
+        process.exit(0);
+
+    } catch (error) {
+        console.error(`ERROR: ${error.message}`);
+
+        // Clean up on error
+        if (chromePid) {
+            await killChrome(chromePid, outputDir);
+        }
+
+        try {
+            fs.rmSync(outputDir, { recursive: true, force: true });
+        } catch (e) {}
+
+        process.exit(1);
+    }
+}
+
+main().catch((e) => {
+    console.error(`Fatal error: ${e.message}`);
+    process.exit(1);
+});