mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-02 17:05:38 +10:00
Add persona CLI command with browser cookie import (#1747)
This commit is contained in:
@@ -36,6 +36,7 @@ class ArchiveBoxGroup(click.Group):
|
||||
'binary': 'archivebox.cli.archivebox_binary.main',
|
||||
'process': 'archivebox.cli.archivebox_process.main',
|
||||
'machine': 'archivebox.cli.archivebox_machine.main',
|
||||
'persona': 'archivebox.cli.archivebox_persona.main',
|
||||
}
|
||||
archive_commands = {
|
||||
# High-level commands
|
||||
|
||||
623
archivebox/cli/archivebox_persona.py
Normal file
623
archivebox/cli/archivebox_persona.py
Normal file
@@ -0,0 +1,623 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
archivebox persona <action> [args...] [--filters]
|
||||
|
||||
Manage Persona records (browser profiles for archiving).
|
||||
|
||||
Actions:
|
||||
create - Create Personas
|
||||
list - List Personas as JSONL (with optional filters)
|
||||
update - Update Personas from stdin JSONL
|
||||
delete - Delete Personas from stdin JSONL
|
||||
|
||||
Examples:
|
||||
# Create a new persona
|
||||
archivebox persona create work
|
||||
archivebox persona create --import=chrome personal
|
||||
|
||||
# List all personas
|
||||
archivebox persona list
|
||||
|
||||
# Delete a persona
|
||||
archivebox persona list --name=old | archivebox persona delete --yes
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox persona'
|
||||
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
import platform
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Optional, Iterable
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
|
||||
from archivebox.cli.cli_utils import apply_filters
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Browser Profile Locations
|
||||
# =============================================================================
|
||||
|
||||
def get_chrome_user_data_dir() -> Optional[Path]:
|
||||
"""Get the default Chrome user data directory for the current platform."""
|
||||
system = platform.system()
|
||||
home = Path.home()
|
||||
|
||||
if system == 'Darwin': # macOS
|
||||
candidates = [
|
||||
home / 'Library' / 'Application Support' / 'Google' / 'Chrome',
|
||||
home / 'Library' / 'Application Support' / 'Chromium',
|
||||
]
|
||||
elif system == 'Linux':
|
||||
candidates = [
|
||||
home / '.config' / 'google-chrome',
|
||||
home / '.config' / 'chromium',
|
||||
home / '.config' / 'chrome',
|
||||
home / 'snap' / 'chromium' / 'common' / 'chromium',
|
||||
]
|
||||
elif system == 'Windows':
|
||||
local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local'))
|
||||
candidates = [
|
||||
local_app_data / 'Google' / 'Chrome' / 'User Data',
|
||||
local_app_data / 'Chromium' / 'User Data',
|
||||
]
|
||||
else:
|
||||
candidates = []
|
||||
|
||||
for candidate in candidates:
|
||||
if candidate.exists() and (candidate / 'Default').exists():
|
||||
return candidate
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_firefox_profile_dir() -> Optional[Path]:
|
||||
"""Get the default Firefox profile directory for the current platform."""
|
||||
system = platform.system()
|
||||
home = Path.home()
|
||||
|
||||
if system == 'Darwin':
|
||||
profiles_dir = home / 'Library' / 'Application Support' / 'Firefox' / 'Profiles'
|
||||
elif system == 'Linux':
|
||||
profiles_dir = home / '.mozilla' / 'firefox'
|
||||
elif system == 'Windows':
|
||||
app_data = Path(os.environ.get('APPDATA', home / 'AppData' / 'Roaming'))
|
||||
profiles_dir = app_data / 'Mozilla' / 'Firefox' / 'Profiles'
|
||||
else:
|
||||
return None
|
||||
|
||||
if not profiles_dir.exists():
|
||||
return None
|
||||
|
||||
# Find the default profile (usually ends with .default or .default-release)
|
||||
for profile in profiles_dir.iterdir():
|
||||
if profile.is_dir() and ('default' in profile.name.lower()):
|
||||
return profile
|
||||
|
||||
# If no default found, return the first profile
|
||||
profiles = [p for p in profiles_dir.iterdir() if p.is_dir()]
|
||||
return profiles[0] if profiles else None
|
||||
|
||||
|
||||
def get_brave_user_data_dir() -> Optional[Path]:
|
||||
"""Get the default Brave user data directory for the current platform."""
|
||||
system = platform.system()
|
||||
home = Path.home()
|
||||
|
||||
if system == 'Darwin':
|
||||
candidates = [
|
||||
home / 'Library' / 'Application Support' / 'BraveSoftware' / 'Brave-Browser',
|
||||
]
|
||||
elif system == 'Linux':
|
||||
candidates = [
|
||||
home / '.config' / 'BraveSoftware' / 'Brave-Browser',
|
||||
]
|
||||
elif system == 'Windows':
|
||||
local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local'))
|
||||
candidates = [
|
||||
local_app_data / 'BraveSoftware' / 'Brave-Browser' / 'User Data',
|
||||
]
|
||||
else:
|
||||
candidates = []
|
||||
|
||||
for candidate in candidates:
|
||||
if candidate.exists() and (candidate / 'Default').exists():
|
||||
return candidate
|
||||
|
||||
return None
|
||||
|
||||
|
||||
BROWSER_PROFILE_FINDERS = {
|
||||
'chrome': get_chrome_user_data_dir,
|
||||
'chromium': get_chrome_user_data_dir, # Same locations
|
||||
'firefox': get_firefox_profile_dir,
|
||||
'brave': get_brave_user_data_dir,
|
||||
}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Cookie Extraction via CDP
|
||||
# =============================================================================
|
||||
|
||||
def extract_cookies_via_cdp(user_data_dir: Path, output_file: Path) -> bool:
|
||||
"""
|
||||
Launch Chrome with the given user data dir and extract cookies via CDP.
|
||||
|
||||
Returns True if successful, False otherwise.
|
||||
"""
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
|
||||
# Find the cookie extraction script
|
||||
chrome_plugin_dir = Path(__file__).parent.parent / 'plugins' / 'chrome'
|
||||
extract_script = chrome_plugin_dir / 'extract_cookies.js'
|
||||
|
||||
if not extract_script.exists():
|
||||
rprint(f'[yellow]Cookie extraction script not found at {extract_script}[/yellow]', file=sys.stderr)
|
||||
return False
|
||||
|
||||
# Get node modules dir
|
||||
node_modules_dir = CONSTANTS.LIB_DIR / 'npm' / 'node_modules'
|
||||
|
||||
# Set up environment
|
||||
env = os.environ.copy()
|
||||
env['NODE_MODULES_DIR'] = str(node_modules_dir)
|
||||
env['CHROME_USER_DATA_DIR'] = str(user_data_dir)
|
||||
env['COOKIES_OUTPUT_FILE'] = str(output_file)
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['node', str(extract_script)],
|
||||
env=env,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
return True
|
||||
else:
|
||||
rprint(f'[yellow]Cookie extraction failed: {result.stderr}[/yellow]', file=sys.stderr)
|
||||
return False
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
rprint('[yellow]Cookie extraction timed out[/yellow]', file=sys.stderr)
|
||||
return False
|
||||
except FileNotFoundError:
|
||||
rprint('[yellow]Node.js not found. Cannot extract cookies.[/yellow]', file=sys.stderr)
|
||||
return False
|
||||
except Exception as e:
|
||||
rprint(f'[yellow]Cookie extraction error: {e}[/yellow]', file=sys.stderr)
|
||||
return False
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Validation Helpers
|
||||
# =============================================================================
|
||||
|
||||
def validate_persona_name(name: str) -> tuple[bool, str]:
|
||||
"""
|
||||
Validate persona name to prevent path traversal attacks.
|
||||
|
||||
Returns:
|
||||
(is_valid, error_message): tuple indicating if name is valid
|
||||
"""
|
||||
if not name or not name.strip():
|
||||
return False, "Persona name cannot be empty"
|
||||
|
||||
# Check for path separators
|
||||
if '/' in name or '\\' in name:
|
||||
return False, "Persona name cannot contain path separators (/ or \\)"
|
||||
|
||||
# Check for parent directory references
|
||||
if '..' in name:
|
||||
return False, "Persona name cannot contain parent directory references (..)"
|
||||
|
||||
# Check for hidden files/directories
|
||||
if name.startswith('.'):
|
||||
return False, "Persona name cannot start with a dot (.)"
|
||||
|
||||
# Ensure name doesn't contain null bytes or other dangerous chars
|
||||
if '\x00' in name or '\n' in name or '\r' in name:
|
||||
return False, "Persona name contains invalid characters"
|
||||
|
||||
return True, ""
|
||||
|
||||
|
||||
def ensure_path_within_personas_dir(persona_path: Path) -> bool:
|
||||
"""
|
||||
Verify that a persona path is within PERSONAS_DIR.
|
||||
|
||||
This is a safety check to prevent path traversal attacks where
|
||||
a malicious persona name could cause operations on paths outside
|
||||
the expected PERSONAS_DIR.
|
||||
|
||||
Returns:
|
||||
True if path is safe, False otherwise
|
||||
"""
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
|
||||
try:
|
||||
# Resolve both paths to absolute paths
|
||||
personas_dir = CONSTANTS.PERSONAS_DIR.resolve()
|
||||
resolved_path = persona_path.resolve()
|
||||
|
||||
# Check if resolved_path is a child of personas_dir
|
||||
return resolved_path.is_relative_to(personas_dir)
|
||||
except (ValueError, RuntimeError):
|
||||
return False
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CREATE
|
||||
# =============================================================================
|
||||
|
||||
def create_personas(
|
||||
names: Iterable[str],
|
||||
import_from: Optional[str] = None,
|
||||
) -> int:
|
||||
"""
|
||||
Create Personas from names.
|
||||
|
||||
If --import is specified, copy the browser profile to the persona directory
|
||||
and extract cookies.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: Failure
|
||||
"""
|
||||
from archivebox.misc.jsonl import write_record
|
||||
from archivebox.personas.models import Persona
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
name_list = list(names) if names else []
|
||||
|
||||
if not name_list:
|
||||
rprint('[yellow]No persona names provided. Pass names as arguments.[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Validate import source if specified
|
||||
source_profile_dir = None
|
||||
if import_from:
|
||||
import_from = import_from.lower()
|
||||
if import_from not in BROWSER_PROFILE_FINDERS:
|
||||
rprint(f'[red]Unknown browser: {import_from}[/red]', file=sys.stderr)
|
||||
rprint(f'[dim]Supported browsers: {", ".join(BROWSER_PROFILE_FINDERS.keys())}[/dim]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
source_profile_dir = BROWSER_PROFILE_FINDERS[import_from]()
|
||||
if not source_profile_dir:
|
||||
rprint(f'[red]Could not find {import_from} profile directory[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
rprint(f'[dim]Found {import_from} profile: {source_profile_dir}[/dim]', file=sys.stderr)
|
||||
|
||||
created_count = 0
|
||||
for name in name_list:
|
||||
name = name.strip()
|
||||
if not name:
|
||||
continue
|
||||
|
||||
# Validate persona name to prevent path traversal
|
||||
is_valid, error_msg = validate_persona_name(name)
|
||||
if not is_valid:
|
||||
rprint(f'[red]Invalid persona name "{name}": {error_msg}[/red]', file=sys.stderr)
|
||||
continue
|
||||
|
||||
persona, created = Persona.objects.get_or_create(name=name)
|
||||
|
||||
if created:
|
||||
persona.ensure_dirs()
|
||||
created_count += 1
|
||||
rprint(f'[green]Created persona: {name}[/green]', file=sys.stderr)
|
||||
else:
|
||||
rprint(f'[dim]Persona already exists: {name}[/dim]', file=sys.stderr)
|
||||
|
||||
# Import browser profile if requested
|
||||
if import_from and source_profile_dir:
|
||||
persona_chrome_dir = Path(persona.CHROME_USER_DATA_DIR)
|
||||
|
||||
# Copy the browser profile
|
||||
rprint(f'[dim]Copying browser profile to {persona_chrome_dir}...[/dim]', file=sys.stderr)
|
||||
|
||||
try:
|
||||
# Remove existing chrome_user_data if it exists
|
||||
if persona_chrome_dir.exists():
|
||||
shutil.rmtree(persona_chrome_dir)
|
||||
|
||||
# Copy the profile directory
|
||||
# We copy the entire user data dir, not just Default profile
|
||||
shutil.copytree(
|
||||
source_profile_dir,
|
||||
persona_chrome_dir,
|
||||
symlinks=True,
|
||||
ignore=shutil.ignore_patterns(
|
||||
'Cache', 'Code Cache', 'GPUCache', 'ShaderCache',
|
||||
'Service Worker', 'GCM Store', '*.log', 'Crashpad',
|
||||
'BrowserMetrics', 'BrowserMetrics-spare.pma',
|
||||
'SingletonLock', 'SingletonSocket', 'SingletonCookie',
|
||||
),
|
||||
)
|
||||
rprint(f'[green]Copied browser profile to persona[/green]', file=sys.stderr)
|
||||
|
||||
# Extract cookies via CDP
|
||||
cookies_file = Path(persona.path) / 'cookies.txt'
|
||||
rprint(f'[dim]Extracting cookies via CDP...[/dim]', file=sys.stderr)
|
||||
|
||||
if extract_cookies_via_cdp(persona_chrome_dir, cookies_file):
|
||||
rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr)
|
||||
else:
|
||||
rprint(f'[yellow]Could not extract cookies automatically.[/yellow]', file=sys.stderr)
|
||||
rprint(f'[dim]You can manually export cookies using a browser extension.[/dim]', file=sys.stderr)
|
||||
|
||||
except Exception as e:
|
||||
rprint(f'[red]Failed to copy browser profile: {e}[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if not is_tty:
|
||||
write_record({
|
||||
'id': str(persona.id) if hasattr(persona, 'id') else None,
|
||||
'name': persona.name,
|
||||
'path': str(persona.path),
|
||||
'CHROME_USER_DATA_DIR': persona.CHROME_USER_DATA_DIR,
|
||||
'COOKIES_FILE': persona.COOKIES_FILE,
|
||||
})
|
||||
|
||||
rprint(f'[green]Created {created_count} new persona(s)[/green]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# LIST
|
||||
# =============================================================================
|
||||
|
||||
def list_personas(
|
||||
name: Optional[str] = None,
|
||||
name__icontains: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
) -> int:
|
||||
"""
|
||||
List Personas as JSONL with optional filters.
|
||||
|
||||
Exit codes:
|
||||
0: Success (even if no results)
|
||||
"""
|
||||
from archivebox.misc.jsonl import write_record
|
||||
from archivebox.personas.models import Persona
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
queryset = Persona.objects.all().order_by('name')
|
||||
|
||||
# Apply filters
|
||||
filter_kwargs = {
|
||||
'name': name,
|
||||
'name__icontains': name__icontains,
|
||||
}
|
||||
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
|
||||
|
||||
count = 0
|
||||
for persona in queryset:
|
||||
cookies_status = '[green]✓[/green]' if persona.COOKIES_FILE else '[dim]✗[/dim]'
|
||||
chrome_status = '[green]✓[/green]' if Path(persona.CHROME_USER_DATA_DIR).exists() else '[dim]✗[/dim]'
|
||||
|
||||
if is_tty:
|
||||
rprint(f'[cyan]{persona.name:20}[/cyan] cookies:{cookies_status} chrome:{chrome_status} [dim]{persona.path}[/dim]')
|
||||
else:
|
||||
write_record({
|
||||
'id': str(persona.id) if hasattr(persona, 'id') else None,
|
||||
'name': persona.name,
|
||||
'path': str(persona.path),
|
||||
'CHROME_USER_DATA_DIR': persona.CHROME_USER_DATA_DIR,
|
||||
'COOKIES_FILE': persona.COOKIES_FILE,
|
||||
})
|
||||
count += 1
|
||||
|
||||
rprint(f'[dim]Listed {count} persona(s)[/dim]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# UPDATE
|
||||
# =============================================================================
|
||||
|
||||
def update_personas(name: Optional[str] = None) -> int:
|
||||
"""
|
||||
Update Personas from stdin JSONL.
|
||||
|
||||
Reads Persona records from stdin and applies updates.
|
||||
Uses PATCH semantics - only specified fields are updated.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: No input or error
|
||||
"""
|
||||
from archivebox.misc.jsonl import read_stdin, write_record
|
||||
from archivebox.personas.models import Persona
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
updated_count = 0
|
||||
for record in records:
|
||||
persona_id = record.get('id')
|
||||
old_name = record.get('name')
|
||||
|
||||
if not persona_id and not old_name:
|
||||
continue
|
||||
|
||||
try:
|
||||
if persona_id:
|
||||
persona = Persona.objects.get(id=persona_id)
|
||||
else:
|
||||
persona = Persona.objects.get(name=old_name)
|
||||
|
||||
# Apply updates from CLI flags
|
||||
if name:
|
||||
# Validate new name to prevent path traversal
|
||||
is_valid, error_msg = validate_persona_name(name)
|
||||
if not is_valid:
|
||||
rprint(f'[red]Invalid new persona name "{name}": {error_msg}[/red]', file=sys.stderr)
|
||||
continue
|
||||
|
||||
# Rename the persona directory too
|
||||
old_path = persona.path
|
||||
persona.name = name
|
||||
new_path = persona.path
|
||||
|
||||
if old_path.exists() and old_path != new_path:
|
||||
shutil.move(str(old_path), str(new_path))
|
||||
|
||||
persona.save()
|
||||
|
||||
updated_count += 1
|
||||
|
||||
if not is_tty:
|
||||
write_record({
|
||||
'id': str(persona.id) if hasattr(persona, 'id') else None,
|
||||
'name': persona.name,
|
||||
'path': str(persona.path),
|
||||
})
|
||||
|
||||
except Persona.DoesNotExist:
|
||||
rprint(f'[yellow]Persona not found: {persona_id or old_name}[/yellow]', file=sys.stderr)
|
||||
continue
|
||||
|
||||
rprint(f'[green]Updated {updated_count} persona(s)[/green]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# DELETE
|
||||
# =============================================================================
|
||||
|
||||
def delete_personas(yes: bool = False, dry_run: bool = False) -> int:
|
||||
"""
|
||||
Delete Personas from stdin JSONL.
|
||||
|
||||
Requires --yes flag to confirm deletion.
|
||||
|
||||
Exit codes:
|
||||
0: Success
|
||||
1: No input or missing --yes flag
|
||||
"""
|
||||
from archivebox.misc.jsonl import read_stdin
|
||||
from archivebox.personas.models import Persona
|
||||
|
||||
records = list(read_stdin())
|
||||
if not records:
|
||||
rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Collect persona IDs or names
|
||||
persona_ids = []
|
||||
persona_names = []
|
||||
for r in records:
|
||||
if r.get('id'):
|
||||
persona_ids.append(r['id'])
|
||||
elif r.get('name'):
|
||||
persona_names.append(r['name'])
|
||||
|
||||
if not persona_ids and not persona_names:
|
||||
rprint('[yellow]No valid persona IDs or names in input[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
from django.db.models import Q
|
||||
query = Q()
|
||||
if persona_ids:
|
||||
query |= Q(id__in=persona_ids)
|
||||
if persona_names:
|
||||
query |= Q(name__in=persona_names)
|
||||
|
||||
personas = Persona.objects.filter(query)
|
||||
count = personas.count()
|
||||
|
||||
if count == 0:
|
||||
rprint('[yellow]No matching personas found[/yellow]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if dry_run:
|
||||
rprint(f'[yellow]Would delete {count} persona(s) (dry run)[/yellow]', file=sys.stderr)
|
||||
for persona in personas:
|
||||
rprint(f' {persona.name} ({persona.path})', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
if not yes:
|
||||
rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Delete persona directories and database records
|
||||
deleted_count = 0
|
||||
for persona in personas:
|
||||
persona_path = persona.path
|
||||
|
||||
# Safety check: ensure path is within PERSONAS_DIR before deletion
|
||||
if not ensure_path_within_personas_dir(persona_path):
|
||||
rprint(f'[red]Security error: persona path "{persona_path}" is outside PERSONAS_DIR. Skipping deletion.[/red]', file=sys.stderr)
|
||||
continue
|
||||
|
||||
if persona_path.exists():
|
||||
shutil.rmtree(persona_path)
|
||||
persona.delete()
|
||||
deleted_count += 1
|
||||
|
||||
rprint(f'[green]Deleted {deleted_count} persona(s)[/green]', file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CLI Commands
|
||||
# =============================================================================
|
||||
|
||||
@click.group()
|
||||
def main():
|
||||
"""Manage Persona records (browser profiles)."""
|
||||
pass
|
||||
|
||||
|
||||
@main.command('create')
|
||||
@click.argument('names', nargs=-1)
|
||||
@click.option('--import', 'import_from', help='Import profile from browser (chrome, firefox, brave)')
|
||||
def create_cmd(names: tuple, import_from: Optional[str]):
|
||||
"""Create Personas, optionally importing from a browser profile."""
|
||||
sys.exit(create_personas(names, import_from=import_from))
|
||||
|
||||
|
||||
@main.command('list')
|
||||
@click.option('--name', help='Filter by exact name')
|
||||
@click.option('--name__icontains', help='Filter by name contains')
|
||||
@click.option('--limit', '-n', type=int, help='Limit number of results')
|
||||
def list_cmd(name: Optional[str], name__icontains: Optional[str], limit: Optional[int]):
|
||||
"""List Personas as JSONL."""
|
||||
sys.exit(list_personas(name=name, name__icontains=name__icontains, limit=limit))
|
||||
|
||||
|
||||
@main.command('update')
|
||||
@click.option('--name', '-n', help='Set new name')
|
||||
def update_cmd(name: Optional[str]):
|
||||
"""Update Personas from stdin JSONL."""
|
||||
sys.exit(update_personas(name=name))
|
||||
|
||||
|
||||
@main.command('delete')
|
||||
@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
|
||||
@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
|
||||
def delete_cmd(yes: bool, dry_run: bool):
|
||||
"""Delete Personas from stdin JSONL."""
|
||||
sys.exit(delete_personas(yes=yes, dry_run=dry_run))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -203,7 +203,7 @@ function waitForDebugPort(port, timeout = 30000) {
|
||||
|
||||
/**
|
||||
* Kill zombie Chrome processes from stale crawls.
|
||||
* Recursively scans DATA_DIR for any */chrome/*.pid files from stale crawls.
|
||||
* Recursively scans DATA_DIR for any chrome/*.pid files from stale crawls.
|
||||
* Does not assume specific directory structure - works with nested paths.
|
||||
* @param {string} [dataDir] - Data directory (defaults to DATA_DIR env or '.')
|
||||
* @returns {number} - Number of zombies killed
|
||||
|
||||
254
archivebox/plugins/chrome/extract_cookies.js
Normal file
254
archivebox/plugins/chrome/extract_cookies.js
Normal file
@@ -0,0 +1,254 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Extract cookies from Chrome via CDP and write to Netscape cookies.txt format.
|
||||
*
|
||||
* This script launches Chrome with a given user data directory, connects via CDP,
|
||||
* extracts all cookies, and writes them to a cookies.txt file in Netscape format.
|
||||
*
|
||||
* Usage:
|
||||
* CHROME_USER_DATA_DIR=/path/to/profile COOKIES_OUTPUT_FILE=/path/to/cookies.txt node extract_cookies.js
|
||||
*
|
||||
* Environment variables:
|
||||
* CHROME_USER_DATA_DIR: Path to Chrome user data directory (required)
|
||||
* COOKIES_OUTPUT_FILE: Path to output cookies.txt file (required)
|
||||
* CHROME_HEADLESS: Run in headless mode (default: true)
|
||||
* NODE_MODULES_DIR: Path to node_modules for module resolution
|
||||
*/
|
||||
|
||||
// Add NODE_MODULES_DIR to module resolution paths if set
|
||||
if (process.env.NODE_MODULES_DIR) {
|
||||
module.paths.unshift(process.env.NODE_MODULES_DIR);
|
||||
}
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const {
|
||||
findChromium,
|
||||
launchChromium,
|
||||
killChrome,
|
||||
getEnv,
|
||||
} = require('./chrome_utils.js');
|
||||
|
||||
/**
|
||||
* Convert a cookie object to Netscape cookies.txt format line.
|
||||
*
|
||||
* Format: domain includeSubdomains path secure expiry name value
|
||||
*
|
||||
* @param {Object} cookie - CDP cookie object
|
||||
* @returns {string} - Netscape format cookie line
|
||||
*/
|
||||
function cookieToNetscape(cookie) {
|
||||
// Domain: prefix with . for domain cookies (not host-only)
|
||||
let domain = cookie.domain;
|
||||
if (!domain.startsWith('.') && !cookie.hostOnly) {
|
||||
domain = '.' + domain;
|
||||
}
|
||||
|
||||
// Include subdomains: TRUE if domain cookie (starts with .)
|
||||
const includeSubdomains = domain.startsWith('.') ? 'TRUE' : 'FALSE';
|
||||
|
||||
// Path
|
||||
const cookiePath = cookie.path || '/';
|
||||
|
||||
// Secure flag
|
||||
const secure = cookie.secure ? 'TRUE' : 'FALSE';
|
||||
|
||||
// Expiry timestamp (0 for session cookies)
|
||||
let expiry = '0';
|
||||
if (cookie.expires && cookie.expires > 0) {
|
||||
// CDP returns expiry in seconds since epoch
|
||||
expiry = Math.floor(cookie.expires).toString();
|
||||
}
|
||||
|
||||
// Name and value
|
||||
const name = cookie.name;
|
||||
const value = cookie.value;
|
||||
|
||||
return `${domain}\t${includeSubdomains}\t${cookiePath}\t${secure}\t${expiry}\t${name}\t${value}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Write cookies to Netscape cookies.txt format file.
|
||||
*
|
||||
* @param {Array} cookies - Array of CDP cookie objects
|
||||
* @param {string} outputPath - Path to output file
|
||||
*/
|
||||
function writeCookiesFile(cookies, outputPath) {
|
||||
const lines = [
|
||||
'# Netscape HTTP Cookie File',
|
||||
'# https://curl.se/docs/http-cookies.html',
|
||||
'# This file was generated by ArchiveBox persona cookie extraction',
|
||||
'#',
|
||||
'# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue',
|
||||
'',
|
||||
];
|
||||
|
||||
for (const cookie of cookies) {
|
||||
lines.push(cookieToNetscape(cookie));
|
||||
}
|
||||
|
||||
fs.writeFileSync(outputPath, lines.join('\n') + '\n');
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const userDataDir = getEnv('CHROME_USER_DATA_DIR');
|
||||
const outputFile = getEnv('COOKIES_OUTPUT_FILE');
|
||||
|
||||
if (!userDataDir) {
|
||||
console.error('ERROR: CHROME_USER_DATA_DIR environment variable is required');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
if (!outputFile) {
|
||||
console.error('ERROR: COOKIES_OUTPUT_FILE environment variable is required');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
if (!fs.existsSync(userDataDir)) {
|
||||
console.error(`ERROR: User data directory does not exist: ${userDataDir}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const binary = findChromium();
|
||||
if (!binary) {
|
||||
console.error('ERROR: Chromium binary not found');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.error(`[*] Extracting cookies from: ${userDataDir}`);
|
||||
console.error(`[*] Output file: ${outputFile}`);
|
||||
console.error(`[*] Using browser: ${binary}`);
|
||||
|
||||
// Create a temporary output directory for Chrome files
|
||||
const outputDir = fs.mkdtempSync(path.join(require('os').tmpdir(), 'chrome-cookies-'));
|
||||
|
||||
let chromePid = null;
|
||||
|
||||
try {
|
||||
// Launch Chrome with the user data directory
|
||||
const result = await launchChromium({
|
||||
binary,
|
||||
outputDir,
|
||||
userDataDir,
|
||||
headless: true,
|
||||
killZombies: false, // Don't kill other Chrome instances
|
||||
});
|
||||
|
||||
if (!result.success) {
|
||||
console.error(`ERROR: Failed to launch Chrome: ${result.error}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
chromePid = result.pid;
|
||||
const cdpUrl = result.cdpUrl;
|
||||
const port = result.port;
|
||||
|
||||
console.error(`[*] Chrome launched (PID: ${chromePid})`);
|
||||
console.error(`[*] CDP URL: ${cdpUrl}`);
|
||||
|
||||
// Connect to CDP and get cookies
|
||||
const http = require('http');
|
||||
|
||||
// Use CDP directly via HTTP to get all cookies
|
||||
const getCookies = () => {
|
||||
return new Promise((resolve, reject) => {
|
||||
const req = http.request(
|
||||
{
|
||||
hostname: '127.0.0.1',
|
||||
port: port,
|
||||
path: '/json/list',
|
||||
method: 'GET',
|
||||
},
|
||||
(res) => {
|
||||
let data = '';
|
||||
res.on('data', (chunk) => (data += chunk));
|
||||
res.on('end', () => {
|
||||
try {
|
||||
const targets = JSON.parse(data);
|
||||
// Find a page target
|
||||
const pageTarget = targets.find(t => t.type === 'page') || targets[0];
|
||||
if (!pageTarget) {
|
||||
reject(new Error('No page target found'));
|
||||
return;
|
||||
}
|
||||
|
||||
// Connect via WebSocket and send CDP command
|
||||
const WebSocket = require('ws');
|
||||
const ws = new WebSocket(pageTarget.webSocketDebuggerUrl);
|
||||
|
||||
ws.on('open', () => {
|
||||
ws.send(JSON.stringify({
|
||||
id: 1,
|
||||
method: 'Network.getAllCookies',
|
||||
}));
|
||||
});
|
||||
|
||||
ws.on('message', (message) => {
|
||||
const response = JSON.parse(message);
|
||||
if (response.id === 1) {
|
||||
ws.close();
|
||||
if (response.result && response.result.cookies) {
|
||||
resolve(response.result.cookies);
|
||||
} else {
|
||||
reject(new Error('Failed to get cookies: ' + JSON.stringify(response)));
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
ws.on('error', (err) => {
|
||||
reject(err);
|
||||
});
|
||||
} catch (e) {
|
||||
reject(e);
|
||||
}
|
||||
});
|
||||
}
|
||||
);
|
||||
|
||||
req.on('error', reject);
|
||||
req.end();
|
||||
});
|
||||
};
|
||||
|
||||
// Wait a moment for the browser to fully initialize
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
|
||||
console.error('[*] Fetching cookies via CDP...');
|
||||
const cookies = await getCookies();
|
||||
|
||||
console.error(`[+] Retrieved ${cookies.length} cookies`);
|
||||
|
||||
// Write cookies to file
|
||||
writeCookiesFile(cookies, outputFile);
|
||||
console.error(`[+] Wrote cookies to: ${outputFile}`);
|
||||
|
||||
// Clean up
|
||||
await killChrome(chromePid, outputDir);
|
||||
chromePid = null;
|
||||
|
||||
// Remove temp directory
|
||||
fs.rmSync(outputDir, { recursive: true, force: true });
|
||||
|
||||
console.error('[+] Cookie extraction complete');
|
||||
process.exit(0);
|
||||
|
||||
} catch (error) {
|
||||
console.error(`ERROR: ${error.message}`);
|
||||
|
||||
// Clean up on error
|
||||
if (chromePid) {
|
||||
await killChrome(chromePid, outputDir);
|
||||
}
|
||||
|
||||
try {
|
||||
fs.rmSync(outputDir, { recursive: true, force: true });
|
||||
} catch (e) {}
|
||||
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main().catch((e) => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
Reference in New Issue
Block a user