This commit is contained in:
Nick Sweeting
2026-01-21 03:19:56 -08:00
parent f3f55d3395
commit ec4b27056e
113 changed files with 6929 additions and 2396 deletions

View File

@@ -15,6 +15,7 @@ Examples:
# Create a new persona
archivebox persona create work
archivebox persona create --import=chrome personal
archivebox persona create --import=edge work
# List all personas
archivebox persona list
@@ -34,6 +35,7 @@ import subprocess
import tempfile
from pathlib import Path
from typing import Optional, Iterable
from collections import OrderedDict
import rich_click as click
from rich import print as rprint
@@ -78,34 +80,6 @@ def get_chrome_user_data_dir() -> Optional[Path]:
return None
def get_firefox_profile_dir() -> Optional[Path]:
"""Get the default Firefox profile directory for the current platform."""
system = platform.system()
home = Path.home()
if system == 'Darwin':
profiles_dir = home / 'Library' / 'Application Support' / 'Firefox' / 'Profiles'
elif system == 'Linux':
profiles_dir = home / '.mozilla' / 'firefox'
elif system == 'Windows':
app_data = Path(os.environ.get('APPDATA', home / 'AppData' / 'Roaming'))
profiles_dir = app_data / 'Mozilla' / 'Firefox' / 'Profiles'
else:
return None
if not profiles_dir.exists():
return None
# Find the default profile (usually ends with .default or .default-release)
for profile in profiles_dir.iterdir():
if profile.is_dir() and ('default' in profile.name.lower()):
return profile
# If no default found, return the first profile
profiles = [p for p in profiles_dir.iterdir() if p.is_dir()]
return profiles[0] if profiles else None
def get_brave_user_data_dir() -> Optional[Path]:
"""Get the default Brave user data directory for the current platform."""
system = platform.system()
@@ -134,25 +108,99 @@ def get_brave_user_data_dir() -> Optional[Path]:
return None
def get_edge_user_data_dir() -> Optional[Path]:
"""Get the default Edge user data directory for the current platform."""
system = platform.system()
home = Path.home()
if system == 'Darwin':
candidates = [
home / 'Library' / 'Application Support' / 'Microsoft Edge',
]
elif system == 'Linux':
candidates = [
home / '.config' / 'microsoft-edge',
home / '.config' / 'microsoft-edge-beta',
home / '.config' / 'microsoft-edge-dev',
]
elif system == 'Windows':
local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local'))
candidates = [
local_app_data / 'Microsoft' / 'Edge' / 'User Data',
]
else:
candidates = []
for candidate in candidates:
if candidate.exists() and (candidate / 'Default').exists():
return candidate
return None
BROWSER_PROFILE_FINDERS = {
'chrome': get_chrome_user_data_dir,
'chromium': get_chrome_user_data_dir, # Same locations
'firefox': get_firefox_profile_dir,
'brave': get_brave_user_data_dir,
'edge': get_edge_user_data_dir,
}
CHROMIUM_BROWSERS = {'chrome', 'chromium', 'brave', 'edge'}
# =============================================================================
# Cookie Extraction via CDP
# =============================================================================
NETSCAPE_COOKIE_HEADER = [
'# Netscape HTTP Cookie File',
'# https://curl.se/docs/http-cookies.html',
'# This file was generated by ArchiveBox persona cookie extraction',
'#',
'# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue',
'',
]
def _parse_netscape_cookies(path: Path) -> "OrderedDict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]]":
cookies = OrderedDict()
if not path.exists():
return cookies
for line in path.read_text().splitlines():
if not line or line.startswith('#'):
continue
parts = line.split('\t')
if len(parts) < 7:
continue
domain, include_subdomains, cookie_path, secure, expiry, name, value = parts[:7]
key = (domain, cookie_path, name)
cookies[key] = (domain, include_subdomains, cookie_path, secure, expiry, name, value)
return cookies
def _write_netscape_cookies(path: Path, cookies: "OrderedDict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]]") -> None:
lines = list(NETSCAPE_COOKIE_HEADER)
for cookie in cookies.values():
lines.append('\t'.join(cookie))
path.write_text('\n'.join(lines) + '\n')
def _merge_netscape_cookies(existing_file: Path, new_file: Path) -> None:
existing = _parse_netscape_cookies(existing_file)
new = _parse_netscape_cookies(new_file)
for key, cookie in new.items():
existing[key] = cookie
_write_netscape_cookies(existing_file, existing)
def extract_cookies_via_cdp(user_data_dir: Path, output_file: Path) -> bool:
"""
Launch Chrome with the given user data dir and extract cookies via CDP.
Returns True if successful, False otherwise.
"""
from archivebox.config.constants import CONSTANTS
from archivebox.config.common import STORAGE_CONFIG
# Find the cookie extraction script
chrome_plugin_dir = Path(__file__).parent.parent / 'plugins' / 'chrome'
@@ -163,14 +211,21 @@ def extract_cookies_via_cdp(user_data_dir: Path, output_file: Path) -> bool:
return False
# Get node modules dir
node_modules_dir = CONSTANTS.LIB_DIR / 'npm' / 'node_modules'
node_modules_dir = STORAGE_CONFIG.LIB_DIR / 'npm' / 'node_modules'
# Set up environment
env = os.environ.copy()
env['NODE_MODULES_DIR'] = str(node_modules_dir)
env['CHROME_USER_DATA_DIR'] = str(user_data_dir)
env['COOKIES_OUTPUT_FILE'] = str(output_file)
env['CHROME_HEADLESS'] = 'true'
output_path = output_file
temp_output = None
temp_dir = None
if output_file.exists():
temp_dir = Path(tempfile.mkdtemp(prefix='ab_cookies_'))
temp_output = temp_dir / 'cookies.txt'
output_path = temp_output
env['COOKIES_OUTPUT_FILE'] = str(output_path)
try:
result = subprocess.run(
@@ -182,6 +237,8 @@ def extract_cookies_via_cdp(user_data_dir: Path, output_file: Path) -> bool:
)
if result.returncode == 0:
if temp_output and temp_output.exists():
_merge_netscape_cookies(output_file, temp_output)
return True
else:
rprint(f'[yellow]Cookie extraction failed: {result.stderr}[/yellow]', file=sys.stderr)
@@ -196,6 +253,9 @@ def extract_cookies_via_cdp(user_data_dir: Path, output_file: Path) -> bool:
except Exception as e:
rprint(f'[yellow]Cookie extraction error: {e}[/yellow]', file=sys.stderr)
return False
finally:
if temp_dir and temp_dir.exists():
shutil.rmtree(temp_dir, ignore_errors=True)
# =============================================================================
@@ -323,6 +383,9 @@ def create_personas(
# Import browser profile if requested
if import_from and source_profile_dir:
cookies_file = Path(persona.path) / 'cookies.txt'
if import_from in CHROMIUM_BROWSERS:
persona_chrome_dir = Path(persona.CHROME_USER_DATA_DIR)
# Copy the browser profile
@@ -349,7 +412,6 @@ def create_personas(
rprint(f'[green]Copied browser profile to persona[/green]', file=sys.stderr)
# Extract cookies via CDP
cookies_file = Path(persona.path) / 'cookies.txt'
rprint(f'[dim]Extracting cookies via CDP...[/dim]', file=sys.stderr)
if extract_cookies_via_cdp(persona_chrome_dir, cookies_file):
@@ -589,7 +651,7 @@ def main():
@main.command('create')
@click.argument('names', nargs=-1)
@click.option('--import', 'import_from', help='Import profile from browser (chrome, firefox, brave)')
@click.option('--import', 'import_from', help='Import profile from browser (chrome, chromium, brave, edge)')
def create_cmd(names: tuple, import_from: Optional[str]):
"""Create Personas, optionally importing from a browser profile."""
sys.exit(create_personas(names, import_from=import_from))

View File

@@ -3,6 +3,9 @@
__package__ = 'archivebox.cli'
from typing import Iterable
import os
import sys
import subprocess
import rich_click as click
from rich import print
@@ -60,6 +63,26 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
pass
if run_in_debug:
os.environ['ARCHIVEBOX_RUNSERVER'] = '1'
if reload:
os.environ['ARCHIVEBOX_AUTORELOAD'] = '1'
os.environ['ARCHIVEBOX_ORCHESTRATOR_MANAGED_BY_WATCHER'] = '1'
from archivebox.config.common import STORAGE_CONFIG
pidfile = str(STORAGE_CONFIG.TMP_DIR / 'runserver.pid')
os.environ['ARCHIVEBOX_RUNSERVER_PIDFILE'] = pidfile
from django.utils.autoreload import DJANGO_AUTORELOAD_ENV
is_reloader_child = os.environ.get(DJANGO_AUTORELOAD_ENV) == 'true'
if not is_reloader_child:
env = os.environ.copy()
env['ARCHIVEBOX_ORCHESTRATOR_WATCHER'] = '1'
subprocess.Popen(
[sys.executable, '-m', 'archivebox', 'manage', 'orchestrator_watch', f'--pidfile={pidfile}'],
env=env,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
from django.core.management import call_command
print('[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]')
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
@@ -79,7 +102,6 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
is_port_in_use,
)
from archivebox.workers.orchestrator import Orchestrator
import sys
# Check if port is already in use
if is_port_in_use(host, int(port)):