mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
wip
This commit is contained in:
@@ -15,6 +15,7 @@ Examples:
|
||||
# Create a new persona
|
||||
archivebox persona create work
|
||||
archivebox persona create --import=chrome personal
|
||||
archivebox persona create --import=edge work
|
||||
|
||||
# List all personas
|
||||
archivebox persona list
|
||||
@@ -34,6 +35,7 @@ import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Optional, Iterable
|
||||
from collections import OrderedDict
|
||||
|
||||
import rich_click as click
|
||||
from rich import print as rprint
|
||||
@@ -78,34 +80,6 @@ def get_chrome_user_data_dir() -> Optional[Path]:
|
||||
return None
|
||||
|
||||
|
||||
def get_firefox_profile_dir() -> Optional[Path]:
|
||||
"""Get the default Firefox profile directory for the current platform."""
|
||||
system = platform.system()
|
||||
home = Path.home()
|
||||
|
||||
if system == 'Darwin':
|
||||
profiles_dir = home / 'Library' / 'Application Support' / 'Firefox' / 'Profiles'
|
||||
elif system == 'Linux':
|
||||
profiles_dir = home / '.mozilla' / 'firefox'
|
||||
elif system == 'Windows':
|
||||
app_data = Path(os.environ.get('APPDATA', home / 'AppData' / 'Roaming'))
|
||||
profiles_dir = app_data / 'Mozilla' / 'Firefox' / 'Profiles'
|
||||
else:
|
||||
return None
|
||||
|
||||
if not profiles_dir.exists():
|
||||
return None
|
||||
|
||||
# Find the default profile (usually ends with .default or .default-release)
|
||||
for profile in profiles_dir.iterdir():
|
||||
if profile.is_dir() and ('default' in profile.name.lower()):
|
||||
return profile
|
||||
|
||||
# If no default found, return the first profile
|
||||
profiles = [p for p in profiles_dir.iterdir() if p.is_dir()]
|
||||
return profiles[0] if profiles else None
|
||||
|
||||
|
||||
def get_brave_user_data_dir() -> Optional[Path]:
|
||||
"""Get the default Brave user data directory for the current platform."""
|
||||
system = platform.system()
|
||||
@@ -134,25 +108,99 @@ def get_brave_user_data_dir() -> Optional[Path]:
|
||||
return None
|
||||
|
||||
|
||||
def get_edge_user_data_dir() -> Optional[Path]:
|
||||
"""Get the default Edge user data directory for the current platform."""
|
||||
system = platform.system()
|
||||
home = Path.home()
|
||||
|
||||
if system == 'Darwin':
|
||||
candidates = [
|
||||
home / 'Library' / 'Application Support' / 'Microsoft Edge',
|
||||
]
|
||||
elif system == 'Linux':
|
||||
candidates = [
|
||||
home / '.config' / 'microsoft-edge',
|
||||
home / '.config' / 'microsoft-edge-beta',
|
||||
home / '.config' / 'microsoft-edge-dev',
|
||||
]
|
||||
elif system == 'Windows':
|
||||
local_app_data = Path(os.environ.get('LOCALAPPDATA', home / 'AppData' / 'Local'))
|
||||
candidates = [
|
||||
local_app_data / 'Microsoft' / 'Edge' / 'User Data',
|
||||
]
|
||||
else:
|
||||
candidates = []
|
||||
|
||||
for candidate in candidates:
|
||||
if candidate.exists() and (candidate / 'Default').exists():
|
||||
return candidate
|
||||
|
||||
return None
|
||||
|
||||
|
||||
BROWSER_PROFILE_FINDERS = {
|
||||
'chrome': get_chrome_user_data_dir,
|
||||
'chromium': get_chrome_user_data_dir, # Same locations
|
||||
'firefox': get_firefox_profile_dir,
|
||||
'brave': get_brave_user_data_dir,
|
||||
'edge': get_edge_user_data_dir,
|
||||
}
|
||||
|
||||
CHROMIUM_BROWSERS = {'chrome', 'chromium', 'brave', 'edge'}
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Cookie Extraction via CDP
|
||||
# =============================================================================
|
||||
|
||||
NETSCAPE_COOKIE_HEADER = [
|
||||
'# Netscape HTTP Cookie File',
|
||||
'# https://curl.se/docs/http-cookies.html',
|
||||
'# This file was generated by ArchiveBox persona cookie extraction',
|
||||
'#',
|
||||
'# Format: domain\\tincludeSubdomains\\tpath\\tsecure\\texpiry\\tname\\tvalue',
|
||||
'',
|
||||
]
|
||||
|
||||
|
||||
def _parse_netscape_cookies(path: Path) -> "OrderedDict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]]":
|
||||
cookies = OrderedDict()
|
||||
if not path.exists():
|
||||
return cookies
|
||||
|
||||
for line in path.read_text().splitlines():
|
||||
if not line or line.startswith('#'):
|
||||
continue
|
||||
parts = line.split('\t')
|
||||
if len(parts) < 7:
|
||||
continue
|
||||
domain, include_subdomains, cookie_path, secure, expiry, name, value = parts[:7]
|
||||
key = (domain, cookie_path, name)
|
||||
cookies[key] = (domain, include_subdomains, cookie_path, secure, expiry, name, value)
|
||||
return cookies
|
||||
|
||||
|
||||
def _write_netscape_cookies(path: Path, cookies: "OrderedDict[tuple[str, str, str], tuple[str, str, str, str, str, str, str]]") -> None:
|
||||
lines = list(NETSCAPE_COOKIE_HEADER)
|
||||
for cookie in cookies.values():
|
||||
lines.append('\t'.join(cookie))
|
||||
path.write_text('\n'.join(lines) + '\n')
|
||||
|
||||
|
||||
def _merge_netscape_cookies(existing_file: Path, new_file: Path) -> None:
|
||||
existing = _parse_netscape_cookies(existing_file)
|
||||
new = _parse_netscape_cookies(new_file)
|
||||
for key, cookie in new.items():
|
||||
existing[key] = cookie
|
||||
_write_netscape_cookies(existing_file, existing)
|
||||
|
||||
|
||||
def extract_cookies_via_cdp(user_data_dir: Path, output_file: Path) -> bool:
|
||||
"""
|
||||
Launch Chrome with the given user data dir and extract cookies via CDP.
|
||||
|
||||
Returns True if successful, False otherwise.
|
||||
"""
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
|
||||
# Find the cookie extraction script
|
||||
chrome_plugin_dir = Path(__file__).parent.parent / 'plugins' / 'chrome'
|
||||
@@ -163,14 +211,21 @@ def extract_cookies_via_cdp(user_data_dir: Path, output_file: Path) -> bool:
|
||||
return False
|
||||
|
||||
# Get node modules dir
|
||||
node_modules_dir = CONSTANTS.LIB_DIR / 'npm' / 'node_modules'
|
||||
node_modules_dir = STORAGE_CONFIG.LIB_DIR / 'npm' / 'node_modules'
|
||||
|
||||
# Set up environment
|
||||
env = os.environ.copy()
|
||||
env['NODE_MODULES_DIR'] = str(node_modules_dir)
|
||||
env['CHROME_USER_DATA_DIR'] = str(user_data_dir)
|
||||
env['COOKIES_OUTPUT_FILE'] = str(output_file)
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
output_path = output_file
|
||||
temp_output = None
|
||||
temp_dir = None
|
||||
if output_file.exists():
|
||||
temp_dir = Path(tempfile.mkdtemp(prefix='ab_cookies_'))
|
||||
temp_output = temp_dir / 'cookies.txt'
|
||||
output_path = temp_output
|
||||
env['COOKIES_OUTPUT_FILE'] = str(output_path)
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
@@ -182,6 +237,8 @@ def extract_cookies_via_cdp(user_data_dir: Path, output_file: Path) -> bool:
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
if temp_output and temp_output.exists():
|
||||
_merge_netscape_cookies(output_file, temp_output)
|
||||
return True
|
||||
else:
|
||||
rprint(f'[yellow]Cookie extraction failed: {result.stderr}[/yellow]', file=sys.stderr)
|
||||
@@ -196,6 +253,9 @@ def extract_cookies_via_cdp(user_data_dir: Path, output_file: Path) -> bool:
|
||||
except Exception as e:
|
||||
rprint(f'[yellow]Cookie extraction error: {e}[/yellow]', file=sys.stderr)
|
||||
return False
|
||||
finally:
|
||||
if temp_dir and temp_dir.exists():
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
@@ -323,6 +383,9 @@ def create_personas(
|
||||
|
||||
# Import browser profile if requested
|
||||
if import_from and source_profile_dir:
|
||||
cookies_file = Path(persona.path) / 'cookies.txt'
|
||||
|
||||
if import_from in CHROMIUM_BROWSERS:
|
||||
persona_chrome_dir = Path(persona.CHROME_USER_DATA_DIR)
|
||||
|
||||
# Copy the browser profile
|
||||
@@ -349,7 +412,6 @@ def create_personas(
|
||||
rprint(f'[green]Copied browser profile to persona[/green]', file=sys.stderr)
|
||||
|
||||
# Extract cookies via CDP
|
||||
cookies_file = Path(persona.path) / 'cookies.txt'
|
||||
rprint(f'[dim]Extracting cookies via CDP...[/dim]', file=sys.stderr)
|
||||
|
||||
if extract_cookies_via_cdp(persona_chrome_dir, cookies_file):
|
||||
@@ -589,7 +651,7 @@ def main():
|
||||
|
||||
@main.command('create')
|
||||
@click.argument('names', nargs=-1)
|
||||
@click.option('--import', 'import_from', help='Import profile from browser (chrome, firefox, brave)')
|
||||
@click.option('--import', 'import_from', help='Import profile from browser (chrome, chromium, brave, edge)')
|
||||
def create_cmd(names: tuple, import_from: Optional[str]):
|
||||
"""Create Personas, optionally importing from a browser profile."""
|
||||
sys.exit(create_personas(names, import_from=import_from))
|
||||
|
||||
@@ -3,6 +3,9 @@
|
||||
__package__ = 'archivebox.cli'
|
||||
|
||||
from typing import Iterable
|
||||
import os
|
||||
import sys
|
||||
import subprocess
|
||||
|
||||
import rich_click as click
|
||||
from rich import print
|
||||
@@ -60,6 +63,26 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
|
||||
pass
|
||||
|
||||
if run_in_debug:
|
||||
os.environ['ARCHIVEBOX_RUNSERVER'] = '1'
|
||||
if reload:
|
||||
os.environ['ARCHIVEBOX_AUTORELOAD'] = '1'
|
||||
os.environ['ARCHIVEBOX_ORCHESTRATOR_MANAGED_BY_WATCHER'] = '1'
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
pidfile = str(STORAGE_CONFIG.TMP_DIR / 'runserver.pid')
|
||||
os.environ['ARCHIVEBOX_RUNSERVER_PIDFILE'] = pidfile
|
||||
|
||||
from django.utils.autoreload import DJANGO_AUTORELOAD_ENV
|
||||
is_reloader_child = os.environ.get(DJANGO_AUTORELOAD_ENV) == 'true'
|
||||
if not is_reloader_child:
|
||||
env = os.environ.copy()
|
||||
env['ARCHIVEBOX_ORCHESTRATOR_WATCHER'] = '1'
|
||||
subprocess.Popen(
|
||||
[sys.executable, '-m', 'archivebox', 'manage', 'orchestrator_watch', f'--pidfile={pidfile}'],
|
||||
env=env,
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
)
|
||||
|
||||
from django.core.management import call_command
|
||||
print('[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]')
|
||||
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
|
||||
@@ -79,7 +102,6 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
|
||||
is_port_in_use,
|
||||
)
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
import sys
|
||||
|
||||
# Check if port is already in use
|
||||
if is_port_in_use(host, int(port)):
|
||||
|
||||
Reference in New Issue
Block a user