Files
ArchiveBox/archivebox/tests/conftest.py
Nick Sweeting ec4b27056e wip
2026-01-21 03:19:56 -08:00

408 lines
13 KiB
Python

"""archivebox/tests/conftest.py - Pytest fixtures for CLI tests."""
import os
import shutil
import sys
import subprocess
import textwrap
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
import pytest
# =============================================================================
# CLI Helpers (defined before fixtures that use them)
# =============================================================================
def run_archivebox_cmd(
args: List[str],
data_dir: Path,
stdin: Optional[str] = None,
timeout: int = 60,
env: Optional[Dict[str, str]] = None,
) -> Tuple[str, str, int]:
"""
Run archivebox command via subprocess, return (stdout, stderr, returncode).
Args:
args: Command arguments (e.g., ['crawl', 'create', 'https://example.com'])
data_dir: The DATA_DIR to use
stdin: Optional string to pipe to stdin
timeout: Command timeout in seconds
env: Additional environment variables
Returns:
Tuple of (stdout, stderr, returncode)
"""
cmd = [sys.executable, '-m', 'archivebox'] + args
base_env = os.environ.copy()
base_env['DATA_DIR'] = str(data_dir)
base_env['USE_COLOR'] = 'False'
base_env['SHOW_PROGRESS'] = 'False'
# Disable slow extractors for faster tests
base_env['SAVE_ARCHIVEDOTORG'] = 'False'
base_env['SAVE_TITLE'] = 'False'
base_env['SAVE_FAVICON'] = 'False'
base_env['SAVE_WGET'] = 'False'
base_env['SAVE_WARC'] = 'False'
base_env['SAVE_PDF'] = 'False'
base_env['SAVE_SCREENSHOT'] = 'False'
base_env['SAVE_DOM'] = 'False'
base_env['SAVE_SINGLEFILE'] = 'False'
base_env['SAVE_READABILITY'] = 'False'
base_env['SAVE_MERCURY'] = 'False'
base_env['SAVE_GIT'] = 'False'
base_env['SAVE_YTDLP'] = 'False'
base_env['SAVE_HEADERS'] = 'False'
base_env['SAVE_HTMLTOTEXT'] = 'False'
if env:
base_env.update(env)
result = subprocess.run(
cmd,
input=stdin,
capture_output=True,
text=True,
cwd=data_dir,
env=base_env,
timeout=timeout,
)
return result.stdout, result.stderr, result.returncode
# =============================================================================
# Fixtures
# =============================================================================
@pytest.fixture
def isolated_data_dir(tmp_path):
"""
Create isolated DATA_DIR for each test.
Uses tmp_path for complete isolation.
"""
data_dir = tmp_path / 'archivebox_data'
data_dir.mkdir()
return data_dir
@pytest.fixture
def initialized_archive(isolated_data_dir):
"""
Initialize ArchiveBox archive in isolated directory.
Runs `archivebox init` via subprocess to set up database and directories.
"""
stdout, stderr, returncode = run_archivebox_cmd(
['init', '--quick'],
data_dir=isolated_data_dir,
timeout=60,
)
assert returncode == 0, f"archivebox init failed: {stderr}"
return isolated_data_dir
# =============================================================================
# CWD-based CLI Helpers (no DATA_DIR env)
# =============================================================================
def run_archivebox_cmd_cwd(
args: List[str],
cwd: Path,
stdin: Optional[str] = None,
timeout: int = 60,
env: Optional[Dict[str, str]] = None,
) -> Tuple[str, str, int]:
"""
Run archivebox command via subprocess using cwd as DATA_DIR (no DATA_DIR env).
Returns (stdout, stderr, returncode).
"""
cmd = [sys.executable, '-m', 'archivebox'] + args
base_env = os.environ.copy()
base_env.pop('DATA_DIR', None)
base_env['USE_COLOR'] = 'False'
base_env['SHOW_PROGRESS'] = 'False'
if env:
base_env.update(env)
result = subprocess.run(
cmd,
input=stdin,
capture_output=True,
text=True,
cwd=cwd,
env=base_env,
timeout=timeout,
)
return result.stdout, result.stderr, result.returncode
def run_python_cwd(
script: str,
cwd: Path,
timeout: int = 60,
) -> Tuple[str, str, int]:
base_env = os.environ.copy()
base_env.pop('DATA_DIR', None)
result = subprocess.run(
[sys.executable, '-'],
input=script,
capture_output=True,
text=True,
cwd=cwd,
env=base_env,
timeout=timeout,
)
return result.stdout, result.stderr, result.returncode
def _get_machine_type() -> str:
import platform
os_name = platform.system().lower()
arch = platform.machine().lower()
in_docker = os.environ.get('IN_DOCKER', '').lower() in ('1', 'true', 'yes')
suffix = '-docker' if in_docker else ''
return f'{arch}-{os_name}{suffix}'
def _find_cached_chromium(lib_dir: Path) -> Optional[Path]:
candidates = [
lib_dir / 'puppeteer',
lib_dir / 'npm' / 'node_modules' / 'puppeteer' / '.local-chromium',
]
for base in candidates:
if not base.exists():
continue
for path in base.rglob('Chromium.app/Contents/MacOS/Chromium'):
return path
for path in base.rglob('chrome-linux/chrome'):
return path
for path in base.rglob('chrome-linux64/chrome'):
return path
return None
def _find_system_browser() -> Optional[Path]:
candidates = [
Path('/Applications/Chromium.app/Contents/MacOS/Chromium'),
Path('/usr/bin/chromium'),
Path('/usr/bin/chromium-browser'),
]
for candidate in candidates:
if candidate.exists():
return candidate
return None
def _ensure_puppeteer(shared_lib: Path) -> None:
npm_prefix = shared_lib / 'npm'
node_modules = npm_prefix / 'node_modules'
puppeteer_dir = node_modules / 'puppeteer'
if puppeteer_dir.exists():
return
npm_prefix.mkdir(parents=True, exist_ok=True)
env = os.environ.copy()
env['PUPPETEER_SKIP_DOWNLOAD'] = '1'
subprocess.run(
['npm', 'install', 'puppeteer'],
cwd=str(npm_prefix),
env=env,
check=True,
capture_output=True,
text=True,
timeout=600,
)
@pytest.fixture(scope="class")
def real_archive_with_example(tmp_path_factory, request):
"""
Initialize archive and add https://example.com using chrome+responses only.
Uses cwd for DATA_DIR and symlinks lib dir to a shared cache.
"""
tmp_path = tmp_path_factory.mktemp("archivebox_data")
if getattr(request, "cls", None) is not None:
request.cls.data_dir = tmp_path
stdout, stderr, returncode = run_archivebox_cmd_cwd(
['init', '--quick'],
cwd=tmp_path,
timeout=120,
)
assert returncode == 0, f"archivebox init failed: {stderr}"
stdout, stderr, returncode = run_archivebox_cmd_cwd(
[
'config',
'--set',
'LISTEN_HOST=archivebox.localhost:8000',
'PUBLIC_INDEX=True',
'PUBLIC_SNAPSHOTS=True',
'PUBLIC_ADD_VIEW=True',
],
cwd=tmp_path,
)
assert returncode == 0, f"archivebox config failed: {stderr}"
machine_type = _get_machine_type()
shared_root = Path(__file__).resolve().parents[3] / 'tmp' / 'test_lib_cache'
shared_lib = shared_root / machine_type
shared_lib.mkdir(parents=True, exist_ok=True)
lib_target = tmp_path / 'lib' / machine_type
if lib_target.exists() and not lib_target.is_symlink():
shutil.rmtree(lib_target)
if not lib_target.exists():
lib_target.parent.mkdir(parents=True, exist_ok=True)
lib_target.symlink_to(shared_lib, target_is_directory=True)
_ensure_puppeteer(shared_lib)
cached_chromium = _find_cached_chromium(shared_lib)
if cached_chromium:
browser_binary = cached_chromium
else:
browser_binary = _find_system_browser()
if browser_binary:
chromium_link = shared_lib / 'chromium-bin'
if not chromium_link.exists():
chromium_link.symlink_to(browser_binary)
browser_binary = chromium_link
if browser_binary:
stdout, stderr, returncode = run_archivebox_cmd_cwd(
[f'config', '--set', f'CHROME_BINARY={browser_binary}'],
cwd=tmp_path,
)
assert returncode == 0, f"archivebox config CHROME_BINARY failed: {stderr}"
script = textwrap.dedent(f"""\
import os
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
import django
django.setup()
from django.utils import timezone
from archivebox.machine.models import Binary, Machine
machine = Machine.current()
Binary.objects.filter(machine=machine, name='chromium').update(
status='installed',
abspath='{browser_binary}',
binprovider='env',
retry_at=timezone.now(),
)
Binary.objects.update_or_create(
machine=machine,
name='chromium',
defaults={{
'status': 'installed',
'abspath': '{browser_binary}',
'binprovider': 'env',
'retry_at': timezone.now(),
}},
)
print('OK')
"""
)
stdout, stderr, returncode = run_python_cwd(script, cwd=tmp_path, timeout=60)
assert returncode == 0, f"Register chromium binary failed: {stderr}"
add_env = {
'CHROME_ENABLED': 'True',
'RESPONSES_ENABLED': 'True',
'DOM_ENABLED': 'False',
'SHOW_PROGRESS': 'False',
'USE_COLOR': 'False',
'CHROME_HEADLESS': 'True',
'CHROME_PAGELOAD_TIMEOUT': '45',
'CHROME_TIMEOUT': '60',
'RESPONSES_TIMEOUT': '30',
}
if browser_binary:
add_env['CHROME_BINARY'] = str(browser_binary)
if cached_chromium:
add_env['PUPPETEER_CACHE_DIR'] = str(shared_lib / 'puppeteer')
stdout, stderr, returncode = run_archivebox_cmd_cwd(
['add', '--depth=0', '--plugins=chrome,responses', 'https://example.com'],
cwd=tmp_path,
timeout=600,
env=add_env,
)
assert returncode == 0, f"archivebox add failed: {stderr}"
return tmp_path
# =============================================================================
# Output Assertions
# =============================================================================
def parse_jsonl_output(stdout: str) -> List[Dict[str, Any]]:
"""Parse JSONL output into list of dicts via Process parser."""
from archivebox.machine.models import Process
return Process.parse_records_from_text(stdout or '')
def assert_jsonl_contains_type(stdout: str, record_type: str, min_count: int = 1):
"""Assert output contains at least min_count records of type."""
records = parse_jsonl_output(stdout)
matching = [r for r in records if r.get('type') == record_type]
assert len(matching) >= min_count, \
f"Expected >= {min_count} {record_type}, got {len(matching)}"
return matching
def assert_jsonl_pass_through(stdout: str, input_records: List[Dict[str, Any]]):
"""Assert that input records appear in output (pass-through behavior)."""
output_records = parse_jsonl_output(stdout)
output_ids = {r.get('id') for r in output_records if r.get('id')}
for input_rec in input_records:
input_id = input_rec.get('id')
if input_id:
assert input_id in output_ids, \
f"Input record {input_id} not found in output (pass-through failed)"
def assert_record_has_fields(record: Dict[str, Any], required_fields: List[str]):
"""Assert record has all required fields with non-None values."""
for field in required_fields:
assert field in record, f"Record missing field: {field}"
assert record[field] is not None, f"Record field is None: {field}"
# =============================================================================
# Test Data Factories
# =============================================================================
def create_test_url(domain: str = 'example.com', path: str = None) -> str:
"""Generate unique test URL."""
import uuid
path = path or uuid.uuid4().hex[:8]
return f'https://{domain}/{path}'
def create_test_crawl_json(urls: List[str] = None, **kwargs) -> Dict[str, Any]:
"""Create Crawl JSONL record for testing."""
urls = urls or [create_test_url()]
return {
'type': 'Crawl',
'urls': '\n'.join(urls),
'max_depth': kwargs.get('max_depth', 0),
'tags_str': kwargs.get('tags_str', ''),
'status': kwargs.get('status', 'queued'),
**{k: v for k, v in kwargs.items() if k not in ('max_depth', 'tags_str', 'status')},
}
def create_test_snapshot_json(url: str = None, **kwargs) -> Dict[str, Any]:
"""Create Snapshot JSONL record for testing."""
return {
'type': 'Snapshot',
'url': url or create_test_url(),
'tags_str': kwargs.get('tags_str', ''),
'status': kwargs.get('status', 'queued'),
**{k: v for k, v in kwargs.items() if k not in ('tags_str', 'status')},
}