wip major changes

This commit is contained in:
Nick Sweeting
2025-12-24 20:09:51 -08:00
parent c1335fed37
commit 1915333b81
450 changed files with 35814 additions and 19015 deletions

View File

@@ -0,0 +1,80 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"SAVE_WGET": {
"type": "boolean",
"default": true,
"description": "Enable wget archiving"
},
"SAVE_WARC": {
"type": "boolean",
"default": true,
"description": "Save WARC archive file"
},
"SAVE_WGET_REQUISITES": {
"type": "boolean",
"default": true,
"description": "Download page requisites (CSS, JS, images)"
},
"WGET_BINARY": {
"type": "string",
"default": "wget",
"description": "Path to wget binary"
},
"WGET_TIMEOUT": {
"type": "integer",
"default": 60,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for wget in seconds"
},
"WGET_USER_AGENT": {
"type": "string",
"default": "",
"x-fallback": "USER_AGENT",
"description": "User agent string for wget"
},
"WGET_CHECK_SSL_VALIDITY": {
"type": "boolean",
"default": true,
"x-fallback": "CHECK_SSL_VALIDITY",
"x-aliases": ["CHECK_SSL_VALIDITY"],
"description": "Whether to verify SSL certificates"
},
"WGET_COOKIES_FILE": {
"type": "string",
"default": "",
"x-fallback": "COOKIES_FILE",
"description": "Path to cookies file"
},
"WGET_RESTRICT_FILE_NAMES": {
"type": "string",
"default": "windows",
"enum": ["windows", "unix", "ascii", "nocontrol", "lowercase", "uppercase"],
"x-fallback": "RESTRICT_FILE_NAMES",
"description": "Filename restriction mode"
},
"WGET_ARGS": {
"type": "array",
"items": {"type": "string"},
"default": [
"--no-verbose",
"--adjust-extension",
"--convert-links",
"--force-directories",
"--backup-converted",
"--span-hosts",
"--no-parent",
"-e", "robots=off"
],
"description": "Default wget arguments"
},
"WGET_EXTRA_ARGS": {
"type": "string",
"default": "",
"description": "Extra arguments for wget (space-separated)"
}
}
}

View File

@@ -0,0 +1,136 @@
#!/usr/bin/env python3
"""
Validation hook for wget binary.
Runs at crawl start to verify wget is available.
Outputs JSONL for InstalledBinary and Machine config updates.
"""
import os
import sys
import json
import shutil
import hashlib
import subprocess
from pathlib import Path
def get_binary_version(abspath: str) -> str | None:
"""Get version string from binary."""
try:
result = subprocess.run(
[abspath, '--version'],
capture_output=True,
text=True,
timeout=5,
)
if result.returncode == 0 and result.stdout:
# wget version string: "GNU Wget 1.24.5 built on ..."
first_line = result.stdout.strip().split('\n')[0]
# Extract version number
parts = first_line.split()
for i, part in enumerate(parts):
if part.lower() == 'wget' and i + 1 < len(parts):
return parts[i + 1]
return first_line[:32]
except Exception:
pass
return None
def get_binary_hash(abspath: str) -> str | None:
"""Get SHA256 hash of binary."""
try:
with open(abspath, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
except Exception:
return None
def find_wget() -> dict | None:
"""Find wget binary using abx-pkg or fallback to shutil.which."""
# Try abx-pkg first
try:
from abx_pkg import Binary, EnvProvider
class WgetBinary(Binary):
name: str = 'wget'
binproviders_supported = [EnvProvider()]
binary = WgetBinary()
loaded = binary.load()
if loaded and loaded.abspath:
return {
'name': 'wget',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
}
except ImportError:
pass
except Exception:
pass
# Fallback to shutil.which
abspath = shutil.which('wget') or os.environ.get('WGET_BINARY', '')
if abspath and Path(abspath).is_file():
return {
'name': 'wget',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'env',
}
return None
def main():
"""Validate wget binary and output JSONL."""
result = find_wget()
if result and result.get('abspath'):
# Output InstalledBinary
print(json.dumps({
'type': 'InstalledBinary',
'name': result['name'],
'abspath': result['abspath'],
'version': result['version'],
'sha256': result['sha256'],
'binprovider': result['binprovider'],
}))
# Output Machine config update
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/WGET_BINARY',
'value': result['abspath'],
}))
if result['version']:
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/WGET_VERSION',
'value': result['version'],
}))
sys.exit(0)
else:
# Output Dependency request
print(json.dumps({
'type': 'Dependency',
'bin_name': 'wget',
'bin_providers': 'apt,brew,env',
}))
# Exit non-zero to indicate binary not found
print(f"wget binary not found", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,130 @@
#!/usr/bin/env python3
"""
Validate and compute derived wget config values.
This hook runs early in the Crawl lifecycle to:
1. Validate config values with warnings (not hard errors)
2. Compute derived values (USE_WGET from SAVE_WGET/SAVE_WARC)
3. Check binary availability and version
Output:
- COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
- InstalledBinary JSONL records to stdout when binaries are found
"""
import json
import os
import shutil
import subprocess
import sys
from abx_pkg import Binary, EnvProvider
# Read config from environment (already validated by JSONSchema)
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
except ValueError:
return default
def output_installed_binary(binary: Binary, name: str):
"""Output InstalledBinary JSONL record to stdout."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'InstalledBinary',
'name': name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'env',
'machine_id': machine_id,
}
print(json.dumps(record))
def main():
warnings = []
errors = []
computed = {}
# Get config values
save_wget = get_env_bool('SAVE_WGET', True)
save_warc = get_env_bool('SAVE_WARC', True)
wget_timeout = get_env_int('WGET_TIMEOUT') or get_env_int('TIMEOUT', 60)
wget_binary = get_env('WGET_BINARY', 'wget')
# Compute derived values
use_wget = save_wget or save_warc
computed['USE_WGET'] = str(use_wget).lower()
# Validate timeout with warning (not error)
if use_wget and wget_timeout < 20:
warnings.append(
f"WGET_TIMEOUT={wget_timeout} is very low. "
"wget may fail to archive sites if set to less than ~20 seconds. "
"Consider setting WGET_TIMEOUT=60 or higher."
)
# Check binary availability using abx-pkg
provider = EnvProvider()
try:
binary = Binary(name=wget_binary, binproviders=[provider]).load()
binary_path = str(binary.abspath) if binary.abspath else ''
except Exception:
binary = None
binary_path = ''
if not binary_path:
if use_wget:
errors.append(f"WGET_BINARY={wget_binary} not found. Install wget or set SAVE_WGET=false.")
computed['WGET_BINARY'] = ''
else:
computed['WGET_BINARY'] = binary_path
wget_version = str(binary.version) if binary.version else 'unknown'
computed['WGET_VERSION'] = wget_version
# Output InstalledBinary JSONL record
output_installed_binary(binary, name='wget')
# Check for compression support
if computed.get('WGET_BINARY'):
try:
result = subprocess.run(
[computed['WGET_BINARY'], '--compression=auto', '--help'],
capture_output=True, timeout=5
)
computed['WGET_AUTO_COMPRESSION'] = 'true' if result.returncode == 0 else 'false'
except Exception:
computed['WGET_AUTO_COMPRESSION'] = 'false'
# Output results
# Format: KEY=VALUE lines that hooks.py will parse and add to env
for key, value in computed.items():
print(f"COMPUTED:{key}={value}")
for warning in warnings:
print(f"WARNING:{warning}", file=sys.stderr)
for error in errors:
print(f"ERROR:{error}", file=sys.stderr)
# Exit with error if any hard errors
sys.exit(1 if errors else 0)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,325 @@
#!/usr/bin/env python3
"""
Archive a URL using wget.
Usage: on_Snapshot__wget.py --url=<url> --snapshot-id=<uuid>
Output: Downloads files to $PWD
Environment variables:
WGET_BINARY: Path to wget binary (optional, falls back to PATH)
WGET_TIMEOUT: Timeout in seconds (default: 60)
WGET_USER_AGENT: User agent string
WGET_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
WGET_COOKIES_FILE: Path to cookies file (optional)
WGET_RESTRICT_FILE_NAMES: Filename restriction mode (default: windows)
WGET_EXTRA_ARGS: Extra arguments for wget (space-separated)
# Wget feature toggles
SAVE_WGET: Enable wget archiving (default: True)
SAVE_WARC: Save WARC file (default: True)
SAVE_WGET_REQUISITES: Download page requisites (default: True)
# Fallback to ARCHIVING_CONFIG values if WGET_* not set:
TIMEOUT: Fallback timeout
USER_AGENT: Fallback user agent
CHECK_SSL_VALIDITY: Fallback SSL check
COOKIES_FILE: Fallback cookies file
RESTRICT_FILE_NAMES: Fallback filename restriction
"""
import json
import os
import re
import shutil
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path
import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'wget'
BIN_NAME = 'wget'
BIN_PROVIDERS = 'apt,brew,env'
OUTPUT_DIR = 'wget'
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
except ValueError:
return default
STATICFILE_DIR = 'staticfile'
def has_staticfile_output() -> bool:
"""Check if staticfile extractor already downloaded this URL."""
staticfile_dir = Path(STATICFILE_DIR)
return staticfile_dir.exists() and any(staticfile_dir.iterdir())
def find_wget() -> str | None:
"""Find wget binary."""
wget = get_env('WGET_BINARY')
if wget and os.path.isfile(wget):
return wget
return shutil.which('wget')
def get_version(binary: str) -> str:
"""Get wget version."""
try:
result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
return result.stdout.split('\n')[0].strip()[:64]
except Exception:
return ''
def check_wget_compression(binary: str) -> bool:
"""Check if wget supports --compression=auto."""
try:
result = subprocess.run(
[binary, '--compression=auto', '--help'],
capture_output=True,
timeout=5
)
return result.returncode == 0
except Exception:
return False
# Default wget args (from old WGET_CONFIG)
WGET_DEFAULT_ARGS = [
'--no-verbose',
'--adjust-extension',
'--convert-links',
'--force-directories',
'--backup-converted',
'--span-hosts',
'--no-parent',
'-e', 'robots=off',
]
def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]:
"""
Archive URL using wget.
Returns: (success, output_path, error_message)
"""
# Get config from env (with WGET_ prefix or fallback to ARCHIVING_CONFIG style)
timeout = get_env_int('WGET_TIMEOUT') or get_env_int('TIMEOUT', 60)
user_agent = get_env('WGET_USER_AGENT') or get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
check_ssl = get_env_bool('WGET_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True))
cookies_file = get_env('WGET_COOKIES_FILE') or get_env('COOKIES_FILE', '')
restrict_names = get_env('WGET_RESTRICT_FILE_NAMES') or get_env('RESTRICT_FILE_NAMES', 'windows')
extra_args = get_env('WGET_EXTRA_ARGS', '')
# Feature toggles
save_warc = get_env_bool('SAVE_WARC', True)
save_requisites = get_env_bool('SAVE_WGET_REQUISITES', True)
# Check for compression support
supports_compression = check_wget_compression(binary)
# Build wget command (later options take precedence)
cmd = [
binary,
*WGET_DEFAULT_ARGS,
f'--timeout={timeout}',
'--tries=2',
]
if user_agent:
cmd.append(f'--user-agent={user_agent}')
if restrict_names:
cmd.append(f'--restrict-file-names={restrict_names}')
if save_requisites:
cmd.append('--page-requisites')
if save_warc:
warc_dir = Path('warc')
warc_dir.mkdir(exist_ok=True)
warc_path = warc_dir / str(int(datetime.now(timezone.utc).timestamp()))
cmd.append(f'--warc-file={warc_path}')
else:
cmd.append('--timestamping')
if cookies_file and Path(cookies_file).is_file():
cmd.extend(['--load-cookies', cookies_file])
if supports_compression:
cmd.append('--compression=auto')
if not check_ssl:
cmd.extend(['--no-check-certificate', '--no-hsts'])
if extra_args:
cmd.extend(extra_args.split())
cmd.append(url)
# Run wget
try:
result = subprocess.run(
cmd,
capture_output=True,
timeout=timeout * 2, # Allow extra time for large downloads
)
# Find downloaded files
downloaded_files = [
f for f in Path('.').rglob('*')
if f.is_file() and f.name != '.gitkeep' and not str(f).startswith('warc/')
]
if not downloaded_files:
stderr = result.stderr.decode('utf-8', errors='replace')
stdout = result.stdout.decode('utf-8', errors='replace')
combined = stderr + stdout
if '403' in combined or 'Forbidden' in combined:
return False, None, '403 Forbidden (try changing USER_AGENT)'
elif '404' in combined or 'Not Found' in combined:
return False, None, '404 Not Found'
elif '500' in combined:
return False, None, '500 Internal Server Error'
else:
return False, None, f'No files downloaded: {stderr[:200]}'
# Find main HTML file
html_files = [
f for f in downloaded_files
if re.search(r'\.[Ss]?[Hh][Tt][Mm][Ll]?$', str(f))
]
output_path = str(html_files[0]) if html_files else str(downloaded_files[0])
# Parse download stats from wget output
output_tail = result.stderr.decode('utf-8', errors='replace').strip().split('\n')[-3:]
files_count = len(downloaded_files)
return True, output_path, ''
except subprocess.TimeoutExpired:
return False, None, f'Timed out after {timeout * 2} seconds'
except Exception as e:
return False, None, f'{type(e).__name__}: {e}'
@click.command()
@click.option('--url', required=True, help='URL to archive')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Archive a URL using wget."""
start_ts = datetime.now(timezone.utc)
version = ''
output = None
status = 'failed'
error = ''
binary = None
cmd_str = ''
try:
# Check if wget is enabled
if not get_env_bool('SAVE_WGET', True):
print('Skipping wget (SAVE_WGET=False)')
status = 'skipped'
end_ts = datetime.now(timezone.utc)
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={end_ts.isoformat()}')
print(f'STATUS={status}')
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}')
sys.exit(0)
# Check if staticfile extractor already handled this (permanent skip)
if has_staticfile_output():
print(f'Skipping wget - staticfile extractor already downloaded this')
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
print(f'STATUS=skipped')
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
sys.exit(0) # Permanent skip - staticfile already handled
# Find binary
binary = find_wget()
if not binary:
print(f'ERROR: {BIN_NAME} binary not found', file=sys.stderr)
print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
print(f'INSTALL_HINT=apt install wget OR brew install wget', file=sys.stderr)
sys.exit(1)
version = get_version(binary)
cmd_str = f'{binary} ... {url}'
# Run extraction
success, output, error = save_wget(url, binary)
status = 'succeeded' if success else 'failed'
if success:
# Count downloaded files
files = list(Path('.').rglob('*'))
file_count = len([f for f in files if f.is_file()])
print(f'wget completed: {file_count} files downloaded')
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
# Print results
end_ts = datetime.now(timezone.utc)
duration = (end_ts - start_ts).total_seconds()
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={end_ts.isoformat()}')
print(f'DURATION={duration:.2f}')
if cmd_str:
print(f'CMD={cmd_str}')
if version:
print(f'VERSION={version}')
if output:
print(f'OUTPUT={output}')
print(f'STATUS={status}')
if error:
print(f'ERROR={error}', file=sys.stderr)
# Print JSON result
result_json = {
'extractor': EXTRACTOR_NAME,
'url': url,
'snapshot_id': snapshot_id,
'status': status,
'start_ts': start_ts.isoformat(),
'end_ts': end_ts.isoformat(),
'duration': round(duration, 2),
'cmd_version': version,
'output': output,
'error': error or None,
}
print(f'RESULT_JSON={json.dumps(result_json)}')
sys.exit(0 if status == 'succeeded' else 1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,369 @@
"""
Integration tests for wget plugin
Tests verify:
1. Plugin reports missing dependency correctly
2. wget can be installed via brew/apt provider hooks
3. Config options work (SAVE_WGET, SAVE_WARC, etc.)
4. Extraction works against real example.com
5. Output files contain actual page content
6. Skip cases work (SAVE_WGET=False, staticfile present)
7. Failure cases handled (404, network errors)
"""
import json
import os
import shutil
import subprocess
import sys
import tempfile
import uuid
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
WGET_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_wget.py'))
BREW_HOOK = PLUGINS_ROOT / 'brew' / 'on_Dependency__install_using_brew_provider.py'
APT_HOOK = PLUGINS_ROOT / 'apt' / 'on_Dependency__install_using_apt_provider.py'
TEST_URL = 'https://example.com'
def test_hook_script_exists():
"""Verify hook script exists."""
assert WGET_HOOK.exists(), f"Hook script not found: {WGET_HOOK}"
def test_reports_missing_dependency_when_not_installed():
"""Test that script reports DEPENDENCY_NEEDED when wget is not found."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Run with empty PATH so binary won't be found
env = {'PATH': '/nonexistent', 'HOME': str(tmpdir)}
result = subprocess.run(
[sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'test123'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env
)
# Should fail and report missing dependency
assert result.returncode != 0, "Should exit non-zero when dependency missing"
combined = result.stdout + result.stderr
assert 'DEPENDENCY_NEEDED' in combined, "Should output DEPENDENCY_NEEDED"
assert 'wget' in combined.lower(), "Should mention wget"
assert 'BIN_PROVIDERS' in combined, "Should report available providers (apt,brew,env)"
def test_can_install_wget_via_provider():
"""Test that wget can be installed via brew/apt provider hooks."""
# Determine which provider to use
if shutil.which('brew'):
provider_hook = BREW_HOOK
provider_name = 'brew'
elif shutil.which('apt-get'):
provider_hook = APT_HOOK
provider_name = 'apt'
else:
pytest.skip("Neither brew nor apt available on this system")
assert provider_hook.exists(), f"Provider hook not found: {provider_hook}"
# Test installation via provider hook
dependency_id = str(uuid.uuid4())
result = subprocess.run(
[
sys.executable,
str(provider_hook),
'--dependency-id', dependency_id,
'--bin-name', 'wget',
'--bin-providers', 'apt,brew,env'
],
capture_output=True,
text=True,
timeout=300 # Installation can take time
)
# Should succeed (wget installs successfully or is already installed)
assert result.returncode == 0, f"{provider_name} install failed: {result.stderr}"
# Should output InstalledBinary JSONL record
assert 'InstalledBinary' in result.stdout or 'wget' in result.stderr, \
f"Should output installation info: stdout={result.stdout}, stderr={result.stderr}"
# Parse JSONL if present
if result.stdout.strip():
for line in result.stdout.strip().split('\n'):
try:
record = json.loads(line)
if record.get('type') == 'InstalledBinary':
assert record['name'] == 'wget'
assert record['binprovider'] in ['brew', 'apt']
assert record['abspath'], "Should have binary path"
assert Path(record['abspath']).exists(), f"Binary should exist at {record['abspath']}"
break
except json.JSONDecodeError:
continue
# Verify wget is now available
result = subprocess.run(['which', 'wget'], capture_output=True, text=True)
assert result.returncode == 0, "wget should be available after installation"
def test_archives_example_com():
"""Test full workflow: ensure wget installed then archive example.com."""
# First ensure wget is installed via provider
if shutil.which('brew'):
provider_hook = BREW_HOOK
elif shutil.which('apt-get'):
provider_hook = APT_HOOK
else:
pytest.skip("Neither brew nor apt available")
# Run installation (idempotent - will succeed if already installed)
install_result = subprocess.run(
[
sys.executable,
str(provider_hook),
'--dependency-id', str(uuid.uuid4()),
'--bin-name', 'wget',
'--bin-providers', 'apt,brew,env'
],
capture_output=True,
text=True,
timeout=300
)
if install_result.returncode != 0:
pytest.skip(f"Could not install wget: {install_result.stderr}")
# Now test archiving
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Run wget extraction
result = subprocess.run(
[sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=120
)
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
# Verify output in stdout
assert 'STATUS=succeeded' in result.stdout, "Should report success"
assert 'wget completed' in result.stdout, "Should report completion"
# Verify files were downloaded
downloaded_files = list(tmpdir.rglob('*.html')) + list(tmpdir.rglob('*.htm'))
assert len(downloaded_files) > 0, "No HTML files downloaded"
# Find main HTML file (should contain example.com)
main_html = None
for html_file in downloaded_files:
content = html_file.read_text(errors='ignore')
if 'example domain' in content.lower():
main_html = html_file
break
assert main_html is not None, "Could not find main HTML file with example.com content"
# Verify HTML content contains REAL example.com text
html_content = main_html.read_text(errors='ignore')
assert len(html_content) > 200, f"HTML content too short: {len(html_content)} bytes"
assert 'example domain' in html_content.lower(), "Missing 'Example Domain' in HTML"
assert ('this domain' in html_content.lower() or
'illustrative examples' in html_content.lower()), \
"Missing example.com description text"
assert ('iana' in html_content.lower() or
'more information' in html_content.lower()), \
"Missing IANA reference"
# Verify RESULT_JSON is present and valid
assert 'RESULT_JSON=' in result.stdout, "Should output RESULT_JSON"
for line in result.stdout.split('\n'):
if line.startswith('RESULT_JSON='):
result_json = json.loads(line.replace('RESULT_JSON=', ''))
assert result_json['extractor'] == 'wget'
assert result_json['status'] == 'succeeded'
assert result_json['url'] == TEST_URL
assert result_json['snapshot_id'] == 'test789'
assert 'duration' in result_json
assert result_json['duration'] >= 0
break
def test_config_save_wget_false_skips():
"""Test that SAVE_WGET=False causes skip."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set SAVE_WGET=False
env = os.environ.copy()
env['SAVE_WGET'] = 'False'
result = subprocess.run(
[sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=30
)
# Should succeed but skip
assert result.returncode == 0, f"Should exit 0 when skipping: {result.stderr}"
assert 'STATUS=skipped' in result.stdout, "Should report skipped status"
assert 'SAVE_WGET=False' in result.stdout, "Should mention SAVE_WGET=False"
def test_config_save_warc():
"""Test that SAVE_WARC=True creates WARC files."""
# Ensure wget is available
if not shutil.which('wget'):
pytest.skip("wget not installed")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set SAVE_WARC=True explicitly
env = os.environ.copy()
env['SAVE_WARC'] = 'True'
result = subprocess.run(
[sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'testwarc'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=120
)
if result.returncode == 0:
# Look for WARC files in warc/ subdirectory
warc_dir = tmpdir / 'warc'
if warc_dir.exists():
warc_files = list(warc_dir.rglob('*'))
warc_files = [f for f in warc_files if f.is_file()]
assert len(warc_files) > 0, "WARC file not created when SAVE_WARC=True"
def test_staticfile_present_skips():
"""Test that wget skips when staticfile already downloaded."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Create staticfile directory with content to simulate staticfile extractor ran
staticfile_dir = tmpdir / 'staticfile'
staticfile_dir.mkdir()
(staticfile_dir / 'index.html').write_text('<html>test</html>')
result = subprocess.run(
[sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'teststatic'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=30
)
# Should skip
assert result.returncode == 0, "Should exit 0 when skipping"
assert 'STATUS=skipped' in result.stdout, "Should report skipped status"
assert 'staticfile' in result.stdout.lower(), "Should mention staticfile"
def test_handles_404_gracefully():
"""Test that wget fails gracefully on 404."""
if not shutil.which('wget'):
pytest.skip("wget not installed")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Try to download non-existent page
result = subprocess.run(
[sys.executable, str(WGET_HOOK), '--url', 'https://example.com/nonexistent-page-404', '--snapshot-id', 'test404'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
)
# Should fail
assert result.returncode != 0, "Should fail on 404"
combined = result.stdout + result.stderr
assert '404' in combined or 'Not Found' in combined or 'No files downloaded' in combined, \
"Should report 404 or no files downloaded"
def test_config_timeout_honored():
"""Test that WGET_TIMEOUT config is respected."""
if not shutil.which('wget'):
pytest.skip("wget not installed")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set very short timeout
env = os.environ.copy()
env['WGET_TIMEOUT'] = '5'
# This should still succeed for example.com (it's fast)
result = subprocess.run(
[sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'testtimeout'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=30
)
# Verify it completed (success or fail, but didn't hang)
assert result.returncode in (0, 1), "Should complete (success or fail)"
def test_config_user_agent():
"""Test that WGET_USER_AGENT config is used."""
if not shutil.which('wget'):
pytest.skip("wget not installed")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set custom user agent
env = os.environ.copy()
env['WGET_USER_AGENT'] = 'TestBot/1.0'
result = subprocess.run(
[sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'testua'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=120
)
# Should succeed (example.com doesn't block)
if result.returncode == 0:
assert 'STATUS=succeeded' in result.stdout
if __name__ == '__main__':
pytest.main([__file__, '-v'])