mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 15:27:53 +10:00
wip major changes
This commit is contained in:
80
archivebox/plugins/wget/config.json
Normal file
80
archivebox/plugins/wget/config.json
Normal file
@@ -0,0 +1,80 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"SAVE_WGET": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Enable wget archiving"
|
||||
},
|
||||
"SAVE_WARC": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Save WARC archive file"
|
||||
},
|
||||
"SAVE_WGET_REQUISITES": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Download page requisites (CSS, JS, images)"
|
||||
},
|
||||
"WGET_BINARY": {
|
||||
"type": "string",
|
||||
"default": "wget",
|
||||
"description": "Path to wget binary"
|
||||
},
|
||||
"WGET_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 60,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for wget in seconds"
|
||||
},
|
||||
"WGET_USER_AGENT": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"x-fallback": "USER_AGENT",
|
||||
"description": "User agent string for wget"
|
||||
},
|
||||
"WGET_CHECK_SSL_VALIDITY": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-fallback": "CHECK_SSL_VALIDITY",
|
||||
"x-aliases": ["CHECK_SSL_VALIDITY"],
|
||||
"description": "Whether to verify SSL certificates"
|
||||
},
|
||||
"WGET_COOKIES_FILE": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"x-fallback": "COOKIES_FILE",
|
||||
"description": "Path to cookies file"
|
||||
},
|
||||
"WGET_RESTRICT_FILE_NAMES": {
|
||||
"type": "string",
|
||||
"default": "windows",
|
||||
"enum": ["windows", "unix", "ascii", "nocontrol", "lowercase", "uppercase"],
|
||||
"x-fallback": "RESTRICT_FILE_NAMES",
|
||||
"description": "Filename restriction mode"
|
||||
},
|
||||
"WGET_ARGS": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": [
|
||||
"--no-verbose",
|
||||
"--adjust-extension",
|
||||
"--convert-links",
|
||||
"--force-directories",
|
||||
"--backup-converted",
|
||||
"--span-hosts",
|
||||
"--no-parent",
|
||||
"-e", "robots=off"
|
||||
],
|
||||
"description": "Default wget arguments"
|
||||
},
|
||||
"WGET_EXTRA_ARGS": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"description": "Extra arguments for wget (space-separated)"
|
||||
}
|
||||
}
|
||||
}
|
||||
136
archivebox/plugins/wget/on_Crawl__00_validate_wget.py
Normal file
136
archivebox/plugins/wget/on_Crawl__00_validate_wget.py
Normal file
@@ -0,0 +1,136 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validation hook for wget binary.
|
||||
|
||||
Runs at crawl start to verify wget is available.
|
||||
Outputs JSONL for InstalledBinary and Machine config updates.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import hashlib
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_binary_version(abspath: str) -> str | None:
|
||||
"""Get version string from binary."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[abspath, '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
# wget version string: "GNU Wget 1.24.5 built on ..."
|
||||
first_line = result.stdout.strip().split('\n')[0]
|
||||
# Extract version number
|
||||
parts = first_line.split()
|
||||
for i, part in enumerate(parts):
|
||||
if part.lower() == 'wget' and i + 1 < len(parts):
|
||||
return parts[i + 1]
|
||||
return first_line[:32]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def get_binary_hash(abspath: str) -> str | None:
|
||||
"""Get SHA256 hash of binary."""
|
||||
try:
|
||||
with open(abspath, 'rb') as f:
|
||||
return hashlib.sha256(f.read()).hexdigest()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_wget() -> dict | None:
|
||||
"""Find wget binary using abx-pkg or fallback to shutil.which."""
|
||||
# Try abx-pkg first
|
||||
try:
|
||||
from abx_pkg import Binary, EnvProvider
|
||||
|
||||
class WgetBinary(Binary):
|
||||
name: str = 'wget'
|
||||
binproviders_supported = [EnvProvider()]
|
||||
|
||||
binary = WgetBinary()
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
'name': 'wget',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to shutil.which
|
||||
abspath = shutil.which('wget') or os.environ.get('WGET_BINARY', '')
|
||||
if abspath and Path(abspath).is_file():
|
||||
return {
|
||||
'name': 'wget',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
"""Validate wget binary and output JSONL."""
|
||||
|
||||
result = find_wget()
|
||||
|
||||
if result and result.get('abspath'):
|
||||
# Output InstalledBinary
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': result['name'],
|
||||
'abspath': result['abspath'],
|
||||
'version': result['version'],
|
||||
'sha256': result['sha256'],
|
||||
'binprovider': result['binprovider'],
|
||||
}))
|
||||
|
||||
# Output Machine config update
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/WGET_BINARY',
|
||||
'value': result['abspath'],
|
||||
}))
|
||||
|
||||
if result['version']:
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/WGET_VERSION',
|
||||
'value': result['version'],
|
||||
}))
|
||||
|
||||
sys.exit(0)
|
||||
else:
|
||||
# Output Dependency request
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'wget',
|
||||
'bin_providers': 'apt,brew,env',
|
||||
}))
|
||||
|
||||
# Exit non-zero to indicate binary not found
|
||||
print(f"wget binary not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
130
archivebox/plugins/wget/on_Crawl__00_validate_wget_config.py
Normal file
130
archivebox/plugins/wget/on_Crawl__00_validate_wget_config.py
Normal file
@@ -0,0 +1,130 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validate and compute derived wget config values.
|
||||
|
||||
This hook runs early in the Crawl lifecycle to:
|
||||
1. Validate config values with warnings (not hard errors)
|
||||
2. Compute derived values (USE_WGET from SAVE_WGET/SAVE_WARC)
|
||||
3. Check binary availability and version
|
||||
|
||||
Output:
|
||||
- COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
|
||||
- InstalledBinary JSONL records to stdout when binaries are found
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
from abx_pkg import Binary, EnvProvider
|
||||
|
||||
|
||||
# Read config from environment (already validated by JSONSchema)
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
def get_env_int(name: str, default: int = 0) -> int:
|
||||
try:
|
||||
return int(get_env(name, str(default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
def output_installed_binary(binary: Binary, name: str):
|
||||
"""Output InstalledBinary JSONL record to stdout."""
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
record = {
|
||||
'type': 'InstalledBinary',
|
||||
'name': name,
|
||||
'abspath': str(binary.abspath),
|
||||
'version': str(binary.version) if binary.version else '',
|
||||
'sha256': binary.sha256 or '',
|
||||
'binprovider': 'env',
|
||||
'machine_id': machine_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
|
||||
def main():
|
||||
warnings = []
|
||||
errors = []
|
||||
computed = {}
|
||||
|
||||
# Get config values
|
||||
save_wget = get_env_bool('SAVE_WGET', True)
|
||||
save_warc = get_env_bool('SAVE_WARC', True)
|
||||
wget_timeout = get_env_int('WGET_TIMEOUT') or get_env_int('TIMEOUT', 60)
|
||||
wget_binary = get_env('WGET_BINARY', 'wget')
|
||||
|
||||
# Compute derived values
|
||||
use_wget = save_wget or save_warc
|
||||
computed['USE_WGET'] = str(use_wget).lower()
|
||||
|
||||
# Validate timeout with warning (not error)
|
||||
if use_wget and wget_timeout < 20:
|
||||
warnings.append(
|
||||
f"WGET_TIMEOUT={wget_timeout} is very low. "
|
||||
"wget may fail to archive sites if set to less than ~20 seconds. "
|
||||
"Consider setting WGET_TIMEOUT=60 or higher."
|
||||
)
|
||||
|
||||
# Check binary availability using abx-pkg
|
||||
provider = EnvProvider()
|
||||
try:
|
||||
binary = Binary(name=wget_binary, binproviders=[provider]).load()
|
||||
binary_path = str(binary.abspath) if binary.abspath else ''
|
||||
except Exception:
|
||||
binary = None
|
||||
binary_path = ''
|
||||
|
||||
if not binary_path:
|
||||
if use_wget:
|
||||
errors.append(f"WGET_BINARY={wget_binary} not found. Install wget or set SAVE_WGET=false.")
|
||||
computed['WGET_BINARY'] = ''
|
||||
else:
|
||||
computed['WGET_BINARY'] = binary_path
|
||||
wget_version = str(binary.version) if binary.version else 'unknown'
|
||||
computed['WGET_VERSION'] = wget_version
|
||||
|
||||
# Output InstalledBinary JSONL record
|
||||
output_installed_binary(binary, name='wget')
|
||||
|
||||
# Check for compression support
|
||||
if computed.get('WGET_BINARY'):
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[computed['WGET_BINARY'], '--compression=auto', '--help'],
|
||||
capture_output=True, timeout=5
|
||||
)
|
||||
computed['WGET_AUTO_COMPRESSION'] = 'true' if result.returncode == 0 else 'false'
|
||||
except Exception:
|
||||
computed['WGET_AUTO_COMPRESSION'] = 'false'
|
||||
|
||||
# Output results
|
||||
# Format: KEY=VALUE lines that hooks.py will parse and add to env
|
||||
for key, value in computed.items():
|
||||
print(f"COMPUTED:{key}={value}")
|
||||
|
||||
for warning in warnings:
|
||||
print(f"WARNING:{warning}", file=sys.stderr)
|
||||
|
||||
for error in errors:
|
||||
print(f"ERROR:{error}", file=sys.stderr)
|
||||
|
||||
# Exit with error if any hard errors
|
||||
sys.exit(1 if errors else 0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
325
archivebox/plugins/wget/on_Snapshot__50_wget.py
Normal file
325
archivebox/plugins/wget/on_Snapshot__50_wget.py
Normal file
@@ -0,0 +1,325 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Archive a URL using wget.
|
||||
|
||||
Usage: on_Snapshot__wget.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Downloads files to $PWD
|
||||
|
||||
Environment variables:
|
||||
WGET_BINARY: Path to wget binary (optional, falls back to PATH)
|
||||
WGET_TIMEOUT: Timeout in seconds (default: 60)
|
||||
WGET_USER_AGENT: User agent string
|
||||
WGET_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
|
||||
WGET_COOKIES_FILE: Path to cookies file (optional)
|
||||
WGET_RESTRICT_FILE_NAMES: Filename restriction mode (default: windows)
|
||||
WGET_EXTRA_ARGS: Extra arguments for wget (space-separated)
|
||||
|
||||
# Wget feature toggles
|
||||
SAVE_WGET: Enable wget archiving (default: True)
|
||||
SAVE_WARC: Save WARC file (default: True)
|
||||
SAVE_WGET_REQUISITES: Download page requisites (default: True)
|
||||
|
||||
# Fallback to ARCHIVING_CONFIG values if WGET_* not set:
|
||||
TIMEOUT: Fallback timeout
|
||||
USER_AGENT: Fallback user agent
|
||||
CHECK_SSL_VALIDITY: Fallback SSL check
|
||||
COOKIES_FILE: Fallback cookies file
|
||||
RESTRICT_FILE_NAMES: Fallback filename restriction
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'wget'
|
||||
BIN_NAME = 'wget'
|
||||
BIN_PROVIDERS = 'apt,brew,env'
|
||||
OUTPUT_DIR = 'wget'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def get_env_int(name: str, default: int = 0) -> int:
|
||||
try:
|
||||
return int(get_env(name, str(default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
STATICFILE_DIR = 'staticfile'
|
||||
|
||||
def has_staticfile_output() -> bool:
|
||||
"""Check if staticfile extractor already downloaded this URL."""
|
||||
staticfile_dir = Path(STATICFILE_DIR)
|
||||
return staticfile_dir.exists() and any(staticfile_dir.iterdir())
|
||||
|
||||
|
||||
def find_wget() -> str | None:
|
||||
"""Find wget binary."""
|
||||
wget = get_env('WGET_BINARY')
|
||||
if wget and os.path.isfile(wget):
|
||||
return wget
|
||||
return shutil.which('wget')
|
||||
|
||||
|
||||
def get_version(binary: str) -> str:
|
||||
"""Get wget version."""
|
||||
try:
|
||||
result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
|
||||
return result.stdout.split('\n')[0].strip()[:64]
|
||||
except Exception:
|
||||
return ''
|
||||
|
||||
|
||||
def check_wget_compression(binary: str) -> bool:
|
||||
"""Check if wget supports --compression=auto."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[binary, '--compression=auto', '--help'],
|
||||
capture_output=True,
|
||||
timeout=5
|
||||
)
|
||||
return result.returncode == 0
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
# Default wget args (from old WGET_CONFIG)
|
||||
WGET_DEFAULT_ARGS = [
|
||||
'--no-verbose',
|
||||
'--adjust-extension',
|
||||
'--convert-links',
|
||||
'--force-directories',
|
||||
'--backup-converted',
|
||||
'--span-hosts',
|
||||
'--no-parent',
|
||||
'-e', 'robots=off',
|
||||
]
|
||||
|
||||
|
||||
def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
"""
|
||||
Archive URL using wget.
|
||||
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
# Get config from env (with WGET_ prefix or fallback to ARCHIVING_CONFIG style)
|
||||
timeout = get_env_int('WGET_TIMEOUT') or get_env_int('TIMEOUT', 60)
|
||||
user_agent = get_env('WGET_USER_AGENT') or get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
|
||||
check_ssl = get_env_bool('WGET_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True))
|
||||
cookies_file = get_env('WGET_COOKIES_FILE') or get_env('COOKIES_FILE', '')
|
||||
restrict_names = get_env('WGET_RESTRICT_FILE_NAMES') or get_env('RESTRICT_FILE_NAMES', 'windows')
|
||||
extra_args = get_env('WGET_EXTRA_ARGS', '')
|
||||
|
||||
# Feature toggles
|
||||
save_warc = get_env_bool('SAVE_WARC', True)
|
||||
save_requisites = get_env_bool('SAVE_WGET_REQUISITES', True)
|
||||
|
||||
# Check for compression support
|
||||
supports_compression = check_wget_compression(binary)
|
||||
|
||||
# Build wget command (later options take precedence)
|
||||
cmd = [
|
||||
binary,
|
||||
*WGET_DEFAULT_ARGS,
|
||||
f'--timeout={timeout}',
|
||||
'--tries=2',
|
||||
]
|
||||
|
||||
if user_agent:
|
||||
cmd.append(f'--user-agent={user_agent}')
|
||||
|
||||
if restrict_names:
|
||||
cmd.append(f'--restrict-file-names={restrict_names}')
|
||||
|
||||
if save_requisites:
|
||||
cmd.append('--page-requisites')
|
||||
|
||||
if save_warc:
|
||||
warc_dir = Path('warc')
|
||||
warc_dir.mkdir(exist_ok=True)
|
||||
warc_path = warc_dir / str(int(datetime.now(timezone.utc).timestamp()))
|
||||
cmd.append(f'--warc-file={warc_path}')
|
||||
else:
|
||||
cmd.append('--timestamping')
|
||||
|
||||
if cookies_file and Path(cookies_file).is_file():
|
||||
cmd.extend(['--load-cookies', cookies_file])
|
||||
|
||||
if supports_compression:
|
||||
cmd.append('--compression=auto')
|
||||
|
||||
if not check_ssl:
|
||||
cmd.extend(['--no-check-certificate', '--no-hsts'])
|
||||
|
||||
if extra_args:
|
||||
cmd.extend(extra_args.split())
|
||||
|
||||
cmd.append(url)
|
||||
|
||||
# Run wget
|
||||
try:
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
timeout=timeout * 2, # Allow extra time for large downloads
|
||||
)
|
||||
|
||||
# Find downloaded files
|
||||
downloaded_files = [
|
||||
f for f in Path('.').rglob('*')
|
||||
if f.is_file() and f.name != '.gitkeep' and not str(f).startswith('warc/')
|
||||
]
|
||||
|
||||
if not downloaded_files:
|
||||
stderr = result.stderr.decode('utf-8', errors='replace')
|
||||
stdout = result.stdout.decode('utf-8', errors='replace')
|
||||
combined = stderr + stdout
|
||||
|
||||
if '403' in combined or 'Forbidden' in combined:
|
||||
return False, None, '403 Forbidden (try changing USER_AGENT)'
|
||||
elif '404' in combined or 'Not Found' in combined:
|
||||
return False, None, '404 Not Found'
|
||||
elif '500' in combined:
|
||||
return False, None, '500 Internal Server Error'
|
||||
else:
|
||||
return False, None, f'No files downloaded: {stderr[:200]}'
|
||||
|
||||
# Find main HTML file
|
||||
html_files = [
|
||||
f for f in downloaded_files
|
||||
if re.search(r'\.[Ss]?[Hh][Tt][Mm][Ll]?$', str(f))
|
||||
]
|
||||
output_path = str(html_files[0]) if html_files else str(downloaded_files[0])
|
||||
|
||||
# Parse download stats from wget output
|
||||
output_tail = result.stderr.decode('utf-8', errors='replace').strip().split('\n')[-3:]
|
||||
files_count = len(downloaded_files)
|
||||
|
||||
return True, output_path, ''
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
return False, None, f'Timed out after {timeout * 2} seconds'
|
||||
except Exception as e:
|
||||
return False, None, f'{type(e).__name__}: {e}'
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL to archive')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Archive a URL using wget."""
|
||||
|
||||
start_ts = datetime.now(timezone.utc)
|
||||
version = ''
|
||||
output = None
|
||||
status = 'failed'
|
||||
error = ''
|
||||
binary = None
|
||||
cmd_str = ''
|
||||
|
||||
try:
|
||||
# Check if wget is enabled
|
||||
if not get_env_bool('SAVE_WGET', True):
|
||||
print('Skipping wget (SAVE_WGET=False)')
|
||||
status = 'skipped'
|
||||
end_ts = datetime.now(timezone.utc)
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={end_ts.isoformat()}')
|
||||
print(f'STATUS={status}')
|
||||
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}')
|
||||
sys.exit(0)
|
||||
|
||||
# Check if staticfile extractor already handled this (permanent skip)
|
||||
if has_staticfile_output():
|
||||
print(f'Skipping wget - staticfile extractor already downloaded this')
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
|
||||
print(f'STATUS=skipped')
|
||||
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
|
||||
sys.exit(0) # Permanent skip - staticfile already handled
|
||||
|
||||
# Find binary
|
||||
binary = find_wget()
|
||||
if not binary:
|
||||
print(f'ERROR: {BIN_NAME} binary not found', file=sys.stderr)
|
||||
print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
|
||||
print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
|
||||
print(f'INSTALL_HINT=apt install wget OR brew install wget', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
version = get_version(binary)
|
||||
cmd_str = f'{binary} ... {url}'
|
||||
|
||||
# Run extraction
|
||||
success, output, error = save_wget(url, binary)
|
||||
status = 'succeeded' if success else 'failed'
|
||||
|
||||
if success:
|
||||
# Count downloaded files
|
||||
files = list(Path('.').rglob('*'))
|
||||
file_count = len([f for f in files if f.is_file()])
|
||||
print(f'wget completed: {file_count} files downloaded')
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
|
||||
# Print results
|
||||
end_ts = datetime.now(timezone.utc)
|
||||
duration = (end_ts - start_ts).total_seconds()
|
||||
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={end_ts.isoformat()}')
|
||||
print(f'DURATION={duration:.2f}')
|
||||
if cmd_str:
|
||||
print(f'CMD={cmd_str}')
|
||||
if version:
|
||||
print(f'VERSION={version}')
|
||||
if output:
|
||||
print(f'OUTPUT={output}')
|
||||
print(f'STATUS={status}')
|
||||
|
||||
if error:
|
||||
print(f'ERROR={error}', file=sys.stderr)
|
||||
|
||||
# Print JSON result
|
||||
result_json = {
|
||||
'extractor': EXTRACTOR_NAME,
|
||||
'url': url,
|
||||
'snapshot_id': snapshot_id,
|
||||
'status': status,
|
||||
'start_ts': start_ts.isoformat(),
|
||||
'end_ts': end_ts.isoformat(),
|
||||
'duration': round(duration, 2),
|
||||
'cmd_version': version,
|
||||
'output': output,
|
||||
'error': error or None,
|
||||
}
|
||||
print(f'RESULT_JSON={json.dumps(result_json)}')
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
369
archivebox/plugins/wget/tests/test_wget.py
Normal file
369
archivebox/plugins/wget/tests/test_wget.py
Normal file
@@ -0,0 +1,369 @@
|
||||
"""
|
||||
Integration tests for wget plugin
|
||||
|
||||
Tests verify:
|
||||
1. Plugin reports missing dependency correctly
|
||||
2. wget can be installed via brew/apt provider hooks
|
||||
3. Config options work (SAVE_WGET, SAVE_WARC, etc.)
|
||||
4. Extraction works against real example.com
|
||||
5. Output files contain actual page content
|
||||
6. Skip cases work (SAVE_WGET=False, staticfile present)
|
||||
7. Failure cases handled (404, network errors)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
WGET_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_wget.py'))
|
||||
BREW_HOOK = PLUGINS_ROOT / 'brew' / 'on_Dependency__install_using_brew_provider.py'
|
||||
APT_HOOK = PLUGINS_ROOT / 'apt' / 'on_Dependency__install_using_apt_provider.py'
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify hook script exists."""
|
||||
assert WGET_HOOK.exists(), f"Hook script not found: {WGET_HOOK}"
|
||||
|
||||
|
||||
def test_reports_missing_dependency_when_not_installed():
|
||||
"""Test that script reports DEPENDENCY_NEEDED when wget is not found."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Run with empty PATH so binary won't be found
|
||||
env = {'PATH': '/nonexistent', 'HOME': str(tmpdir)}
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'test123'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Should fail and report missing dependency
|
||||
assert result.returncode != 0, "Should exit non-zero when dependency missing"
|
||||
combined = result.stdout + result.stderr
|
||||
assert 'DEPENDENCY_NEEDED' in combined, "Should output DEPENDENCY_NEEDED"
|
||||
assert 'wget' in combined.lower(), "Should mention wget"
|
||||
assert 'BIN_PROVIDERS' in combined, "Should report available providers (apt,brew,env)"
|
||||
|
||||
|
||||
def test_can_install_wget_via_provider():
|
||||
"""Test that wget can be installed via brew/apt provider hooks."""
|
||||
|
||||
# Determine which provider to use
|
||||
if shutil.which('brew'):
|
||||
provider_hook = BREW_HOOK
|
||||
provider_name = 'brew'
|
||||
elif shutil.which('apt-get'):
|
||||
provider_hook = APT_HOOK
|
||||
provider_name = 'apt'
|
||||
else:
|
||||
pytest.skip("Neither brew nor apt available on this system")
|
||||
|
||||
assert provider_hook.exists(), f"Provider hook not found: {provider_hook}"
|
||||
|
||||
# Test installation via provider hook
|
||||
dependency_id = str(uuid.uuid4())
|
||||
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable,
|
||||
str(provider_hook),
|
||||
'--dependency-id', dependency_id,
|
||||
'--bin-name', 'wget',
|
||||
'--bin-providers', 'apt,brew,env'
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300 # Installation can take time
|
||||
)
|
||||
|
||||
# Should succeed (wget installs successfully or is already installed)
|
||||
assert result.returncode == 0, f"{provider_name} install failed: {result.stderr}"
|
||||
|
||||
# Should output InstalledBinary JSONL record
|
||||
assert 'InstalledBinary' in result.stdout or 'wget' in result.stderr, \
|
||||
f"Should output installation info: stdout={result.stdout}, stderr={result.stderr}"
|
||||
|
||||
# Parse JSONL if present
|
||||
if result.stdout.strip():
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == 'wget'
|
||||
assert record['binprovider'] in ['brew', 'apt']
|
||||
assert record['abspath'], "Should have binary path"
|
||||
assert Path(record['abspath']).exists(), f"Binary should exist at {record['abspath']}"
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
# Verify wget is now available
|
||||
result = subprocess.run(['which', 'wget'], capture_output=True, text=True)
|
||||
assert result.returncode == 0, "wget should be available after installation"
|
||||
|
||||
|
||||
def test_archives_example_com():
|
||||
"""Test full workflow: ensure wget installed then archive example.com."""
|
||||
|
||||
# First ensure wget is installed via provider
|
||||
if shutil.which('brew'):
|
||||
provider_hook = BREW_HOOK
|
||||
elif shutil.which('apt-get'):
|
||||
provider_hook = APT_HOOK
|
||||
else:
|
||||
pytest.skip("Neither brew nor apt available")
|
||||
|
||||
# Run installation (idempotent - will succeed if already installed)
|
||||
install_result = subprocess.run(
|
||||
[
|
||||
sys.executable,
|
||||
str(provider_hook),
|
||||
'--dependency-id', str(uuid.uuid4()),
|
||||
'--bin-name', 'wget',
|
||||
'--bin-providers', 'apt,brew,env'
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300
|
||||
)
|
||||
|
||||
if install_result.returncode != 0:
|
||||
pytest.skip(f"Could not install wget: {install_result.stderr}")
|
||||
|
||||
# Now test archiving
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Run wget extraction
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
|
||||
|
||||
# Verify output in stdout
|
||||
assert 'STATUS=succeeded' in result.stdout, "Should report success"
|
||||
assert 'wget completed' in result.stdout, "Should report completion"
|
||||
|
||||
# Verify files were downloaded
|
||||
downloaded_files = list(tmpdir.rglob('*.html')) + list(tmpdir.rglob('*.htm'))
|
||||
assert len(downloaded_files) > 0, "No HTML files downloaded"
|
||||
|
||||
# Find main HTML file (should contain example.com)
|
||||
main_html = None
|
||||
for html_file in downloaded_files:
|
||||
content = html_file.read_text(errors='ignore')
|
||||
if 'example domain' in content.lower():
|
||||
main_html = html_file
|
||||
break
|
||||
|
||||
assert main_html is not None, "Could not find main HTML file with example.com content"
|
||||
|
||||
# Verify HTML content contains REAL example.com text
|
||||
html_content = main_html.read_text(errors='ignore')
|
||||
assert len(html_content) > 200, f"HTML content too short: {len(html_content)} bytes"
|
||||
assert 'example domain' in html_content.lower(), "Missing 'Example Domain' in HTML"
|
||||
assert ('this domain' in html_content.lower() or
|
||||
'illustrative examples' in html_content.lower()), \
|
||||
"Missing example.com description text"
|
||||
assert ('iana' in html_content.lower() or
|
||||
'more information' in html_content.lower()), \
|
||||
"Missing IANA reference"
|
||||
|
||||
# Verify RESULT_JSON is present and valid
|
||||
assert 'RESULT_JSON=' in result.stdout, "Should output RESULT_JSON"
|
||||
|
||||
for line in result.stdout.split('\n'):
|
||||
if line.startswith('RESULT_JSON='):
|
||||
result_json = json.loads(line.replace('RESULT_JSON=', ''))
|
||||
assert result_json['extractor'] == 'wget'
|
||||
assert result_json['status'] == 'succeeded'
|
||||
assert result_json['url'] == TEST_URL
|
||||
assert result_json['snapshot_id'] == 'test789'
|
||||
assert 'duration' in result_json
|
||||
assert result_json['duration'] >= 0
|
||||
break
|
||||
|
||||
|
||||
def test_config_save_wget_false_skips():
|
||||
"""Test that SAVE_WGET=False causes skip."""
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set SAVE_WGET=False
|
||||
env = os.environ.copy()
|
||||
env['SAVE_WGET'] = 'False'
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Should succeed but skip
|
||||
assert result.returncode == 0, f"Should exit 0 when skipping: {result.stderr}"
|
||||
assert 'STATUS=skipped' in result.stdout, "Should report skipped status"
|
||||
assert 'SAVE_WGET=False' in result.stdout, "Should mention SAVE_WGET=False"
|
||||
|
||||
|
||||
def test_config_save_warc():
|
||||
"""Test that SAVE_WARC=True creates WARC files."""
|
||||
|
||||
# Ensure wget is available
|
||||
if not shutil.which('wget'):
|
||||
pytest.skip("wget not installed")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set SAVE_WARC=True explicitly
|
||||
env = os.environ.copy()
|
||||
env['SAVE_WARC'] = 'True'
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'testwarc'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=120
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
# Look for WARC files in warc/ subdirectory
|
||||
warc_dir = tmpdir / 'warc'
|
||||
if warc_dir.exists():
|
||||
warc_files = list(warc_dir.rglob('*'))
|
||||
warc_files = [f for f in warc_files if f.is_file()]
|
||||
assert len(warc_files) > 0, "WARC file not created when SAVE_WARC=True"
|
||||
|
||||
|
||||
def test_staticfile_present_skips():
|
||||
"""Test that wget skips when staticfile already downloaded."""
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Create staticfile directory with content to simulate staticfile extractor ran
|
||||
staticfile_dir = tmpdir / 'staticfile'
|
||||
staticfile_dir.mkdir()
|
||||
(staticfile_dir / 'index.html').write_text('<html>test</html>')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'teststatic'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Should skip
|
||||
assert result.returncode == 0, "Should exit 0 when skipping"
|
||||
assert 'STATUS=skipped' in result.stdout, "Should report skipped status"
|
||||
assert 'staticfile' in result.stdout.lower(), "Should mention staticfile"
|
||||
|
||||
|
||||
def test_handles_404_gracefully():
|
||||
"""Test that wget fails gracefully on 404."""
|
||||
|
||||
if not shutil.which('wget'):
|
||||
pytest.skip("wget not installed")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Try to download non-existent page
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(WGET_HOOK), '--url', 'https://example.com/nonexistent-page-404', '--snapshot-id', 'test404'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Should fail
|
||||
assert result.returncode != 0, "Should fail on 404"
|
||||
combined = result.stdout + result.stderr
|
||||
assert '404' in combined or 'Not Found' in combined or 'No files downloaded' in combined, \
|
||||
"Should report 404 or no files downloaded"
|
||||
|
||||
|
||||
def test_config_timeout_honored():
|
||||
"""Test that WGET_TIMEOUT config is respected."""
|
||||
|
||||
if not shutil.which('wget'):
|
||||
pytest.skip("wget not installed")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set very short timeout
|
||||
env = os.environ.copy()
|
||||
env['WGET_TIMEOUT'] = '5'
|
||||
|
||||
# This should still succeed for example.com (it's fast)
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'testtimeout'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Verify it completed (success or fail, but didn't hang)
|
||||
assert result.returncode in (0, 1), "Should complete (success or fail)"
|
||||
|
||||
|
||||
def test_config_user_agent():
|
||||
"""Test that WGET_USER_AGENT config is used."""
|
||||
|
||||
if not shutil.which('wget'):
|
||||
pytest.skip("wget not installed")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set custom user agent
|
||||
env = os.environ.copy()
|
||||
env['WGET_USER_AGENT'] = 'TestBot/1.0'
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'testua'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=120
|
||||
)
|
||||
|
||||
# Should succeed (example.com doesn't block)
|
||||
if result.returncode == 0:
|
||||
assert 'STATUS=succeeded' in result.stdout
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
Reference in New Issue
Block a user