#!/usr/bin/env python3 """ Archive a URL using wget. Usage: on_Snapshot__wget.py --url= --snapshot-id= Output: Downloads files to $PWD Environment variables: WGET_BINARY: Path to wget binary (optional, falls back to PATH) WGET_TIMEOUT: Timeout in seconds (default: 60) WGET_USER_AGENT: User agent string WGET_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True) WGET_COOKIES_FILE: Path to cookies file (optional) WGET_RESTRICT_FILE_NAMES: Filename restriction mode (default: windows) WGET_EXTRA_ARGS: Extra arguments for wget (space-separated) # Wget feature toggles SAVE_WGET: Enable wget archiving (default: True) SAVE_WARC: Save WARC file (default: True) SAVE_WGET_REQUISITES: Download page requisites (default: True) # Fallback to ARCHIVING_CONFIG values if WGET_* not set: TIMEOUT: Fallback timeout USER_AGENT: Fallback user agent CHECK_SSL_VALIDITY: Fallback SSL check COOKIES_FILE: Fallback cookies file RESTRICT_FILE_NAMES: Fallback filename restriction """ import json import os import re import shutil import subprocess import sys from datetime import datetime, timezone from pathlib import Path import rich_click as click # Extractor metadata EXTRACTOR_NAME = 'wget' BIN_NAME = 'wget' BIN_PROVIDERS = 'apt,brew,env' OUTPUT_DIR = '.' def get_env(name: str, default: str = '') -> str: return os.environ.get(name, default).strip() def get_env_bool(name: str, default: bool = False) -> bool: val = get_env(name, '').lower() if val in ('true', '1', 'yes', 'on'): return True if val in ('false', '0', 'no', 'off'): return False return default def get_env_int(name: str, default: int = 0) -> int: try: return int(get_env(name, str(default))) except ValueError: return default STATICFILE_DIR = '../staticfile' def has_staticfile_output() -> bool: """Check if staticfile extractor already downloaded this URL.""" staticfile_dir = Path(STATICFILE_DIR) return staticfile_dir.exists() and any(staticfile_dir.iterdir()) def find_wget() -> str | None: """Find wget binary.""" wget = get_env('WGET_BINARY') if wget and os.path.isfile(wget): return wget return shutil.which('wget') def get_version(binary: str) -> str: """Get wget version.""" try: result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10) return result.stdout.split('\n')[0].strip()[:64] except Exception: return '' def check_wget_compression(binary: str) -> bool: """Check if wget supports --compression=auto.""" try: result = subprocess.run( [binary, '--compression=auto', '--help'], capture_output=True, timeout=5 ) return result.returncode == 0 except Exception: return False # Default wget args (from old WGET_CONFIG) WGET_DEFAULT_ARGS = [ '--no-verbose', '--adjust-extension', '--convert-links', '--force-directories', '--backup-converted', '--span-hosts', '--no-parent', '-e', 'robots=off', ] def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]: """ Archive URL using wget. Returns: (success, output_path, error_message) """ # Get config from env (with WGET_ prefix or fallback to ARCHIVING_CONFIG style) timeout = get_env_int('WGET_TIMEOUT') or get_env_int('TIMEOUT', 60) user_agent = get_env('WGET_USER_AGENT') or get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)') check_ssl = get_env_bool('WGET_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True)) cookies_file = get_env('WGET_COOKIES_FILE') or get_env('COOKIES_FILE', '') restrict_names = get_env('WGET_RESTRICT_FILE_NAMES') or get_env('RESTRICT_FILE_NAMES', 'windows') extra_args = get_env('WGET_EXTRA_ARGS', '') # Feature toggles save_warc = get_env_bool('SAVE_WARC', True) save_requisites = get_env_bool('SAVE_WGET_REQUISITES', True) # Check for compression support supports_compression = check_wget_compression(binary) # Build wget command (later options take precedence) cmd = [ binary, *WGET_DEFAULT_ARGS, f'--timeout={timeout}', '--tries=2', ] if user_agent: cmd.append(f'--user-agent={user_agent}') if restrict_names: cmd.append(f'--restrict-file-names={restrict_names}') if save_requisites: cmd.append('--page-requisites') if save_warc: warc_dir = Path('warc') warc_dir.mkdir(exist_ok=True) warc_path = warc_dir / str(int(datetime.now(timezone.utc).timestamp())) cmd.append(f'--warc-file={warc_path}') else: cmd.append('--timestamping') if cookies_file and Path(cookies_file).is_file(): cmd.extend(['--load-cookies', cookies_file]) if supports_compression: cmd.append('--compression=auto') if not check_ssl: cmd.extend(['--no-check-certificate', '--no-hsts']) if extra_args: cmd.extend(extra_args.split()) cmd.append(url) # Run wget try: result = subprocess.run( cmd, capture_output=True, timeout=timeout * 2, # Allow extra time for large downloads ) # Find downloaded files downloaded_files = [ f for f in Path('.').rglob('*') if f.is_file() and f.name != '.gitkeep' and not str(f).startswith('warc/') ] if not downloaded_files: stderr = result.stderr.decode('utf-8', errors='replace') stdout = result.stdout.decode('utf-8', errors='replace') combined = stderr + stdout if '403' in combined or 'Forbidden' in combined: return False, None, '403 Forbidden (try changing USER_AGENT)' elif '404' in combined or 'Not Found' in combined: return False, None, '404 Not Found' elif '500' in combined: return False, None, '500 Internal Server Error' else: return False, None, f'No files downloaded: {stderr[:200]}' # Find main HTML file html_files = [ f for f in downloaded_files if re.search(r'\.[Ss]?[Hh][Tt][Mm][Ll]?$', str(f)) ] output_path = str(html_files[0]) if html_files else str(downloaded_files[0]) # Parse download stats from wget output output_tail = result.stderr.decode('utf-8', errors='replace').strip().split('\n')[-3:] files_count = len(downloaded_files) return True, output_path, '' except subprocess.TimeoutExpired: return False, None, f'Timed out after {timeout * 2} seconds' except Exception as e: return False, None, f'{type(e).__name__}: {e}' @click.command() @click.option('--url', required=True, help='URL to archive') @click.option('--snapshot-id', required=True, help='Snapshot UUID') def main(url: str, snapshot_id: str): """Archive a URL using wget.""" start_ts = datetime.now(timezone.utc) version = '' output = None status = 'failed' error = '' binary = None cmd_str = '' try: # Check if wget is enabled if not get_env_bool('SAVE_WGET', True): print('Skipping wget (SAVE_WGET=False)') status = 'skipped' end_ts = datetime.now(timezone.utc) print(f'START_TS={start_ts.isoformat()}') print(f'END_TS={end_ts.isoformat()}') print(f'STATUS={status}') print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}') sys.exit(0) # Check if staticfile extractor already handled this (permanent skip) if has_staticfile_output(): print(f'Skipping wget - staticfile extractor already downloaded this') print(f'START_TS={start_ts.isoformat()}') print(f'END_TS={datetime.now(timezone.utc).isoformat()}') print(f'STATUS=skipped') print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}') sys.exit(0) # Permanent skip - staticfile already handled # Find binary binary = find_wget() if not binary: print(f'ERROR: {BIN_NAME} binary not found', file=sys.stderr) print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr) print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr) print(f'INSTALL_HINT=apt install wget OR brew install wget', file=sys.stderr) sys.exit(1) version = get_version(binary) cmd_str = f'{binary} ... {url}' # Run extraction success, output, error = save_wget(url, binary) status = 'succeeded' if success else 'failed' if success: # Count downloaded files files = list(Path('.').rglob('*')) file_count = len([f for f in files if f.is_file()]) print(f'wget completed: {file_count} files downloaded') except Exception as e: error = f'{type(e).__name__}: {e}' status = 'failed' # Print results end_ts = datetime.now(timezone.utc) duration = (end_ts - start_ts).total_seconds() print(f'START_TS={start_ts.isoformat()}') print(f'END_TS={end_ts.isoformat()}') print(f'DURATION={duration:.2f}') if cmd_str: print(f'CMD={cmd_str}') if version: print(f'VERSION={version}') if output: print(f'OUTPUT={output}') print(f'STATUS={status}') if error: print(f'ERROR={error}', file=sys.stderr) # Print JSON result result_json = { 'extractor': EXTRACTOR_NAME, 'url': url, 'snapshot_id': snapshot_id, 'status': status, 'start_ts': start_ts.isoformat(), 'end_ts': end_ts.isoformat(), 'duration': round(duration, 2), 'cmd_version': version, 'output': output, 'error': error or None, } print(f'RESULT_JSON={json.dumps(result_json)}') sys.exit(0 if status == 'succeeded' else 1) if __name__ == '__main__': main()