Files
ArchiveBox/archivebox/plugins/git/on_Snapshot__62_git.py
Nick Sweeting 80f75126c6 more fixes
2025-12-29 21:03:05 -08:00

147 lines
4.0 KiB
Python

#!/usr/bin/env python3
"""
Clone a git repository from a URL.
Usage: on_Snapshot__git.py --url=<url> --snapshot-id=<uuid>
Output: Clones repository to $PWD/repo
Environment variables:
GIT_BINARY: Path to git binary
GIT_TIMEOUT: Timeout in seconds (default: 120)
GIT_ARGS: Default git arguments (JSON array, default: ["clone", "--depth=1", "--recursive"])
GIT_ARGS_EXTRA: Extra arguments to append (JSON array, default: [])
# Fallback to ARCHIVING_CONFIG values if GIT_* not set:
TIMEOUT: Fallback timeout
"""
import json
import os
import subprocess
import sys
from pathlib import Path
import rich_click as click
# Extractor metadata
PLUGIN_NAME = 'git'
BIN_NAME = 'git'
BIN_PROVIDERS = 'apt,brew,env'
OUTPUT_DIR = '.'
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
except ValueError:
return default
def get_env_array(name: str, default: list[str] | None = None) -> list[str]:
"""Parse a JSON array from environment variable."""
val = get_env(name, '')
if not val:
return default if default is not None else []
try:
result = json.loads(val)
if isinstance(result, list):
return [str(item) for item in result]
return default if default is not None else []
except json.JSONDecodeError:
return default if default is not None else []
def is_git_url(url: str) -> bool:
"""Check if URL looks like a git repository."""
git_patterns = [
'.git',
'github.com',
'gitlab.com',
'bitbucket.org',
'git://',
'ssh://git@',
]
return any(p in url.lower() for p in git_patterns)
def clone_git(url: str, binary: str) -> tuple[bool, str | None, str]:
"""
Clone git repository.
Returns: (success, output_path, error_message)
"""
timeout = get_env_int('GIT_TIMEOUT') or get_env_int('TIMEOUT', 120)
git_args = get_env_array('GIT_ARGS', ["clone", "--depth=1", "--recursive"])
git_args_extra = get_env_array('GIT_ARGS_EXTRA', [])
cmd = [binary, *git_args, *git_args_extra, url, OUTPUT_DIR]
try:
result = subprocess.run(cmd, capture_output=True, timeout=timeout)
if result.returncode == 0 and Path(OUTPUT_DIR).is_dir():
return True, OUTPUT_DIR, ''
else:
stderr = result.stderr.decode('utf-8', errors='replace')
return False, None, f'git clone failed: {stderr[:200]}'
except subprocess.TimeoutExpired:
return False, None, f'Timed out after {timeout} seconds'
except Exception as e:
return False, None, f'{type(e).__name__}: {e}'
@click.command()
@click.option('--url', required=True, help='Git repository URL')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Clone a git repository from a URL."""
output = None
status = 'failed'
error = ''
try:
# Check if URL looks like a git repo
if not is_git_url(url):
print(f'Skipping git clone for non-git URL: {url}', file=sys.stderr)
print(json.dumps({
'type': 'ArchiveResult',
'status': 'skipped',
'output_str': 'Not a git URL',
}))
sys.exit(0)
# Get binary from environment
binary = get_env('GIT_BINARY', 'git')
# Run extraction
success, output, error = clone_git(url, binary)
status = 'succeeded' if success else 'failed'
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
if error:
print(f'ERROR: {error}', file=sys.stderr)
# Output clean JSONL (no RESULT_JSON= prefix)
result = {
'type': 'ArchiveResult',
'status': status,
'output_str': output or error or '',
}
print(json.dumps(result))
sys.exit(0 if status == 'succeeded' else 1)
if __name__ == '__main__':
main()