wip major changes

This commit is contained in:
Nick Sweeting
2025-12-24 20:09:51 -08:00
parent c1335fed37
commit 1915333b81
450 changed files with 35814 additions and 19015 deletions

View File

@@ -0,0 +1,40 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"SAVE_GIT": {
"type": "boolean",
"default": true,
"description": "Enable git repository cloning"
},
"GIT_BINARY": {
"type": "string",
"default": "git",
"description": "Path to git binary"
},
"GIT_TIMEOUT": {
"type": "integer",
"default": 120,
"minimum": 10,
"x-fallback": "TIMEOUT",
"description": "Timeout for git operations in seconds"
},
"GIT_DOMAINS": {
"type": "string",
"default": "github.com,gitlab.com,bitbucket.org,gist.github.com,codeberg.org,gitea.com,git.sr.ht",
"description": "Comma-separated list of domains to treat as git repositories"
},
"GIT_CLONE_DEPTH": {
"type": "integer",
"default": 1,
"minimum": 0,
"description": "Depth of git clone (0 for full history, 1 for shallow)"
},
"GIT_EXTRA_ARGS": {
"type": "string",
"default": "",
"description": "Extra arguments for git clone"
}
}
}

View File

@@ -0,0 +1,126 @@
#!/usr/bin/env python3
"""
Validation hook for git binary.
Runs at crawl start to verify git is available.
Outputs JSONL for InstalledBinary and Machine config updates.
"""
import os
import sys
import json
import shutil
import hashlib
import subprocess
from pathlib import Path
def get_binary_version(abspath: str) -> str | None:
"""Get version string from binary."""
try:
result = subprocess.run(
[abspath, '--version'],
capture_output=True,
text=True,
timeout=5,
)
if result.returncode == 0 and result.stdout:
# git version string: "git version 2.43.0"
first_line = result.stdout.strip().split('\n')[0]
parts = first_line.split()
if len(parts) >= 3 and parts[0] == 'git':
return parts[2]
return first_line[:32]
except Exception:
pass
return None
def get_binary_hash(abspath: str) -> str | None:
"""Get SHA256 hash of binary."""
try:
with open(abspath, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
except Exception:
return None
def find_git() -> dict | None:
"""Find git binary."""
try:
from abx_pkg import Binary, EnvProvider
class GitBinary(Binary):
name: str = 'git'
binproviders_supported = [EnvProvider()]
binary = GitBinary()
loaded = binary.load()
if loaded and loaded.abspath:
return {
'name': 'git',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
}
except ImportError:
pass
except Exception:
pass
# Fallback to shutil.which
abspath = shutil.which('git') or os.environ.get('GIT_BINARY', '')
if abspath and Path(abspath).is_file():
return {
'name': 'git',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'env',
}
return None
def main():
result = find_git()
if result and result.get('abspath'):
print(json.dumps({
'type': 'InstalledBinary',
'name': result['name'],
'abspath': result['abspath'],
'version': result['version'],
'sha256': result['sha256'],
'binprovider': result['binprovider'],
}))
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/GIT_BINARY',
'value': result['abspath'],
}))
if result['version']:
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/GIT_VERSION',
'value': result['version'],
}))
sys.exit(0)
else:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'git',
'bin_providers': 'apt,brew,env',
}))
print(f"git binary not found", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,193 @@
#!/usr/bin/env python3
"""
Clone a git repository from a URL.
Usage: on_Snapshot__git.py --url=<url> --snapshot-id=<uuid>
Output: Clones repository to $PWD/repo
Environment variables:
GIT_BINARY: Path to git binary
TIMEOUT: Timeout in seconds (default: 120)
GIT_ARGS: Extra arguments for git clone (space-separated)
"""
import json
import os
import shutil
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path
import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'git'
BIN_NAME = 'git'
BIN_PROVIDERS = 'apt,brew,env'
OUTPUT_DIR = 'repo'
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
except ValueError:
return default
def is_git_url(url: str) -> bool:
"""Check if URL looks like a git repository."""
git_patterns = [
'.git',
'github.com',
'gitlab.com',
'bitbucket.org',
'git://',
'ssh://git@',
]
return any(p in url.lower() for p in git_patterns)
def find_git() -> str | None:
"""Find git binary."""
git = get_env('GIT_BINARY')
if git and os.path.isfile(git):
return git
return shutil.which('git')
def get_version(binary: str) -> str:
"""Get git version."""
try:
result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
return result.stdout.strip()[:64]
except Exception:
return ''
def clone_git(url: str, binary: str) -> tuple[bool, str | None, str]:
"""
Clone git repository.
Returns: (success, output_path, error_message)
"""
timeout = get_env_int('TIMEOUT', 120)
extra_args = get_env('GIT_ARGS')
cmd = [
binary,
'clone',
'--depth=1',
'--recursive',
]
if extra_args:
cmd.extend(extra_args.split())
cmd.extend([url, OUTPUT_DIR])
try:
result = subprocess.run(cmd, capture_output=True, timeout=timeout)
if result.returncode == 0 and Path(OUTPUT_DIR).is_dir():
return True, OUTPUT_DIR, ''
else:
stderr = result.stderr.decode('utf-8', errors='replace')
return False, None, f'git clone failed: {stderr[:200]}'
except subprocess.TimeoutExpired:
return False, None, f'Timed out after {timeout} seconds'
except Exception as e:
return False, None, f'{type(e).__name__}: {e}'
@click.command()
@click.option('--url', required=True, help='Git repository URL')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Clone a git repository from a URL."""
start_ts = datetime.now(timezone.utc)
version = ''
output = None
status = 'failed'
error = ''
binary = None
try:
# Check if URL looks like a git repo
if not is_git_url(url):
print(f'Skipping git clone for non-git URL: {url}')
status = 'skipped'
end_ts = datetime.now(timezone.utc)
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={end_ts.isoformat()}')
print(f'STATUS={status}')
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url})}')
sys.exit(0)
# Find binary
binary = find_git()
if not binary:
print(f'ERROR: git binary not found', file=sys.stderr)
print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
sys.exit(1)
version = get_version(binary)
# Run extraction
success, output, error = clone_git(url, binary)
status = 'succeeded' if success else 'failed'
if success:
print(f'git clone completed')
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
# Print results
end_ts = datetime.now(timezone.utc)
duration = (end_ts - start_ts).total_seconds()
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={end_ts.isoformat()}')
print(f'DURATION={duration:.2f}')
if binary:
print(f'CMD={binary} clone {url}')
if version:
print(f'VERSION={version}')
if output:
print(f'OUTPUT={output}')
print(f'STATUS={status}')
if error:
print(f'ERROR={error}', file=sys.stderr)
# Print JSON result
result_json = {
'extractor': EXTRACTOR_NAME,
'url': url,
'snapshot_id': snapshot_id,
'status': status,
'start_ts': start_ts.isoformat(),
'end_ts': end_ts.isoformat(),
'duration': round(duration, 2),
'cmd_version': version,
'output': output,
'error': error or None,
}
print(f'RESULT_JSON={json.dumps(result_json)}')
sys.exit(0 if status == 'succeeded' else 1)
if __name__ == '__main__':
main()