mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-04 06:47:57 +10:00
wip major changes
This commit is contained in:
40
archivebox/plugins/git/config.json
Normal file
40
archivebox/plugins/git/config.json
Normal file
@@ -0,0 +1,40 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"SAVE_GIT": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Enable git repository cloning"
|
||||
},
|
||||
"GIT_BINARY": {
|
||||
"type": "string",
|
||||
"default": "git",
|
||||
"description": "Path to git binary"
|
||||
},
|
||||
"GIT_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 120,
|
||||
"minimum": 10,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for git operations in seconds"
|
||||
},
|
||||
"GIT_DOMAINS": {
|
||||
"type": "string",
|
||||
"default": "github.com,gitlab.com,bitbucket.org,gist.github.com,codeberg.org,gitea.com,git.sr.ht",
|
||||
"description": "Comma-separated list of domains to treat as git repositories"
|
||||
},
|
||||
"GIT_CLONE_DEPTH": {
|
||||
"type": "integer",
|
||||
"default": 1,
|
||||
"minimum": 0,
|
||||
"description": "Depth of git clone (0 for full history, 1 for shallow)"
|
||||
},
|
||||
"GIT_EXTRA_ARGS": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"description": "Extra arguments for git clone"
|
||||
}
|
||||
}
|
||||
}
|
||||
126
archivebox/plugins/git/on_Crawl__00_validate_git.py
Normal file
126
archivebox/plugins/git/on_Crawl__00_validate_git.py
Normal file
@@ -0,0 +1,126 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validation hook for git binary.
|
||||
|
||||
Runs at crawl start to verify git is available.
|
||||
Outputs JSONL for InstalledBinary and Machine config updates.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import hashlib
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_binary_version(abspath: str) -> str | None:
|
||||
"""Get version string from binary."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[abspath, '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
# git version string: "git version 2.43.0"
|
||||
first_line = result.stdout.strip().split('\n')[0]
|
||||
parts = first_line.split()
|
||||
if len(parts) >= 3 and parts[0] == 'git':
|
||||
return parts[2]
|
||||
return first_line[:32]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def get_binary_hash(abspath: str) -> str | None:
|
||||
"""Get SHA256 hash of binary."""
|
||||
try:
|
||||
with open(abspath, 'rb') as f:
|
||||
return hashlib.sha256(f.read()).hexdigest()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_git() -> dict | None:
|
||||
"""Find git binary."""
|
||||
try:
|
||||
from abx_pkg import Binary, EnvProvider
|
||||
|
||||
class GitBinary(Binary):
|
||||
name: str = 'git'
|
||||
binproviders_supported = [EnvProvider()]
|
||||
|
||||
binary = GitBinary()
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
'name': 'git',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to shutil.which
|
||||
abspath = shutil.which('git') or os.environ.get('GIT_BINARY', '')
|
||||
if abspath and Path(abspath).is_file():
|
||||
return {
|
||||
'name': 'git',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
result = find_git()
|
||||
|
||||
if result and result.get('abspath'):
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': result['name'],
|
||||
'abspath': result['abspath'],
|
||||
'version': result['version'],
|
||||
'sha256': result['sha256'],
|
||||
'binprovider': result['binprovider'],
|
||||
}))
|
||||
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/GIT_BINARY',
|
||||
'value': result['abspath'],
|
||||
}))
|
||||
|
||||
if result['version']:
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/GIT_VERSION',
|
||||
'value': result['version'],
|
||||
}))
|
||||
|
||||
sys.exit(0)
|
||||
else:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'git',
|
||||
'bin_providers': 'apt,brew,env',
|
||||
}))
|
||||
print(f"git binary not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
193
archivebox/plugins/git/on_Snapshot__12_git.py
Normal file
193
archivebox/plugins/git/on_Snapshot__12_git.py
Normal file
@@ -0,0 +1,193 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Clone a git repository from a URL.
|
||||
|
||||
Usage: on_Snapshot__git.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Clones repository to $PWD/repo
|
||||
|
||||
Environment variables:
|
||||
GIT_BINARY: Path to git binary
|
||||
TIMEOUT: Timeout in seconds (default: 120)
|
||||
GIT_ARGS: Extra arguments for git clone (space-separated)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'git'
|
||||
BIN_NAME = 'git'
|
||||
BIN_PROVIDERS = 'apt,brew,env'
|
||||
OUTPUT_DIR = 'repo'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
|
||||
def get_env_int(name: str, default: int = 0) -> int:
|
||||
try:
|
||||
return int(get_env(name, str(default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
def is_git_url(url: str) -> bool:
|
||||
"""Check if URL looks like a git repository."""
|
||||
git_patterns = [
|
||||
'.git',
|
||||
'github.com',
|
||||
'gitlab.com',
|
||||
'bitbucket.org',
|
||||
'git://',
|
||||
'ssh://git@',
|
||||
]
|
||||
return any(p in url.lower() for p in git_patterns)
|
||||
|
||||
|
||||
def find_git() -> str | None:
|
||||
"""Find git binary."""
|
||||
git = get_env('GIT_BINARY')
|
||||
if git and os.path.isfile(git):
|
||||
return git
|
||||
|
||||
return shutil.which('git')
|
||||
|
||||
|
||||
def get_version(binary: str) -> str:
|
||||
"""Get git version."""
|
||||
try:
|
||||
result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
|
||||
return result.stdout.strip()[:64]
|
||||
except Exception:
|
||||
return ''
|
||||
|
||||
|
||||
def clone_git(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
"""
|
||||
Clone git repository.
|
||||
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
timeout = get_env_int('TIMEOUT', 120)
|
||||
extra_args = get_env('GIT_ARGS')
|
||||
|
||||
cmd = [
|
||||
binary,
|
||||
'clone',
|
||||
'--depth=1',
|
||||
'--recursive',
|
||||
]
|
||||
|
||||
if extra_args:
|
||||
cmd.extend(extra_args.split())
|
||||
|
||||
cmd.extend([url, OUTPUT_DIR])
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, timeout=timeout)
|
||||
|
||||
if result.returncode == 0 and Path(OUTPUT_DIR).is_dir():
|
||||
return True, OUTPUT_DIR, ''
|
||||
else:
|
||||
stderr = result.stderr.decode('utf-8', errors='replace')
|
||||
return False, None, f'git clone failed: {stderr[:200]}'
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
return False, None, f'Timed out after {timeout} seconds'
|
||||
except Exception as e:
|
||||
return False, None, f'{type(e).__name__}: {e}'
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='Git repository URL')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Clone a git repository from a URL."""
|
||||
|
||||
start_ts = datetime.now(timezone.utc)
|
||||
version = ''
|
||||
output = None
|
||||
status = 'failed'
|
||||
error = ''
|
||||
binary = None
|
||||
|
||||
try:
|
||||
# Check if URL looks like a git repo
|
||||
if not is_git_url(url):
|
||||
print(f'Skipping git clone for non-git URL: {url}')
|
||||
status = 'skipped'
|
||||
end_ts = datetime.now(timezone.utc)
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={end_ts.isoformat()}')
|
||||
print(f'STATUS={status}')
|
||||
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url})}')
|
||||
sys.exit(0)
|
||||
|
||||
# Find binary
|
||||
binary = find_git()
|
||||
if not binary:
|
||||
print(f'ERROR: git binary not found', file=sys.stderr)
|
||||
print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
|
||||
print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
version = get_version(binary)
|
||||
|
||||
# Run extraction
|
||||
success, output, error = clone_git(url, binary)
|
||||
status = 'succeeded' if success else 'failed'
|
||||
|
||||
if success:
|
||||
print(f'git clone completed')
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
|
||||
# Print results
|
||||
end_ts = datetime.now(timezone.utc)
|
||||
duration = (end_ts - start_ts).total_seconds()
|
||||
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={end_ts.isoformat()}')
|
||||
print(f'DURATION={duration:.2f}')
|
||||
if binary:
|
||||
print(f'CMD={binary} clone {url}')
|
||||
if version:
|
||||
print(f'VERSION={version}')
|
||||
if output:
|
||||
print(f'OUTPUT={output}')
|
||||
print(f'STATUS={status}')
|
||||
|
||||
if error:
|
||||
print(f'ERROR={error}', file=sys.stderr)
|
||||
|
||||
# Print JSON result
|
||||
result_json = {
|
||||
'extractor': EXTRACTOR_NAME,
|
||||
'url': url,
|
||||
'snapshot_id': snapshot_id,
|
||||
'status': status,
|
||||
'start_ts': start_ts.isoformat(),
|
||||
'end_ts': end_ts.isoformat(),
|
||||
'duration': round(duration, 2),
|
||||
'cmd_version': version,
|
||||
'output': output,
|
||||
'error': error or None,
|
||||
}
|
||||
print(f'RESULT_JSON={json.dumps(result_json)}')
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user