Files
ArchiveBox/archivebox/plugins/git/on_Snapshot__62_git.py
Claude 1b5a816022 Implement hook step-based concurrency system
This implements the hook concurrency plan from TODO_hook_concurrency.md:

## Schema Changes
- Add Snapshot.current_step (IntegerField 0-9, default=0)
- Create migration 0034_snapshot_current_step.py
- Fix uuid_compat imports in migrations 0032 and 0003

## Core Logic
- Add extract_step(hook_name) utility - extracts step from __XX_ pattern
- Add is_background_hook(hook_name) utility - checks for .bg. suffix
- Update Snapshot.create_pending_archiveresults() to create one AR per hook
- Update ArchiveResult.run() to handle hook_name field
- Add Snapshot.advance_step_if_ready() method for step advancement
- Integrate with SnapshotMachine.is_finished() to call advance_step_if_ready()

## Worker Coordination
- Update ArchiveResultWorker.get_queue() for step-based filtering
- ARs are only claimable when their step <= snapshot.current_step

## Hook Renumbering
- Step 5 (DOM extraction): singlefile→50, screenshot→51, pdf→52, dom→53,
  title→54, readability→55, headers→55, mercury→56, htmltotext→57
- Step 6 (post-DOM): wget→61, git→62, media→63.bg, gallerydl→64.bg,
  forumdl→65.bg, papersdl→66.bg
- Step 7 (URL extraction): parse_* hooks moved to 70-75

Background hooks (.bg suffix) don't block step advancement, enabling
long-running downloads to continue while other hooks proceed.
2025-12-28 13:47:25 +00:00

141 lines
3.4 KiB
Python

#!/usr/bin/env python3
"""
Clone a git repository from a URL.
Usage: on_Snapshot__git.py --url=<url> --snapshot-id=<uuid>
Output: Clones repository to $PWD/repo
Environment variables:
GIT_BINARY: Path to git binary
GIT_TIMEOUT: Timeout in seconds (default: 120)
GIT_ARGS: Extra arguments for git clone (space-separated)
# Fallback to ARCHIVING_CONFIG values if GIT_* not set:
TIMEOUT: Fallback timeout
"""
import json
import os
import subprocess
import sys
from pathlib import Path
import rich_click as click
# Extractor metadata
PLUGIN_NAME = 'git'
BIN_NAME = 'git'
BIN_PROVIDERS = 'apt,brew,env'
OUTPUT_DIR = '.'
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
except ValueError:
return default
def is_git_url(url: str) -> bool:
"""Check if URL looks like a git repository."""
git_patterns = [
'.git',
'github.com',
'gitlab.com',
'bitbucket.org',
'git://',
'ssh://git@',
]
return any(p in url.lower() for p in git_patterns)
def clone_git(url: str, binary: str) -> tuple[bool, str | None, str]:
"""
Clone git repository.
Returns: (success, output_path, error_message)
"""
timeout = get_env_int('GIT_TIMEOUT') or get_env_int('TIMEOUT', 120)
extra_args = get_env('GIT_ARGS')
cmd = [
binary,
'clone',
'--depth=1',
'--recursive',
]
if extra_args:
cmd.extend(extra_args.split())
cmd.extend([url, OUTPUT_DIR])
try:
result = subprocess.run(cmd, capture_output=True, timeout=timeout)
if result.returncode == 0 and Path(OUTPUT_DIR).is_dir():
return True, OUTPUT_DIR, ''
else:
stderr = result.stderr.decode('utf-8', errors='replace')
return False, None, f'git clone failed: {stderr[:200]}'
except subprocess.TimeoutExpired:
return False, None, f'Timed out after {timeout} seconds'
except Exception as e:
return False, None, f'{type(e).__name__}: {e}'
@click.command()
@click.option('--url', required=True, help='Git repository URL')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Clone a git repository from a URL."""
output = None
status = 'failed'
error = ''
try:
# Check if URL looks like a git repo
if not is_git_url(url):
print(f'Skipping git clone for non-git URL: {url}', file=sys.stderr)
print(json.dumps({
'type': 'ArchiveResult',
'status': 'skipped',
'output_str': 'Not a git URL',
}))
sys.exit(0)
# Get binary from environment
binary = get_env('GIT_BINARY', 'git')
# Run extraction
success, output, error = clone_git(url, binary)
status = 'succeeded' if success else 'failed'
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
if error:
print(f'ERROR: {error}', file=sys.stderr)
# Output clean JSONL (no RESULT_JSON= prefix)
result = {
'type': 'ArchiveResult',
'status': status,
'output_str': output or error or '',
}
print(json.dumps(result))
sys.exit(0 if status == 'succeeded' else 1)
if __name__ == '__main__':
main()