This commit is contained in:
Nick Sweeting
2026-01-21 03:19:56 -08:00
parent f3f55d3395
commit ec4b27056e
113 changed files with 6929 additions and 2396 deletions

View File

@@ -0,0 +1,20 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"HASHES_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_HASHES", "USE_HASHES"],
"description": "Enable merkle tree hash generation"
},
"HASHES_TIMEOUT": {
"type": "integer",
"default": 30,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for merkle tree generation in seconds"
}
}
}

View File

@@ -0,0 +1,185 @@
#!/usr/bin/env python3
"""
Create a hashed Merkle tree of all archived outputs.
This plugin runs after all extractors complete (priority 93) and generates
a cryptographic Merkle hash tree of all files in the snapshot directory.
Output: hashes.json containing root_hash, tree structure, file list, metadata
Usage: on_Snapshot__93_hashes.py --url=<url> --snapshot-id=<uuid>
Environment variables:
SAVE_HASHES: Enable hash merkle tree generation (default: true)
DATA_DIR: ArchiveBox data directory
ARCHIVE_DIR: Archive output directory
"""
import os
import sys
import json
import hashlib
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, List, Optional, Tuple, Any
import click
def sha256_file(filepath: Path) -> str:
"""Compute SHA256 hash of a file."""
h = hashlib.sha256()
try:
with open(filepath, 'rb') as f:
while chunk := f.read(65536):
h.update(chunk)
return h.hexdigest()
except (OSError, PermissionError):
return '0' * 64
def sha256_data(data: bytes) -> str:
"""Compute SHA256 hash of raw data."""
return hashlib.sha256(data).hexdigest()
def collect_files(snapshot_dir: Path, exclude_dirs: Optional[List[str]] = None) -> List[Tuple[Path, str, int]]:
"""Recursively collect all files in snapshot directory."""
exclude_dirs = exclude_dirs or ['hashes', '.git', '__pycache__']
files = []
for root, dirs, filenames in os.walk(snapshot_dir):
dirs[:] = [d for d in dirs if d not in exclude_dirs]
for filename in filenames:
filepath = Path(root) / filename
rel_path = filepath.relative_to(snapshot_dir)
if filepath.is_symlink():
continue
file_hash = sha256_file(filepath)
file_size = filepath.stat().st_size if filepath.exists() else 0
files.append((rel_path, file_hash, file_size))
files.sort(key=lambda x: str(x[0]))
return files
def build_merkle_tree(file_hashes: List[str]) -> Tuple[str, List[List[str]]]:
"""Build a Merkle tree from a list of leaf hashes."""
if not file_hashes:
return sha256_data(b''), [[]]
tree_levels = [file_hashes.copy()]
while len(tree_levels[-1]) > 1:
current_level = tree_levels[-1]
next_level = []
for i in range(0, len(current_level), 2):
left = current_level[i]
if i + 1 < len(current_level):
right = current_level[i + 1]
combined = left + right
else:
combined = left + left
parent_hash = sha256_data(combined.encode('utf-8'))
next_level.append(parent_hash)
tree_levels.append(next_level)
root_hash = tree_levels[-1][0]
return root_hash, tree_levels
def create_hashes(snapshot_dir: Path) -> Dict[str, Any]:
"""Create a complete Merkle hash tree of all files in snapshot directory."""
files = collect_files(snapshot_dir)
file_hashes = [file_hash for _, file_hash, _ in files]
root_hash, tree_levels = build_merkle_tree(file_hashes)
total_size = sum(size for _, _, size in files)
file_list = [
{'path': str(path), 'hash': file_hash, 'size': size}
for path, file_hash, size in files
]
return {
'root_hash': root_hash,
'tree_levels': tree_levels,
'files': file_list,
'metadata': {
'timestamp': datetime.now(timezone.utc).isoformat(),
'file_count': len(files),
'total_size': total_size,
'tree_depth': len(tree_levels),
},
}
@click.command()
@click.option('--url', required=True, help='URL being archived')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Generate Merkle tree of all archived outputs."""
status = 'failed'
output = None
error = ''
root_hash = None
file_count = 0
try:
# Check if enabled
save_hashes = os.getenv('HASHES_ENABLED', 'true').lower() in ('true', '1', 'yes', 'on')
if not save_hashes:
status = 'skipped'
click.echo(json.dumps({'status': status, 'output': 'HASHES_ENABLED=false'}))
sys.exit(0)
# Working directory is the extractor output dir (e.g., <snapshot>/hashes/)
# Parent is the snapshot directory
output_dir = Path.cwd()
snapshot_dir = output_dir.parent
if not snapshot_dir.exists():
raise FileNotFoundError(f'Snapshot directory not found: {snapshot_dir}')
# Ensure output directory exists
output_dir.mkdir(exist_ok=True)
output_path = output_dir / 'hashes.json'
# Generate Merkle tree
merkle_data = create_hashes(snapshot_dir)
# Write output
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(merkle_data, f, indent=2)
status = 'succeeded'
output = 'hashes.json'
root_hash = merkle_data['root_hash']
file_count = merkle_data['metadata']['file_count']
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
click.echo(f'Error: {error}', err=True)
# Print JSON result for hook runner
result = {
'status': status,
'output': output,
'error': error or None,
'root_hash': root_hash,
'file_count': file_count,
}
click.echo(json.dumps(result))
sys.exit(0 if status in ('succeeded', 'skipped') else 1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1 @@
<span class="abx-output-icon abx-output-icon--hashes" title="Authenticity Hashes"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="5" r="2"/><circle cx="6" cy="18" r="2"/><circle cx="18" cy="18" r="2"/><path d="M12 7v6"/><path d="M12 13l-4 3"/><path d="M12 13l4 3"/></svg></span>

View File

@@ -0,0 +1,157 @@
"""
Tests for the hashes plugin.
Tests the real merkle tree generation with actual files.
"""
import json
import os
import subprocess
import sys
import tempfile
from pathlib import Path
import pytest
from django.test import TestCase
# Get the path to the hashes hook
PLUGIN_DIR = Path(__file__).parent.parent
HASHES_HOOK = PLUGIN_DIR / 'on_Snapshot__93_hashes.py'
class TestHashesPlugin(TestCase):
"""Test the hashes plugin."""
def test_hashes_hook_exists(self):
"""Hashes hook script should exist."""
self.assertTrue(HASHES_HOOK.exists(), f"Hook not found: {HASHES_HOOK}")
def test_hashes_generates_tree_for_files(self):
"""Hashes hook should generate merkle tree for files in snapshot directory."""
with tempfile.TemporaryDirectory() as temp_dir:
# Create a mock snapshot directory structure
snapshot_dir = Path(temp_dir) / 'snapshot'
snapshot_dir.mkdir()
# Create output directory for hashes
output_dir = snapshot_dir / 'hashes'
output_dir.mkdir()
# Create some test files
(snapshot_dir / 'index.html').write_text('<html><body>Test</body></html>')
(snapshot_dir / 'screenshot.png').write_bytes(b'\x89PNG\r\n\x1a\n' + b'\x00' * 100)
subdir = snapshot_dir / 'media'
subdir.mkdir()
(subdir / 'video.mp4').write_bytes(b'\x00\x00\x00\x18ftypmp42')
# Run the hook from the output directory
env = os.environ.copy()
env['HASHES_ENABLED'] = 'true'
result = subprocess.run(
[
sys.executable, str(HASHES_HOOK),
'--url=https://example.com',
'--snapshot-id=test-snapshot',
],
capture_output=True,
text=True,
cwd=str(output_dir), # Hook expects to run from output dir
env=env,
timeout=30
)
# Should succeed
self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}")
# Check output file exists
output_file = output_dir / 'hashes.json'
self.assertTrue(output_file.exists(), "hashes.json not created")
# Parse and verify output
with open(output_file) as f:
data = json.load(f)
self.assertIn('root_hash', data)
self.assertIn('files', data)
self.assertIn('metadata', data)
# Should have indexed our test files
file_paths = [f['path'] for f in data['files']]
self.assertIn('index.html', file_paths)
self.assertIn('screenshot.png', file_paths)
# Verify metadata
self.assertGreater(data['metadata']['file_count'], 0)
self.assertGreater(data['metadata']['total_size'], 0)
def test_hashes_skips_when_disabled(self):
"""Hashes hook should skip when HASHES_ENABLED=false."""
with tempfile.TemporaryDirectory() as temp_dir:
snapshot_dir = Path(temp_dir) / 'snapshot'
snapshot_dir.mkdir()
output_dir = snapshot_dir / 'hashes'
output_dir.mkdir()
env = os.environ.copy()
env['HASHES_ENABLED'] = 'false'
result = subprocess.run(
[
sys.executable, str(HASHES_HOOK),
'--url=https://example.com',
'--snapshot-id=test-snapshot',
],
capture_output=True,
text=True,
cwd=str(output_dir),
env=env,
timeout=30
)
# Should succeed (exit 0) but skip
self.assertEqual(result.returncode, 0)
self.assertIn('skipped', result.stdout)
def test_hashes_handles_empty_directory(self):
"""Hashes hook should handle empty snapshot directory."""
with tempfile.TemporaryDirectory() as temp_dir:
snapshot_dir = Path(temp_dir) / 'snapshot'
snapshot_dir.mkdir()
output_dir = snapshot_dir / 'hashes'
output_dir.mkdir()
env = os.environ.copy()
env['HASHES_ENABLED'] = 'true'
result = subprocess.run(
[
sys.executable, str(HASHES_HOOK),
'--url=https://example.com',
'--snapshot-id=test-snapshot',
],
capture_output=True,
text=True,
cwd=str(output_dir),
env=env,
timeout=30
)
# Should succeed even with empty directory
self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}")
# Check output file exists
output_file = output_dir / 'hashes.json'
self.assertTrue(output_file.exists())
with open(output_file) as f:
data = json.load(f)
# Should have empty file list
self.assertEqual(data['metadata']['file_count'], 0)
if __name__ == '__main__':
pytest.main([__file__, '-v'])