mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
wip
This commit is contained in:
20
archivebox/plugins/hashes/config.json
Normal file
20
archivebox/plugins/hashes/config.json
Normal file
@@ -0,0 +1,20 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"HASHES_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_HASHES", "USE_HASHES"],
|
||||
"description": "Enable merkle tree hash generation"
|
||||
},
|
||||
"HASHES_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 30,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for merkle tree generation in seconds"
|
||||
}
|
||||
}
|
||||
}
|
||||
185
archivebox/plugins/hashes/on_Snapshot__93_hashes.py
Executable file
185
archivebox/plugins/hashes/on_Snapshot__93_hashes.py
Executable file
@@ -0,0 +1,185 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Create a hashed Merkle tree of all archived outputs.
|
||||
|
||||
This plugin runs after all extractors complete (priority 93) and generates
|
||||
a cryptographic Merkle hash tree of all files in the snapshot directory.
|
||||
|
||||
Output: hashes.json containing root_hash, tree structure, file list, metadata
|
||||
|
||||
Usage: on_Snapshot__93_hashes.py --url=<url> --snapshot-id=<uuid>
|
||||
|
||||
Environment variables:
|
||||
SAVE_HASHES: Enable hash merkle tree generation (default: true)
|
||||
DATA_DIR: ArchiveBox data directory
|
||||
ARCHIVE_DIR: Archive output directory
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from typing import Dict, List, Optional, Tuple, Any
|
||||
|
||||
import click
|
||||
|
||||
|
||||
def sha256_file(filepath: Path) -> str:
|
||||
"""Compute SHA256 hash of a file."""
|
||||
h = hashlib.sha256()
|
||||
try:
|
||||
with open(filepath, 'rb') as f:
|
||||
while chunk := f.read(65536):
|
||||
h.update(chunk)
|
||||
return h.hexdigest()
|
||||
except (OSError, PermissionError):
|
||||
return '0' * 64
|
||||
|
||||
|
||||
def sha256_data(data: bytes) -> str:
|
||||
"""Compute SHA256 hash of raw data."""
|
||||
return hashlib.sha256(data).hexdigest()
|
||||
|
||||
|
||||
def collect_files(snapshot_dir: Path, exclude_dirs: Optional[List[str]] = None) -> List[Tuple[Path, str, int]]:
|
||||
"""Recursively collect all files in snapshot directory."""
|
||||
exclude_dirs = exclude_dirs or ['hashes', '.git', '__pycache__']
|
||||
files = []
|
||||
|
||||
for root, dirs, filenames in os.walk(snapshot_dir):
|
||||
dirs[:] = [d for d in dirs if d not in exclude_dirs]
|
||||
|
||||
for filename in filenames:
|
||||
filepath = Path(root) / filename
|
||||
rel_path = filepath.relative_to(snapshot_dir)
|
||||
|
||||
if filepath.is_symlink():
|
||||
continue
|
||||
|
||||
file_hash = sha256_file(filepath)
|
||||
file_size = filepath.stat().st_size if filepath.exists() else 0
|
||||
files.append((rel_path, file_hash, file_size))
|
||||
|
||||
files.sort(key=lambda x: str(x[0]))
|
||||
return files
|
||||
|
||||
|
||||
def build_merkle_tree(file_hashes: List[str]) -> Tuple[str, List[List[str]]]:
|
||||
"""Build a Merkle tree from a list of leaf hashes."""
|
||||
if not file_hashes:
|
||||
return sha256_data(b''), [[]]
|
||||
|
||||
tree_levels = [file_hashes.copy()]
|
||||
|
||||
while len(tree_levels[-1]) > 1:
|
||||
current_level = tree_levels[-1]
|
||||
next_level = []
|
||||
|
||||
for i in range(0, len(current_level), 2):
|
||||
left = current_level[i]
|
||||
if i + 1 < len(current_level):
|
||||
right = current_level[i + 1]
|
||||
combined = left + right
|
||||
else:
|
||||
combined = left + left
|
||||
|
||||
parent_hash = sha256_data(combined.encode('utf-8'))
|
||||
next_level.append(parent_hash)
|
||||
|
||||
tree_levels.append(next_level)
|
||||
|
||||
root_hash = tree_levels[-1][0]
|
||||
return root_hash, tree_levels
|
||||
|
||||
|
||||
def create_hashes(snapshot_dir: Path) -> Dict[str, Any]:
|
||||
"""Create a complete Merkle hash tree of all files in snapshot directory."""
|
||||
files = collect_files(snapshot_dir)
|
||||
file_hashes = [file_hash for _, file_hash, _ in files]
|
||||
root_hash, tree_levels = build_merkle_tree(file_hashes)
|
||||
total_size = sum(size for _, _, size in files)
|
||||
|
||||
file_list = [
|
||||
{'path': str(path), 'hash': file_hash, 'size': size}
|
||||
for path, file_hash, size in files
|
||||
]
|
||||
|
||||
return {
|
||||
'root_hash': root_hash,
|
||||
'tree_levels': tree_levels,
|
||||
'files': file_list,
|
||||
'metadata': {
|
||||
'timestamp': datetime.now(timezone.utc).isoformat(),
|
||||
'file_count': len(files),
|
||||
'total_size': total_size,
|
||||
'tree_depth': len(tree_levels),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL being archived')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Generate Merkle tree of all archived outputs."""
|
||||
status = 'failed'
|
||||
output = None
|
||||
error = ''
|
||||
root_hash = None
|
||||
file_count = 0
|
||||
|
||||
try:
|
||||
# Check if enabled
|
||||
save_hashes = os.getenv('HASHES_ENABLED', 'true').lower() in ('true', '1', 'yes', 'on')
|
||||
|
||||
if not save_hashes:
|
||||
status = 'skipped'
|
||||
click.echo(json.dumps({'status': status, 'output': 'HASHES_ENABLED=false'}))
|
||||
sys.exit(0)
|
||||
|
||||
# Working directory is the extractor output dir (e.g., <snapshot>/hashes/)
|
||||
# Parent is the snapshot directory
|
||||
output_dir = Path.cwd()
|
||||
snapshot_dir = output_dir.parent
|
||||
|
||||
if not snapshot_dir.exists():
|
||||
raise FileNotFoundError(f'Snapshot directory not found: {snapshot_dir}')
|
||||
|
||||
# Ensure output directory exists
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
output_path = output_dir / 'hashes.json'
|
||||
|
||||
# Generate Merkle tree
|
||||
merkle_data = create_hashes(snapshot_dir)
|
||||
|
||||
# Write output
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(merkle_data, f, indent=2)
|
||||
|
||||
status = 'succeeded'
|
||||
output = 'hashes.json'
|
||||
root_hash = merkle_data['root_hash']
|
||||
file_count = merkle_data['metadata']['file_count']
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
click.echo(f'Error: {error}', err=True)
|
||||
|
||||
# Print JSON result for hook runner
|
||||
result = {
|
||||
'status': status,
|
||||
'output': output,
|
||||
'error': error or None,
|
||||
'root_hash': root_hash,
|
||||
'file_count': file_count,
|
||||
}
|
||||
click.echo(json.dumps(result))
|
||||
|
||||
sys.exit(0 if status in ('succeeded', 'skipped') else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
1
archivebox/plugins/hashes/templates/icon.html
Normal file
1
archivebox/plugins/hashes/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
<span class="abx-output-icon abx-output-icon--hashes" title="Authenticity Hashes"><svg width="16" height="16" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="1.8" stroke-linecap="round" stroke-linejoin="round"><circle cx="12" cy="5" r="2"/><circle cx="6" cy="18" r="2"/><circle cx="18" cy="18" r="2"/><path d="M12 7v6"/><path d="M12 13l-4 3"/><path d="M12 13l4 3"/></svg></span>
|
||||
157
archivebox/plugins/hashes/tests/test_hashes.py
Normal file
157
archivebox/plugins/hashes/tests/test_hashes.py
Normal file
@@ -0,0 +1,157 @@
|
||||
"""
|
||||
Tests for the hashes plugin.
|
||||
|
||||
Tests the real merkle tree generation with actual files.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from django.test import TestCase
|
||||
|
||||
|
||||
# Get the path to the hashes hook
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
HASHES_HOOK = PLUGIN_DIR / 'on_Snapshot__93_hashes.py'
|
||||
|
||||
|
||||
class TestHashesPlugin(TestCase):
|
||||
"""Test the hashes plugin."""
|
||||
|
||||
def test_hashes_hook_exists(self):
|
||||
"""Hashes hook script should exist."""
|
||||
self.assertTrue(HASHES_HOOK.exists(), f"Hook not found: {HASHES_HOOK}")
|
||||
|
||||
def test_hashes_generates_tree_for_files(self):
|
||||
"""Hashes hook should generate merkle tree for files in snapshot directory."""
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Create a mock snapshot directory structure
|
||||
snapshot_dir = Path(temp_dir) / 'snapshot'
|
||||
snapshot_dir.mkdir()
|
||||
|
||||
# Create output directory for hashes
|
||||
output_dir = snapshot_dir / 'hashes'
|
||||
output_dir.mkdir()
|
||||
|
||||
# Create some test files
|
||||
(snapshot_dir / 'index.html').write_text('<html><body>Test</body></html>')
|
||||
(snapshot_dir / 'screenshot.png').write_bytes(b'\x89PNG\r\n\x1a\n' + b'\x00' * 100)
|
||||
|
||||
subdir = snapshot_dir / 'media'
|
||||
subdir.mkdir()
|
||||
(subdir / 'video.mp4').write_bytes(b'\x00\x00\x00\x18ftypmp42')
|
||||
|
||||
# Run the hook from the output directory
|
||||
env = os.environ.copy()
|
||||
env['HASHES_ENABLED'] = 'true'
|
||||
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable, str(HASHES_HOOK),
|
||||
'--url=https://example.com',
|
||||
'--snapshot-id=test-snapshot',
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
cwd=str(output_dir), # Hook expects to run from output dir
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Should succeed
|
||||
self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}")
|
||||
|
||||
# Check output file exists
|
||||
output_file = output_dir / 'hashes.json'
|
||||
self.assertTrue(output_file.exists(), "hashes.json not created")
|
||||
|
||||
# Parse and verify output
|
||||
with open(output_file) as f:
|
||||
data = json.load(f)
|
||||
|
||||
self.assertIn('root_hash', data)
|
||||
self.assertIn('files', data)
|
||||
self.assertIn('metadata', data)
|
||||
|
||||
# Should have indexed our test files
|
||||
file_paths = [f['path'] for f in data['files']]
|
||||
self.assertIn('index.html', file_paths)
|
||||
self.assertIn('screenshot.png', file_paths)
|
||||
|
||||
# Verify metadata
|
||||
self.assertGreater(data['metadata']['file_count'], 0)
|
||||
self.assertGreater(data['metadata']['total_size'], 0)
|
||||
|
||||
def test_hashes_skips_when_disabled(self):
|
||||
"""Hashes hook should skip when HASHES_ENABLED=false."""
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
snapshot_dir = Path(temp_dir) / 'snapshot'
|
||||
snapshot_dir.mkdir()
|
||||
output_dir = snapshot_dir / 'hashes'
|
||||
output_dir.mkdir()
|
||||
|
||||
env = os.environ.copy()
|
||||
env['HASHES_ENABLED'] = 'false'
|
||||
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable, str(HASHES_HOOK),
|
||||
'--url=https://example.com',
|
||||
'--snapshot-id=test-snapshot',
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
cwd=str(output_dir),
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Should succeed (exit 0) but skip
|
||||
self.assertEqual(result.returncode, 0)
|
||||
self.assertIn('skipped', result.stdout)
|
||||
|
||||
def test_hashes_handles_empty_directory(self):
|
||||
"""Hashes hook should handle empty snapshot directory."""
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
snapshot_dir = Path(temp_dir) / 'snapshot'
|
||||
snapshot_dir.mkdir()
|
||||
output_dir = snapshot_dir / 'hashes'
|
||||
output_dir.mkdir()
|
||||
|
||||
env = os.environ.copy()
|
||||
env['HASHES_ENABLED'] = 'true'
|
||||
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable, str(HASHES_HOOK),
|
||||
'--url=https://example.com',
|
||||
'--snapshot-id=test-snapshot',
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
cwd=str(output_dir),
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Should succeed even with empty directory
|
||||
self.assertEqual(result.returncode, 0, f"Hook failed: {result.stderr}")
|
||||
|
||||
# Check output file exists
|
||||
output_file = output_dir / 'hashes.json'
|
||||
self.assertTrue(output_file.exists())
|
||||
|
||||
with open(output_file) as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Should have empty file list
|
||||
self.assertEqual(data['metadata']['file_count'], 0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
Reference in New Issue
Block a user