rename archive_org to archivedotorg, add BinaryWorker, fix config pass-through

This commit is contained in:
Nick Sweeting
2026-01-04 22:38:15 -08:00
parent 456aaee287
commit 7ceaeae2d9
32 changed files with 1111 additions and 110 deletions

View File

@@ -0,0 +1,26 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"ARCHIVEDOTORG_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_ARCHIVEDOTORG", "USE_ARCHIVEDOTORG", "SUBMIT_ARCHIVEDOTORG"],
"description": "Submit URLs to archive.org Wayback Machine"
},
"ARCHIVEDOTORG_TIMEOUT": {
"type": "integer",
"default": 60,
"minimum": 10,
"x-fallback": "TIMEOUT",
"description": "Timeout for archive.org submission in seconds"
},
"ARCHIVEDOTORG_USER_AGENT": {
"type": "string",
"default": "",
"x-fallback": "USER_AGENT",
"description": "User agent string"
}
}
}

View File

@@ -0,0 +1,140 @@
#!/usr/bin/env python3
"""
Submit a URL to archive.org for archiving.
Usage: on_Snapshot__archivedotorg.py --url=<url> --snapshot-id=<uuid>
Output: Writes archive.org.txt to $PWD with the archived URL
Environment variables:
ARCHIVEDOTORG_TIMEOUT: Timeout in seconds (default: 60)
USER_AGENT: User agent string
# Fallback to ARCHIVING_CONFIG values if ARCHIVEDOTORG_* not set:
TIMEOUT: Fallback timeout
Note: This extractor uses the 'requests' library which is bundled with ArchiveBox.
It can run standalone if requests is installed: pip install requests
"""
import json
import os
import sys
from pathlib import Path
import rich_click as click
# Extractor metadata
PLUGIN_NAME = 'archivedotorg'
OUTPUT_DIR = '.'
OUTPUT_FILE = 'archive.org.txt'
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
except ValueError:
return default
def submit_to_archivedotorg(url: str) -> tuple[bool, str | None, str]:
"""
Submit URL to archive.org Wayback Machine.
Returns: (success, output_path, error_message)
"""
try:
import requests
except ImportError:
return False, None, 'requests library not installed'
timeout = get_env_int('ARCHIVEDOTORG_TIMEOUT') or get_env_int('TIMEOUT', 60)
user_agent = get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
submit_url = f'https://web.archive.org/save/{url}'
try:
response = requests.get(
submit_url,
timeout=timeout,
headers={'User-Agent': user_agent},
allow_redirects=True,
)
# Check for successful archive
content_location = response.headers.get('Content-Location', '')
x_archive_orig_url = response.headers.get('X-Archive-Orig-Url', '')
# Build archive URL
if content_location:
archive_url = f'https://web.archive.org{content_location}'
Path(OUTPUT_FILE).write_text(archive_url, encoding='utf-8')
return True, OUTPUT_FILE, ''
elif 'web.archive.org' in response.url:
# We were redirected to an archive page
Path(OUTPUT_FILE).write_text(response.url, encoding='utf-8')
return True, OUTPUT_FILE, ''
else:
# Check for errors in response
if 'RobotAccessControlException' in response.text:
# Blocked by robots.txt - save submit URL for manual retry
Path(OUTPUT_FILE).write_text(submit_url, encoding='utf-8')
return True, OUTPUT_FILE, '' # Consider this a soft success
elif response.status_code >= 400:
return False, None, f'HTTP {response.status_code}'
else:
# Save submit URL anyway
Path(OUTPUT_FILE).write_text(submit_url, encoding='utf-8')
return True, OUTPUT_FILE, ''
except requests.Timeout:
return False, None, f'Request timed out after {timeout} seconds'
except requests.RequestException as e:
return False, None, f'{type(e).__name__}: {e}'
except Exception as e:
return False, None, f'{type(e).__name__}: {e}'
@click.command()
@click.option('--url', required=True, help='URL to submit to archive.org')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Submit a URL to archive.org for archiving."""
# Check if feature is enabled
if get_env('ARCHIVEDOTORG_ENABLED', 'True').lower() in ('false', '0', 'no', 'off'):
print('Skipping archive.org submission (ARCHIVEDOTORG_ENABLED=False)', file=sys.stderr)
# Temporary failure (config disabled) - NO JSONL emission
sys.exit(0)
try:
# Run extraction
success, output, error = submit_to_archivedotorg(url)
if success:
# Success - emit ArchiveResult with output file
result = {
'type': 'ArchiveResult',
'status': 'succeeded',
'output_str': output or '',
}
print(json.dumps(result))
sys.exit(0)
else:
# Transient error (network, timeout, HTTP error) - emit NO JSONL
# System will retry later
print(f'ERROR: {error}', file=sys.stderr)
sys.exit(1)
except Exception as e:
# Unexpected error - also transient, emit NO JSONL
print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1 @@
🏛️

View File

@@ -0,0 +1,12 @@
{% load config_tags %}
{% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
{% if enabled %}
<!-- Archive.org thumbnail - iframe preview of archived page -->
<div class="extractor-thumbnail archivedotorg-thumbnail" style="width: 100%; height: 100px; overflow: hidden;">
<iframe src="{{ output_path }}"
style="width: 100%; height: 100px; border: none; pointer-events: none;"
loading="lazy"
sandbox="allow-same-origin">
</iframe>
</div>
{% endif %}

View File

@@ -0,0 +1,93 @@
"""
Integration tests for archivedotorg plugin
Tests verify standalone archive.org extractor execution.
"""
import json
import subprocess
import sys
import tempfile
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
ARCHIVEDOTORG_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_archivedotorg.*'), None)
TEST_URL = 'https://example.com'
def test_hook_script_exists():
assert ARCHIVEDOTORG_HOOK.exists()
def test_submits_to_archivedotorg():
with tempfile.TemporaryDirectory() as tmpdir:
result = subprocess.run(
[sys.executable, str(ARCHIVEDOTORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
cwd=tmpdir, capture_output=True, text=True, timeout=60
)
assert result.returncode in (0, 1)
# Parse clean JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
if result.returncode == 0:
# Success - should have ArchiveResult
assert result_json, "Should have ArchiveResult JSONL output on success"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
else:
# Transient error - no JSONL output, just stderr
assert not result_json, "Should NOT emit JSONL on transient error"
assert result.stderr, "Should have error message in stderr"
def test_config_save_archivedotorg_false_skips():
with tempfile.TemporaryDirectory() as tmpdir:
import os
env = os.environ.copy()
env['ARCHIVEDOTORG_ENABLED'] = 'False'
result = subprocess.run(
[sys.executable, str(ARCHIVEDOTORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
cwd=tmpdir, capture_output=True, text=True, env=env, timeout=30
)
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
# Feature disabled - temporary failure, should NOT emit JSONL
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
# Should NOT emit any JSONL
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}"
def test_handles_timeout():
with tempfile.TemporaryDirectory() as tmpdir:
import os
env = os.environ.copy()
env['TIMEOUT'] = '1'
result = subprocess.run(
[sys.executable, str(ARCHIVEDOTORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'testtimeout'],
cwd=tmpdir, capture_output=True, text=True, env=env, timeout=30
)
# Timeout is a transient error - should exit 1 with no JSONL
assert result.returncode in (0, 1), "Should complete without hanging"
# If it timed out (exit 1), should have no JSONL output
if result.returncode == 1:
jsonl_lines = [line for line in result.stdout.strip().split('\n')
if line.strip().startswith('{')]
assert len(jsonl_lines) == 0, "Should not emit JSONL on timeout (transient error)"
if __name__ == '__main__':
pytest.main([__file__, '-v'])