mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
rename archive_org to archivedotorg, add BinaryWorker, fix config pass-through
This commit is contained in:
26
archivebox/plugins/archivedotorg/config.json
Normal file
26
archivebox/plugins/archivedotorg/config.json
Normal file
@@ -0,0 +1,26 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"ARCHIVEDOTORG_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_ARCHIVEDOTORG", "USE_ARCHIVEDOTORG", "SUBMIT_ARCHIVEDOTORG"],
|
||||
"description": "Submit URLs to archive.org Wayback Machine"
|
||||
},
|
||||
"ARCHIVEDOTORG_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 60,
|
||||
"minimum": 10,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for archive.org submission in seconds"
|
||||
},
|
||||
"ARCHIVEDOTORG_USER_AGENT": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"x-fallback": "USER_AGENT",
|
||||
"description": "User agent string"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,140 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Submit a URL to archive.org for archiving.
|
||||
|
||||
Usage: on_Snapshot__archivedotorg.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Writes archive.org.txt to $PWD with the archived URL
|
||||
|
||||
Environment variables:
|
||||
ARCHIVEDOTORG_TIMEOUT: Timeout in seconds (default: 60)
|
||||
USER_AGENT: User agent string
|
||||
|
||||
# Fallback to ARCHIVING_CONFIG values if ARCHIVEDOTORG_* not set:
|
||||
TIMEOUT: Fallback timeout
|
||||
|
||||
Note: This extractor uses the 'requests' library which is bundled with ArchiveBox.
|
||||
It can run standalone if requests is installed: pip install requests
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
PLUGIN_NAME = 'archivedotorg'
|
||||
OUTPUT_DIR = '.'
|
||||
OUTPUT_FILE = 'archive.org.txt'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
|
||||
def get_env_int(name: str, default: int = 0) -> int:
|
||||
try:
|
||||
return int(get_env(name, str(default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
def submit_to_archivedotorg(url: str) -> tuple[bool, str | None, str]:
|
||||
"""
|
||||
Submit URL to archive.org Wayback Machine.
|
||||
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
try:
|
||||
import requests
|
||||
except ImportError:
|
||||
return False, None, 'requests library not installed'
|
||||
|
||||
timeout = get_env_int('ARCHIVEDOTORG_TIMEOUT') or get_env_int('TIMEOUT', 60)
|
||||
user_agent = get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
|
||||
|
||||
submit_url = f'https://web.archive.org/save/{url}'
|
||||
|
||||
try:
|
||||
response = requests.get(
|
||||
submit_url,
|
||||
timeout=timeout,
|
||||
headers={'User-Agent': user_agent},
|
||||
allow_redirects=True,
|
||||
)
|
||||
|
||||
# Check for successful archive
|
||||
content_location = response.headers.get('Content-Location', '')
|
||||
x_archive_orig_url = response.headers.get('X-Archive-Orig-Url', '')
|
||||
|
||||
# Build archive URL
|
||||
if content_location:
|
||||
archive_url = f'https://web.archive.org{content_location}'
|
||||
Path(OUTPUT_FILE).write_text(archive_url, encoding='utf-8')
|
||||
return True, OUTPUT_FILE, ''
|
||||
elif 'web.archive.org' in response.url:
|
||||
# We were redirected to an archive page
|
||||
Path(OUTPUT_FILE).write_text(response.url, encoding='utf-8')
|
||||
return True, OUTPUT_FILE, ''
|
||||
else:
|
||||
# Check for errors in response
|
||||
if 'RobotAccessControlException' in response.text:
|
||||
# Blocked by robots.txt - save submit URL for manual retry
|
||||
Path(OUTPUT_FILE).write_text(submit_url, encoding='utf-8')
|
||||
return True, OUTPUT_FILE, '' # Consider this a soft success
|
||||
elif response.status_code >= 400:
|
||||
return False, None, f'HTTP {response.status_code}'
|
||||
else:
|
||||
# Save submit URL anyway
|
||||
Path(OUTPUT_FILE).write_text(submit_url, encoding='utf-8')
|
||||
return True, OUTPUT_FILE, ''
|
||||
|
||||
except requests.Timeout:
|
||||
return False, None, f'Request timed out after {timeout} seconds'
|
||||
except requests.RequestException as e:
|
||||
return False, None, f'{type(e).__name__}: {e}'
|
||||
except Exception as e:
|
||||
return False, None, f'{type(e).__name__}: {e}'
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL to submit to archive.org')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Submit a URL to archive.org for archiving."""
|
||||
|
||||
# Check if feature is enabled
|
||||
if get_env('ARCHIVEDOTORG_ENABLED', 'True').lower() in ('false', '0', 'no', 'off'):
|
||||
print('Skipping archive.org submission (ARCHIVEDOTORG_ENABLED=False)', file=sys.stderr)
|
||||
# Temporary failure (config disabled) - NO JSONL emission
|
||||
sys.exit(0)
|
||||
|
||||
try:
|
||||
# Run extraction
|
||||
success, output, error = submit_to_archivedotorg(url)
|
||||
|
||||
if success:
|
||||
# Success - emit ArchiveResult with output file
|
||||
result = {
|
||||
'type': 'ArchiveResult',
|
||||
'status': 'succeeded',
|
||||
'output_str': output or '',
|
||||
}
|
||||
print(json.dumps(result))
|
||||
sys.exit(0)
|
||||
else:
|
||||
# Transient error (network, timeout, HTTP error) - emit NO JSONL
|
||||
# System will retry later
|
||||
print(f'ERROR: {error}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
# Unexpected error - also transient, emit NO JSONL
|
||||
print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
1
archivebox/plugins/archivedotorg/templates/icon.html
Normal file
1
archivebox/plugins/archivedotorg/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
🏛️
|
||||
12
archivebox/plugins/archivedotorg/templates/thumbnail.html
Normal file
12
archivebox/plugins/archivedotorg/templates/thumbnail.html
Normal file
@@ -0,0 +1,12 @@
|
||||
{% load config_tags %}
|
||||
{% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
|
||||
{% if enabled %}
|
||||
<!-- Archive.org thumbnail - iframe preview of archived page -->
|
||||
<div class="extractor-thumbnail archivedotorg-thumbnail" style="width: 100%; height: 100px; overflow: hidden;">
|
||||
<iframe src="{{ output_path }}"
|
||||
style="width: 100%; height: 100px; border: none; pointer-events: none;"
|
||||
loading="lazy"
|
||||
sandbox="allow-same-origin">
|
||||
</iframe>
|
||||
</div>
|
||||
{% endif %}
|
||||
93
archivebox/plugins/archivedotorg/tests/test_archivedotorg.py
Normal file
93
archivebox/plugins/archivedotorg/tests/test_archivedotorg.py
Normal file
@@ -0,0 +1,93 @@
|
||||
"""
|
||||
Integration tests for archivedotorg plugin
|
||||
|
||||
Tests verify standalone archive.org extractor execution.
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
ARCHIVEDOTORG_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_archivedotorg.*'), None)
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
def test_hook_script_exists():
|
||||
assert ARCHIVEDOTORG_HOOK.exists()
|
||||
|
||||
def test_submits_to_archivedotorg():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(ARCHIVEDOTORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
|
||||
cwd=tmpdir, capture_output=True, text=True, timeout=60
|
||||
)
|
||||
|
||||
assert result.returncode in (0, 1)
|
||||
|
||||
# Parse clean JSONL output
|
||||
result_json = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
if result.returncode == 0:
|
||||
# Success - should have ArchiveResult
|
||||
assert result_json, "Should have ArchiveResult JSONL output on success"
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
||||
else:
|
||||
# Transient error - no JSONL output, just stderr
|
||||
assert not result_json, "Should NOT emit JSONL on transient error"
|
||||
assert result.stderr, "Should have error message in stderr"
|
||||
|
||||
def test_config_save_archivedotorg_false_skips():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
import os
|
||||
env = os.environ.copy()
|
||||
env['ARCHIVEDOTORG_ENABLED'] = 'False'
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(ARCHIVEDOTORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
|
||||
cwd=tmpdir, capture_output=True, text=True, env=env, timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
|
||||
|
||||
# Feature disabled - temporary failure, should NOT emit JSONL
|
||||
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
|
||||
|
||||
# Should NOT emit any JSONL
|
||||
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
|
||||
assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}"
|
||||
|
||||
def test_handles_timeout():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
import os
|
||||
env = os.environ.copy()
|
||||
env['TIMEOUT'] = '1'
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(ARCHIVEDOTORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'testtimeout'],
|
||||
cwd=tmpdir, capture_output=True, text=True, env=env, timeout=30
|
||||
)
|
||||
|
||||
# Timeout is a transient error - should exit 1 with no JSONL
|
||||
assert result.returncode in (0, 1), "Should complete without hanging"
|
||||
|
||||
# If it timed out (exit 1), should have no JSONL output
|
||||
if result.returncode == 1:
|
||||
jsonl_lines = [line for line in result.stdout.strip().split('\n')
|
||||
if line.strip().startswith('{')]
|
||||
assert len(jsonl_lines) == 0, "Should not emit JSONL on timeout (transient error)"
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
Reference in New Issue
Block a user