mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
rename archive_org to archivedotorg, add BinaryWorker, fix config pass-through
This commit is contained in:
@@ -2,7 +2,7 @@
|
||||
"""
|
||||
Submit a URL to archive.org for archiving.
|
||||
|
||||
Usage: on_Snapshot__archive_org.py --url=<url> --snapshot-id=<uuid>
|
||||
Usage: on_Snapshot__archivedotorg.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Writes archive.org.txt to $PWD with the archived URL
|
||||
|
||||
Environment variables:
|
||||
@@ -25,7 +25,7 @@ import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
PLUGIN_NAME = 'archive_org'
|
||||
PLUGIN_NAME = 'archivedotorg'
|
||||
OUTPUT_DIR = '.'
|
||||
OUTPUT_FILE = 'archive.org.txt'
|
||||
|
||||
@@ -41,7 +41,7 @@ def get_env_int(name: str, default: int = 0) -> int:
|
||||
return default
|
||||
|
||||
|
||||
def submit_to_archive_org(url: str) -> tuple[bool, str | None, str]:
|
||||
def submit_to_archivedotorg(url: str) -> tuple[bool, str | None, str]:
|
||||
"""
|
||||
Submit URL to archive.org Wayback Machine.
|
||||
|
||||
@@ -113,7 +113,7 @@ def main(url: str, snapshot_id: str):
|
||||
|
||||
try:
|
||||
# Run extraction
|
||||
success, output, error = submit_to_archive_org(url)
|
||||
success, output, error = submit_to_archivedotorg(url)
|
||||
|
||||
if success:
|
||||
# Success - emit ArchiveResult with output file
|
||||
@@ -1,5 +1,5 @@
|
||||
"""
|
||||
Integration tests for archive_org plugin
|
||||
Integration tests for archivedotorg plugin
|
||||
|
||||
Tests verify standalone archive.org extractor execution.
|
||||
"""
|
||||
@@ -12,13 +12,13 @@ from pathlib import Path
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
ARCHIVEDOTORG_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_archive_org.*'), None)
|
||||
ARCHIVEDOTORG_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_archivedotorg.*'), None)
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
def test_hook_script_exists():
|
||||
assert ARCHIVEDOTORG_HOOK.exists()
|
||||
|
||||
def test_submits_to_archive_org():
|
||||
def test_submits_to_archivedotorg():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(ARCHIVEDOTORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
|
||||
@@ -49,7 +49,7 @@ def test_submits_to_archive_org():
|
||||
assert not result_json, "Should NOT emit JSONL on transient error"
|
||||
assert result.stderr, "Should have error message in stderr"
|
||||
|
||||
def test_config_save_archive_org_false_skips():
|
||||
def test_config_save_archivedotorg_false_skips():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
import os
|
||||
env = os.environ.copy()
|
||||
@@ -26,11 +26,20 @@ const {
|
||||
readCdpUrl,
|
||||
} = require('../chrome/chrome_utils.js');
|
||||
|
||||
// Flush V8 coverage before exit (needed for NODE_V8_COVERAGE to capture early exits)
|
||||
function flushCoverageAndExit(exitCode) {
|
||||
if (process.env.NODE_V8_COVERAGE) {
|
||||
const v8 = require('v8');
|
||||
v8.takeCoverage();
|
||||
}
|
||||
process.exit(exitCode);
|
||||
}
|
||||
|
||||
// Check if screenshot is enabled BEFORE requiring puppeteer
|
||||
if (!getEnvBool('SCREENSHOT_ENABLED', true)) {
|
||||
console.error('Skipping screenshot (SCREENSHOT_ENABLED=False)');
|
||||
// Temporary failure (config disabled) - NO JSONL emission
|
||||
process.exit(0);
|
||||
flushCoverageAndExit(0);
|
||||
}
|
||||
|
||||
// Now safe to require puppeteer
|
||||
@@ -135,7 +144,7 @@ async function main() {
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__51_screenshot.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
flushCoverageAndExit(1);
|
||||
}
|
||||
|
||||
// Check if staticfile extractor already handled this (permanent skip)
|
||||
@@ -147,7 +156,7 @@ async function main() {
|
||||
status: 'skipped',
|
||||
output_str: 'staticfile already handled',
|
||||
}));
|
||||
process.exit(0);
|
||||
flushCoverageAndExit(0);
|
||||
}
|
||||
|
||||
// Take screenshot (throws on error)
|
||||
@@ -166,5 +175,5 @@ async function main() {
|
||||
main().catch(e => {
|
||||
// Transient error - emit NO JSONL
|
||||
console.error(`ERROR: ${e.message}`);
|
||||
process.exit(1);
|
||||
flushCoverageAndExit(1);
|
||||
});
|
||||
|
||||
@@ -226,9 +226,6 @@ def test_config_save_screenshot_false_skips():
|
||||
print(f"[DEBUG RESULT] Exit code: {result.returncode}")
|
||||
print(f"[DEBUG RESULT] Stderr: {result.stderr[:200]}")
|
||||
|
||||
# FORCE FAILURE to verify test actually runs
|
||||
assert False, f"FORCED FAILURE - NODE_V8_COVERAGE={'NODE_V8_COVERAGE' in env} value={env.get('NODE_V8_COVERAGE', 'NOTSET')}"
|
||||
|
||||
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
|
||||
|
||||
# Feature disabled - temporary failure, should NOT emit JSONL
|
||||
|
||||
@@ -74,7 +74,7 @@
|
||||
"--geo-bypass",
|
||||
"--add-metadata",
|
||||
"--no-progress",
|
||||
"--remote-components ejs:github",
|
||||
"--remote-components=ejs:github",
|
||||
"-o",
|
||||
"%(title)s.%(ext)s"
|
||||
],
|
||||
|
||||
Reference in New Issue
Block a user