mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 15:27:53 +10:00
way better plugin hooks system wip
This commit is contained in:
@@ -2,8 +2,8 @@
|
||||
/**
|
||||
* Extract HTTP response headers for a URL.
|
||||
*
|
||||
* If a Chrome session exists (from chrome_session extractor), reads the captured
|
||||
* response headers from chrome_session/response_headers.json.
|
||||
* If a Chrome session exists (from chrome plugin), reads the captured
|
||||
* response headers from chrome plugin/response_headers.json.
|
||||
* Otherwise falls back to making an HTTP HEAD request.
|
||||
*
|
||||
* Usage: on_Snapshot__12_headers.js --url=<url> --snapshot-id=<uuid>
|
||||
@@ -24,7 +24,7 @@ const http = require('http');
|
||||
const EXTRACTOR_NAME = 'headers';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'headers.json';
|
||||
const CHROME_SESSION_DIR = '../chrome_session';
|
||||
const CHROME_SESSION_DIR = '../chrome';
|
||||
const CHROME_HEADERS_FILE = 'response_headers.json';
|
||||
|
||||
// Parse command line arguments
|
||||
@@ -56,7 +56,7 @@ function getEnvInt(name, defaultValue = 0) {
|
||||
return isNaN(val) ? defaultValue : val;
|
||||
}
|
||||
|
||||
// Get headers from chrome_session if available
|
||||
// Get headers from chrome plugin if available
|
||||
function getHeadersFromChromeSession() {
|
||||
const headersFile = path.join(CHROME_SESSION_DIR, CHROME_HEADERS_FILE);
|
||||
if (fs.existsSync(headersFile)) {
|
||||
@@ -117,7 +117,7 @@ async function extractHeaders(url) {
|
||||
const chromeHeaders = getHeadersFromChromeSession();
|
||||
if (chromeHeaders && chromeHeaders.headers) {
|
||||
fs.writeFileSync(outputPath, JSON.stringify(chromeHeaders, null, 2), 'utf8');
|
||||
return { success: true, output: outputPath, method: 'chrome_session', status: chromeHeaders.status };
|
||||
return { success: true, output: outputPath, method: 'chrome', status: chromeHeaders.status };
|
||||
}
|
||||
|
||||
// Fallback to HTTP HEAD request
|
||||
|
||||
@@ -75,16 +75,24 @@ def test_extracts_headers_from_example_com():
|
||||
|
||||
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
|
||||
|
||||
# Verify output in stdout
|
||||
assert 'STATUS=succeeded' in result.stdout, "Should report success"
|
||||
assert 'Headers extracted' in result.stdout, "Should report completion"
|
||||
# Parse clean JSONL output
|
||||
result_json = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Verify output directory created
|
||||
headers_dir = tmpdir / 'headers'
|
||||
assert headers_dir.exists(), "Output directory not created"
|
||||
assert result_json, "Should have ArchiveResult JSONL output"
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
||||
|
||||
# Verify output file exists
|
||||
headers_file = headers_dir / 'headers.json'
|
||||
# Verify output file exists (hook writes to current directory)
|
||||
headers_file = tmpdir / 'headers.json'
|
||||
assert headers_file.exists(), "headers.json not created"
|
||||
|
||||
# Verify headers JSON contains REAL example.com response
|
||||
@@ -106,20 +114,6 @@ def test_extracts_headers_from_example_com():
|
||||
assert 'content-type' in headers_lower or 'content-length' in headers_lower, \
|
||||
"Should have at least one common HTTP header"
|
||||
|
||||
# Verify RESULT_JSON is present and valid
|
||||
assert 'RESULT_JSON=' in result.stdout, "Should output RESULT_JSON"
|
||||
|
||||
for line in result.stdout.split('\n'):
|
||||
if line.startswith('RESULT_JSON='):
|
||||
result_json = json.loads(line.replace('RESULT_JSON=', ''))
|
||||
assert result_json['extractor'] == 'headers'
|
||||
assert result_json['status'] == 'succeeded'
|
||||
assert result_json['url'] == TEST_URL
|
||||
assert result_json['snapshot_id'] == 'test789'
|
||||
assert 'duration' in result_json
|
||||
assert result_json['duration'] >= 0
|
||||
break
|
||||
|
||||
|
||||
def test_headers_output_structure():
|
||||
"""Test that headers plugin produces correctly structured output."""
|
||||
@@ -140,10 +134,25 @@ def test_headers_output_structure():
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
|
||||
assert 'STATUS=succeeded' in result.stdout, "Should report success"
|
||||
|
||||
# Parse clean JSONL output
|
||||
result_json = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert result_json, "Should have ArchiveResult JSONL output"
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
||||
|
||||
# Verify output structure
|
||||
output_headers_file = tmpdir / 'headers' / 'headers.json'
|
||||
output_headers_file = tmpdir / 'headers.json'
|
||||
assert output_headers_file.exists(), "Output headers.json not created"
|
||||
|
||||
output_data = json.loads(output_headers_file.read_text())
|
||||
@@ -162,8 +171,8 @@ def test_headers_output_structure():
|
||||
assert output_data['status'] in [200, 301, 302]
|
||||
|
||||
|
||||
def test_falls_back_to_http_when_chrome_session_unavailable():
|
||||
"""Test that headers plugin falls back to HTTP HEAD when chrome_session unavailable."""
|
||||
def test_falls_back_to_http_when_chrome_unavailable():
|
||||
"""Test that headers plugin falls back to HTTP HEAD when chrome unavailable."""
|
||||
|
||||
if not shutil.which('node'):
|
||||
pytest.skip("node not installed")
|
||||
@@ -171,7 +180,7 @@ def test_falls_back_to_http_when_chrome_session_unavailable():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Don't create chrome_session directory - force HTTP fallback
|
||||
# Don't create chrome directory - force HTTP fallback
|
||||
|
||||
# Run headers extraction
|
||||
result = subprocess.run(
|
||||
@@ -183,12 +192,25 @@ def test_falls_back_to_http_when_chrome_session_unavailable():
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
|
||||
assert 'STATUS=succeeded' in result.stdout, "Should report success"
|
||||
assert 'http' in result.stdout.lower() or 'HEAD' not in result.stdout, \
|
||||
"Should use HTTP method"
|
||||
|
||||
# Parse clean JSONL output
|
||||
result_json = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert result_json, "Should have ArchiveResult JSONL output"
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
||||
|
||||
# Verify output exists and has real HTTP headers
|
||||
output_headers_file = tmpdir / 'headers' / 'headers.json'
|
||||
output_headers_file = tmpdir / 'headers.json'
|
||||
assert output_headers_file.exists(), "Output headers.json not created"
|
||||
|
||||
output_data = json.loads(output_headers_file.read_text())
|
||||
@@ -250,7 +272,21 @@ def test_config_user_agent():
|
||||
|
||||
# Should succeed (example.com doesn't block)
|
||||
if result.returncode == 0:
|
||||
assert 'STATUS=succeeded' in result.stdout
|
||||
# Parse clean JSONL output
|
||||
result_json = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert result_json, "Should have ArchiveResult JSONL output"
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
||||
|
||||
|
||||
def test_handles_https_urls():
|
||||
@@ -271,7 +307,7 @@ def test_handles_https_urls():
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
output_headers_file = tmpdir / 'headers' / 'headers.json'
|
||||
output_headers_file = tmpdir / 'headers.json'
|
||||
if output_headers_file.exists():
|
||||
output_data = json.loads(output_headers_file.read_text())
|
||||
assert output_data['url'] == 'https://example.org'
|
||||
@@ -298,7 +334,7 @@ def test_handles_404_gracefully():
|
||||
# May succeed or fail depending on server behavior
|
||||
# If it succeeds, verify 404 status is captured
|
||||
if result.returncode == 0:
|
||||
output_headers_file = tmpdir / 'headers' / 'headers.json'
|
||||
output_headers_file = tmpdir / 'headers.json'
|
||||
if output_headers_file.exists():
|
||||
output_data = json.loads(output_headers_file.read_text())
|
||||
assert output_data['status'] == 404, "Should capture 404 status"
|
||||
|
||||
Reference in New Issue
Block a user