This commit is contained in:
Nick Sweeting
2025-12-28 17:51:54 -08:00
parent 54f91c1339
commit f0aa19fa7d
157 changed files with 6774 additions and 5061 deletions

View File

@@ -3,10 +3,10 @@
"type": "object",
"additionalProperties": false,
"properties": {
"SAVE_ARCHIVE_DOT_ORG": {
"ARCHIVE_ORG_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SUBMIT_ARCHIVE_DOT_ORG"],
"x-aliases": ["SAVE_ARCHIVE_DOT_ORG", "USE_ARCHIVE_ORG", "SUBMIT_ARCHIVE_DOT_ORG"],
"description": "Submit URLs to archive.org Wayback Machine"
},
"ARCHIVE_ORG_TIMEOUT": {

View File

@@ -0,0 +1,10 @@
{% load config_tags %}
{% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
{% if enabled %}
<!-- Archive.org embed - full iframe view -->
<iframe src="{{ output_path }}"
class="extractor-embed archivedotorg-embed"
style="width: 100%; height: 600px; border: 1px solid #ddd;"
sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms">
</iframe>
{% endif %}

View File

@@ -0,0 +1,10 @@
{% load config_tags %}
{% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
{% if enabled %}
<!-- Archive.org fullscreen - full page iframe -->
<iframe src="{{ output_path }}"
class="extractor-fullscreen archivedotorg-fullscreen"
style="width: 100%; height: 100vh; border: none;"
sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms">
</iframe>
{% endif %}

View File

@@ -0,0 +1,12 @@
{% load config_tags %}
{% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
{% if enabled %}
<!-- Archive.org thumbnail - iframe preview of archived page -->
<div class="extractor-thumbnail archivedotorg-thumbnail" style="width: 100%; height: 100px; overflow: hidden;">
<iframe src="{{ output_path }}"
style="width: 100%; height: 100px; border: none; pointer-events: none;"
loading="lazy"
sandbox="allow-same-origin">
</iframe>
</div>
{% endif %}

View File

@@ -60,21 +60,6 @@
"default": true,
"x-fallback": "CHECK_SSL_VALIDITY",
"description": "Whether to verify SSL certificates"
},
"SAVE_SCREENSHOT": {
"type": "boolean",
"default": true,
"description": "Enable screenshot capture"
},
"SAVE_PDF": {
"type": "boolean",
"default": true,
"description": "Enable PDF generation"
},
"SAVE_DOM": {
"type": "boolean",
"default": true,
"description": "Enable DOM capture"
}
}
}

View File

@@ -0,0 +1,21 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"required_plugins": ["chrome"],
"properties": {
"DOM_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_DOM", "USE_DOM"],
"description": "Enable DOM capture"
},
"DOM_TIMEOUT": {
"type": "integer",
"default": 60,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for DOM capture in seconds"
}
}
}

View File

@@ -3,9 +3,10 @@
"type": "object",
"additionalProperties": false,
"properties": {
"SAVE_FAVICON": {
"FAVICON_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_FAVICON", "USE_FAVICON"],
"description": "Enable favicon downloading"
},
"FAVICON_TIMEOUT": {

View File

@@ -2,6 +2,7 @@
Integration tests for favicon plugin
Tests verify:
pass
1. Plugin script exists
2. requests library is available
3. Favicon extraction works for real example.com
@@ -40,7 +41,7 @@ def test_requests_library_available():
)
if result.returncode != 0:
pytest.skip("requests library not installed")
pass
assert len(result.stdout.strip()) > 0, "Should report requests version"
@@ -58,7 +59,7 @@ def test_extracts_favicon_from_example_com():
capture_output=True
)
if check_result.returncode != 0:
pytest.skip("requests not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
@@ -80,6 +81,7 @@ def test_extracts_favicon_from_example_com():
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
@@ -124,7 +126,7 @@ def test_config_timeout_honored():
capture_output=True
)
if check_result.returncode != 0:
pytest.skip("requests not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
@@ -155,7 +157,7 @@ def test_config_user_agent():
capture_output=True
)
if check_result.returncode != 0:
pytest.skip("requests not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
@@ -181,6 +183,7 @@ def test_config_user_agent():
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
@@ -201,7 +204,7 @@ def test_handles_https_urls():
capture_output=True
)
if check_result.returncode != 0:
pytest.skip("requests not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
@@ -232,7 +235,7 @@ def test_handles_missing_favicon_gracefully():
capture_output=True
)
if check_result.returncode != 0:
pytest.skip("requests not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)

View File

@@ -3,9 +3,10 @@
"type": "object",
"additionalProperties": false,
"properties": {
"SAVE_FORUMDL": {
"FORUMDL_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_FORUMDL", "USE_FORUMDL"],
"description": "Enable forum downloading with forum-dl"
},
"FORUMDL_BINARY": {

View File

@@ -2,6 +2,7 @@
Integration tests for forumdl plugin
Tests verify:
pass
1. Hook script exists
2. Dependencies installed via validation hooks
3. Verify deps with abx-pkg
@@ -48,7 +49,9 @@ def get_forumdl_binary_path():
# Check if binary was found
for line in result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Binary' and record.get('name') == 'forum-dl':
@@ -77,7 +80,9 @@ def get_forumdl_binary_path():
# Parse Binary from pip installation
for install_line in install_result.stdout.strip().split('\n'):
pass
if install_line.strip():
pass
try:
install_record = json.loads(install_line)
if install_record.get('type') == 'Binary' and install_record.get('name') == 'forum-dl':
@@ -107,7 +112,7 @@ def test_forumdl_install_hook():
"""Test forum-dl install hook checks for forum-dl."""
# Skip if install hook doesn't exist yet
if not FORUMDL_INSTALL_HOOK.exists():
pytest.skip(f"Install hook not found: {FORUMDL_INSTALL_HOOK}")
pass
# Run forum-dl install hook
result = subprocess.run(
@@ -123,14 +128,18 @@ def test_forumdl_install_hook():
found_dependency = False
for line in result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Binary':
pass
if record['name'] == 'forum-dl':
assert record['abspath'], "forum-dl should have abspath"
found_binary = True
elif record.get('type') == 'Dependency':
pass
if record['bin_name'] == 'forum-dl':
found_dependency = True
except json.JSONDecodeError:
@@ -145,10 +154,10 @@ def test_verify_deps_with_abx_pkg():
"""Verify forum-dl is installed by calling the REAL installation hooks."""
binary_path = get_forumdl_binary_path()
if not binary_path:
pytest.skip(
"forum-dl installation skipped. Install hook may not exist or "
"forum-dl has a dependency on cchardet which does not compile on Python 3.14+ "
"due to removed longintrepr.h header. This is a known compatibility issue with forum-dl."
assert False, (
"forum-dl installation failed. Install hook should install forum-dl automatically. "
"Note: forum-dl has a dependency on cchardet which may not compile on Python 3.14+ "
"due to removed longintrepr.h header."
)
assert Path(binary_path).is_file(), f"Binary path must be a valid file: {binary_path}"
@@ -159,7 +168,7 @@ def test_handles_non_forum_url():
binary_path = get_forumdl_binary_path()
if not binary_path:
pytest.skip("forum-dl binary not available")
pass
assert Path(binary_path).is_file(), f"Binary must be a valid file: {binary_path}"
with tempfile.TemporaryDirectory() as tmpdir:
@@ -186,6 +195,7 @@ def test_handles_non_forum_url():
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
@@ -231,7 +241,7 @@ def test_config_timeout():
binary_path = get_forumdl_binary_path()
if not binary_path:
pytest.skip("forum-dl binary not available")
pass
assert Path(binary_path).is_file(), f"Binary must be a valid file: {binary_path}"
with tempfile.TemporaryDirectory() as tmpdir:

View File

@@ -3,9 +3,10 @@
"type": "object",
"additionalProperties": false,
"properties": {
"SAVE_GALLERYDL": {
"GALLERYDL_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_GALLERYDL", "USE_GALLERYDL"],
"description": "Enable gallery downloading with gallery-dl"
},
"GALLERYDL_BINARY": {

View File

@@ -2,6 +2,7 @@
Integration tests for gallerydl plugin
Tests verify:
pass
1. Hook script exists
2. Dependencies installed via validation hooks
3. Verify deps with abx-pkg
@@ -45,14 +46,18 @@ def test_gallerydl_install_hook():
found_dependency = False
for line in result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Binary':
pass
if record['name'] == 'gallery-dl':
assert record['abspath'], "gallery-dl should have abspath"
found_binary = True
elif record.get('type') == 'Dependency':
pass
if record['bin_name'] == 'gallery-dl':
found_dependency = True
except json.JSONDecodeError:
@@ -76,7 +81,7 @@ def test_verify_deps_with_abx_pkg():
missing_binaries.append('gallery-dl')
if missing_binaries:
pytest.skip(f"Binaries not available: {', '.join(missing_binaries)} - Dependency records should have been emitted")
pass
def test_handles_non_gallery_url():
@@ -103,6 +108,7 @@ def test_handles_non_gallery_url():
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':

View File

@@ -3,9 +3,10 @@
"type": "object",
"additionalProperties": false,
"properties": {
"SAVE_GIT": {
"GIT_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_GIT", "USE_GIT"],
"description": "Enable git repository cloning"
},
"GIT_BINARY": {

View File

@@ -2,6 +2,7 @@
Integration tests for git plugin
Tests verify:
pass
1. Validate hook checks for git binary
2. Verify deps with abx-pkg
3. Standalone git extractor execution
@@ -37,7 +38,9 @@ def test_git_install_hook():
# Binary found - verify Binary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Binary':
@@ -52,7 +55,9 @@ def test_git_install_hook():
# Binary not found - verify Dependency JSONL output
found_dependency = False
for line in result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Dependency':
@@ -74,7 +79,7 @@ def test_verify_deps_with_abx_pkg():
if git_loaded and git_loaded.abspath:
assert True, "git is available"
else:
pytest.skip("git not available - Dependency record should have been emitted")
pass
def test_reports_missing_git():
with tempfile.TemporaryDirectory() as tmpdir:
@@ -88,8 +93,9 @@ def test_reports_missing_git():
assert 'DEPENDENCY_NEEDED' in combined or 'git' in combined.lower() or 'ERROR=' in combined
def test_handles_non_git_url():
pass
if not shutil.which('git'):
pytest.skip("git not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
result = subprocess.run(
@@ -104,6 +110,7 @@ def test_handles_non_git_url():
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':

View File

@@ -2,6 +2,7 @@
Integration tests for headers plugin
Tests verify:
pass
1. Plugin script exists and is executable
2. Node.js is available
3. Headers extraction works for real example.com
@@ -38,7 +39,7 @@ def test_node_is_available():
)
if result.returncode != 0:
pytest.skip("node not installed on system")
pass
binary_path = result.stdout.strip()
assert Path(binary_path).exists(), f"Binary should exist at {binary_path}"
@@ -59,7 +60,7 @@ def test_extracts_headers_from_example_com():
# Check node is available
if not shutil.which('node'):
pytest.skip("node not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
@@ -80,6 +81,7 @@ def test_extracts_headers_from_example_com():
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
@@ -119,7 +121,7 @@ def test_headers_output_structure():
"""Test that headers plugin produces correctly structured output."""
if not shutil.which('node'):
pytest.skip("node not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
@@ -140,6 +142,7 @@ def test_headers_output_structure():
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
@@ -175,7 +178,7 @@ def test_falls_back_to_http_when_chrome_unavailable():
"""Test that headers plugin falls back to HTTP HEAD when chrome unavailable."""
if not shutil.which('node'):
pytest.skip("node not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
@@ -198,6 +201,7 @@ def test_falls_back_to_http_when_chrome_unavailable():
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
@@ -224,7 +228,7 @@ def test_config_timeout_honored():
"""Test that TIMEOUT config is respected."""
if not shutil.which('node'):
pytest.skip("node not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
@@ -251,7 +255,7 @@ def test_config_user_agent():
"""Test that USER_AGENT config is used."""
if not shutil.which('node'):
pytest.skip("node not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
@@ -277,6 +281,7 @@ def test_config_user_agent():
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
@@ -293,7 +298,7 @@ def test_handles_https_urls():
"""Test that HTTPS URLs work correctly."""
if not shutil.which('node'):
pytest.skip("node not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
@@ -318,7 +323,7 @@ def test_handles_404_gracefully():
"""Test that headers plugin handles 404s gracefully."""
if not shutil.which('node'):
pytest.skip("node not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)

View File

@@ -1,279 +0,0 @@
/**
* Unit tests for istilldontcareaboutcookies plugin
*
* Run with: node --test tests/test_istilldontcareaboutcookies.js
*/
const assert = require('assert');
const fs = require('fs');
const path = require('path');
const { describe, it, before, after, beforeEach, afterEach } = require('node:test');
// Test fixtures
const TEST_DIR = path.join(__dirname, '.test_fixtures');
const TEST_EXTENSIONS_DIR = path.join(TEST_DIR, 'chrome_extensions');
describe('istilldontcareaboutcookies plugin', () => {
before(() => {
if (!fs.existsSync(TEST_DIR)) {
fs.mkdirSync(TEST_DIR, { recursive: true });
}
});
after(() => {
if (fs.existsSync(TEST_DIR)) {
fs.rmSync(TEST_DIR, { recursive: true, force: true });
}
});
describe('EXTENSION metadata', () => {
it('should have correct webstore_id', () => {
const { EXTENSION } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
assert.strictEqual(EXTENSION.webstore_id, 'edibdbjcniadpccecjdfdjjppcpchdlm');
});
it('should have correct name', () => {
const { EXTENSION } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
assert.strictEqual(EXTENSION.name, 'istilldontcareaboutcookies');
});
});
describe('installCookiesExtension', () => {
beforeEach(() => {
process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
});
afterEach(() => {
if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
delete process.env.CHROME_EXTENSIONS_DIR;
});
it('should use cached extension if available', async () => {
const { installCookiesExtension } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
// Create fake cache
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
const fakeExtensionDir = path.join(TEST_EXTENSIONS_DIR, 'fake_cookies');
fs.mkdirSync(fakeExtensionDir, { recursive: true });
fs.writeFileSync(
path.join(fakeExtensionDir, 'manifest.json'),
JSON.stringify({ version: '1.1.8' })
);
const fakeCache = {
webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
name: 'istilldontcareaboutcookies',
unpacked_path: fakeExtensionDir,
version: '1.1.8'
};
fs.writeFileSync(cacheFile, JSON.stringify(fakeCache));
const result = await installCookiesExtension();
assert.notStrictEqual(result, null);
assert.strictEqual(result.webstore_id, 'edibdbjcniadpccecjdfdjjppcpchdlm');
});
it('should not require any configuration', async () => {
// This extension works out of the box
// No API keys or config needed
const { EXTENSION } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
assert.ok(EXTENSION);
// No config fields should be required
});
});
describe('cache file creation', () => {
beforeEach(() => {
process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
});
afterEach(() => {
if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
delete process.env.CHROME_EXTENSIONS_DIR;
});
it('should create cache file with correct extension name', async () => {
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
// Create mock extension
const mockExtension = {
webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
name: 'istilldontcareaboutcookies',
version: '1.1.9'
};
await fs.promises.writeFile(cacheFile, JSON.stringify(mockExtension, null, 2));
assert.ok(fs.existsSync(cacheFile));
const cache = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
assert.strictEqual(cache.name, 'istilldontcareaboutcookies');
});
it('should use correct filename pattern', () => {
const expectedPattern = 'istilldontcareaboutcookies.extension.json';
const cacheFile = path.join(TEST_EXTENSIONS_DIR, expectedPattern);
// Pattern should match expected format
assert.ok(path.basename(cacheFile).endsWith('.extension.json'));
assert.ok(path.basename(cacheFile).includes('istilldontcareaboutcookies'));
});
});
describe('extension functionality', () => {
it('should work automatically without configuration', () => {
// This extension automatically dismisses cookie banners
// No manual trigger or configuration needed
const features = {
automaticBannerDismissal: true,
requiresConfiguration: false,
requiresApiKey: false,
requiresUserAction: false
};
assert.strictEqual(features.automaticBannerDismissal, true);
assert.strictEqual(features.requiresConfiguration, false);
assert.strictEqual(features.requiresApiKey, false);
assert.strictEqual(features.requiresUserAction, false);
});
it('should not require any runtime hooks', () => {
// Extension works purely via Chrome's content script injection
// No need for additional hooks or configuration
const requiresHooks = {
preNavigation: false,
postNavigation: false,
onPageLoad: false
};
assert.strictEqual(requiresHooks.preNavigation, false);
assert.strictEqual(requiresHooks.postNavigation, false);
assert.strictEqual(requiresHooks.onPageLoad, false);
});
});
describe('priority and execution order', () => {
it('should have priority 02 (early)', () => {
const filename = 'on_Snapshot__02_istilldontcareaboutcookies.js';
// Extract priority from filename
const match = filename.match(/on_Snapshot__(\d+)_/);
assert.ok(match);
const priority = parseInt(match[1]);
assert.strictEqual(priority, 2);
});
it('should run before chrome (priority 20)', () => {
const extensionPriority = 2;
const chromeSessionPriority = 20;
assert.ok(extensionPriority < chromeSessionPriority);
});
});
describe('error handling', () => {
beforeEach(() => {
process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
});
afterEach(() => {
if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
delete process.env.CHROME_EXTENSIONS_DIR;
});
it('should handle corrupted cache gracefully', async () => {
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
// Create corrupted cache
fs.writeFileSync(cacheFile, 'invalid json content');
// Should detect corruption and proceed with fresh install
const { installCookiesExtension } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
// Mock loadOrInstallExtension to avoid actual download
const extensionUtils = require('../../chrome_extensions/chrome_extension_utils.js');
const originalFunc = extensionUtils.loadOrInstallExtension;
extensionUtils.loadOrInstallExtension = async () => ({
webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
name: 'istilldontcareaboutcookies',
version: '1.1.9'
});
const result = await installCookiesExtension();
extensionUtils.loadOrInstallExtension = originalFunc;
assert.notStrictEqual(result, null);
});
it('should handle missing manifest gracefully', async () => {
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
const fakeExtensionDir = path.join(TEST_EXTENSIONS_DIR, 'fake_cookies_no_manifest');
// Create directory without manifest
fs.mkdirSync(fakeExtensionDir, { recursive: true });
const fakeCache = {
webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
name: 'istilldontcareaboutcookies',
unpacked_path: fakeExtensionDir
};
fs.writeFileSync(cacheFile, JSON.stringify(fakeCache));
const { installCookiesExtension } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
// Mock to return fresh extension when manifest missing
const extensionUtils = require('../../chrome_extensions/chrome_extension_utils.js');
const originalFunc = extensionUtils.loadOrInstallExtension;
let freshInstallCalled = false;
extensionUtils.loadOrInstallExtension = async () => {
freshInstallCalled = true;
return {
webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
name: 'istilldontcareaboutcookies',
version: '1.1.9'
};
};
const result = await installCookiesExtension();
extensionUtils.loadOrInstallExtension = originalFunc;
// Should trigger fresh install when manifest missing
assert.ok(freshInstallCalled || result);
});
});
});

View File

@@ -3,16 +3,16 @@
"type": "object",
"additionalProperties": false,
"properties": {
"SAVE_MEDIA": {
"MEDIA_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["USE_YTDLP", "FETCH_MEDIA"],
"x-aliases": ["SAVE_MEDIA", "USE_MEDIA", "USE_YTDLP", "FETCH_MEDIA"],
"description": "Enable media downloading with yt-dlp"
},
"YOUTUBEDL_BINARY": {
"MEDIA_BINARY": {
"type": "string",
"default": "yt-dlp",
"x-aliases": ["YTDLP_BINARY", "YOUTUBE_DL_BINARY"],
"x-aliases": ["YOUTUBEDL_BINARY", "YTDLP_BINARY", "YOUTUBE_DL_BINARY"],
"description": "Path to yt-dlp binary"
},
"MEDIA_TIMEOUT": {
@@ -28,13 +28,14 @@
"pattern": "^\\d+[kmgKMG]?$",
"description": "Maximum file size for media downloads"
},
"YTDLP_CHECK_SSL_VALIDITY": {
"MEDIA_CHECK_SSL_VALIDITY": {
"type": "boolean",
"default": true,
"x-fallback": "CHECK_SSL_VALIDITY",
"x-aliases": ["YTDLP_CHECK_SSL_VALIDITY"],
"description": "Whether to verify SSL certificates"
},
"YTDLP_ARGS": {
"MEDIA_ARGS": {
"type": "array",
"items": {"type": "string"},
"default": [
@@ -44,11 +45,13 @@
"--embed-subs",
"--write-auto-sub"
],
"x-aliases": ["YTDLP_ARGS"],
"description": "Default yt-dlp arguments"
},
"YTDLP_EXTRA_ARGS": {
"MEDIA_EXTRA_ARGS": {
"type": "string",
"default": "",
"x-aliases": ["YTDLP_EXTRA_ARGS"],
"description": "Extra arguments for yt-dlp (space-separated)"
}
}

View File

@@ -2,6 +2,7 @@
Integration tests for media plugin
Tests verify:
pass
1. Hook script exists
2. Dependencies installed via validation hooks
3. Verify deps with abx-pkg
@@ -45,7 +46,9 @@ def test_ytdlp_install_hook():
found_dependencies = {'node': False, 'ffmpeg': False, 'yt-dlp': False}
for line in result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Binary':
@@ -94,7 +97,7 @@ def test_verify_deps_with_abx_pkg():
missing_binaries.append('ffmpeg')
if missing_binaries:
pytest.skip(f"Binaries not available: {', '.join(missing_binaries)} - Dependency records should have been emitted")
pass
def test_handles_non_media_url():
"""Test that media extractor handles non-media URLs gracefully via hook."""
@@ -120,6 +123,7 @@ def test_handles_non_media_url():
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':

View File

@@ -3,9 +3,10 @@
"type": "object",
"additionalProperties": false,
"properties": {
"SAVE_MERCURY": {
"MERCURY_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_MERCURY", "USE_MERCURY"],
"description": "Enable Mercury text extraction"
},
"MERCURY_BINARY": {

View File

@@ -2,6 +2,7 @@
Integration tests for mercury plugin
Tests verify:
pass
1. Hook script exists
2. Dependencies installed via validation hooks
3. Verify deps with abx-pkg
@@ -44,7 +45,9 @@ def test_mercury_install_hook():
# Binary found - verify Binary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Binary':
@@ -59,7 +62,9 @@ def test_mercury_install_hook():
# Binary not found - verify Dependency JSONL output
found_dependency = False
for line in result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Dependency':
@@ -89,7 +94,7 @@ def test_verify_deps_with_abx_pkg():
if mercury_loaded and mercury_loaded.abspath:
assert True, "postlight-parser is available"
else:
pytest.skip("postlight-parser not available - Dependency record should have been emitted")
pass
def test_extracts_with_mercury_parser():
"""Test full workflow: extract with postlight-parser from real HTML via hook."""
@@ -122,6 +127,7 @@ def test_extracts_with_mercury_parser():
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
@@ -184,6 +190,7 @@ def test_fails_gracefully_without_html():
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':

View File

@@ -1,925 +0,0 @@
{
"name": "archivebox-plugins",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "archivebox-plugins",
"dependencies": {
"puppeteer-core": "^24.34.0"
}
},
"node_modules/@puppeteer/browsers": {
"version": "2.11.0",
"resolved": "https://registry.npmjs.org/@puppeteer/browsers/-/browsers-2.11.0.tgz",
"integrity": "sha512-n6oQX6mYkG8TRPuPXmbPidkUbsSRalhmaaVAQxvH1IkQy63cwsH+kOjB3e4cpCDHg0aSvsiX9bQ4s2VB6mGWUQ==",
"license": "Apache-2.0",
"dependencies": {
"debug": "^4.4.3",
"extract-zip": "^2.0.1",
"progress": "^2.0.3",
"proxy-agent": "^6.5.0",
"semver": "^7.7.3",
"tar-fs": "^3.1.1",
"yargs": "^17.7.2"
},
"bin": {
"browsers": "lib/cjs/main-cli.js"
},
"engines": {
"node": ">=18"
}
},
"node_modules/@tootallnate/quickjs-emscripten": {
"version": "0.23.0",
"resolved": "https://registry.npmjs.org/@tootallnate/quickjs-emscripten/-/quickjs-emscripten-0.23.0.tgz",
"integrity": "sha512-C5Mc6rdnsaJDjO3UpGW/CQTHtCKaYlScZTly4JIu97Jxo/odCiH0ITnDXSJPTOrEKk/ycSZ0AOgTmkDtkOsvIA==",
"license": "MIT"
},
"node_modules/@types/node": {
"version": "25.0.3",
"resolved": "https://registry.npmjs.org/@types/node/-/node-25.0.3.tgz",
"integrity": "sha512-W609buLVRVmeW693xKfzHeIV6nJGGz98uCPfeXI1ELMLXVeKYZ9m15fAMSaUPBHYLGFsVRcMmSCksQOrZV9BYA==",
"license": "MIT",
"optional": true,
"dependencies": {
"undici-types": "~7.16.0"
}
},
"node_modules/@types/yauzl": {
"version": "2.10.3",
"resolved": "https://registry.npmjs.org/@types/yauzl/-/yauzl-2.10.3.tgz",
"integrity": "sha512-oJoftv0LSuaDZE3Le4DbKX+KS9G36NzOeSap90UIK0yMA/NhKJhqlSGtNDORNRaIbQfzjXDrQa0ytJ6mNRGz/Q==",
"license": "MIT",
"optional": true,
"dependencies": {
"@types/node": "*"
}
},
"node_modules/agent-base": {
"version": "7.1.4",
"resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.4.tgz",
"integrity": "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==",
"license": "MIT",
"engines": {
"node": ">= 14"
}
},
"node_modules/ansi-regex": {
"version": "5.0.1",
"resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz",
"integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==",
"license": "MIT",
"engines": {
"node": ">=8"
}
},
"node_modules/ansi-styles": {
"version": "4.3.0",
"resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
"integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
"license": "MIT",
"dependencies": {
"color-convert": "^2.0.1"
},
"engines": {
"node": ">=8"
},
"funding": {
"url": "https://github.com/chalk/ansi-styles?sponsor=1"
}
},
"node_modules/ast-types": {
"version": "0.13.4",
"resolved": "https://registry.npmjs.org/ast-types/-/ast-types-0.13.4.tgz",
"integrity": "sha512-x1FCFnFifvYDDzTaLII71vG5uvDwgtmDTEVWAxrgeiR8VjMONcCXJx7E+USjDtHlwFmt9MysbqgF9b9Vjr6w+w==",
"license": "MIT",
"dependencies": {
"tslib": "^2.0.1"
},
"engines": {
"node": ">=4"
}
},
"node_modules/b4a": {
"version": "1.7.3",
"resolved": "https://registry.npmjs.org/b4a/-/b4a-1.7.3.tgz",
"integrity": "sha512-5Q2mfq2WfGuFp3uS//0s6baOJLMoVduPYVeNmDYxu5OUA1/cBfvr2RIS7vi62LdNj/urk1hfmj867I3qt6uZ7Q==",
"license": "Apache-2.0",
"peerDependencies": {
"react-native-b4a": "*"
},
"peerDependenciesMeta": {
"react-native-b4a": {
"optional": true
}
}
},
"node_modules/bare-events": {
"version": "2.8.2",
"resolved": "https://registry.npmjs.org/bare-events/-/bare-events-2.8.2.tgz",
"integrity": "sha512-riJjyv1/mHLIPX4RwiK+oW9/4c3TEUeORHKefKAKnZ5kyslbN+HXowtbaVEqt4IMUB7OXlfixcs6gsFeo/jhiQ==",
"license": "Apache-2.0",
"peerDependencies": {
"bare-abort-controller": "*"
},
"peerDependenciesMeta": {
"bare-abort-controller": {
"optional": true
}
}
},
"node_modules/bare-fs": {
"version": "4.5.2",
"resolved": "https://registry.npmjs.org/bare-fs/-/bare-fs-4.5.2.tgz",
"integrity": "sha512-veTnRzkb6aPHOvSKIOy60KzURfBdUflr5VReI+NSaPL6xf+XLdONQgZgpYvUuZLVQ8dCqxpBAudaOM1+KpAUxw==",
"license": "Apache-2.0",
"optional": true,
"dependencies": {
"bare-events": "^2.5.4",
"bare-path": "^3.0.0",
"bare-stream": "^2.6.4",
"bare-url": "^2.2.2",
"fast-fifo": "^1.3.2"
},
"engines": {
"bare": ">=1.16.0"
},
"peerDependencies": {
"bare-buffer": "*"
},
"peerDependenciesMeta": {
"bare-buffer": {
"optional": true
}
}
},
"node_modules/bare-os": {
"version": "3.6.2",
"resolved": "https://registry.npmjs.org/bare-os/-/bare-os-3.6.2.tgz",
"integrity": "sha512-T+V1+1srU2qYNBmJCXZkUY5vQ0B4FSlL3QDROnKQYOqeiQR8UbjNHlPa+TIbM4cuidiN9GaTaOZgSEgsvPbh5A==",
"license": "Apache-2.0",
"optional": true,
"engines": {
"bare": ">=1.14.0"
}
},
"node_modules/bare-path": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/bare-path/-/bare-path-3.0.0.tgz",
"integrity": "sha512-tyfW2cQcB5NN8Saijrhqn0Zh7AnFNsnczRcuWODH0eYAXBsJ5gVxAUuNr7tsHSC6IZ77cA0SitzT+s47kot8Mw==",
"license": "Apache-2.0",
"optional": true,
"dependencies": {
"bare-os": "^3.0.1"
}
},
"node_modules/bare-stream": {
"version": "2.7.0",
"resolved": "https://registry.npmjs.org/bare-stream/-/bare-stream-2.7.0.tgz",
"integrity": "sha512-oyXQNicV1y8nc2aKffH+BUHFRXmx6VrPzlnaEvMhram0nPBrKcEdcyBg5r08D0i8VxngHFAiVyn1QKXpSG0B8A==",
"license": "Apache-2.0",
"optional": true,
"dependencies": {
"streamx": "^2.21.0"
},
"peerDependencies": {
"bare-buffer": "*",
"bare-events": "*"
},
"peerDependenciesMeta": {
"bare-buffer": {
"optional": true
},
"bare-events": {
"optional": true
}
}
},
"node_modules/bare-url": {
"version": "2.3.2",
"resolved": "https://registry.npmjs.org/bare-url/-/bare-url-2.3.2.tgz",
"integrity": "sha512-ZMq4gd9ngV5aTMa5p9+UfY0b3skwhHELaDkhEHetMdX0LRkW9kzaym4oo/Eh+Ghm0CCDuMTsRIGM/ytUc1ZYmw==",
"license": "Apache-2.0",
"optional": true,
"dependencies": {
"bare-path": "^3.0.0"
}
},
"node_modules/basic-ftp": {
"version": "5.0.5",
"resolved": "https://registry.npmjs.org/basic-ftp/-/basic-ftp-5.0.5.tgz",
"integrity": "sha512-4Bcg1P8xhUuqcii/S0Z9wiHIrQVPMermM1any+MX5GeGD7faD3/msQUDGLol9wOcz4/jbg/WJnGqoJF6LiBdtg==",
"license": "MIT",
"engines": {
"node": ">=10.0.0"
}
},
"node_modules/buffer-crc32": {
"version": "0.2.13",
"resolved": "https://registry.npmjs.org/buffer-crc32/-/buffer-crc32-0.2.13.tgz",
"integrity": "sha512-VO9Ht/+p3SN7SKWqcrgEzjGbRSJYTx+Q1pTQC0wrWqHx0vpJraQ6GtHx8tvcg1rlK1byhU5gccxgOgj7B0TDkQ==",
"license": "MIT",
"engines": {
"node": "*"
}
},
"node_modules/chromium-bidi": {
"version": "12.0.1",
"resolved": "https://registry.npmjs.org/chromium-bidi/-/chromium-bidi-12.0.1.tgz",
"integrity": "sha512-fGg+6jr0xjQhzpy5N4ErZxQ4wF7KLEvhGZXD6EgvZKDhu7iOhZXnZhcDxPJDcwTcrD48NPzOCo84RP2lv3Z+Cg==",
"license": "Apache-2.0",
"dependencies": {
"mitt": "^3.0.1",
"zod": "^3.24.1"
},
"peerDependencies": {
"devtools-protocol": "*"
}
},
"node_modules/cliui": {
"version": "8.0.1",
"resolved": "https://registry.npmjs.org/cliui/-/cliui-8.0.1.tgz",
"integrity": "sha512-BSeNnyus75C4//NQ9gQt1/csTXyo/8Sb+afLAkzAptFuMsod9HFokGNudZpi/oQV73hnVK+sR+5PVRMd+Dr7YQ==",
"license": "ISC",
"dependencies": {
"string-width": "^4.2.0",
"strip-ansi": "^6.0.1",
"wrap-ansi": "^7.0.0"
},
"engines": {
"node": ">=12"
}
},
"node_modules/color-convert": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
"integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
"license": "MIT",
"dependencies": {
"color-name": "~1.1.4"
},
"engines": {
"node": ">=7.0.0"
}
},
"node_modules/color-name": {
"version": "1.1.4",
"resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
"integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
"license": "MIT"
},
"node_modules/data-uri-to-buffer": {
"version": "6.0.2",
"resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz",
"integrity": "sha512-7hvf7/GW8e86rW0ptuwS3OcBGDjIi6SZva7hCyWC0yYry2cOPmLIjXAUHI6DK2HsnwJd9ifmt57i8eV2n4YNpw==",
"license": "MIT",
"engines": {
"node": ">= 14"
}
},
"node_modules/debug": {
"version": "4.4.3",
"resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz",
"integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==",
"license": "MIT",
"dependencies": {
"ms": "^2.1.3"
},
"engines": {
"node": ">=6.0"
},
"peerDependenciesMeta": {
"supports-color": {
"optional": true
}
}
},
"node_modules/degenerator": {
"version": "5.0.1",
"resolved": "https://registry.npmjs.org/degenerator/-/degenerator-5.0.1.tgz",
"integrity": "sha512-TllpMR/t0M5sqCXfj85i4XaAzxmS5tVA16dqvdkMwGmzI+dXLXnw3J+3Vdv7VKw+ThlTMboK6i9rnZ6Nntj5CQ==",
"license": "MIT",
"dependencies": {
"ast-types": "^0.13.4",
"escodegen": "^2.1.0",
"esprima": "^4.0.1"
},
"engines": {
"node": ">= 14"
}
},
"node_modules/devtools-protocol": {
"version": "0.0.1534754",
"resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.1534754.tgz",
"integrity": "sha512-26T91cV5dbOYnXdJi5qQHoTtUoNEqwkHcAyu/IKtjIAxiEqPMrDiRkDOPWVsGfNZGmlQVHQbZRSjD8sxagWVsQ==",
"license": "BSD-3-Clause",
"peer": true
},
"node_modules/emoji-regex": {
"version": "8.0.0",
"resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz",
"integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==",
"license": "MIT"
},
"node_modules/end-of-stream": {
"version": "1.4.5",
"resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.5.tgz",
"integrity": "sha512-ooEGc6HP26xXq/N+GCGOT0JKCLDGrq2bQUZrQ7gyrJiZANJ/8YDTxTpQBXGMn+WbIQXNVpyWymm7KYVICQnyOg==",
"license": "MIT",
"dependencies": {
"once": "^1.4.0"
}
},
"node_modules/escalade": {
"version": "3.2.0",
"resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz",
"integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==",
"license": "MIT",
"engines": {
"node": ">=6"
}
},
"node_modules/escodegen": {
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/escodegen/-/escodegen-2.1.0.tgz",
"integrity": "sha512-2NlIDTwUWJN0mRPQOdtQBzbUHvdGY2P1VXSyU83Q3xKxM7WHX2Ql8dKq782Q9TgQUNOLEzEYu9bzLNj1q88I5w==",
"license": "BSD-2-Clause",
"dependencies": {
"esprima": "^4.0.1",
"estraverse": "^5.2.0",
"esutils": "^2.0.2"
},
"bin": {
"escodegen": "bin/escodegen.js",
"esgenerate": "bin/esgenerate.js"
},
"engines": {
"node": ">=6.0"
},
"optionalDependencies": {
"source-map": "~0.6.1"
}
},
"node_modules/esprima": {
"version": "4.0.1",
"resolved": "https://registry.npmjs.org/esprima/-/esprima-4.0.1.tgz",
"integrity": "sha512-eGuFFw7Upda+g4p+QHvnW0RyTX/SVeJBDM/gCtMARO0cLuT2HcEKnTPvhjV6aGeqrCB/sbNop0Kszm0jsaWU4A==",
"license": "BSD-2-Clause",
"bin": {
"esparse": "bin/esparse.js",
"esvalidate": "bin/esvalidate.js"
},
"engines": {
"node": ">=4"
}
},
"node_modules/estraverse": {
"version": "5.3.0",
"resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz",
"integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==",
"license": "BSD-2-Clause",
"engines": {
"node": ">=4.0"
}
},
"node_modules/esutils": {
"version": "2.0.3",
"resolved": "https://registry.npmjs.org/esutils/-/esutils-2.0.3.tgz",
"integrity": "sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g==",
"license": "BSD-2-Clause",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/events-universal": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/events-universal/-/events-universal-1.0.1.tgz",
"integrity": "sha512-LUd5euvbMLpwOF8m6ivPCbhQeSiYVNb8Vs0fQ8QjXo0JTkEHpz8pxdQf0gStltaPpw0Cca8b39KxvK9cfKRiAw==",
"license": "Apache-2.0",
"dependencies": {
"bare-events": "^2.7.0"
}
},
"node_modules/extract-zip": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/extract-zip/-/extract-zip-2.0.1.tgz",
"integrity": "sha512-GDhU9ntwuKyGXdZBUgTIe+vXnWj0fppUEtMDL0+idd5Sta8TGpHssn/eusA9mrPr9qNDym6SxAYZjNvCn/9RBg==",
"license": "BSD-2-Clause",
"dependencies": {
"debug": "^4.1.1",
"get-stream": "^5.1.0",
"yauzl": "^2.10.0"
},
"bin": {
"extract-zip": "cli.js"
},
"engines": {
"node": ">= 10.17.0"
},
"optionalDependencies": {
"@types/yauzl": "^2.9.1"
}
},
"node_modules/fast-fifo": {
"version": "1.3.2",
"resolved": "https://registry.npmjs.org/fast-fifo/-/fast-fifo-1.3.2.tgz",
"integrity": "sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ==",
"license": "MIT"
},
"node_modules/fd-slicer": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/fd-slicer/-/fd-slicer-1.1.0.tgz",
"integrity": "sha512-cE1qsB/VwyQozZ+q1dGxR8LBYNZeofhEdUNGSMbQD3Gw2lAzX9Zb3uIU6Ebc/Fmyjo9AWWfnn0AUCHqtevs/8g==",
"license": "MIT",
"dependencies": {
"pend": "~1.2.0"
}
},
"node_modules/get-caller-file": {
"version": "2.0.5",
"resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-2.0.5.tgz",
"integrity": "sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==",
"license": "ISC",
"engines": {
"node": "6.* || 8.* || >= 10.*"
}
},
"node_modules/get-stream": {
"version": "5.2.0",
"resolved": "https://registry.npmjs.org/get-stream/-/get-stream-5.2.0.tgz",
"integrity": "sha512-nBF+F1rAZVCu/p7rjzgA+Yb4lfYXrpl7a6VmJrU8wF9I1CKvP/QwPNZHnOlwbTkY6dvtFIzFMSyQXbLoTQPRpA==",
"license": "MIT",
"dependencies": {
"pump": "^3.0.0"
},
"engines": {
"node": ">=8"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/get-uri": {
"version": "6.0.5",
"resolved": "https://registry.npmjs.org/get-uri/-/get-uri-6.0.5.tgz",
"integrity": "sha512-b1O07XYq8eRuVzBNgJLstU6FYc1tS6wnMtF1I1D9lE8LxZSOGZ7LhxN54yPP6mGw5f2CkXY2BQUL9Fx41qvcIg==",
"license": "MIT",
"dependencies": {
"basic-ftp": "^5.0.2",
"data-uri-to-buffer": "^6.0.2",
"debug": "^4.3.4"
},
"engines": {
"node": ">= 14"
}
},
"node_modules/http-proxy-agent": {
"version": "7.0.2",
"resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-7.0.2.tgz",
"integrity": "sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig==",
"license": "MIT",
"dependencies": {
"agent-base": "^7.1.0",
"debug": "^4.3.4"
},
"engines": {
"node": ">= 14"
}
},
"node_modules/https-proxy-agent": {
"version": "7.0.6",
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.6.tgz",
"integrity": "sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==",
"license": "MIT",
"dependencies": {
"agent-base": "^7.1.2",
"debug": "4"
},
"engines": {
"node": ">= 14"
}
},
"node_modules/ip-address": {
"version": "10.1.0",
"resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.1.0.tgz",
"integrity": "sha512-XXADHxXmvT9+CRxhXg56LJovE+bmWnEWB78LB83VZTprKTmaC5QfruXocxzTZ2Kl0DNwKuBdlIhjL8LeY8Sf8Q==",
"license": "MIT",
"engines": {
"node": ">= 12"
}
},
"node_modules/is-fullwidth-code-point": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz",
"integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==",
"license": "MIT",
"engines": {
"node": ">=8"
}
},
"node_modules/lru-cache": {
"version": "7.18.3",
"resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-7.18.3.tgz",
"integrity": "sha512-jumlc0BIUrS3qJGgIkWZsyfAM7NCWiBcCDhnd+3NNM5KbBmLTgHVfWBcg6W+rLUsIpzpERPsvwUP7CckAQSOoA==",
"license": "ISC",
"engines": {
"node": ">=12"
}
},
"node_modules/mitt": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/mitt/-/mitt-3.0.1.tgz",
"integrity": "sha512-vKivATfr97l2/QBCYAkXYDbrIWPM2IIKEl7YPhjCvKlG3kE2gm+uBo6nEXK3M5/Ffh/FLpKExzOQ3JJoJGFKBw==",
"license": "MIT"
},
"node_modules/ms": {
"version": "2.1.3",
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
"integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
"license": "MIT"
},
"node_modules/netmask": {
"version": "2.0.2",
"resolved": "https://registry.npmjs.org/netmask/-/netmask-2.0.2.tgz",
"integrity": "sha512-dBpDMdxv9Irdq66304OLfEmQ9tbNRFnFTuZiLo+bD+r332bBmMJ8GBLXklIXXgxd3+v9+KUnZaUR5PJMa75Gsg==",
"license": "MIT",
"engines": {
"node": ">= 0.4.0"
}
},
"node_modules/once": {
"version": "1.4.0",
"resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz",
"integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==",
"license": "ISC",
"dependencies": {
"wrappy": "1"
}
},
"node_modules/pac-proxy-agent": {
"version": "7.2.0",
"resolved": "https://registry.npmjs.org/pac-proxy-agent/-/pac-proxy-agent-7.2.0.tgz",
"integrity": "sha512-TEB8ESquiLMc0lV8vcd5Ql/JAKAoyzHFXaStwjkzpOpC5Yv+pIzLfHvjTSdf3vpa2bMiUQrg9i6276yn8666aA==",
"license": "MIT",
"dependencies": {
"@tootallnate/quickjs-emscripten": "^0.23.0",
"agent-base": "^7.1.2",
"debug": "^4.3.4",
"get-uri": "^6.0.1",
"http-proxy-agent": "^7.0.0",
"https-proxy-agent": "^7.0.6",
"pac-resolver": "^7.0.1",
"socks-proxy-agent": "^8.0.5"
},
"engines": {
"node": ">= 14"
}
},
"node_modules/pac-resolver": {
"version": "7.0.1",
"resolved": "https://registry.npmjs.org/pac-resolver/-/pac-resolver-7.0.1.tgz",
"integrity": "sha512-5NPgf87AT2STgwa2ntRMr45jTKrYBGkVU36yT0ig/n/GMAa3oPqhZfIQ2kMEimReg0+t9kZViDVZ83qfVUlckg==",
"license": "MIT",
"dependencies": {
"degenerator": "^5.0.0",
"netmask": "^2.0.2"
},
"engines": {
"node": ">= 14"
}
},
"node_modules/pend": {
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz",
"integrity": "sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==",
"license": "MIT"
},
"node_modules/progress": {
"version": "2.0.3",
"resolved": "https://registry.npmjs.org/progress/-/progress-2.0.3.tgz",
"integrity": "sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA==",
"license": "MIT",
"engines": {
"node": ">=0.4.0"
}
},
"node_modules/proxy-agent": {
"version": "6.5.0",
"resolved": "https://registry.npmjs.org/proxy-agent/-/proxy-agent-6.5.0.tgz",
"integrity": "sha512-TmatMXdr2KlRiA2CyDu8GqR8EjahTG3aY3nXjdzFyoZbmB8hrBsTyMezhULIXKnC0jpfjlmiZ3+EaCzoInSu/A==",
"license": "MIT",
"dependencies": {
"agent-base": "^7.1.2",
"debug": "^4.3.4",
"http-proxy-agent": "^7.0.1",
"https-proxy-agent": "^7.0.6",
"lru-cache": "^7.14.1",
"pac-proxy-agent": "^7.1.0",
"proxy-from-env": "^1.1.0",
"socks-proxy-agent": "^8.0.5"
},
"engines": {
"node": ">= 14"
}
},
"node_modules/proxy-from-env": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz",
"integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==",
"license": "MIT"
},
"node_modules/pump": {
"version": "3.0.3",
"resolved": "https://registry.npmjs.org/pump/-/pump-3.0.3.tgz",
"integrity": "sha512-todwxLMY7/heScKmntwQG8CXVkWUOdYxIvY2s0VWAAMh/nd8SoYiRaKjlr7+iCs984f2P8zvrfWcDDYVb73NfA==",
"license": "MIT",
"dependencies": {
"end-of-stream": "^1.1.0",
"once": "^1.3.1"
}
},
"node_modules/puppeteer-core": {
"version": "24.34.0",
"resolved": "https://registry.npmjs.org/puppeteer-core/-/puppeteer-core-24.34.0.tgz",
"integrity": "sha512-24evawO+mUGW4mvS2a2ivwLdX3gk8zRLZr9HP+7+VT2vBQnm0oh9jJEZmUE3ePJhRkYlZ93i7OMpdcoi2qNCLg==",
"license": "Apache-2.0",
"dependencies": {
"@puppeteer/browsers": "2.11.0",
"chromium-bidi": "12.0.1",
"debug": "^4.4.3",
"devtools-protocol": "0.0.1534754",
"typed-query-selector": "^2.12.0",
"webdriver-bidi-protocol": "0.3.10",
"ws": "^8.18.3"
},
"engines": {
"node": ">=18"
}
},
"node_modules/require-directory": {
"version": "2.1.1",
"resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz",
"integrity": "sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==",
"license": "MIT",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/semver": {
"version": "7.7.3",
"resolved": "https://registry.npmjs.org/semver/-/semver-7.7.3.tgz",
"integrity": "sha512-SdsKMrI9TdgjdweUSR9MweHA4EJ8YxHn8DFaDisvhVlUOe4BF1tLD7GAj0lIqWVl+dPb/rExr0Btby5loQm20Q==",
"license": "ISC",
"bin": {
"semver": "bin/semver.js"
},
"engines": {
"node": ">=10"
}
},
"node_modules/smart-buffer": {
"version": "4.2.0",
"resolved": "https://registry.npmjs.org/smart-buffer/-/smart-buffer-4.2.0.tgz",
"integrity": "sha512-94hK0Hh8rPqQl2xXc3HsaBoOXKV20MToPkcXvwbISWLEs+64sBq5kFgn2kJDHb1Pry9yrP0dxrCI9RRci7RXKg==",
"license": "MIT",
"engines": {
"node": ">= 6.0.0",
"npm": ">= 3.0.0"
}
},
"node_modules/socks": {
"version": "2.8.7",
"resolved": "https://registry.npmjs.org/socks/-/socks-2.8.7.tgz",
"integrity": "sha512-HLpt+uLy/pxB+bum/9DzAgiKS8CX1EvbWxI4zlmgGCExImLdiad2iCwXT5Z4c9c3Eq8rP2318mPW2c+QbtjK8A==",
"license": "MIT",
"dependencies": {
"ip-address": "^10.0.1",
"smart-buffer": "^4.2.0"
},
"engines": {
"node": ">= 10.0.0",
"npm": ">= 3.0.0"
}
},
"node_modules/socks-proxy-agent": {
"version": "8.0.5",
"resolved": "https://registry.npmjs.org/socks-proxy-agent/-/socks-proxy-agent-8.0.5.tgz",
"integrity": "sha512-HehCEsotFqbPW9sJ8WVYB6UbmIMv7kUUORIF2Nncq4VQvBfNBLibW9YZR5dlYCSUhwcD628pRllm7n+E+YTzJw==",
"license": "MIT",
"dependencies": {
"agent-base": "^7.1.2",
"debug": "^4.3.4",
"socks": "^2.8.3"
},
"engines": {
"node": ">= 14"
}
},
"node_modules/source-map": {
"version": "0.6.1",
"resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
"integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==",
"license": "BSD-3-Clause",
"optional": true,
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/streamx": {
"version": "2.23.0",
"resolved": "https://registry.npmjs.org/streamx/-/streamx-2.23.0.tgz",
"integrity": "sha512-kn+e44esVfn2Fa/O0CPFcex27fjIL6MkVae0Mm6q+E6f0hWv578YCERbv+4m02cjxvDsPKLnmxral/rR6lBMAg==",
"license": "MIT",
"dependencies": {
"events-universal": "^1.0.0",
"fast-fifo": "^1.3.2",
"text-decoder": "^1.1.0"
}
},
"node_modules/string-width": {
"version": "4.2.3",
"resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz",
"integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==",
"license": "MIT",
"dependencies": {
"emoji-regex": "^8.0.0",
"is-fullwidth-code-point": "^3.0.0",
"strip-ansi": "^6.0.1"
},
"engines": {
"node": ">=8"
}
},
"node_modules/strip-ansi": {
"version": "6.0.1",
"resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
"integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
"license": "MIT",
"dependencies": {
"ansi-regex": "^5.0.1"
},
"engines": {
"node": ">=8"
}
},
"node_modules/tar-fs": {
"version": "3.1.1",
"resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-3.1.1.tgz",
"integrity": "sha512-LZA0oaPOc2fVo82Txf3gw+AkEd38szODlptMYejQUhndHMLQ9M059uXR+AfS7DNo0NpINvSqDsvyaCrBVkptWg==",
"license": "MIT",
"dependencies": {
"pump": "^3.0.0",
"tar-stream": "^3.1.5"
},
"optionalDependencies": {
"bare-fs": "^4.0.1",
"bare-path": "^3.0.0"
}
},
"node_modules/tar-stream": {
"version": "3.1.7",
"resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-3.1.7.tgz",
"integrity": "sha512-qJj60CXt7IU1Ffyc3NJMjh6EkuCFej46zUqJ4J7pqYlThyd9bO0XBTmcOIhSzZJVWfsLks0+nle/j538YAW9RQ==",
"license": "MIT",
"dependencies": {
"b4a": "^1.6.4",
"fast-fifo": "^1.2.0",
"streamx": "^2.15.0"
}
},
"node_modules/text-decoder": {
"version": "1.2.3",
"resolved": "https://registry.npmjs.org/text-decoder/-/text-decoder-1.2.3.tgz",
"integrity": "sha512-3/o9z3X0X0fTupwsYvR03pJ/DjWuqqrfwBgTQzdWDiQSm9KitAyz/9WqsT2JQW7KV2m+bC2ol/zqpW37NHxLaA==",
"license": "Apache-2.0",
"dependencies": {
"b4a": "^1.6.4"
}
},
"node_modules/tslib": {
"version": "2.8.1",
"resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz",
"integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==",
"license": "0BSD"
},
"node_modules/typed-query-selector": {
"version": "2.12.0",
"resolved": "https://registry.npmjs.org/typed-query-selector/-/typed-query-selector-2.12.0.tgz",
"integrity": "sha512-SbklCd1F0EiZOyPiW192rrHZzZ5sBijB6xM+cpmrwDqObvdtunOHHIk9fCGsoK5JVIYXoyEp4iEdE3upFH3PAg==",
"license": "MIT"
},
"node_modules/undici-types": {
"version": "7.16.0",
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz",
"integrity": "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw==",
"license": "MIT",
"optional": true
},
"node_modules/webdriver-bidi-protocol": {
"version": "0.3.10",
"resolved": "https://registry.npmjs.org/webdriver-bidi-protocol/-/webdriver-bidi-protocol-0.3.10.tgz",
"integrity": "sha512-5LAE43jAVLOhB/QqX4bwSiv0Hg1HBfMmOuwBSXHdvg4GMGu9Y0lIq7p4R/yySu6w74WmaR4GM4H9t2IwLW7hgw==",
"license": "Apache-2.0"
},
"node_modules/wrap-ansi": {
"version": "7.0.0",
"resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz",
"integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==",
"license": "MIT",
"dependencies": {
"ansi-styles": "^4.0.0",
"string-width": "^4.1.0",
"strip-ansi": "^6.0.0"
},
"engines": {
"node": ">=10"
},
"funding": {
"url": "https://github.com/chalk/wrap-ansi?sponsor=1"
}
},
"node_modules/wrappy": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz",
"integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==",
"license": "ISC"
},
"node_modules/ws": {
"version": "8.18.3",
"resolved": "https://registry.npmjs.org/ws/-/ws-8.18.3.tgz",
"integrity": "sha512-PEIGCY5tSlUt50cqyMXfCzX+oOPqN0vuGqWzbcJ2xvnkzkq46oOpz7dQaTDBdfICb4N14+GARUDw2XV2N4tvzg==",
"license": "MIT",
"engines": {
"node": ">=10.0.0"
},
"peerDependencies": {
"bufferutil": "^4.0.1",
"utf-8-validate": ">=5.0.2"
},
"peerDependenciesMeta": {
"bufferutil": {
"optional": true
},
"utf-8-validate": {
"optional": true
}
}
},
"node_modules/y18n": {
"version": "5.0.8",
"resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz",
"integrity": "sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==",
"license": "ISC",
"engines": {
"node": ">=10"
}
},
"node_modules/yargs": {
"version": "17.7.2",
"resolved": "https://registry.npmjs.org/yargs/-/yargs-17.7.2.tgz",
"integrity": "sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w==",
"license": "MIT",
"dependencies": {
"cliui": "^8.0.1",
"escalade": "^3.1.1",
"get-caller-file": "^2.0.5",
"require-directory": "^2.1.1",
"string-width": "^4.2.3",
"y18n": "^5.0.5",
"yargs-parser": "^21.1.1"
},
"engines": {
"node": ">=12"
}
},
"node_modules/yargs-parser": {
"version": "21.1.1",
"resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-21.1.1.tgz",
"integrity": "sha512-tVpsJW7DdjecAiFpbIB1e3qxIQsE6NoPc5/eTdrbbIC4h0LVsWhnoa3g+m2HclBIujHzsxZ4VJVA+GUuc2/LBw==",
"license": "ISC",
"engines": {
"node": ">=12"
}
},
"node_modules/yauzl": {
"version": "2.10.0",
"resolved": "https://registry.npmjs.org/yauzl/-/yauzl-2.10.0.tgz",
"integrity": "sha512-p4a9I6X6nu6IhoGmBqAcbJy1mlC4j27vEPZX9F4L4/vZT3Lyq1VkFHw/V/PUcB9Buo+DG3iHkT0x3Qya58zc3g==",
"license": "MIT",
"dependencies": {
"buffer-crc32": "~0.2.3",
"fd-slicer": "~1.1.0"
}
},
"node_modules/zod": {
"version": "3.25.76",
"resolved": "https://registry.npmjs.org/zod/-/zod-3.25.76.tgz",
"integrity": "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==",
"license": "MIT",
"funding": {
"url": "https://github.com/sponsors/colinhacks"
}
}
}
}

View File

@@ -1 +0,0 @@
{"name":"archivebox-plugins","private":true,"dependencies":{"puppeteer-core":"^24.34.0"}}

View File

@@ -3,9 +3,10 @@
"type": "object",
"additionalProperties": false,
"properties": {
"SAVE_PAPERSDL": {
"PAPERSDL_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_PAPERSDL", "USE_PAPERSDL"],
"description": "Enable paper downloading with papers-dl"
},
"PAPERSDL_BINARY": {

View File

@@ -170,10 +170,6 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
if normalized != url:
urls_found.add(unescape(normalized))
if not urls_found:
click.echo('No URLs found', err=True)
sys.exit(1)
# Emit Snapshot records to stdout (JSONL)
for found_url in sorted(urls_found):
record = {
@@ -189,7 +185,17 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
print(json.dumps(record))
click.echo(f'Found {len(urls_found)} URLs', err=True)
# Emit ArchiveResult record to mark completion
status = 'succeeded' if urls_found else 'skipped'
output_str = f'Found {len(urls_found)} URLs' if urls_found else 'No URLs found'
ar_record = {
'type': 'ArchiveResult',
'status': status,
'output_str': output_str,
}
print(json.dumps(ar_record))
click.echo(output_str, err=True)
sys.exit(0)

View File

@@ -27,12 +27,13 @@ class TestParseHtmlUrls:
assert result.returncode == 0, f"Failed to parse example.com: {result.stderr}"
output_file = tmp_path / 'urls.jsonl'
assert output_file.exists(), "Output file not created"
# Verify stdout contains JSONL records for discovered URLs
# example.com links to iana.org
assert 'iana.org' in result.stdout or 'example' in result.stdout, "Expected links from example.com not found"
# Verify output contains IANA link (example.com links to iana.org)
content = output_file.read_text()
assert 'iana.org' in content or 'example' in content, "Expected links from example.com not found"
# Verify ArchiveResult record is present
assert '"type": "ArchiveResult"' in result.stdout, "Missing ArchiveResult record"
assert '"status": "succeeded"' in result.stdout, "Missing success status"
def test_extracts_href_urls(self, tmp_path):
"""Test extracting URLs from anchor tags."""
@@ -56,17 +57,16 @@ class TestParseHtmlUrls:
)
assert result.returncode == 0
assert 'Found 3 URLs' in result.stdout
assert 'Found 3 URLs' in result.stderr
output_file = tmp_path / 'urls.jsonl'
assert output_file.exists()
lines = output_file.read_text().strip().split('\n')
assert len(lines) == 3
# Parse Snapshot records from stdout
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '"type": "Snapshot"' in line]
assert len(lines) == 3, f"Expected 3 Snapshot records, got {len(lines)}"
urls = set()
for line in lines:
entry = json.loads(line)
assert entry['type'] == 'Snapshot'
assert 'url' in entry
urls.add(entry['url'])
@@ -74,6 +74,10 @@ class TestParseHtmlUrls:
assert 'https://foo.bar/page' in urls
assert 'http://test.org' in urls
# Verify ArchiveResult record
assert '"type": "ArchiveResult"' in result.stdout
assert '"status": "succeeded"' in result.stdout
def test_ignores_non_http_schemes(self, tmp_path):
"""Test that non-http schemes are ignored."""
input_file = tmp_path / 'page.html'
@@ -96,9 +100,10 @@ class TestParseHtmlUrls:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
assert len(lines) == 1
# Parse Snapshot records from stdout
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '"type": "Snapshot"' in line]
assert len(lines) == 1, f"Expected 1 Snapshot record, got {len(lines)}"
entry = json.loads(lines[0])
assert entry['url'] == 'https://valid.com'
@@ -122,8 +127,8 @@ class TestParseHtmlUrls:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
entry = json.loads(lines[0])
assert entry['url'] == 'https://example.com/page?a=1&b=2'
def test_deduplicates_urls(self, tmp_path):
@@ -147,8 +152,7 @@ class TestParseHtmlUrls:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
assert len(lines) == 1
def test_excludes_source_url(self, tmp_path):
@@ -172,14 +176,13 @@ class TestParseHtmlUrls:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
assert len(lines) == 1
entry = json.loads(lines[0])
assert entry['url'] == 'https://other.com'
def test_exits_1_when_no_urls_found(self, tmp_path):
"""Test that script exits with code 1 when no URLs found."""
def test_skips_when_no_urls_found(self, tmp_path):
"""Test that script returns skipped status when no URLs found."""
input_file = tmp_path / 'page.html'
input_file.write_text('<html><body>No links here</body></html>')
@@ -190,8 +193,9 @@ class TestParseHtmlUrls:
text=True,
)
assert result.returncode == 1
assert result.returncode == 0
assert 'No URLs found' in result.stderr
assert '"status": "skipped"' in result.stdout
def test_handles_malformed_html(self, tmp_path):
"""Test handling of malformed HTML."""
@@ -212,8 +216,7 @@ class TestParseHtmlUrls:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
assert len(lines) == 2
def test_output_is_valid_json(self, tmp_path):
@@ -229,11 +232,11 @@ class TestParseHtmlUrls:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
entry = json.loads(lines[0])
assert entry['url'] == 'https://example.com'
assert 'type' in entry
assert 'plugin' in entry
assert entry['type'] == 'Snapshot'
assert entry['plugin'] == 'parse_html_urls'
if __name__ == '__main__':

View File

@@ -170,10 +170,6 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
# Skip malformed lines
continue
if not urls_found:
click.echo('No URLs found', err=True)
sys.exit(1)
# Emit Tag records first (to stdout as JSONL)
for tag_name in sorted(all_tags):
print(json.dumps({
@@ -185,7 +181,17 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
for entry in urls_found:
print(json.dumps(entry))
click.echo(f'Found {len(urls_found)} URLs, {len(all_tags)} tags', err=True)
# Emit ArchiveResult record to mark completion
status = 'succeeded' if urls_found else 'skipped'
output_str = f'Found {len(urls_found)} URLs, {len(all_tags)} tags' if urls_found else 'No URLs found'
ar_record = {
'type': 'ArchiveResult',
'status': status,
'output_str': output_str,
}
print(json.dumps(ar_record))
click.echo(output_str, err=True)
sys.exit(0)

View File

@@ -34,10 +34,8 @@ class TestParseJsonlUrls:
assert result.returncode == 0
assert 'Found 3 URLs' in result.stdout
output_file = tmp_path / 'urls.jsonl'
assert output_file.exists()
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
assert len(lines) == 3
entries = [json.loads(line) for line in lines]
@@ -64,8 +62,9 @@ class TestParseJsonlUrls:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
assert entry['url'] == 'https://example.com'
def test_supports_description_as_title(self, tmp_path):
@@ -81,8 +80,9 @@ class TestParseJsonlUrls:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
assert entry['title'] == 'A description'
def test_parses_various_timestamp_formats(self, tmp_path):
@@ -98,8 +98,9 @@ class TestParseJsonlUrls:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
# Parser converts timestamp to bookmarked_at
assert 'bookmarked_at' in entry
@@ -116,9 +117,9 @@ class TestParseJsonlUrls:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
# Output goes to stdout (JSONL)
# Parser converts tags to separate Tag objects in the output
content = output_file.read_text()
content = result.stdout
assert 'tech' in content or 'news' in content or 'Tag' in content
def test_parses_tags_as_list(self, tmp_path):
@@ -134,9 +135,9 @@ class TestParseJsonlUrls:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
# Output goes to stdout (JSONL)
# Parser converts tags to separate Tag objects in the output
content = output_file.read_text()
content = result.stdout
assert 'tech' in content or 'news' in content or 'Tag' in content
def test_skips_malformed_lines(self, tmp_path):
@@ -156,8 +157,8 @@ class TestParseJsonlUrls:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
assert len(lines) == 2
def test_skips_entries_without_url(self, tmp_path):
@@ -177,12 +178,12 @@ class TestParseJsonlUrls:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
assert len(lines) == 2
def test_exits_1_when_no_urls_found(self, tmp_path):
"""Test that script exits with code 1 when no URLs found."""
def test_skips_when_no_urls_found(self, tmp_path):
"""Test that script returns skipped status when no URLs found."""
input_file = tmp_path / 'empty.jsonl'
input_file.write_text('{"title": "No URL"}\n')
@@ -193,8 +194,9 @@ class TestParseJsonlUrls:
text=True,
)
assert result.returncode == 1
assert result.returncode == 0
assert 'No URLs found' in result.stderr
assert '"status": "skipped"' in result.stdout
def test_exits_1_when_file_not_found(self, tmp_path):
"""Test that script exits with code 1 when file doesn't exist."""
@@ -221,8 +223,9 @@ class TestParseJsonlUrls:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
assert entry['url'] == 'https://example.com/page?a=1&b=2'
assert entry['title'] == 'Test & Title'
@@ -244,8 +247,8 @@ class TestParseJsonlUrls:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
assert len(lines) == 2
def test_output_includes_required_fields(self, tmp_path):
@@ -261,8 +264,9 @@ class TestParseJsonlUrls:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
assert entry['url'] == 'https://example.com'
assert 'type' in entry
assert 'plugin' in entry

View File

@@ -207,23 +207,28 @@ def main(url: str, snapshot_id: str = None):
urls_found.append(entry)
if not urls_found:
click.echo('No bookmarks found', err=True)
sys.exit(1)
# Emit Tag records first (to stdout as JSONL)
for tag_name in sorted(all_tags):
print(json.dumps({
'type': 'Tag',
'name': tag_name,
}))
# Write urls.jsonl
with open('urls.jsonl', 'w') as f:
# Write Tag records first
for tag_name in sorted(all_tags):
f.write(json.dumps({
'type': 'Tag',
'name': tag_name,
}) + '\n')
# Write Snapshot records
for entry in urls_found:
f.write(json.dumps(entry) + '\n')
# Emit Snapshot records (to stdout as JSONL)
for entry in urls_found:
print(json.dumps(entry))
click.echo(f'Found {len(urls_found)} URLs, {len(all_tags)} tags')
# Emit ArchiveResult record to mark completion
status = 'succeeded' if urls_found else 'skipped'
output_str = f'Found {len(urls_found)} URLs, {len(all_tags)} tags' if urls_found else 'No bookmarks found'
ar_record = {
'type': 'ArchiveResult',
'status': status,
'output_str': output_str,
}
print(json.dumps(ar_record))
click.echo(output_str, err=True)
sys.exit(0)

View File

@@ -39,10 +39,8 @@ class TestParseNetscapeUrls:
assert result.returncode == 0
assert 'Found 3 URLs' in result.stdout
output_file = tmp_path / 'urls.jsonl'
assert output_file.exists()
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
assert len(lines) == 3
entries = [json.loads(line) for line in lines]
@@ -71,8 +69,9 @@ class TestParseNetscapeUrls:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
# Parser converts timestamp to bookmarked_at
assert 'bookmarked_at' in entry
@@ -91,8 +90,9 @@ class TestParseNetscapeUrls:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
assert 'q=test+query' in entry['url']
assert 'page=1' in entry['url']
@@ -111,13 +111,14 @@ class TestParseNetscapeUrls:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
assert entry['url'] == 'https://example.com/page?a=1&b=2'
assert entry['title'] == 'Test & Title'
def test_exits_1_when_no_bookmarks_found(self, tmp_path):
"""Test that script exits with code 1 when no bookmarks found."""
def test_skips_when_no_bookmarks_found(self, tmp_path):
"""Test that script returns skipped status when no bookmarks found."""
input_file = tmp_path / 'empty.html'
input_file.write_text('''<!DOCTYPE NETSCAPE-Bookmark-file-1>
<TITLE>Bookmarks</TITLE>
@@ -133,8 +134,9 @@ class TestParseNetscapeUrls:
text=True,
)
assert result.returncode == 1
assert result.returncode == 0
assert 'No bookmarks found' in result.stderr
assert '"status": "skipped"' in result.stdout
def test_exits_1_when_file_not_found(self, tmp_path):
"""Test that script exits with code 1 when file doesn't exist."""
@@ -173,8 +175,8 @@ class TestParseNetscapeUrls:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
urls = {json.loads(line)['url'] for line in lines}
assert 'https://example.com/nested1' in urls
@@ -196,8 +198,9 @@ class TestParseNetscapeUrls:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
assert entry['url'] == 'https://example.com'

View File

@@ -40,8 +40,8 @@ class TestFirefoxFormat:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
entries = [json.loads(line) for line in lines]
assert len(entries) == 2
@@ -70,12 +70,13 @@ class TestFirefoxFormat:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL) - get all JSONL records
all_lines = [line for line in result.stdout.strip().split('\n') if line.strip() and line.startswith('{')]
records = [json.loads(line) for line in all_lines]
# Should have Tag records + Snapshot records
tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
tags = [r for r in records if r.get('type') == 'Tag']
snapshots = [r for r in records if r.get('type') == 'Snapshot']
tag_names = {t['name'] for t in tags}
assert 'coding' in tag_names
@@ -112,8 +113,8 @@ class TestFirefoxFormat:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
entries = [json.loads(line) for line in lines]
urls = {e['url'] for e in entries}
@@ -141,8 +142,8 @@ class TestFirefoxFormat:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
entries = [json.loads(line) for line in lines]
assert entries[0]['url'] == 'https://example.com'
@@ -175,8 +176,8 @@ class TestChromeFormat:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
entries = [json.loads(line) for line in lines]
# Should correctly parse microsecond timestamps
@@ -212,8 +213,8 @@ class TestChromeFormat:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
entries = [json.loads(line) for line in lines]
urls = {e['url'] for e in entries}
@@ -248,8 +249,8 @@ class TestSafariFormat:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
entries = [json.loads(line) for line in lines]
urls = {e['url'] for e in entries}
@@ -279,8 +280,8 @@ class TestSafariFormat:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
entries = [json.loads(line) for line in lines]
urls = {e['url'] for e in entries}
@@ -312,8 +313,8 @@ class TestEdgeFormat:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
entries = [json.loads(line) for line in lines]
urls = {e['url'] for e in entries}
@@ -340,8 +341,9 @@ class TestTimestampFormats:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
dt = datetime.fromisoformat(entry['bookmarked_at'])
assert dt.year == 2021
@@ -366,8 +368,9 @@ class TestTimestampFormats:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
dt = datetime.fromisoformat(entry['bookmarked_at'])
# Should detect Mac epoch and convert correctly to 2021
@@ -389,8 +392,9 @@ class TestTimestampFormats:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
dt = datetime.fromisoformat(entry['bookmarked_at'])
# Should detect Mac epoch and convert to 2024
@@ -412,8 +416,9 @@ class TestTimestampFormats:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
dt = datetime.fromisoformat(entry['bookmarked_at'])
assert dt.year == 2021
@@ -437,8 +442,9 @@ class TestTimestampFormats:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
dt = datetime.fromisoformat(entry['bookmarked_at'])
assert dt.year == 2021
@@ -461,8 +467,9 @@ class TestTimestampFormats:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
dt = datetime.fromisoformat(entry['bookmarked_at'])
# Should detect Mac epoch with milliseconds and convert to 2021
@@ -487,8 +494,8 @@ class TestTimestampFormats:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
entries = [json.loads(line) for line in lines]
# All should be parsed to reasonable dates (2020-2025)
@@ -512,8 +519,9 @@ class TestTimestampFormats:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
dt = datetime.fromisoformat(entry['bookmarked_at'])
assert dt.year == 1996
@@ -534,8 +542,9 @@ class TestTimestampFormats:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
dt = datetime.fromisoformat(entry['bookmarked_at'])
assert dt.year == 2024
@@ -555,8 +564,9 @@ class TestTimestampFormats:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
# Should still extract URL but skip timestamp
assert entry['url'] == 'https://example.com'
@@ -577,8 +587,9 @@ class TestTimestampFormats:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
# Timestamp 0 = 1970, which is before MIN_REASONABLE_YEAR (1995)
# Parser should skip it as unreasonable
@@ -603,8 +614,9 @@ class TestTimestampFormats:
# Should handle gracefully (extracts URL, may or may not include timestamp)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
assert entry['url'] == 'https://example.com'
# If timestamp is included, should be reasonable (1969)
if 'bookmarked_at' in entry:
@@ -632,8 +644,8 @@ class TestBookmarkAttributes:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
entries = [json.loads(line) for line in lines]
# Both should be extracted
@@ -654,8 +666,9 @@ class TestBookmarkAttributes:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
assert 'google.com' in entry['url']
@@ -674,8 +687,9 @@ class TestBookmarkAttributes:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
assert entry['url'] == 'https://example.com/login'
@@ -704,9 +718,9 @@ class TestEdgeCases:
# Current regex works line-by-line, so this might not match
# Document current behavior
if result.returncode == 0:
output_file = tmp_path / 'urls.jsonl'
# Output goes to stdout (JSONL)
if output_file.exists():
content = output_file.read_text().strip()
content = result.stdout.strip()
if content:
entry = json.loads(content)
assert 'example.com' in entry['url']
@@ -727,8 +741,9 @@ class TestEdgeCases:
# Should succeed and extract URL without timestamp
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
assert entry['url'] == 'https://example.com'
assert entry['title'] == 'No Date'
assert 'bookmarked_at' not in entry
@@ -768,8 +783,8 @@ class TestEdgeCases:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
entries = [json.loads(line) for line in lines]
assert len(entries) == 3
@@ -792,8 +807,8 @@ class TestEdgeCases:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
entries = [json.loads(line) for line in lines]
# Both should be extracted
@@ -815,8 +830,9 @@ class TestEdgeCases:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
assert entry['url'].startswith('data:')
@@ -835,8 +851,9 @@ class TestEdgeCases:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
assert entry['url'].startswith('file://')
@@ -856,8 +873,9 @@ class TestEdgeCases:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
assert len(entry['url']) > 1000
assert entry['url'].startswith('https://example.com')
@@ -881,7 +899,7 @@ class TestEdgeCases:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
# Output goes to stdout (JSONL)
lines = output_file.read_text(encoding='utf-8').strip().split('\n')
entries = [json.loads(line) for line in lines]
@@ -915,8 +933,8 @@ class TestEdgeCases:
assert result.returncode == 0
assert 'Found 1000 URLs' in result.stdout
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
# Should have 10 unique tags + 1000 snapshots
tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']

View File

@@ -70,61 +70,57 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
# Parse the feed
feed = feedparser.parse(content)
if not feed.entries:
click.echo('No entries found in feed', err=True)
sys.exit(1)
urls_found = []
all_tags = set()
for item in feed.entries:
item_url = getattr(item, 'link', None)
if not item_url:
continue
if not feed.entries:
# No entries - will emit skipped status at end
pass
else:
for item in feed.entries:
item_url = getattr(item, 'link', None)
if not item_url:
continue
title = getattr(item, 'title', None)
title = getattr(item, 'title', None)
# Get bookmarked_at (published/updated date as ISO 8601)
bookmarked_at = None
if hasattr(item, 'published_parsed') and item.published_parsed:
bookmarked_at = datetime.fromtimestamp(mktime(item.published_parsed), tz=timezone.utc).isoformat()
elif hasattr(item, 'updated_parsed') and item.updated_parsed:
bookmarked_at = datetime.fromtimestamp(mktime(item.updated_parsed), tz=timezone.utc).isoformat()
# Get bookmarked_at (published/updated date as ISO 8601)
bookmarked_at = None
if hasattr(item, 'published_parsed') and item.published_parsed:
bookmarked_at = datetime.fromtimestamp(mktime(item.published_parsed), tz=timezone.utc).isoformat()
elif hasattr(item, 'updated_parsed') and item.updated_parsed:
bookmarked_at = datetime.fromtimestamp(mktime(item.updated_parsed), tz=timezone.utc).isoformat()
# Get tags
tags = ''
if hasattr(item, 'tags') and item.tags:
try:
tags = ','.join(tag.term for tag in item.tags if hasattr(tag, 'term'))
# Collect unique tags
for tag in tags.split(','):
tag = tag.strip()
if tag:
all_tags.add(tag)
except (AttributeError, TypeError):
pass
# Get tags
tags = ''
if hasattr(item, 'tags') and item.tags:
try:
tags = ','.join(tag.term for tag in item.tags if hasattr(tag, 'term'))
# Collect unique tags
for tag in tags.split(','):
tag = tag.strip()
if tag:
all_tags.add(tag)
except (AttributeError, TypeError):
pass
entry = {
'type': 'Snapshot',
'url': unescape(item_url),
'plugin': PLUGIN_NAME,
'depth': depth + 1,
}
if snapshot_id:
entry['parent_snapshot_id'] = snapshot_id
if crawl_id:
entry['crawl_id'] = crawl_id
if title:
entry['title'] = unescape(title)
if bookmarked_at:
entry['bookmarked_at'] = bookmarked_at
if tags:
entry['tags'] = tags
urls_found.append(entry)
if not urls_found:
click.echo('No valid URLs found in feed entries', err=True)
sys.exit(1)
entry = {
'type': 'Snapshot',
'url': unescape(item_url),
'plugin': PLUGIN_NAME,
'depth': depth + 1,
}
if snapshot_id:
entry['parent_snapshot_id'] = snapshot_id
if crawl_id:
entry['crawl_id'] = crawl_id
if title:
entry['title'] = unescape(title)
if bookmarked_at:
entry['bookmarked_at'] = bookmarked_at
if tags:
entry['tags'] = tags
urls_found.append(entry)
# Emit Tag records first (to stdout as JSONL)
for tag_name in sorted(all_tags):
@@ -137,7 +133,17 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
for entry in urls_found:
print(json.dumps(entry))
click.echo(f'Found {len(urls_found)} URLs, {len(all_tags)} tags', err=True)
# Emit ArchiveResult record to mark completion
status = 'succeeded' if urls_found else 'skipped'
output_str = f'Found {len(urls_found)} URLs, {len(all_tags)} tags' if urls_found else 'No URLs found'
ar_record = {
'type': 'ArchiveResult',
'status': status,
'output_str': output_str,
}
print(json.dumps(ar_record))
click.echo(output_str, err=True)
sys.exit(0)

View File

@@ -28,10 +28,8 @@ class TestParseRssUrls:
# HN RSS feed should parse successfully
if result.returncode == 0:
output_file = tmp_path / 'urls.jsonl'
assert output_file.exists(), "Output file not created"
content = output_file.read_text()
# Output goes to stdout (JSONL)
content = result.stdout
assert len(content) > 0, "No URLs extracted from real RSS feed"
# Verify at least one URL was extracted
@@ -70,10 +68,8 @@ class TestParseRssUrls:
assert result.returncode == 0
assert 'Found 2 URLs' in result.stdout
output_file = tmp_path / 'urls.jsonl'
assert output_file.exists()
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
assert len(lines) == 2
entries = [json.loads(line) for line in lines]
@@ -112,15 +108,15 @@ class TestParseRssUrls:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
urls = {json.loads(line)['url'] for line in lines}
assert 'https://atom.example.com/entry/1' in urls
assert 'https://atom.example.com/entry/2' in urls
def test_exits_1_when_no_entries(self, tmp_path):
"""Test that script exits with code 1 when feed has no entries."""
def test_skips_when_no_entries(self, tmp_path):
"""Test that script returns skipped status when feed has no entries."""
input_file = tmp_path / 'empty.rss'
input_file.write_text('''<?xml version="1.0"?>
<rss version="2.0">
@@ -137,8 +133,9 @@ class TestParseRssUrls:
text=True,
)
assert result.returncode == 1
assert 'No entries found' in result.stderr
assert result.returncode == 0
assert 'No URLs found' in result.stderr
assert '"status": "skipped"' in result.stdout
def test_exits_1_when_file_not_found(self, tmp_path):
"""Test that script exits with code 1 when file doesn't exist."""
@@ -174,8 +171,9 @@ class TestParseRssUrls:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
assert entry['url'] == 'https://example.com/page?a=1&b=2'
def test_includes_optional_metadata(self, tmp_path):
@@ -201,8 +199,9 @@ class TestParseRssUrls:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
assert entry['url'] == 'https://example.com/test'
assert entry['title'] == 'Test Title'
# Parser converts timestamp to bookmarked_at

View File

@@ -41,8 +41,8 @@ class TestRssVariants:
)
assert result.returncode == 0, f"Failed: {result.stderr}"
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
assert entry['url'] == 'https://example.com/article1'
@@ -82,8 +82,8 @@ class TestRssVariants:
)
assert result.returncode == 0, f"Failed: {result.stderr}"
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
entries = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
urls = {e['url'] for e in entries}
@@ -122,8 +122,8 @@ class TestRssVariants:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
content = output_file.read_text().strip()
# Output goes to stdout (JSONL)
content = result.stdout.strip()
lines = content.split('\n')
# Check for Tag records
@@ -171,8 +171,8 @@ class TestAtomVariants:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
tag_names = {t['name'] for t in tags}
@@ -207,8 +207,9 @@ class TestAtomVariants:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
# feedparser should pick the alternate link
assert 'atom.example.com/article' in entry['url']
@@ -239,8 +240,9 @@ class TestDateFormats:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
assert 'bookmarked_at' in entry
assert '2020-01-15' in entry['bookmarked_at']
@@ -265,8 +267,9 @@ class TestDateFormats:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
assert 'bookmarked_at' in entry
assert '2024-01-15' in entry['bookmarked_at']
@@ -292,8 +295,9 @@ class TestDateFormats:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
# Should use published date (Jan 10) not updated date (Jan 15)
assert '2024-01-10' in entry['bookmarked_at']
@@ -318,8 +322,9 @@ class TestDateFormats:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
assert '2024-01-20' in entry['bookmarked_at']
def test_no_date(self, tmp_path):
@@ -344,8 +349,9 @@ class TestDateFormats:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
assert 'bookmarked_at' not in entry
@@ -377,8 +383,8 @@ class TestTagsAndCategories:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
tag_names = {t['name'] for t in tags}
@@ -414,8 +420,8 @@ class TestTagsAndCategories:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
tag_names = {t['name'] for t in tags}
@@ -445,8 +451,9 @@ class TestTagsAndCategories:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
assert 'tags' not in entry or entry['tags'] == ''
def test_duplicate_tags(self, tmp_path):
@@ -474,8 +481,8 @@ class TestTagsAndCategories:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
# Tag records should be unique
tag_names = [t['name'] for t in tags]
@@ -514,8 +521,8 @@ class TestCustomNamespaces:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
entry = snapshots[0]
@@ -550,8 +557,9 @@ class TestCustomNamespaces:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
assert entry['url'] == 'https://example.com/podcast/1'
assert entry['title'] == 'Podcast Episode 1'
@@ -583,8 +591,8 @@ class TestCustomNamespaces:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
entry = snapshots[0]
@@ -617,8 +625,9 @@ class TestEdgeCases:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
assert entry['url'] == 'https://example.com/notitle'
assert 'title' not in entry
@@ -649,8 +658,9 @@ class TestEdgeCases:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
# Should only have the entry with a link
assert entry['url'] == 'https://example.com/haslink'
@@ -678,8 +688,9 @@ class TestEdgeCases:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
assert entry['title'] == 'Using <div> & <span> tags'
@@ -708,8 +719,8 @@ class TestEdgeCases:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
tag_names = {t['name'] for t in tags}
@@ -740,8 +751,9 @@ class TestEdgeCases:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
# feedparser should strip HTML tags
assert 'HTML' in entry['title']
@@ -770,8 +782,9 @@ class TestEdgeCases:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
# feedparser may convert relative to absolute, or leave as-is
assert 'article/relative' in entry['url']
@@ -800,7 +813,7 @@ class TestEdgeCases:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
# Output goes to stdout (JSONL)
lines = output_file.read_text(encoding='utf-8').strip().split('\n')
snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
@@ -831,8 +844,9 @@ class TestEdgeCases:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
assert len(entry['title']) == 1000
assert entry['title'] == long_title
@@ -870,8 +884,8 @@ class TestEdgeCases:
assert result.returncode == 0
assert 'Found 100 URLs' in result.stdout
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
# Should have 10 unique tags (Tag0-Tag9) + 100 snapshots
tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
@@ -912,8 +926,8 @@ class TestRealWorldFeeds:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
entry = snapshots[0]
@@ -944,8 +958,8 @@ class TestRealWorldFeeds:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
entry = snapshots[0]
@@ -976,8 +990,9 @@ class TestRealWorldFeeds:
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Output goes to stdout (JSONL)
lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
entry = json.loads(lines[0])
assert 'youtube.com' in entry['url']
assert 'dQw4w9WgXcQ' in entry['url']

View File

@@ -117,20 +117,28 @@ def main(url: str, snapshot_id: str = None):
if cleaned_url != url:
urls_found.add(cleaned_url)
if not urls_found:
click.echo('No URLs found', err=True)
sys.exit(1)
# Emit Snapshot records to stdout (JSONL)
for found_url in sorted(urls_found):
record = {
'type': 'Snapshot',
'url': found_url,
'plugin': PLUGIN_NAME,
}
if snapshot_id:
record['parent_snapshot_id'] = snapshot_id
print(json.dumps(record))
# Write urls.jsonl
with open('urls.jsonl', 'w') as f:
for found_url in sorted(urls_found):
f.write(json.dumps({
'type': 'Snapshot',
'url': found_url,
'plugin': PLUGIN_NAME,
}) + '\n')
# Emit ArchiveResult record to mark completion
status = 'succeeded' if urls_found else 'skipped'
output_str = f'Found {len(urls_found)} URLs' if urls_found else 'No URLs found'
ar_record = {
'type': 'ArchiveResult',
'status': status,
'output_str': output_str,
}
print(json.dumps(ar_record))
click.echo(f'Found {len(urls_found)} URLs')
click.echo(output_str, err=True)
sys.exit(0)

View File

@@ -32,17 +32,16 @@ https://www.iana.org/domains/reserved
)
assert result.returncode == 0, f"Failed: {result.stderr}"
assert 'Found 3 URLs' in result.stdout
assert 'Found 3 URLs' in result.stderr
output_file = tmp_path / 'urls.jsonl'
assert output_file.exists()
lines = output_file.read_text().strip().split('\n')
# Parse Snapshot records from stdout
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '"type": "Snapshot"' in line]
assert len(lines) == 3
urls = set()
for line in lines:
entry = json.loads(line)
assert entry['type'] == 'Snapshot'
assert 'url' in entry
urls.add(entry['url'])
@@ -51,6 +50,10 @@ https://www.iana.org/domains/reserved
assert 'https://example.com/page' in urls
assert 'https://www.iana.org/domains/reserved' in urls
# Verify ArchiveResult record
assert '"type": "ArchiveResult"' in result.stdout
assert '"status": "succeeded"' in result.stdout
def test_extracts_urls_from_mixed_content(self, tmp_path):
"""Test extracting URLs embedded in prose text."""
input_file = tmp_path / 'mixed.txt'
@@ -68,8 +71,7 @@ Also see https://github.com/user/repo for the code.
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
urls = {json.loads(line)['url'] for line in lines}
assert 'https://blog.example.com/post' in urls
@@ -92,15 +94,14 @@ Also see https://github.com/user/repo for the code.
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
urls = {json.loads(line)['url'] for line in lines}
assert 'https://example.com/page' in urls
assert any('wikipedia.org' in u for u in urls)
def test_exits_1_when_no_urls_found(self, tmp_path):
"""Test that script exits with code 1 when no URLs found."""
def test_skips_when_no_urls_found(self, tmp_path):
"""Test that script returns skipped status when no URLs found."""
input_file = tmp_path / 'empty.txt'
input_file.write_text('no urls here, just plain text')
@@ -111,8 +112,9 @@ Also see https://github.com/user/repo for the code.
text=True,
)
assert result.returncode == 1
assert result.returncode == 0
assert 'No URLs found' in result.stderr
assert '"status": "skipped"' in result.stdout
def test_exits_1_when_file_not_found(self, tmp_path):
"""Test that script exits with code 1 when file doesn't exist."""
@@ -144,12 +146,11 @@ https://other.com
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
assert len(lines) == 2
def test_appends_to_existing_file(self, tmp_path):
"""Test that output creates urls.jsonl with extracted URLs."""
def test_outputs_to_stdout(self, tmp_path):
"""Test that output goes to stdout in JSONL format."""
input_file = tmp_path / 'urls.txt'
input_file.write_text('https://new.com\nhttps://other.com')
@@ -161,8 +162,7 @@ https://other.com
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
assert len(lines) == 2
urls = {json.loads(line)['url'] for line in lines}
@@ -182,11 +182,11 @@ https://other.com
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
entry = json.loads(lines[0])
assert entry['url'] == 'https://example.com'
assert 'type' in entry
assert 'plugin' in entry
assert entry['type'] == 'Snapshot'
assert entry['plugin'] == 'parse_txt_urls'
if __name__ == '__main__':

View File

@@ -0,0 +1,28 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"required_plugins": ["chrome"],
"properties": {
"PDF_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_PDF", "USE_PDF"],
"description": "Enable PDF generation"
},
"PDF_TIMEOUT": {
"type": "integer",
"default": 60,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for PDF generation in seconds"
},
"PDF_RESOLUTION": {
"type": "string",
"default": "1440,2000",
"pattern": "^\\d+,\\d+$",
"x-fallback": "RESOLUTION",
"description": "PDF page resolution (width,height)"
}
}
}

View File

@@ -2,6 +2,7 @@
Integration tests for pdf plugin
Tests verify:
pass
1. Hook script exists
2. Dependencies installed via chrome validation hooks
3. Verify deps with abx-pkg
@@ -48,7 +49,9 @@ def test_chrome_validation_and_install():
# Parse Dependency request from JSONL
dependency_request = None
for line in result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Dependency':
@@ -79,7 +82,9 @@ def test_chrome_validation_and_install():
# Verify installation via JSONL output
for line in install_result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Binary':
@@ -126,6 +131,7 @@ def test_extracts_pdf_from_example_com():
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
@@ -138,8 +144,9 @@ def test_extracts_pdf_from_example_com():
# Skip verification if network failed
if result_json['status'] != 'succeeded':
pass
if 'TIMED_OUT' in result_json.get('output_str', '') or 'timeout' in result_json.get('output_str', '').lower():
pytest.skip(f"Network timeout occurred: {result_json['output_str']}")
pass
pytest.fail(f"Extraction failed: {result_json}")
assert result.returncode == 0, f"Should exit 0 on success: {result.stderr}"

View File

@@ -1,390 +0,0 @@
#!/usr/bin/env python3
"""
Shared utilities for extractor plugin hooks.
This module provides common functionality for all extractor plugins to ensure
consistent behavior, output format, error handling, and timing.
All extractor plugins should:
1. Import and use these utilities
2. Output consistent metadata (CMD, VERSION, OUTPUT, timing)
3. Write all files to $PWD
4. Return proper exit codes (0=success, 1=failure)
5. Be runnable standalone without any archivebox imports
"""
import json
import os
import shutil
import subprocess
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
# Static file extensions that generally don't need browser-based extraction
STATIC_EXTENSIONS = (
'.pdf', '.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp', '.ico',
'.mp4', '.mp3', '.m4a', '.webm', '.mkv', '.avi', '.mov',
'.zip', '.tar', '.gz', '.bz2', '.xz', '.7z', '.rar',
'.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
'.exe', '.dmg', '.apk', '.deb', '.rpm',
)
def is_static_file(url: str) -> bool:
"""Check if URL points to a static file that may not need browser-based extractor plugins."""
return url.lower().split('?')[0].split('#')[0].endswith(STATIC_EXTENSIONS)
def get_env(name: str, default: str = '') -> str:
"""Get environment variable with default."""
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
"""Get boolean environment variable."""
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def get_env_int(name: str, default: int = 0) -> int:
"""Get integer environment variable."""
try:
return int(get_env(name, str(default)))
except ValueError:
return default
def find_binary(bin_name: str, env_var: str | None = None) -> str | None:
"""Find binary from environment variable or PATH."""
if env_var:
binary = get_env(env_var)
if binary and os.path.isfile(binary):
return binary
return shutil.which(bin_name)
def get_version(binary: str, version_args: list[str] | None = None) -> str:
"""Get binary version string."""
if not binary or not os.path.isfile(binary):
return ''
args = version_args or ['--version']
try:
result = subprocess.run(
[binary] + args,
capture_output=True,
text=True,
timeout=10
)
# Return first non-empty line, truncated
for line in result.stdout.split('\n'):
line = line.strip()
if line:
return line[:64]
return ''
except Exception:
return ''
class ExtractorResult:
"""
Tracks extractor plugin execution and produces consistent output.
Usage:
result = ExtractorResult(name='wget', url=url)
result.cmd = ['wget', url]
result.version = '1.21'
# ... do extraction ...
result.output_str = 'example.com/index.html'
result.status = 'succeeded'
result.finish()
sys.exit(result.exit_code)
"""
def __init__(self, name: str, url: str, snapshot_id: str = ''):
self.name = name
self.url = url
self.snapshot_id = snapshot_id
self.start_ts = datetime.now(timezone.utc)
self.end_ts: datetime | None = None
self.cmd: list[str] = []
self.version: str = ''
self.output_str: str = '' # Human-readable output summary
self.status: str = 'failed' # 'succeeded', 'failed', 'skipped'
self.stdout: str = ''
self.stderr: str = ''
self.returncode: int | None = None
self.error: str = ''
self.hints: list[str] = []
# Dependency info for missing binary
self.dependency_needed: str = ''
self.bin_providers: str = ''
@property
def duration(self) -> float:
"""Duration in seconds."""
if self.end_ts:
return (self.end_ts - self.start_ts).total_seconds()
return (datetime.now(timezone.utc) - self.start_ts).total_seconds()
@property
def exit_code(self) -> int:
"""Exit code based on status."""
if self.status == 'succeeded':
return 0
if self.status == 'skipped':
return 0 # Skipped is not a failure
return 1
def finish(self, status: str | None = None):
"""Mark extractor plugin execution as finished and print results."""
self.end_ts = datetime.now(timezone.utc)
if status:
self.status = status
self._print_results()
def _print_results(self):
"""Print consistent output for hooks.py to parse."""
import sys
# Print timing
print(f"START_TS={self.start_ts.isoformat()}")
print(f"END_TS={self.end_ts.isoformat() if self.end_ts else ''}")
print(f"DURATION={self.duration:.2f}")
# Print command info
if self.cmd:
print(f"CMD={' '.join(str(c) for c in self.cmd)}")
if self.version:
print(f"VERSION={self.version}")
# Print output path
if self.output_str:
print(f"OUTPUT={self.output_str}")
# Print status
print(f"STATUS={self.status}")
# Print dependency info if needed
if self.dependency_needed:
print(f"DEPENDENCY_NEEDED={self.dependency_needed}", file=sys.stderr)
if self.bin_providers:
print(f"BIN_PROVIDERS={self.bin_providers}", file=sys.stderr)
# Print error info
if self.error:
print(f"ERROR={self.error}", file=sys.stderr)
for hint in self.hints:
print(f"HINT={hint}", file=sys.stderr)
# Print clean JSONL result for hooks.py to parse
result_json = {
'type': 'ArchiveResult',
'status': self.status,
'output_str': self.output_str or self.error or '',
}
if self.cmd:
result_json['cmd'] = self.cmd
if self.version:
result_json['cmd_version'] = self.version
print(json.dumps(result_json))
def run_shell_command(
cmd: list[str],
cwd: str | Path | None = None,
timeout: int = 60,
result: ExtractorResult | None = None,
) -> subprocess.CompletedProcess:
"""
Run a shell command with proper capturing and timing.
Updates result object if provided with stdout, stderr, returncode.
"""
cwd = cwd or Path.cwd()
try:
proc = subprocess.run(
cmd,
cwd=str(cwd),
capture_output=True,
timeout=timeout,
)
if result:
result.stdout = proc.stdout.decode('utf-8', errors='replace')
result.stderr = proc.stderr.decode('utf-8', errors='replace')
result.returncode = proc.returncode
return proc
except subprocess.TimeoutExpired as e:
if result:
result.error = f"Command timed out after {timeout} seconds"
result.stdout = e.stdout.decode('utf-8', errors='replace') if e.stdout else ''
result.stderr = e.stderr.decode('utf-8', errors='replace') if e.stderr else ''
raise
except Exception as e:
if result:
result.error = f"{type(e).__name__}: {e}"
raise
def chrome_args(
headless: bool = True,
sandbox: bool = False,
resolution: str = '1440,900',
user_agent: str = '',
check_ssl: bool = True,
user_data_dir: str = '',
profile_name: str = 'Default',
extra_args: list[str] | None = None,
) -> list[str]:
"""
Build Chrome/Chromium command line arguments.
Based on the old CHROME_CONFIG.chrome_args() implementation.
"""
args = [
# Disable unnecessary features
'--disable-sync',
'--no-pings',
'--no-first-run',
'--no-default-browser-check',
'--disable-default-apps',
'--disable-infobars',
'--disable-blink-features=AutomationControlled',
# Deterministic behavior
'--js-flags=--random-seed=1157259159',
'--deterministic-mode',
'--deterministic-fetch',
# Performance
'--disable-background-networking',
'--disable-background-timer-throttling',
'--disable-backgrounding-occluded-windows',
'--disable-renderer-backgrounding',
'--disable-ipc-flooding-protection',
# Disable prompts/popups
'--deny-permission-prompts',
'--disable-notifications',
'--disable-popup-blocking',
'--noerrdialogs',
# Security/privacy
'--disable-client-side-phishing-detection',
'--disable-domain-reliability',
'--disable-component-update',
'--safebrowsing-disable-auto-update',
'--password-store=basic',
'--use-mock-keychain',
# GPU/rendering
'--force-gpu-mem-available-mb=4096',
'--font-render-hinting=none',
'--force-color-profile=srgb',
'--disable-partial-raster',
'--disable-skia-runtime-opts',
'--disable-2d-canvas-clip-aa',
'--disable-lazy-loading',
# Media
'--use-fake-device-for-media-stream',
'--disable-gesture-requirement-for-media-playback',
]
if headless:
args.append('--headless=new')
if not sandbox:
args.extend([
'--no-sandbox',
'--no-zygote',
'--disable-dev-shm-usage',
'--disable-software-rasterizer',
])
if resolution:
args.append(f'--window-size={resolution}')
if not check_ssl:
args.extend([
'--disable-web-security',
'--ignore-certificate-errors',
])
if user_agent:
args.append(f'--user-agent={user_agent}')
if user_data_dir:
args.append(f'--user-data-dir={user_data_dir}')
args.append(f'--profile-directory={profile_name}')
if extra_args:
args.extend(extra_args)
return args
def chrome_cleanup_lockfile(user_data_dir: str | Path):
"""Remove Chrome SingletonLock file that can prevent browser from starting."""
if not user_data_dir:
return
lockfile = Path(user_data_dir) / 'SingletonLock'
try:
lockfile.unlink(missing_ok=True)
except Exception:
pass
# Common Chrome binary names to search for
CHROME_BINARY_NAMES = [
'google-chrome',
'google-chrome-stable',
'chromium',
'chromium-browser',
'chrome',
]
CHROME_BINARY_NAMES_MACOS = [
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'/Applications/Chromium.app/Contents/MacOS/Chromium',
]
def find_chrome() -> str | None:
"""Find Chrome/Chromium binary."""
# Check environment first
chrome = get_env('CHROME_BINARY')
if chrome and os.path.isfile(chrome):
return chrome
# Search PATH
for name in CHROME_BINARY_NAMES:
binary = shutil.which(name)
if binary:
return binary
# Check macOS locations
for path in CHROME_BINARY_NAMES_MACOS:
if os.path.isfile(path):
return path
return None

View File

@@ -3,9 +3,10 @@
"type": "object",
"additionalProperties": false,
"properties": {
"SAVE_READABILITY": {
"READABILITY_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_READABILITY", "USE_READABILITY"],
"description": "Enable Readability text extraction"
},
"READABILITY_BINARY": {

View File

@@ -2,6 +2,7 @@
Integration tests for readability plugin
Tests verify:
pass
1. Validate hook checks for readability-extractor binary
2. Verify deps with abx-pkg
3. Plugin reports missing dependency correctly
@@ -115,7 +116,9 @@ def test_readability_install_hook():
# Binary found - verify Binary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Binary':
@@ -130,7 +133,9 @@ def test_readability_install_hook():
# Binary not found - verify Dependency JSONL output
found_dependency = False
for line in result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Dependency':
@@ -157,7 +162,7 @@ def test_verify_deps_with_abx_pkg():
if readability_loaded and readability_loaded.abspath:
assert True, "readability-extractor is available"
else:
pytest.skip("readability-extractor not available - Dependency record should have been emitted")
pass
def test_extracts_article_after_installation():
@@ -186,6 +191,7 @@ def test_extracts_article_after_installation():
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':

View File

@@ -1,90 +0,0 @@
#!/bin/bash
# Run all plugin tests
#
# Usage: ./run_all_tests.sh
set -e
echo "=========================================="
echo "Running All Plugin Tests"
echo "=========================================="
echo ""
# Color codes
GREEN='\033[0;32m'
RED='\033[0;31m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# Track results
TOTAL_TESTS=0
PASSED_TESTS=0
FAILED_TESTS=0
run_test_suite() {
local test_file=$1
local test_name=$(basename $(dirname $test_file))
echo -e "${YELLOW}[RUNNING]${NC} $test_name tests..."
if node --test "$test_file" 2>&1; then
echo -e "${GREEN}[PASSED]${NC} $test_name tests"
PASSED_TESTS=$((PASSED_TESTS + 1))
else
echo -e "${RED}[FAILED]${NC} $test_name tests"
FAILED_TESTS=$((FAILED_TESTS + 1))
fi
TOTAL_TESTS=$((TOTAL_TESTS + 1))
echo ""
}
# Find and run all test files
echo "Finding test files..."
echo ""
# Chrome extensions utils tests
if [ -f "chrome_extensions/tests/test_chrome_extension_utils.js" ]; then
run_test_suite "chrome_extensions/tests/test_chrome_extension_utils.js"
fi
# Captcha2 tests
if [ -f "captcha2/tests/test_captcha2_install.js" ]; then
run_test_suite "captcha2/tests/test_captcha2_install.js"
fi
if [ -f "captcha2/tests/test_captcha2_config.js" ]; then
run_test_suite "captcha2/tests/test_captcha2_config.js"
fi
# I Still Don't Care About Cookies tests
if [ -f "istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.js" ]; then
run_test_suite "istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.js"
fi
# uBlock tests
if [ -f "ublock/tests/test_ublock.js" ]; then
run_test_suite "ublock/tests/test_ublock.js"
fi
# SingleFile tests
if [ -f "singlefile/tests/test_singlefile.js" ]; then
run_test_suite "singlefile/tests/test_singlefile.js"
fi
# Print summary
echo "=========================================="
echo "Test Summary"
echo "=========================================="
echo -e "Total test suites: $TOTAL_TESTS"
echo -e "${GREEN}Passed:${NC} $PASSED_TESTS"
echo -e "${RED}Failed:${NC} $FAILED_TESTS"
echo ""
if [ $FAILED_TESTS -eq 0 ]; then
echo -e "${GREEN}✓ All tests passed!${NC}"
exit 0
else
echo -e "${RED}✗ Some tests failed${NC}"
exit 1
fi

View File

@@ -1,29 +0,0 @@
#!/bin/bash
# Run all plugin tests
#
# Usage: ./run_tests.sh [plugin_name]
#
# Examples:
# ./run_tests.sh # Run all tests
# ./run_tests.sh captcha2 # Run only captcha2 tests
# ./run_tests.sh chrome_* # Run all chrome tests
set -e
echo "=========================================="
echo "Running ArchiveBox Plugin Tests"
echo "=========================================="
echo ""
if [ -n "$1" ]; then
echo "Running tests for: $1"
python -m pytest "$1"/tests/ -v
else
echo "Running all plugin tests..."
python -m pytest */tests/test_*.py -v
fi
echo ""
echo "=========================================="
echo "Tests Complete"
echo "=========================================="

View File

@@ -0,0 +1,28 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"required_plugins": ["chrome"],
"properties": {
"SCREENSHOT_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_SCREENSHOT", "USE_SCREENSHOT"],
"description": "Enable screenshot capture"
},
"SCREENSHOT_TIMEOUT": {
"type": "integer",
"default": 60,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for screenshot capture in seconds"
},
"SCREENSHOT_RESOLUTION": {
"type": "string",
"default": "1440,2000",
"pattern": "^\\d+,\\d+$",
"x-fallback": "RESOLUTION",
"description": "Screenshot resolution (width,height)"
}
}
}

View File

@@ -3,21 +3,24 @@
"type": "object",
"additionalProperties": false,
"properties": {
"RIPGREP_BINARY": {
"SEARCH_BACKEND_RIPGREP_BINARY": {
"type": "string",
"default": "rg",
"x-aliases": ["RIPGREP_BINARY"],
"description": "Path to ripgrep binary"
},
"RIPGREP_IGNORE_EXTENSIONS": {
"SEARCH_BACKEND_RIPGREP_IGNORE_EXTENSIONS": {
"type": "string",
"default": "css,js,orig,svg",
"x-aliases": ["RIPGREP_IGNORE_EXTENSIONS"],
"description": "Comma-separated file extensions to ignore"
},
"SEARCH_BACKEND_TIMEOUT": {
"SEARCH_BACKEND_RIPGREP_TIMEOUT": {
"type": "integer",
"default": 90,
"minimum": 5,
"x-fallback": "TIMEOUT",
"x-aliases": ["SEARCH_BACKEND_TIMEOUT"],
"description": "Search timeout in seconds"
}
}

View File

@@ -3,6 +3,7 @@
Tests for ripgrep binary detection and archivebox install functionality.
Guards against regressions in:
pass
1. Machine.config overrides not being used in version command
2. Ripgrep hook not resolving binary names via shutil.which()
3. SEARCH_BACKEND_ENGINE not being passed to hook environment
@@ -26,7 +27,7 @@ def test_ripgrep_hook_detects_binary_from_path():
# Skip if rg is not installed
if not shutil.which('rg'):
pytest.skip("ripgrep (rg) not installed")
pass
# Set SEARCH_BACKEND_ENGINE to enable the hook
env = os.environ.copy()
@@ -85,7 +86,7 @@ def test_ripgrep_hook_handles_absolute_path():
rg_path = shutil.which('rg')
if not rg_path:
pytest.skip("ripgrep (rg) not installed")
pass
env = os.environ.copy()
env['SEARCH_BACKEND_ENGINE'] = 'ripgrep'
@@ -114,7 +115,7 @@ def test_machine_config_overrides_base_config():
Guards against regression where archivebox version was showing binaries
as "not installed" even though they were detected and stored in Machine.config.
"""
from machine.models import Machine, Binary
from archivebox.machine.models import Machine, Binary
machine = Machine.current()
@@ -176,9 +177,8 @@ def test_install_creates_binary_records():
This is an integration test that verifies the full install flow.
"""
from machine.models import Machine, Binary
from crawls.models import Seed, Crawl
from crawls.statemachines import CrawlMachine
from archivebox.machine.models import Machine, Binary
from archivebox.crawls.models import Seed, Crawl, CrawlMachine
from archivebox.base_models.models import get_or_create_system_user_pk
machine = Machine.current()
@@ -213,6 +213,7 @@ def test_install_creates_binary_records():
common_binaries = ['git', 'wget', 'node']
detected = []
for bin_name in common_binaries:
pass
if Binary.objects.filter(machine=machine, name=bin_name).exists():
detected.append(bin_name)
@@ -220,6 +221,7 @@ def test_install_creates_binary_records():
# Verify detected binaries have valid paths and versions
for binary in Binary.objects.filter(machine=machine):
pass
if binary.abspath: # Only check non-empty paths
assert '/' in binary.abspath, \
f"{binary.name} should have full path, not just name: {binary.abspath}"
@@ -233,14 +235,13 @@ def test_ripgrep_only_detected_when_backend_enabled():
Guards against ripgrep being installed/detected when not needed.
"""
from machine.models import Machine, Binary
from crawls.models import Seed, Crawl
from crawls.statemachines import CrawlMachine
from archivebox.machine.models import Machine, Binary
from archivebox.crawls.models import Seed, Crawl, CrawlMachine
from archivebox.base_models.models import get_or_create_system_user_pk
from django.conf import settings
if not shutil.which('rg'):
pytest.skip("ripgrep (rg) not installed")
pass
machine = Machine.current()

View File

@@ -3,34 +3,36 @@
"type": "object",
"additionalProperties": false,
"properties": {
"SEARCH_BACKEND_HOST_NAME": {
"SEARCH_BACKEND_SONIC_HOST_NAME": {
"type": "string",
"default": "127.0.0.1",
"x-aliases": ["SONIC_HOST"],
"x-aliases": ["SEARCH_BACKEND_HOST_NAME", "SONIC_HOST"],
"description": "Sonic server hostname"
},
"SEARCH_BACKEND_PORT": {
"SEARCH_BACKEND_SONIC_PORT": {
"type": "integer",
"default": 1491,
"minimum": 1,
"maximum": 65535,
"x-aliases": ["SONIC_PORT"],
"x-aliases": ["SEARCH_BACKEND_PORT", "SONIC_PORT"],
"description": "Sonic server port"
},
"SEARCH_BACKEND_PASSWORD": {
"SEARCH_BACKEND_SONIC_PASSWORD": {
"type": "string",
"default": "SecretPassword",
"x-aliases": ["SONIC_PASSWORD"],
"x-aliases": ["SEARCH_BACKEND_PASSWORD", "SONIC_PASSWORD"],
"description": "Sonic server password"
},
"SONIC_COLLECTION": {
"SEARCH_BACKEND_SONIC_COLLECTION": {
"type": "string",
"default": "archivebox",
"x-aliases": ["SONIC_COLLECTION"],
"description": "Sonic collection name"
},
"SONIC_BUCKET": {
"SEARCH_BACKEND_SONIC_BUCKET": {
"type": "string",
"default": "snapshots",
"x-aliases": ["SONIC_BUCKET"],
"description": "Sonic bucket name"
}
}

View File

@@ -3,21 +3,22 @@
"type": "object",
"additionalProperties": false,
"properties": {
"SQLITEFTS_DB": {
"SEARCH_BACKEND_SQLITE_DB": {
"type": "string",
"default": "search.sqlite3",
"x-aliases": ["SQLITEFTS_DB"],
"description": "SQLite FTS database filename"
},
"FTS_SEPARATE_DATABASE": {
"SEARCH_BACKEND_SQLITE_SEPARATE_DATABASE": {
"type": "boolean",
"default": true,
"x-aliases": ["SQLITEFTS_SEPARATE_DATABASE"],
"x-aliases": ["FTS_SEPARATE_DATABASE", "SQLITEFTS_SEPARATE_DATABASE"],
"description": "Use separate database file for FTS index"
},
"FTS_TOKENIZERS": {
"SEARCH_BACKEND_SQLITE_TOKENIZERS": {
"type": "string",
"default": "porter unicode61 remove_diacritics 2",
"x-aliases": ["SQLITEFTS_TOKENIZERS"],
"x-aliases": ["FTS_TOKENIZERS", "SQLITEFTS_TOKENIZERS"],
"description": "FTS5 tokenizer configuration"
}
}

View File

@@ -3,9 +3,10 @@
"type": "object",
"additionalProperties": false,
"properties": {
"SAVE_SINGLEFILE": {
"SINGLEFILE_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_SINGLEFILE", "USE_SINGLEFILE"],
"description": "Enable SingleFile archiving"
},
"SINGLEFILE_BINARY": {

View File

@@ -1,385 +0,0 @@
/**
* Unit tests for singlefile plugin
*
* Run with: node --test tests/test_singlefile.js
*/
const assert = require('assert');
const fs = require('fs');
const path = require('path');
const { describe, it, before, after, beforeEach, afterEach } = require('node:test');
// Test fixtures
const TEST_DIR = path.join(__dirname, '.test_fixtures');
const TEST_EXTENSIONS_DIR = path.join(TEST_DIR, 'chrome_extensions');
const TEST_DOWNLOADS_DIR = path.join(TEST_DIR, 'chrome_downloads');
describe('singlefile plugin', () => {
before(() => {
if (!fs.existsSync(TEST_DIR)) {
fs.mkdirSync(TEST_DIR, { recursive: true });
}
});
after(() => {
if (fs.existsSync(TEST_DIR)) {
fs.rmSync(TEST_DIR, { recursive: true, force: true });
}
});
describe('EXTENSION metadata', () => {
it('should have correct webstore_id', () => {
const { EXTENSION } = require('../on_Snapshot__04_singlefile.js');
assert.strictEqual(EXTENSION.webstore_id, 'mpiodijhokgodhhofbcjdecpffjipkle');
});
it('should have correct name', () => {
const { EXTENSION } = require('../on_Snapshot__04_singlefile.js');
assert.strictEqual(EXTENSION.name, 'singlefile');
});
});
describe('installSinglefileExtension', () => {
beforeEach(() => {
process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
});
afterEach(() => {
if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
delete process.env.CHROME_EXTENSIONS_DIR;
});
it('should use cached extension if available', async () => {
const { installSinglefileExtension } = require('../on_Snapshot__04_singlefile.js');
// Create fake cache
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'singlefile.extension.json');
const fakeExtensionDir = path.join(TEST_EXTENSIONS_DIR, 'fake_singlefile');
fs.mkdirSync(fakeExtensionDir, { recursive: true });
fs.writeFileSync(
path.join(fakeExtensionDir, 'manifest.json'),
JSON.stringify({ version: '1.22.90' })
);
const fakeCache = {
webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle',
name: 'singlefile',
unpacked_path: fakeExtensionDir,
version: '1.22.90'
};
fs.writeFileSync(cacheFile, JSON.stringify(fakeCache));
const result = await installSinglefileExtension();
assert.notStrictEqual(result, null);
assert.strictEqual(result.webstore_id, 'mpiodijhokgodhhofbcjdecpffjipkle');
});
});
describe('saveSinglefileWithExtension', () => {
beforeEach(() => {
process.env.CHROME_DOWNLOADS_DIR = TEST_DOWNLOADS_DIR;
if (!fs.existsSync(TEST_DOWNLOADS_DIR)) {
fs.mkdirSync(TEST_DOWNLOADS_DIR, { recursive: true });
}
});
afterEach(() => {
if (fs.existsSync(TEST_DOWNLOADS_DIR)) {
fs.rmSync(TEST_DOWNLOADS_DIR, { recursive: true });
}
delete process.env.CHROME_DOWNLOADS_DIR;
});
it('should require extension and version to be present', () => {
const mockExtension = {
name: 'singlefile',
version: '1.22.96',
id: 'test_id'
};
assert.ok(mockExtension.version);
assert.ok(mockExtension.id);
});
it('should filter unsupported URL schemes', () => {
const unsupportedSchemes = [
'about:',
'chrome:',
'chrome-extension:',
'data:',
'javascript:',
'blob:'
];
unsupportedSchemes.forEach(scheme => {
const testUrl = scheme + 'something';
const urlScheme = testUrl.split(':')[0];
assert.ok(unsupportedSchemes.some(s => s.startsWith(urlScheme)));
});
});
it('should wait for file to appear in downloads directory', async () => {
const checkDelay = 3000; // 3 seconds
const maxTries = 10;
// Total max wait time
const maxWaitTime = checkDelay * maxTries;
assert.strictEqual(maxWaitTime, 30000); // 30 seconds
});
it('should find downloaded file by checking URL in HTML header', () => {
const testUrl = 'https://example.com';
const mockHtml = `<!-- url: ${testUrl} --><html><head><meta charset="utf-8"></head></html>`;
// Should be able to extract URL from header
const headerPart = mockHtml.split('meta charset')[0];
assert.ok(headerPart.includes(`url: ${testUrl}`));
});
it('should move file from downloads to output directory', () => {
const downloadPath = path.join(TEST_DOWNLOADS_DIR, 'temp_file.html');
const outputDir = 'singlefile';
const outputFile = 'singlefile.html';
const outputPath = path.join(outputDir, outputFile);
// Verify paths are different
assert.notStrictEqual(downloadPath, outputPath);
});
});
describe('saveSinglefileWithCLI', () => {
it('should use single-file-cli as fallback', () => {
const cliCommand = 'single-file';
// Should check for CLI availability
assert.strictEqual(typeof cliCommand, 'string');
assert.ok(cliCommand.length > 0);
});
it('should pass correct arguments to CLI', () => {
const args = [
'--browser-headless',
'https://example.com',
'singlefile/singlefile.html'
];
assert.ok(args.includes('--browser-headless'));
assert.ok(args.some(arg => arg.startsWith('http')));
});
it('should handle optional CLI arguments', () => {
const options = {
userAgent: 'Mozilla/5.0...',
cookiesFile: '/path/to/cookies.txt',
ignoreSSL: true
};
// Optional args should be conditionally added
if (options.userAgent) {
assert.ok(options.userAgent.length > 0);
}
if (options.ignoreSSL) {
assert.strictEqual(options.ignoreSSL, true);
}
});
});
describe('priority and execution order', () => {
it('should have priority 04 (early)', () => {
const filename = 'on_Snapshot__04_singlefile.js';
const match = filename.match(/on_Snapshot__(\d+)_/);
assert.ok(match);
const priority = parseInt(match[1]);
assert.strictEqual(priority, 4);
});
it('should run before chrome (priority 20)', () => {
const extensionPriority = 4;
const chromeSessionPriority = 20;
assert.ok(extensionPriority < chromeSessionPriority);
});
it('should install extensions in correct order', () => {
const priorities = {
captcha2: 1,
istilldontcareaboutcookies: 2,
ublock: 3,
singlefile: 4
};
// Should be in ascending order
assert.ok(priorities.captcha2 < priorities.istilldontcareaboutcookies);
assert.ok(priorities.istilldontcareaboutcookies < priorities.ublock);
assert.ok(priorities.ublock < priorities.singlefile);
});
});
describe('output structure', () => {
it('should define output directory and file', () => {
const OUTPUT_DIR = 'singlefile';
const OUTPUT_FILE = 'singlefile.html';
assert.strictEqual(OUTPUT_DIR, 'singlefile');
assert.strictEqual(OUTPUT_FILE, 'singlefile.html');
});
it('should create output directory if not exists', () => {
const outputDir = path.join(TEST_DIR, 'singlefile');
// Should create directory
if (!fs.existsSync(outputDir)) {
fs.mkdirSync(outputDir, { recursive: true });
}
assert.ok(fs.existsSync(outputDir));
// Cleanup
fs.rmSync(outputDir, { recursive: true });
});
});
describe('extension vs CLI fallback', () => {
it('should prefer extension over CLI', () => {
const preferenceOrder = [
'extension',
'cli'
];
assert.strictEqual(preferenceOrder[0], 'extension');
assert.strictEqual(preferenceOrder[1], 'cli');
});
it('should fallback to CLI if extension unavailable', () => {
const extensionAvailable = false;
const cliAvailable = true;
let method;
if (extensionAvailable) {
method = 'extension';
} else if (cliAvailable) {
method = 'cli';
}
assert.strictEqual(method, 'cli');
});
it('should use extension if available', () => {
const extensionAvailable = true;
let method;
if (extensionAvailable) {
method = 'extension';
} else {
method = 'cli';
}
assert.strictEqual(method, 'extension');
});
});
describe('file matching and validation', () => {
beforeEach(() => {
if (!fs.existsSync(TEST_DOWNLOADS_DIR)) {
fs.mkdirSync(TEST_DOWNLOADS_DIR, { recursive: true });
}
});
afterEach(() => {
if (fs.existsSync(TEST_DOWNLOADS_DIR)) {
fs.rmSync(TEST_DOWNLOADS_DIR, { recursive: true });
}
});
it('should filter HTML files from downloads', () => {
// Create mock download files
const files = [
'example.html',
'test.pdf',
'image.png',
'page.html'
];
const htmlFiles = files.filter(f => f.endsWith('.html'));
assert.strictEqual(htmlFiles.length, 2);
assert.ok(htmlFiles.includes('example.html'));
assert.ok(htmlFiles.includes('page.html'));
});
it('should match URL in HTML header comment', () => {
const testUrl = 'https://example.com/page';
const htmlContent = `<!--
Page saved with SingleFile
url: ${testUrl}
saved date: 2024-01-01
-->
<html>...</html>`;
const headerSection = htmlContent.split('meta charset')[0] || htmlContent.split('<html>')[0];
assert.ok(headerSection.includes(`url: ${testUrl}`));
});
it('should handle multiple new files in downloads', () => {
const filesBefore = new Set(['old1.html', 'old2.html']);
const filesAfter = ['old1.html', 'old2.html', 'new1.html', 'new2.html'];
const filesNew = filesAfter.filter(f => !filesBefore.has(f));
assert.strictEqual(filesNew.length, 2);
assert.ok(filesNew.includes('new1.html'));
assert.ok(filesNew.includes('new2.html'));
});
});
describe('error handling', () => {
it('should timeout after max wait time', () => {
const checkDelay = 3000; // ms
const maxTries = 10;
const timeoutMs = checkDelay * maxTries;
assert.strictEqual(timeoutMs, 30000); // 30 seconds
});
it('should handle missing extension gracefully', () => {
const extension = null;
if (!extension || !extension.version) {
// Should throw error
assert.ok(true);
}
});
it('should handle file not found after waiting', () => {
const filesNew = [];
const maxWaitReached = true;
if (filesNew.length === 0 && maxWaitReached) {
// Should return null
const result = null;
assert.strictEqual(result, null);
}
});
});
});

View File

@@ -225,6 +225,7 @@ async function main() {
let status = 'failed';
let output = null;
let error = '';
let extractedTitle = null;
try {
const result = await extractTitle(url);
@@ -232,7 +233,8 @@ async function main() {
if (result.success) {
status = 'succeeded';
output = result.output;
console.log(`Title extracted (${result.method}): ${result.title}`);
extractedTitle = result.title;
console.error(`Title extracted (${result.method}): ${result.title}`);
} else {
status = 'failed';
error = result.error;
@@ -248,13 +250,22 @@ async function main() {
console.error(`ERROR: ${error}`);
}
// Output clean JSONL (no RESULT_JSON= prefix)
const result = {
// Update snapshot title via JSONL
if (status === 'succeeded' && extractedTitle) {
console.log(JSON.stringify({
type: 'Snapshot',
id: snapshotId,
title: extractedTitle
}));
}
// Output ArchiveResult JSONL
const archiveResult = {
type: 'ArchiveResult',
status,
output_str: output || error || '',
output_str: extractedTitle || error || '',
};
console.log(JSON.stringify(result));
console.log(JSON.stringify(archiveResult));
process.exit(status === 'succeeded' ? 0 : 1);
}

View File

@@ -2,6 +2,7 @@
Integration tests for title plugin
Tests verify:
pass
1. Plugin script exists
2. Node.js is available
3. Title extraction works for real example.com
@@ -35,7 +36,7 @@ def test_extracts_title_from_example_com():
# Check node is available
if not shutil.which('node'):
pytest.skip("node not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
@@ -56,6 +57,7 @@ def test_extracts_title_from_example_com():
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
@@ -84,7 +86,7 @@ def test_falls_back_to_http_when_chrome_unavailable():
"""Test that title plugin falls back to HTTP when chrome unavailable."""
if not shutil.which('node'):
pytest.skip("node not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
@@ -107,6 +109,7 @@ def test_falls_back_to_http_when_chrome_unavailable():
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
@@ -130,7 +133,7 @@ def test_config_timeout_honored():
"""Test that TIMEOUT config is respected."""
if not shutil.which('node'):
pytest.skip("node not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
@@ -157,7 +160,7 @@ def test_config_user_agent():
"""Test that USER_AGENT config is used."""
if not shutil.which('node'):
pytest.skip("node not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
@@ -183,6 +186,7 @@ def test_config_user_agent():
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
@@ -199,7 +203,7 @@ def test_handles_https_urls():
"""Test that HTTPS URLs work correctly."""
if not shutil.which('node'):
pytest.skip("node not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
@@ -229,7 +233,7 @@ def test_handles_404_gracefully():
"""
if not shutil.which('node'):
pytest.skip("node not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
@@ -251,7 +255,7 @@ def test_handles_redirects():
"""Test that title plugin handles redirects correctly."""
if not shutil.which('node'):
pytest.skip("node not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)

View File

@@ -1,321 +0,0 @@
/**
* Unit tests for ublock plugin
*
* Run with: node --test tests/test_ublock.js
*/
const assert = require('assert');
const fs = require('fs');
const path = require('path');
const { describe, it, before, after, beforeEach, afterEach } = require('node:test');
// Test fixtures
const TEST_DIR = path.join(__dirname, '.test_fixtures');
const TEST_EXTENSIONS_DIR = path.join(TEST_DIR, 'chrome_extensions');
describe('ublock plugin', () => {
before(() => {
if (!fs.existsSync(TEST_DIR)) {
fs.mkdirSync(TEST_DIR, { recursive: true });
}
});
after(() => {
if (fs.existsSync(TEST_DIR)) {
fs.rmSync(TEST_DIR, { recursive: true, force: true });
}
});
describe('EXTENSION metadata', () => {
it('should have correct webstore_id for uBlock Origin', () => {
const { EXTENSION } = require('../on_Snapshot__03_ublock.js');
assert.strictEqual(EXTENSION.webstore_id, 'cjpalhdlnbpafiamejdnhcphjbkeiagm');
});
it('should have correct name', () => {
const { EXTENSION } = require('../on_Snapshot__03_ublock.js');
assert.strictEqual(EXTENSION.name, 'ublock');
});
});
describe('installUblockExtension', () => {
beforeEach(() => {
process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
});
afterEach(() => {
if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
delete process.env.CHROME_EXTENSIONS_DIR;
});
it('should use cached extension if available', async () => {
const { installUblockExtension } = require('../on_Snapshot__03_ublock.js');
// Create fake cache
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'ublock.extension.json');
const fakeExtensionDir = path.join(TEST_EXTENSIONS_DIR, 'fake_ublock');
fs.mkdirSync(fakeExtensionDir, { recursive: true });
fs.writeFileSync(
path.join(fakeExtensionDir, 'manifest.json'),
JSON.stringify({ version: '1.67.0' })
);
const fakeCache = {
webstore_id: 'cjpalhdlnbpafiamejdnhcphjbkeiagm',
name: 'ublock',
unpacked_path: fakeExtensionDir,
version: '1.67.0'
};
fs.writeFileSync(cacheFile, JSON.stringify(fakeCache));
const result = await installUblockExtension();
assert.notStrictEqual(result, null);
assert.strictEqual(result.webstore_id, 'cjpalhdlnbpafiamejdnhcphjbkeiagm');
});
it('should not require any configuration', async () => {
// uBlock Origin works out of the box with default filter lists
const { EXTENSION } = require('../on_Snapshot__03_ublock.js');
assert.ok(EXTENSION);
// No config fields should be required
});
it('should have large download size (filter lists)', () => {
// uBlock Origin is typically larger than other extensions
// due to included filter lists (usually 3-5 MB)
const typicalSize = 4 * 1024 * 1024; // ~4 MB
const minExpectedSize = 2 * 1024 * 1024; // Minimum 2 MB
// Just verify we understand the expected size
assert.ok(typicalSize > minExpectedSize);
});
});
describe('cache file creation', () => {
beforeEach(() => {
process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
});
afterEach(() => {
if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
delete process.env.CHROME_EXTENSIONS_DIR;
});
it('should create cache file with correct structure', async () => {
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'ublock.extension.json');
const mockExtension = {
webstore_id: 'cjpalhdlnbpafiamejdnhcphjbkeiagm',
name: 'ublock',
version: '1.68.0',
unpacked_path: path.join(TEST_EXTENSIONS_DIR, 'test_ublock'),
crx_path: path.join(TEST_EXTENSIONS_DIR, 'test_ublock.crx')
};
await fs.promises.writeFile(cacheFile, JSON.stringify(mockExtension, null, 2));
assert.ok(fs.existsSync(cacheFile));
const cache = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
assert.strictEqual(cache.name, 'ublock');
assert.strictEqual(cache.webstore_id, 'cjpalhdlnbpafiamejdnhcphjbkeiagm');
});
});
describe('extension functionality', () => {
it('should work automatically with default filter lists', () => {
const features = {
automaticBlocking: true,
requiresConfiguration: false,
requiresApiKey: false,
defaultFilterLists: true,
blocksAds: true,
blocksTrackers: true,
blocksMalware: true
};
assert.strictEqual(features.automaticBlocking, true);
assert.strictEqual(features.requiresConfiguration, false);
assert.strictEqual(features.requiresApiKey, false);
assert.strictEqual(features.defaultFilterLists, true);
});
it('should not require runtime configuration', () => {
// uBlock Origin works purely via filter lists and content scripts
// No API keys or runtime configuration needed
const requiresRuntimeConfig = false;
const requiresApiKey = false;
assert.strictEqual(requiresRuntimeConfig, false);
assert.strictEqual(requiresApiKey, false);
});
it('should support standard filter list formats', () => {
const supportedFormats = [
'EasyList',
'EasyPrivacy',
'Malware Domains',
'Peter Lowe\'s List',
'uBlock Origin filters'
];
assert.ok(supportedFormats.length > 0);
// Should support multiple filter list formats
});
});
describe('priority and execution order', () => {
it('should have priority 03 (early)', () => {
const filename = 'on_Snapshot__03_ublock.js';
const match = filename.match(/on_Snapshot__(\d+)_/);
assert.ok(match);
const priority = parseInt(match[1]);
assert.strictEqual(priority, 3);
});
it('should run before chrome (priority 20)', () => {
const extensionPriority = 3;
const chromeSessionPriority = 20;
assert.ok(extensionPriority < chromeSessionPriority);
});
it('should run after cookie dismissal extension', () => {
const ublockPriority = 3;
const cookiesPriority = 2;
assert.ok(ublockPriority > cookiesPriority);
});
});
describe('performance considerations', () => {
it('should benefit from caching due to large size', () => {
// uBlock Origin's large size makes caching especially important
const averageDownloadTime = 10; // seconds
const averageCacheCheckTime = 0.01; // seconds
const performanceGain = averageDownloadTime / averageCacheCheckTime;
// Should be at least 100x faster with cache
assert.ok(performanceGain > 100);
});
it('should not impact page load time significantly', () => {
// While extension is large, it uses efficient blocking
const efficientBlocking = true;
const minimalOverhead = true;
assert.strictEqual(efficientBlocking, true);
assert.strictEqual(minimalOverhead, true);
});
});
describe('error handling', () => {
beforeEach(() => {
process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
});
afterEach(() => {
if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
delete process.env.CHROME_EXTENSIONS_DIR;
});
it('should handle corrupted cache gracefully', async () => {
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'ublock.extension.json');
// Create corrupted cache
fs.writeFileSync(cacheFile, 'invalid json content');
const { installUblockExtension } = require('../on_Snapshot__03_ublock.js');
// Mock loadOrInstallExtension to avoid actual download
const extensionUtils = require('../../chrome_extensions/chrome_extension_utils.js');
const originalFunc = extensionUtils.loadOrInstallExtension;
extensionUtils.loadOrInstallExtension = async () => ({
webstore_id: 'cjpalhdlnbpafiamejdnhcphjbkeiagm',
name: 'ublock',
version: '1.68.0'
});
const result = await installUblockExtension();
extensionUtils.loadOrInstallExtension = originalFunc;
assert.notStrictEqual(result, null);
});
it('should handle download timeout gracefully', () => {
// For large extension like uBlock, timeout handling is important
const timeoutSeconds = 120; // 2 minutes
const minTimeout = 30; // Should allow at least 30 seconds
assert.ok(timeoutSeconds > minTimeout);
});
});
describe('filter list validation', () => {
it('should have valid filter list format', () => {
// Example filter list entry
const sampleFilters = [
'||ads.example.com^',
'||tracker.example.com^$third-party',
'##.advertisement'
];
// All filters should follow standard format
sampleFilters.forEach(filter => {
assert.ok(typeof filter === 'string');
assert.ok(filter.length > 0);
});
});
it('should support cosmetic filters', () => {
const cosmeticFilter = '##.banner-ad';
// Should start with ## for cosmetic filters
assert.ok(cosmeticFilter.startsWith('##'));
});
it('should support network filters', () => {
const networkFilter = '||ads.example.com^';
// Network filters typically start with || or contain ^
assert.ok(networkFilter.includes('||') || networkFilter.includes('^'));
});
});
});

View File

@@ -3,19 +3,22 @@
"type": "object",
"additionalProperties": false,
"properties": {
"SAVE_WGET": {
"WGET_ENABLED": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_WGET", "USE_WGET"],
"description": "Enable wget archiving"
},
"SAVE_WARC": {
"WGET_SAVE_WARC": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_WARC"],
"description": "Save WARC archive file"
},
"SAVE_WGET_REQUISITES": {
"WGET_SAVE_REQUISITES": {
"type": "boolean",
"default": true,
"x-aliases": ["SAVE_WGET_REQUISITES"],
"description": "Download page requisites (CSS, JS, images)"
},
"WGET_BINARY": {

View File

@@ -2,6 +2,7 @@
Integration tests for wget plugin
Tests verify:
pass
1. Validate hook checks for wget binary
2. Verify deps with abx-pkg
3. Config options work (SAVE_WGET, SAVE_WARC, etc.)
@@ -51,7 +52,9 @@ def test_wget_install_hook():
# Binary found - verify Binary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Binary':
@@ -66,7 +69,9 @@ def test_wget_install_hook():
# Binary not found - verify Dependency JSONL output
found_dependency = False
for line in result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Dependency':
@@ -89,7 +94,7 @@ def test_verify_deps_with_abx_pkg():
if wget_loaded and wget_loaded.abspath:
assert True, "wget is available"
else:
pytest.skip("wget not available - Dependency record should have been emitted")
pass
def test_reports_missing_dependency_when_not_installed():
@@ -127,7 +132,7 @@ def test_can_install_wget_via_provider():
provider_hook = APT_HOOK
provider_name = 'apt'
else:
pytest.skip("Neither brew nor apt available on this system")
pass
assert provider_hook.exists(), f"Provider hook not found: {provider_hook}"
@@ -156,7 +161,9 @@ def test_can_install_wget_via_provider():
# Parse JSONL if present
if result.stdout.strip():
pass
for line in result.stdout.strip().split('\n'):
pass
try:
record = json.loads(line)
if record.get('type') == 'Binary':
@@ -182,7 +189,7 @@ def test_archives_example_com():
elif shutil.which('apt-get'):
provider_hook = APT_HOOK
else:
pytest.skip("Neither brew nor apt available")
pass
# Run installation (idempotent - will succeed if already installed)
install_result = subprocess.run(
@@ -199,7 +206,7 @@ def test_archives_example_com():
)
if install_result.returncode != 0:
pytest.skip(f"Could not install wget: {install_result.stderr}")
pass
# Now test archiving
with tempfile.TemporaryDirectory() as tmpdir:
@@ -221,6 +228,7 @@ def test_archives_example_com():
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
@@ -293,7 +301,7 @@ def test_config_save_warc():
# Ensure wget is available
if not shutil.which('wget'):
pytest.skip("wget not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
@@ -353,6 +361,7 @@ def test_staticfile_present_skips():
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
@@ -370,7 +379,7 @@ def test_handles_404_gracefully():
"""Test that wget fails gracefully on 404."""
if not shutil.which('wget'):
pytest.skip("wget not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
@@ -395,7 +404,7 @@ def test_config_timeout_honored():
"""Test that WGET_TIMEOUT config is respected."""
if not shutil.which('wget'):
pytest.skip("wget not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
@@ -422,7 +431,7 @@ def test_config_user_agent():
"""Test that WGET_USER_AGENT config is used."""
if not shutil.which('wget'):
pytest.skip("wget not installed")
pass
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
@@ -447,6 +456,7 @@ def test_config_user_agent():
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
pass
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':