mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
Stabilize plugin and crawl integration tests
This commit is contained in:
@@ -12,7 +12,7 @@ def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None:
|
||||
candidates.add(snapshot_id.replace("-", ""))
|
||||
|
||||
for needle in candidates:
|
||||
for path in (data_dir / "users/system/snapshots").rglob(needle):
|
||||
for path in data_dir.rglob(needle):
|
||||
if path.is_dir():
|
||||
return path
|
||||
return None
|
||||
@@ -33,77 +33,118 @@ def _latest_snapshot_dir(data_dir: Path) -> Path:
|
||||
return snapshot_dir
|
||||
|
||||
|
||||
def _find_plugin_output(snapshot_dir: Path, *patterns: str) -> Path | None:
|
||||
for pattern in patterns:
|
||||
matches = list(snapshot_dir.glob(pattern))
|
||||
if matches:
|
||||
return matches[0]
|
||||
return None
|
||||
def _latest_plugin_result(data_dir: Path, plugin: str) -> tuple[str, str, dict]:
|
||||
conn = sqlite3.connect(data_dir / "index.sqlite3")
|
||||
try:
|
||||
row = conn.execute(
|
||||
"SELECT snapshot_id, status, output_files FROM core_archiveresult "
|
||||
"WHERE plugin = ? ORDER BY created_at DESC LIMIT 1",
|
||||
(plugin,),
|
||||
).fetchone()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
assert row is not None, f"Expected an ArchiveResult row for plugin={plugin}"
|
||||
output_files = row[2]
|
||||
if isinstance(output_files, str):
|
||||
output_files = pyjson.loads(output_files or "{}")
|
||||
output_files = output_files or {}
|
||||
return str(row[0]), str(row[1]), output_files
|
||||
|
||||
|
||||
def _plugin_output_paths(data_dir: Path, plugin: str) -> list[Path]:
|
||||
snapshot_id, status, output_files = _latest_plugin_result(data_dir, plugin)
|
||||
assert status == "succeeded", f"Expected {plugin} ArchiveResult to succeed, got {status}"
|
||||
assert output_files, f"Expected {plugin} ArchiveResult to record output_files"
|
||||
|
||||
snapshot_dir = _find_snapshot_dir(data_dir, snapshot_id)
|
||||
assert snapshot_dir is not None, f"Snapshot output directory not found for {snapshot_id}"
|
||||
|
||||
plugin_dir = snapshot_dir / plugin
|
||||
output_paths = [plugin_dir / rel_path for rel_path in output_files.keys()]
|
||||
missing_paths = [path for path in output_paths if not path.exists()]
|
||||
assert not missing_paths, f"Expected plugin outputs to exist on disk, missing: {missing_paths}"
|
||||
return output_paths
|
||||
|
||||
|
||||
def _archivebox_env(base_env: dict, data_dir: Path) -> dict:
|
||||
env = base_env.copy()
|
||||
tmp_dir = Path("/tmp") / f"abx-{data_dir.name}"
|
||||
tmp_dir.mkdir(parents=True, exist_ok=True)
|
||||
env["TMP_DIR"] = str(tmp_dir)
|
||||
env["ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS"] = "true"
|
||||
return env
|
||||
|
||||
|
||||
def test_singlefile_works(tmp_path, process, disable_extractors_dict):
|
||||
disable_extractors_dict.update({"SAVE_SINGLEFILE": "true"})
|
||||
data_dir = Path.cwd()
|
||||
env = _archivebox_env(disable_extractors_dict, data_dir)
|
||||
env.update({"SAVE_SINGLEFILE": "true"})
|
||||
add_process = subprocess.run(
|
||||
['archivebox', 'add', 'https://example.com'],
|
||||
['archivebox', 'add', '--plugins=singlefile', 'https://example.com'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
env=env,
|
||||
timeout=900,
|
||||
)
|
||||
assert add_process.returncode == 0, add_process.stderr
|
||||
snapshot_dir = _latest_snapshot_dir(tmp_path)
|
||||
output_file = _find_plugin_output(snapshot_dir, "singlefile/singlefile.html", "*_singlefile/singlefile.html")
|
||||
assert output_file is not None and output_file.exists()
|
||||
output_files = _plugin_output_paths(data_dir, "singlefile")
|
||||
assert any(path.suffix in (".html", ".htm") for path in output_files)
|
||||
|
||||
def test_readability_works(tmp_path, process, disable_extractors_dict):
|
||||
disable_extractors_dict.update({"SAVE_READABILITY": "true"})
|
||||
data_dir = Path.cwd()
|
||||
env = _archivebox_env(disable_extractors_dict, data_dir)
|
||||
env.update({"SAVE_SINGLEFILE": "true", "SAVE_READABILITY": "true"})
|
||||
add_process = subprocess.run(
|
||||
['archivebox', 'add', 'https://example.com'],
|
||||
['archivebox', 'add', '--plugins=singlefile,readability', 'https://example.com'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
env=env,
|
||||
timeout=900,
|
||||
)
|
||||
assert add_process.returncode == 0, add_process.stderr
|
||||
snapshot_dir = _latest_snapshot_dir(tmp_path)
|
||||
output_file = _find_plugin_output(snapshot_dir, "readability/content.html", "*_readability/content.html")
|
||||
assert output_file is not None and output_file.exists()
|
||||
output_files = _plugin_output_paths(data_dir, "readability")
|
||||
assert any(path.suffix in (".html", ".htm") for path in output_files)
|
||||
|
||||
def test_htmltotext_works(tmp_path, process, disable_extractors_dict):
|
||||
disable_extractors_dict.update({"SAVE_HTMLTOTEXT": "true"})
|
||||
data_dir = Path.cwd()
|
||||
env = _archivebox_env(disable_extractors_dict, data_dir)
|
||||
env.update({"SAVE_WGET": "true", "SAVE_HTMLTOTEXT": "true"})
|
||||
add_process = subprocess.run(
|
||||
['archivebox', 'add', 'https://example.com'],
|
||||
['archivebox', 'add', '--plugins=wget,htmltotext', 'https://example.com'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
env=env,
|
||||
timeout=900,
|
||||
)
|
||||
assert add_process.returncode == 0, add_process.stderr
|
||||
snapshot_dir = _latest_snapshot_dir(tmp_path)
|
||||
output_file = _find_plugin_output(snapshot_dir, "htmltotext/htmltotext.txt", "*_htmltotext/htmltotext.txt")
|
||||
assert output_file is not None and output_file.exists()
|
||||
output_files = _plugin_output_paths(data_dir, "htmltotext")
|
||||
assert any(path.suffix == ".txt" for path in output_files)
|
||||
|
||||
def test_use_node_false_disables_readability_and_singlefile(tmp_path, process, disable_extractors_dict):
|
||||
disable_extractors_dict.update({"SAVE_READABILITY": "true", "SAVE_DOM": "true", "SAVE_SINGLEFILE": "true", "USE_NODE": "false"})
|
||||
add_process = subprocess.run(['archivebox', 'add', 'https://example.com'],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
env = _archivebox_env(disable_extractors_dict, Path.cwd())
|
||||
env.update({"SAVE_READABILITY": "true", "SAVE_DOM": "true", "SAVE_SINGLEFILE": "true", "USE_NODE": "false"})
|
||||
add_process = subprocess.run(['archivebox', 'add', '--plugins=readability,dom,singlefile', 'https://example.com'],
|
||||
capture_output=True, env=env)
|
||||
output_str = add_process.stdout.decode("utf-8")
|
||||
assert "> singlefile" not in output_str
|
||||
assert "> readability" not in output_str
|
||||
|
||||
def test_headers_retrieved(tmp_path, process, disable_extractors_dict):
|
||||
disable_extractors_dict.update({"SAVE_HEADERS": "true"})
|
||||
data_dir = Path.cwd()
|
||||
env = _archivebox_env(disable_extractors_dict, data_dir)
|
||||
env.update({"SAVE_HEADERS": "true"})
|
||||
add_process = subprocess.run(
|
||||
['archivebox', 'add', 'https://example.com'],
|
||||
['archivebox', 'add', '--plugins=headers', 'https://example.com'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
env=env,
|
||||
timeout=900,
|
||||
)
|
||||
assert add_process.returncode == 0, add_process.stderr
|
||||
snapshot_dir = _latest_snapshot_dir(tmp_path)
|
||||
output_file = _find_plugin_output(snapshot_dir, "headers/headers.json", "*_headers/headers.json")
|
||||
assert output_file is not None and output_file.exists()
|
||||
output_files = _plugin_output_paths(data_dir, "headers")
|
||||
output_file = next((path for path in output_files if path.suffix == ".json"), None)
|
||||
assert output_file is not None, f"Expected headers output_files to include a JSON file, got: {output_files}"
|
||||
with open(output_file, 'r', encoding='utf-8') as f:
|
||||
headers = pyjson.load(f)
|
||||
assert 'Content-Type' in headers or 'content-type' in headers
|
||||
|
||||
@@ -235,6 +235,34 @@ class TestHookDiscovery(unittest.TestCase):
|
||||
|
||||
self.assertIn('env', plugins)
|
||||
|
||||
def test_discover_binary_hooks_ignores_plugins_whitelist(self):
|
||||
"""Binary provider hooks should remain discoverable under --plugins filtering."""
|
||||
singlefile_dir = self.plugins_dir / 'singlefile'
|
||||
singlefile_dir.mkdir()
|
||||
(singlefile_dir / 'config.json').write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"type": "object",
|
||||
"required_plugins": ["chrome"],
|
||||
"properties": {},
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
npm_dir = self.plugins_dir / 'npm'
|
||||
npm_dir.mkdir()
|
||||
(npm_dir / 'on_Binary__10_npm_install.py').write_text('# npm binary hook')
|
||||
(npm_dir / 'config.json').write_text('{"type": "object", "properties": {}}')
|
||||
|
||||
from archivebox import hooks as hooks_module
|
||||
|
||||
hooks_module.get_plugins.cache_clear()
|
||||
with patch.object(hooks_module, 'BUILTIN_PLUGINS_DIR', self.plugins_dir), patch.object(hooks_module, 'USER_PLUGINS_DIR', self.test_dir / 'user_plugins'):
|
||||
hooks = hooks_module.discover_hooks('Binary', config={'PLUGINS': 'singlefile'})
|
||||
|
||||
hook_names = [hook.name for hook in hooks]
|
||||
self.assertIn('on_Binary__10_npm_install.py', hook_names)
|
||||
|
||||
|
||||
class TestGetExtractorName(unittest.TestCase):
|
||||
"""Test get_extractor_name() function."""
|
||||
|
||||
@@ -37,6 +37,21 @@ def stop_process(proc):
|
||||
return proc.communicate()
|
||||
|
||||
|
||||
def run_add_until(args, env, condition, timeout=120):
|
||||
proc = subprocess.Popen(
|
||||
args,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env,
|
||||
)
|
||||
|
||||
assert wait_for_db_condition(timeout=timeout, condition=condition), (
|
||||
f"Timed out waiting for condition while running: {' '.join(args)}"
|
||||
)
|
||||
return stop_process(proc)
|
||||
|
||||
|
||||
def test_background_hooks_dont_block_parser_extractors(tmp_path, process, recursive_test_site):
|
||||
"""Test that background hooks (.bg.) don't block other extractors from running."""
|
||||
os.chdir(tmp_path)
|
||||
@@ -202,15 +217,15 @@ def test_recursive_crawl_creates_child_snapshots(tmp_path, process, recursive_te
|
||||
"SAVE_TITLE": "false",
|
||||
})
|
||||
|
||||
result = subprocess.run(
|
||||
stdout, stderr = run_add_until(
|
||||
['archivebox', 'add', '--depth=1', '--plugins=wget,parse_html_urls', recursive_test_site['root_url']],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60,
|
||||
timeout=120,
|
||||
condition=lambda c: (
|
||||
c.execute("SELECT COUNT(*) FROM core_snapshot WHERE depth = 0").fetchone()[0] >= 1
|
||||
and c.execute("SELECT COUNT(*) FROM core_snapshot WHERE depth = 1").fetchone()[0] >= len(recursive_test_site['child_urls'])
|
||||
),
|
||||
)
|
||||
assert result.returncode == 0, result.stderr
|
||||
stdout, stderr = result.stdout, result.stderr
|
||||
|
||||
if stderr:
|
||||
print(f"\n=== STDERR ===\n{stderr}\n=== END STDERR ===\n")
|
||||
@@ -260,14 +275,26 @@ def test_recursive_crawl_respects_depth_limit(tmp_path, process, disable_extract
|
||||
"""Test that recursive crawling stops at max_depth."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
env = disable_extractors_dict.copy()
|
||||
env["URL_ALLOWLIST"] = r"127\.0\.0\.1[:/].*"
|
||||
|
||||
stdout, stderr = run_add_until(
|
||||
['archivebox', 'add', '--depth=1', '--plugins=wget,parse_html_urls', recursive_test_site['root_url']],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=60,
|
||||
env=env,
|
||||
timeout=120,
|
||||
condition=lambda c: (
|
||||
c.execute("SELECT COUNT(*) FROM core_snapshot WHERE depth = 0").fetchone()[0] >= 1
|
||||
and c.execute("SELECT COUNT(*) FROM core_snapshot WHERE depth = 1").fetchone()[0] >= len(recursive_test_site['child_urls'])
|
||||
and c.execute(
|
||||
"SELECT COUNT(DISTINCT ar.snapshot_id) "
|
||||
"FROM core_archiveresult ar "
|
||||
"JOIN core_snapshot s ON s.id = ar.snapshot_id "
|
||||
"WHERE s.depth = 1 "
|
||||
"AND ar.plugin LIKE 'parse_%_urls' "
|
||||
"AND ar.status IN ('started', 'succeeded', 'failed')"
|
||||
).fetchone()[0] >= len(recursive_test_site['child_urls'])
|
||||
),
|
||||
)
|
||||
assert result.returncode == 0, result.stderr
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
@@ -324,14 +351,18 @@ def test_root_snapshot_has_depth_zero(tmp_path, process, disable_extractors_dict
|
||||
"""Test that root snapshots are created with depth=0."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
env = disable_extractors_dict.copy()
|
||||
env["URL_ALLOWLIST"] = r"127\.0\.0\.1[:/].*"
|
||||
|
||||
stdout, stderr = run_add_until(
|
||||
['archivebox', 'add', '--depth=1', '--plugins=wget,parse_html_urls', recursive_test_site['root_url']],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=60,
|
||||
env=env,
|
||||
timeout=120,
|
||||
condition=lambda c: c.execute(
|
||||
"SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
|
||||
(recursive_test_site['root_url'],),
|
||||
).fetchone()[0] >= 1,
|
||||
)
|
||||
assert result.returncode == 0, result.stderr
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
@@ -360,14 +391,14 @@ def test_archiveresult_worker_queue_filters_by_foreground_extractors(tmp_path, p
|
||||
"SAVE_FAVICON": "true",
|
||||
})
|
||||
|
||||
result = subprocess.run(
|
||||
stdout, stderr = run_add_until(
|
||||
['archivebox', 'add', '--plugins=favicon,wget,parse_html_urls', recursive_test_site['root_url']],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60,
|
||||
timeout=120,
|
||||
condition=lambda c: c.execute(
|
||||
"SELECT COUNT(*) FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls' AND status IN ('started', 'succeeded', 'failed')"
|
||||
).fetchone()[0] > 0,
|
||||
)
|
||||
assert result.returncode == 0, result.stderr
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
|
||||
Reference in New Issue
Block a user