mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
Fix CI workflows and migration tests
This commit is contained in:
@@ -99,7 +99,7 @@ def get_installed_binary_change_url(name: str, binary: Binary | None) -> str | N
|
||||
if binary is None or not binary.id:
|
||||
return None
|
||||
|
||||
base_url = binary.admin_change_url or f"{INSTALLED_BINARIES_BASE_URL}{binary.id}/change/"
|
||||
base_url = getattr(binary, "admin_change_url", None) or f"{INSTALLED_BINARIES_BASE_URL}{binary.id}/change/"
|
||||
changelist_filters = urlencode({"q": name})
|
||||
return f"{base_url}?{urlencode({'_changelist_filters': changelist_filters})}"
|
||||
|
||||
@@ -360,8 +360,12 @@ def _binary_sort_key(binary: Binary) -> tuple[int, int, int, Any]:
|
||||
|
||||
def get_db_binaries_by_name() -> dict[str, Binary]:
|
||||
grouped: dict[str, list[Binary]] = {}
|
||||
binary_name_aliases = {
|
||||
"youtube-dl": "yt-dlp",
|
||||
}
|
||||
for binary in Binary.objects.all():
|
||||
grouped.setdefault(binary.name, []).append(binary)
|
||||
canonical_name = binary_name_aliases.get(binary.name, binary.name)
|
||||
grouped.setdefault(canonical_name, []).append(binary)
|
||||
|
||||
return {name: max(records, key=_binary_sort_key) for name, records in grouped.items()}
|
||||
|
||||
@@ -424,10 +428,11 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
|
||||
for name in all_binary_names:
|
||||
binary = db_binaries.get(name)
|
||||
binary_is_valid = bool(binary and getattr(binary, "is_valid", getattr(binary, "abspath", None)))
|
||||
|
||||
rows["Binary Name"].append(ItemLink(name, key=name))
|
||||
|
||||
if binary and binary.is_valid:
|
||||
if binary_is_valid:
|
||||
rows["Found Version"].append(f"✅ {binary.version}" if binary.version else "✅ found")
|
||||
rows["Provided By"].append(binary.binprovider or "-")
|
||||
rows["Found Abspath"].append(binary.abspath or "-")
|
||||
@@ -446,9 +451,13 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
assert is_superuser(request), "Must be a superuser to view configuration settings."
|
||||
|
||||
key = {
|
||||
"youtube-dl": "yt-dlp",
|
||||
}.get(key, key)
|
||||
db_binary = get_db_binaries_by_name().get(key)
|
||||
if db_binary and db_binary.is_valid:
|
||||
binary_data = db_binary.to_json()
|
||||
binary_is_valid = bool(db_binary and getattr(db_binary, "is_valid", getattr(db_binary, "abspath", None)))
|
||||
if binary_is_valid:
|
||||
binary_data = db_binary.to_json() if hasattr(db_binary, "to_json") else db_binary.__dict__
|
||||
section: SectionData = {
|
||||
"name": key,
|
||||
"description": mark_safe(render_binary_detail_description(key, binary_data, db_binary)),
|
||||
|
||||
@@ -381,9 +381,7 @@ print(json.dumps({"type": "ArchiveResult", "status": "succeeded", "output_str":
|
||||
|
||||
def test_js_hook_execution(self):
|
||||
"""JavaScript hook should execute and output JSONL."""
|
||||
# Skip if node not available
|
||||
if shutil.which("node") is None:
|
||||
self.skipTest("Node.js not available")
|
||||
self.assertIsNotNone(shutil.which("node"), "Node.js not available")
|
||||
|
||||
hook_path = self.work_dir / "test_hook.js"
|
||||
hook_path.write_text("""#!/usr/bin/env node
|
||||
|
||||
@@ -14,11 +14,14 @@ import shutil
|
||||
import sqlite3
|
||||
import tempfile
|
||||
import unittest
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from .migrations_helpers import (
|
||||
SCHEMA_0_7,
|
||||
SCHEMA_0_8,
|
||||
seed_0_8_data,
|
||||
seed_0_7_data,
|
||||
run_archivebox,
|
||||
create_data_dir_structure,
|
||||
verify_snapshot_count,
|
||||
@@ -525,19 +528,47 @@ class TestFilesystemMigration08to09(unittest.TestCase):
|
||||
4. All files are moved (no data loss)
|
||||
5. Old archive/timestamp/ directories are cleaned up
|
||||
"""
|
||||
# Use the real 0.7.2 database which has actual ArchiveResults with files
|
||||
gold_db = Path("/Users/squash/Local/Code/archiveboxes/archivebox-migration-path/archivebox-v0.7.2/data")
|
||||
if not gold_db.exists():
|
||||
self.skipTest(f"Gold standard database not found at {gold_db}")
|
||||
create_data_dir_structure(self.work_dir)
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
conn.executescript(SCHEMA_0_7)
|
||||
conn.close()
|
||||
original_data = seed_0_7_data(self.db_path)
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
cursor = conn.cursor()
|
||||
for i, snapshot in enumerate(original_data["snapshots"]):
|
||||
legacy_timestamp = str(1704110400 + (i * 86400))
|
||||
cursor.execute(
|
||||
"UPDATE core_snapshot SET timestamp = ? WHERE id = ?",
|
||||
(legacy_timestamp, snapshot["id"]),
|
||||
)
|
||||
cursor.execute(
|
||||
"UPDATE core_archiveresult SET pwd = ? WHERE snapshot_id = ?",
|
||||
(f"/data/archive/{legacy_timestamp}", snapshot["id"]),
|
||||
)
|
||||
snapshot["timestamp"] = legacy_timestamp
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
# Copy gold database to test directory
|
||||
import shutil
|
||||
|
||||
for item in gold_db.iterdir():
|
||||
if item.is_dir():
|
||||
shutil.copytree(item, self.work_dir / item.name, dirs_exist_ok=True)
|
||||
else:
|
||||
shutil.copy2(item, self.work_dir / item.name)
|
||||
sample_files = [
|
||||
"favicon.ico",
|
||||
"screenshot.png",
|
||||
"singlefile.html",
|
||||
"headers.json",
|
||||
]
|
||||
for snapshot in original_data["snapshots"]:
|
||||
snapshot_dir = self.work_dir / "archive" / snapshot["timestamp"]
|
||||
snapshot_dir.mkdir(parents=True, exist_ok=True)
|
||||
(snapshot_dir / "index.json").write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"url": snapshot["url"],
|
||||
"timestamp": snapshot["timestamp"],
|
||||
"title": snapshot["title"],
|
||||
},
|
||||
),
|
||||
)
|
||||
for sample_file in sample_files:
|
||||
(snapshot_dir / sample_file).write_text(f"{snapshot['url']}::{sample_file}")
|
||||
|
||||
# Count archive directories and files BEFORE migration
|
||||
archive_dir = self.work_dir / "archive"
|
||||
@@ -552,12 +583,6 @@ class TestFilesystemMigration08to09(unittest.TestCase):
|
||||
files_before_count = len(files_before)
|
||||
|
||||
# Sample some specific files to check they're preserved
|
||||
sample_files = [
|
||||
"favicon.ico",
|
||||
"screenshot.png",
|
||||
"singlefile.html",
|
||||
"headers.json",
|
||||
]
|
||||
sample_paths_before = {}
|
||||
for d in dirs_before:
|
||||
if d.is_dir():
|
||||
@@ -742,32 +767,30 @@ class TestFilesystemMigration08to09(unittest.TestCase):
|
||||
print(f"[*] ArchiveResults linked to Process: {linked_count}")
|
||||
|
||||
# Verify data migration happened correctly
|
||||
# The 0.7.2 gold database has 44 ArchiveResults
|
||||
self.assertEqual(
|
||||
archiveresult_count,
|
||||
44,
|
||||
f"Expected 44 ArchiveResults from 0.7.2 database, got {archiveresult_count}",
|
||||
len(original_data["archiveresults"]),
|
||||
f"Expected {len(original_data['archiveresults'])} ArchiveResults after migration, got {archiveresult_count}",
|
||||
)
|
||||
|
||||
# Each ArchiveResult should create one Process record
|
||||
self.assertEqual(
|
||||
process_count,
|
||||
44,
|
||||
f"Expected 44 Process records (1 per ArchiveResult), got {process_count}",
|
||||
len(original_data["archiveresults"]),
|
||||
f"Expected {len(original_data['archiveresults'])} Process records (1 per ArchiveResult), got {process_count}",
|
||||
)
|
||||
|
||||
# The 44 ArchiveResults use 7 unique binaries (curl, wget, etc.)
|
||||
self.assertEqual(
|
||||
binary_count,
|
||||
7,
|
||||
f"Expected 7 unique Binary records, got {binary_count}",
|
||||
5,
|
||||
f"Expected 5 unique Binary records, got {binary_count}",
|
||||
)
|
||||
|
||||
# ALL ArchiveResults should be linked to Process records
|
||||
self.assertEqual(
|
||||
linked_count,
|
||||
44,
|
||||
f"Expected all 44 ArchiveResults linked to Process, got {linked_count}",
|
||||
len(original_data["archiveresults"]),
|
||||
f"Expected all {len(original_data['archiveresults'])} ArchiveResults linked to Process, got {linked_count}",
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -77,7 +77,6 @@ def test_run_snapshot_reuses_crawl_bus_for_all_snapshots(monkeypatch):
|
||||
monkeypatch.setattr(runner_module, "create_bus", fake_create_bus)
|
||||
monkeypatch.setattr(runner_module, "discover_plugins", lambda: {})
|
||||
monkeypatch.setattr(runner_module, "ProcessService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "MachineService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "BinaryService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "TagService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
|
||||
@@ -232,7 +231,6 @@ def test_runner_prepare_refreshes_network_interface_and_attaches_current_process
|
||||
monkeypatch.setattr(runner_module, "discover_plugins", lambda: {})
|
||||
monkeypatch.setattr(runner_module, "create_bus", lambda **kwargs: _DummyBus(kwargs["name"]))
|
||||
monkeypatch.setattr(runner_module, "ProcessService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "MachineService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "BinaryService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "TagService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
|
||||
@@ -405,8 +403,8 @@ def test_installed_binary_config_overrides_do_not_map_hardcoded_artifacts_to_con
|
||||
)
|
||||
|
||||
assert "SINGLEFILE_BINARY" not in overrides
|
||||
assert overrides["LIB_DIR"] == "/tmp/shared-lib"
|
||||
assert overrides["LIB_BIN_DIR"] == "/tmp/shared-lib/bin"
|
||||
assert "LIB_DIR" not in overrides
|
||||
assert "LIB_BIN_DIR" not in overrides
|
||||
|
||||
|
||||
def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch):
|
||||
@@ -425,7 +423,6 @@ def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch
|
||||
monkeypatch.setattr(runner_module, "discover_plugins", lambda: {})
|
||||
monkeypatch.setattr(runner_module, "create_bus", lambda **kwargs: _DummyBus(kwargs["name"]))
|
||||
monkeypatch.setattr(runner_module, "ProcessService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "MachineService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "BinaryService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "TagService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
|
||||
@@ -616,7 +613,6 @@ def test_crawl_runner_finalizes_with_sync_to_async_for_is_finished(monkeypatch):
|
||||
monkeypatch.setattr(runner_module, "create_bus", lambda *args, **kwargs: _DummyBus("runner"))
|
||||
monkeypatch.setattr(runner_module, "discover_plugins", lambda: {})
|
||||
monkeypatch.setattr(runner_module, "ProcessService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "MachineService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "BinaryService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "TagService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
|
||||
|
||||
@@ -174,15 +174,14 @@ def _resolve_browser(shared_lib: Path) -> Path | None:
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def browser_runtime(tmp_path_factory):
|
||||
if shutil.which("node") is None or shutil.which("npm") is None:
|
||||
pytest.skip("Node.js and npm are required for browser security tests")
|
||||
assert shutil.which("node") is not None, "Node.js is required for browser security tests"
|
||||
assert shutil.which("npm") is not None, "npm is required for browser security tests"
|
||||
|
||||
shared_lib = tmp_path_factory.mktemp("archivebox_browser_lib")
|
||||
_ensure_puppeteer(shared_lib)
|
||||
|
||||
browser = _resolve_browser(shared_lib)
|
||||
if not browser:
|
||||
pytest.skip("No Chrome/Chromium binary available for browser security tests")
|
||||
assert browser, "No Chrome/Chromium binary available for browser security tests"
|
||||
|
||||
return {
|
||||
"node_modules_dir": shared_lib / "npm" / "node_modules",
|
||||
|
||||
Reference in New Issue
Block a user