Fix CI workflows and migration tests

This commit is contained in:
Nick Sweeting
2026-03-24 13:37:02 -07:00
parent 50286d3c38
commit ed1ddbc95e
14 changed files with 197 additions and 319 deletions

View File

@@ -99,7 +99,7 @@ def get_installed_binary_change_url(name: str, binary: Binary | None) -> str | N
if binary is None or not binary.id:
return None
base_url = binary.admin_change_url or f"{INSTALLED_BINARIES_BASE_URL}{binary.id}/change/"
base_url = getattr(binary, "admin_change_url", None) or f"{INSTALLED_BINARIES_BASE_URL}{binary.id}/change/"
changelist_filters = urlencode({"q": name})
return f"{base_url}?{urlencode({'_changelist_filters': changelist_filters})}"
@@ -360,8 +360,12 @@ def _binary_sort_key(binary: Binary) -> tuple[int, int, int, Any]:
def get_db_binaries_by_name() -> dict[str, Binary]:
grouped: dict[str, list[Binary]] = {}
binary_name_aliases = {
"youtube-dl": "yt-dlp",
}
for binary in Binary.objects.all():
grouped.setdefault(binary.name, []).append(binary)
canonical_name = binary_name_aliases.get(binary.name, binary.name)
grouped.setdefault(canonical_name, []).append(binary)
return {name: max(records, key=_binary_sort_key) for name, records in grouped.items()}
@@ -424,10 +428,11 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
for name in all_binary_names:
binary = db_binaries.get(name)
binary_is_valid = bool(binary and getattr(binary, "is_valid", getattr(binary, "abspath", None)))
rows["Binary Name"].append(ItemLink(name, key=name))
if binary and binary.is_valid:
if binary_is_valid:
rows["Found Version"].append(f"{binary.version}" if binary.version else "✅ found")
rows["Provided By"].append(binary.binprovider or "-")
rows["Found Abspath"].append(binary.abspath or "-")
@@ -446,9 +451,13 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
assert is_superuser(request), "Must be a superuser to view configuration settings."
key = {
"youtube-dl": "yt-dlp",
}.get(key, key)
db_binary = get_db_binaries_by_name().get(key)
if db_binary and db_binary.is_valid:
binary_data = db_binary.to_json()
binary_is_valid = bool(db_binary and getattr(db_binary, "is_valid", getattr(db_binary, "abspath", None)))
if binary_is_valid:
binary_data = db_binary.to_json() if hasattr(db_binary, "to_json") else db_binary.__dict__
section: SectionData = {
"name": key,
"description": mark_safe(render_binary_detail_description(key, binary_data, db_binary)),

View File

@@ -381,9 +381,7 @@ print(json.dumps({"type": "ArchiveResult", "status": "succeeded", "output_str":
def test_js_hook_execution(self):
"""JavaScript hook should execute and output JSONL."""
# Skip if node not available
if shutil.which("node") is None:
self.skipTest("Node.js not available")
self.assertIsNotNone(shutil.which("node"), "Node.js not available")
hook_path = self.work_dir / "test_hook.js"
hook_path.write_text("""#!/usr/bin/env node

View File

@@ -14,11 +14,14 @@ import shutil
import sqlite3
import tempfile
import unittest
import json
from pathlib import Path
from .migrations_helpers import (
SCHEMA_0_7,
SCHEMA_0_8,
seed_0_8_data,
seed_0_7_data,
run_archivebox,
create_data_dir_structure,
verify_snapshot_count,
@@ -525,19 +528,47 @@ class TestFilesystemMigration08to09(unittest.TestCase):
4. All files are moved (no data loss)
5. Old archive/timestamp/ directories are cleaned up
"""
# Use the real 0.7.2 database which has actual ArchiveResults with files
gold_db = Path("/Users/squash/Local/Code/archiveboxes/archivebox-migration-path/archivebox-v0.7.2/data")
if not gold_db.exists():
self.skipTest(f"Gold standard database not found at {gold_db}")
create_data_dir_structure(self.work_dir)
conn = sqlite3.connect(str(self.db_path))
conn.executescript(SCHEMA_0_7)
conn.close()
original_data = seed_0_7_data(self.db_path)
conn = sqlite3.connect(str(self.db_path))
cursor = conn.cursor()
for i, snapshot in enumerate(original_data["snapshots"]):
legacy_timestamp = str(1704110400 + (i * 86400))
cursor.execute(
"UPDATE core_snapshot SET timestamp = ? WHERE id = ?",
(legacy_timestamp, snapshot["id"]),
)
cursor.execute(
"UPDATE core_archiveresult SET pwd = ? WHERE snapshot_id = ?",
(f"/data/archive/{legacy_timestamp}", snapshot["id"]),
)
snapshot["timestamp"] = legacy_timestamp
conn.commit()
conn.close()
# Copy gold database to test directory
import shutil
for item in gold_db.iterdir():
if item.is_dir():
shutil.copytree(item, self.work_dir / item.name, dirs_exist_ok=True)
else:
shutil.copy2(item, self.work_dir / item.name)
sample_files = [
"favicon.ico",
"screenshot.png",
"singlefile.html",
"headers.json",
]
for snapshot in original_data["snapshots"]:
snapshot_dir = self.work_dir / "archive" / snapshot["timestamp"]
snapshot_dir.mkdir(parents=True, exist_ok=True)
(snapshot_dir / "index.json").write_text(
json.dumps(
{
"url": snapshot["url"],
"timestamp": snapshot["timestamp"],
"title": snapshot["title"],
},
),
)
for sample_file in sample_files:
(snapshot_dir / sample_file).write_text(f"{snapshot['url']}::{sample_file}")
# Count archive directories and files BEFORE migration
archive_dir = self.work_dir / "archive"
@@ -552,12 +583,6 @@ class TestFilesystemMigration08to09(unittest.TestCase):
files_before_count = len(files_before)
# Sample some specific files to check they're preserved
sample_files = [
"favicon.ico",
"screenshot.png",
"singlefile.html",
"headers.json",
]
sample_paths_before = {}
for d in dirs_before:
if d.is_dir():
@@ -742,32 +767,30 @@ class TestFilesystemMigration08to09(unittest.TestCase):
print(f"[*] ArchiveResults linked to Process: {linked_count}")
# Verify data migration happened correctly
# The 0.7.2 gold database has 44 ArchiveResults
self.assertEqual(
archiveresult_count,
44,
f"Expected 44 ArchiveResults from 0.7.2 database, got {archiveresult_count}",
len(original_data["archiveresults"]),
f"Expected {len(original_data['archiveresults'])} ArchiveResults after migration, got {archiveresult_count}",
)
# Each ArchiveResult should create one Process record
self.assertEqual(
process_count,
44,
f"Expected 44 Process records (1 per ArchiveResult), got {process_count}",
len(original_data["archiveresults"]),
f"Expected {len(original_data['archiveresults'])} Process records (1 per ArchiveResult), got {process_count}",
)
# The 44 ArchiveResults use 7 unique binaries (curl, wget, etc.)
self.assertEqual(
binary_count,
7,
f"Expected 7 unique Binary records, got {binary_count}",
5,
f"Expected 5 unique Binary records, got {binary_count}",
)
# ALL ArchiveResults should be linked to Process records
self.assertEqual(
linked_count,
44,
f"Expected all 44 ArchiveResults linked to Process, got {linked_count}",
len(original_data["archiveresults"]),
f"Expected all {len(original_data['archiveresults'])} ArchiveResults linked to Process, got {linked_count}",
)

View File

@@ -77,7 +77,6 @@ def test_run_snapshot_reuses_crawl_bus_for_all_snapshots(monkeypatch):
monkeypatch.setattr(runner_module, "create_bus", fake_create_bus)
monkeypatch.setattr(runner_module, "discover_plugins", lambda: {})
monkeypatch.setattr(runner_module, "ProcessService", _DummyService)
monkeypatch.setattr(runner_module, "MachineService", _DummyService)
monkeypatch.setattr(runner_module, "BinaryService", _DummyService)
monkeypatch.setattr(runner_module, "TagService", _DummyService)
monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
@@ -232,7 +231,6 @@ def test_runner_prepare_refreshes_network_interface_and_attaches_current_process
monkeypatch.setattr(runner_module, "discover_plugins", lambda: {})
monkeypatch.setattr(runner_module, "create_bus", lambda **kwargs: _DummyBus(kwargs["name"]))
monkeypatch.setattr(runner_module, "ProcessService", _DummyService)
monkeypatch.setattr(runner_module, "MachineService", _DummyService)
monkeypatch.setattr(runner_module, "BinaryService", _DummyService)
monkeypatch.setattr(runner_module, "TagService", _DummyService)
monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
@@ -405,8 +403,8 @@ def test_installed_binary_config_overrides_do_not_map_hardcoded_artifacts_to_con
)
assert "SINGLEFILE_BINARY" not in overrides
assert overrides["LIB_DIR"] == "/tmp/shared-lib"
assert overrides["LIB_BIN_DIR"] == "/tmp/shared-lib/bin"
assert "LIB_DIR" not in overrides
assert "LIB_BIN_DIR" not in overrides
def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch):
@@ -425,7 +423,6 @@ def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch
monkeypatch.setattr(runner_module, "discover_plugins", lambda: {})
monkeypatch.setattr(runner_module, "create_bus", lambda **kwargs: _DummyBus(kwargs["name"]))
monkeypatch.setattr(runner_module, "ProcessService", _DummyService)
monkeypatch.setattr(runner_module, "MachineService", _DummyService)
monkeypatch.setattr(runner_module, "BinaryService", _DummyService)
monkeypatch.setattr(runner_module, "TagService", _DummyService)
monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
@@ -616,7 +613,6 @@ def test_crawl_runner_finalizes_with_sync_to_async_for_is_finished(monkeypatch):
monkeypatch.setattr(runner_module, "create_bus", lambda *args, **kwargs: _DummyBus("runner"))
monkeypatch.setattr(runner_module, "discover_plugins", lambda: {})
monkeypatch.setattr(runner_module, "ProcessService", _DummyService)
monkeypatch.setattr(runner_module, "MachineService", _DummyService)
monkeypatch.setattr(runner_module, "BinaryService", _DummyService)
monkeypatch.setattr(runner_module, "TagService", _DummyService)
monkeypatch.setattr(runner_module, "CrawlService", _DummyService)

View File

@@ -174,15 +174,14 @@ def _resolve_browser(shared_lib: Path) -> Path | None:
@pytest.fixture(scope="session")
def browser_runtime(tmp_path_factory):
if shutil.which("node") is None or shutil.which("npm") is None:
pytest.skip("Node.js and npm are required for browser security tests")
assert shutil.which("node") is not None, "Node.js is required for browser security tests"
assert shutil.which("npm") is not None, "npm is required for browser security tests"
shared_lib = tmp_path_factory.mktemp("archivebox_browser_lib")
_ensure_puppeteer(shared_lib)
browser = _resolve_browser(shared_lib)
if not browser:
pytest.skip("No Chrome/Chromium binary available for browser security tests")
assert browser, "No Chrome/Chromium binary available for browser security tests"
return {
"node_modules_dir": shared_lib / "npm" / "node_modules",