Fix CI workflows and migration tests

2026-04-06 07:47:53 +10:00 · 2026-03-24 13:37:02 -07:00
parent 50286d3c38
commit ed1ddbc95e
14 changed files with 197 additions and 319 deletions
--- a/archivebox/config/views.py
+++ b/archivebox/config/views.py
@@ -99,7 +99,7 @@ def get_installed_binary_change_url(name: str, binary: Binary | None) -> str | N
    if binary is None or not binary.id:
        return None

-    base_url = binary.admin_change_url or f"{INSTALLED_BINARIES_BASE_URL}{binary.id}/change/"
+    base_url = getattr(binary, "admin_change_url", None) or f"{INSTALLED_BINARIES_BASE_URL}{binary.id}/change/"
    changelist_filters = urlencode({"q": name})
    return f"{base_url}?{urlencode({'_changelist_filters': changelist_filters})}"

@@ -360,8 +360,12 @@ def _binary_sort_key(binary: Binary) -> tuple[int, int, int, Any]:

 def get_db_binaries_by_name() -> dict[str, Binary]:
    grouped: dict[str, list[Binary]] = {}
+    binary_name_aliases = {
+        "youtube-dl": "yt-dlp",
+    }
    for binary in Binary.objects.all():
-        grouped.setdefault(binary.name, []).append(binary)
+        canonical_name = binary_name_aliases.get(binary.name, binary.name)
+        grouped.setdefault(canonical_name, []).append(binary)

    return {name: max(records, key=_binary_sort_key) for name, records in grouped.items()}

@@ -424,10 +428,11 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:

    for name in all_binary_names:
        binary = db_binaries.get(name)
+        binary_is_valid = bool(binary and getattr(binary, "is_valid", getattr(binary, "abspath", None)))

        rows["Binary Name"].append(ItemLink(name, key=name))

-        if binary and binary.is_valid:
+        if binary_is_valid:
            rows["Found Version"].append(f"✅ {binary.version}" if binary.version else "✅ found")
            rows["Provided By"].append(binary.binprovider or "-")
            rows["Found Abspath"].append(binary.abspath or "-")
@@ -446,9 +451,13 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
 def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
    assert is_superuser(request), "Must be a superuser to view configuration settings."

+    key = {
+        "youtube-dl": "yt-dlp",
+    }.get(key, key)
    db_binary = get_db_binaries_by_name().get(key)
-    if db_binary and db_binary.is_valid:
-        binary_data = db_binary.to_json()
+    binary_is_valid = bool(db_binary and getattr(db_binary, "is_valid", getattr(db_binary, "abspath", None)))
+    if binary_is_valid:
+        binary_data = db_binary.to_json() if hasattr(db_binary, "to_json") else db_binary.__dict__
        section: SectionData = {
            "name": key,
            "description": mark_safe(render_binary_detail_description(key, binary_data, db_binary)),
--- a/archivebox/tests/test_hooks.py
+++ b/archivebox/tests/test_hooks.py
@@ -381,9 +381,7 @@ print(json.dumps({"type": "ArchiveResult", "status": "succeeded", "output_str":

    def test_js_hook_execution(self):
        """JavaScript hook should execute and output JSONL."""
-        # Skip if node not available
-        if shutil.which("node") is None:
-            self.skipTest("Node.js not available")
+        self.assertIsNotNone(shutil.which("node"), "Node.js not available")

        hook_path = self.work_dir / "test_hook.js"
        hook_path.write_text("""#!/usr/bin/env node
--- a/archivebox/tests/test_migrations_08_to_09.py
+++ b/archivebox/tests/test_migrations_08_to_09.py
@@ -14,11 +14,14 @@ import shutil
 import sqlite3
 import tempfile
 import unittest
+import json
 from pathlib import Path

 from .migrations_helpers import (
+    SCHEMA_0_7,
    SCHEMA_0_8,
    seed_0_8_data,
+    seed_0_7_data,
    run_archivebox,
    create_data_dir_structure,
    verify_snapshot_count,
@@ -525,19 +528,47 @@ class TestFilesystemMigration08to09(unittest.TestCase):
        4. All files are moved (no data loss)
        5. Old archive/timestamp/ directories are cleaned up
        """
-        # Use the real 0.7.2 database which has actual ArchiveResults with files
-        gold_db = Path("/Users/squash/Local/Code/archiveboxes/archivebox-migration-path/archivebox-v0.7.2/data")
-        if not gold_db.exists():
-            self.skipTest(f"Gold standard database not found at {gold_db}")
+        create_data_dir_structure(self.work_dir)
+        conn = sqlite3.connect(str(self.db_path))
+        conn.executescript(SCHEMA_0_7)
+        conn.close()
+        original_data = seed_0_7_data(self.db_path)
+        conn = sqlite3.connect(str(self.db_path))
+        cursor = conn.cursor()
+        for i, snapshot in enumerate(original_data["snapshots"]):
+            legacy_timestamp = str(1704110400 + (i * 86400))
+            cursor.execute(
+                "UPDATE core_snapshot SET timestamp = ? WHERE id = ?",
+                (legacy_timestamp, snapshot["id"]),
+            )
+            cursor.execute(
+                "UPDATE core_archiveresult SET pwd = ? WHERE snapshot_id = ?",
+                (f"/data/archive/{legacy_timestamp}", snapshot["id"]),
+            )
+            snapshot["timestamp"] = legacy_timestamp
+        conn.commit()
+        conn.close()

-        # Copy gold database to test directory
-        import shutil
-
-        for item in gold_db.iterdir():
-            if item.is_dir():
-                shutil.copytree(item, self.work_dir / item.name, dirs_exist_ok=True)
-            else:
-                shutil.copy2(item, self.work_dir / item.name)
+        sample_files = [
+            "favicon.ico",
+            "screenshot.png",
+            "singlefile.html",
+            "headers.json",
+        ]
+        for snapshot in original_data["snapshots"]:
+            snapshot_dir = self.work_dir / "archive" / snapshot["timestamp"]
+            snapshot_dir.mkdir(parents=True, exist_ok=True)
+            (snapshot_dir / "index.json").write_text(
+                json.dumps(
+                    {
+                        "url": snapshot["url"],
+                        "timestamp": snapshot["timestamp"],
+                        "title": snapshot["title"],
+                    },
+                ),
+            )
+            for sample_file in sample_files:
+                (snapshot_dir / sample_file).write_text(f"{snapshot['url']}::{sample_file}")

        # Count archive directories and files BEFORE migration
        archive_dir = self.work_dir / "archive"
@@ -552,12 +583,6 @@ class TestFilesystemMigration08to09(unittest.TestCase):
        files_before_count = len(files_before)

        # Sample some specific files to check they're preserved
-        sample_files = [
-            "favicon.ico",
-            "screenshot.png",
-            "singlefile.html",
-            "headers.json",
-        ]
        sample_paths_before = {}
        for d in dirs_before:
            if d.is_dir():
@@ -742,32 +767,30 @@ class TestFilesystemMigration08to09(unittest.TestCase):
        print(f"[*] ArchiveResults linked to Process: {linked_count}")

        # Verify data migration happened correctly
-        # The 0.7.2 gold database has 44 ArchiveResults
        self.assertEqual(
            archiveresult_count,
-            44,
-            f"Expected 44 ArchiveResults from 0.7.2 database, got {archiveresult_count}",
+            len(original_data["archiveresults"]),
+            f"Expected {len(original_data['archiveresults'])} ArchiveResults after migration, got {archiveresult_count}",
        )

        # Each ArchiveResult should create one Process record
        self.assertEqual(
            process_count,
-            44,
-            f"Expected 44 Process records (1 per ArchiveResult), got {process_count}",
+            len(original_data["archiveresults"]),
+            f"Expected {len(original_data['archiveresults'])} Process records (1 per ArchiveResult), got {process_count}",
        )

-        # The 44 ArchiveResults use 7 unique binaries (curl, wget, etc.)
        self.assertEqual(
            binary_count,
-            7,
-            f"Expected 7 unique Binary records, got {binary_count}",
+            5,
+            f"Expected 5 unique Binary records, got {binary_count}",
        )

        # ALL ArchiveResults should be linked to Process records
        self.assertEqual(
            linked_count,
-            44,
-            f"Expected all 44 ArchiveResults linked to Process, got {linked_count}",
+            len(original_data["archiveresults"]),
+            f"Expected all {len(original_data['archiveresults'])} ArchiveResults linked to Process, got {linked_count}",
        )


--- a/archivebox/tests/test_runner.py
+++ b/archivebox/tests/test_runner.py
@@ -77,7 +77,6 @@ def test_run_snapshot_reuses_crawl_bus_for_all_snapshots(monkeypatch):
    monkeypatch.setattr(runner_module, "create_bus", fake_create_bus)
    monkeypatch.setattr(runner_module, "discover_plugins", lambda: {})
    monkeypatch.setattr(runner_module, "ProcessService", _DummyService)
-    monkeypatch.setattr(runner_module, "MachineService", _DummyService)
    monkeypatch.setattr(runner_module, "BinaryService", _DummyService)
    monkeypatch.setattr(runner_module, "TagService", _DummyService)
    monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
@@ -232,7 +231,6 @@ def test_runner_prepare_refreshes_network_interface_and_attaches_current_process
    monkeypatch.setattr(runner_module, "discover_plugins", lambda: {})
    monkeypatch.setattr(runner_module, "create_bus", lambda **kwargs: _DummyBus(kwargs["name"]))
    monkeypatch.setattr(runner_module, "ProcessService", _DummyService)
-    monkeypatch.setattr(runner_module, "MachineService", _DummyService)
    monkeypatch.setattr(runner_module, "BinaryService", _DummyService)
    monkeypatch.setattr(runner_module, "TagService", _DummyService)
    monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
@@ -405,8 +403,8 @@ def test_installed_binary_config_overrides_do_not_map_hardcoded_artifacts_to_con
    )

    assert "SINGLEFILE_BINARY" not in overrides
-    assert overrides["LIB_DIR"] == "/tmp/shared-lib"
-    assert overrides["LIB_BIN_DIR"] == "/tmp/shared-lib/bin"
+    assert "LIB_DIR" not in overrides
+    assert "LIB_BIN_DIR" not in overrides


 def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch):
@@ -425,7 +423,6 @@ def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch
    monkeypatch.setattr(runner_module, "discover_plugins", lambda: {})
    monkeypatch.setattr(runner_module, "create_bus", lambda **kwargs: _DummyBus(kwargs["name"]))
    monkeypatch.setattr(runner_module, "ProcessService", _DummyService)
-    monkeypatch.setattr(runner_module, "MachineService", _DummyService)
    monkeypatch.setattr(runner_module, "BinaryService", _DummyService)
    monkeypatch.setattr(runner_module, "TagService", _DummyService)
    monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
@@ -616,7 +613,6 @@ def test_crawl_runner_finalizes_with_sync_to_async_for_is_finished(monkeypatch):
    monkeypatch.setattr(runner_module, "create_bus", lambda *args, **kwargs: _DummyBus("runner"))
    monkeypatch.setattr(runner_module, "discover_plugins", lambda: {})
    monkeypatch.setattr(runner_module, "ProcessService", _DummyService)
-    monkeypatch.setattr(runner_module, "MachineService", _DummyService)
    monkeypatch.setattr(runner_module, "BinaryService", _DummyService)
    monkeypatch.setattr(runner_module, "TagService", _DummyService)
    monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
--- a/archivebox/tests/test_server_security_browser.py
+++ b/archivebox/tests/test_server_security_browser.py
@@ -174,15 +174,14 @@ def _resolve_browser(shared_lib: Path) -> Path | None:

@pytest.fixture(scope="session")
 def browser_runtime(tmp_path_factory):
-    if shutil.which("node") is None or shutil.which("npm") is None:
-        pytest.skip("Node.js and npm are required for browser security tests")
+    assert shutil.which("node") is not None, "Node.js is required for browser security tests"
+    assert shutil.which("npm") is not None, "npm is required for browser security tests"

    shared_lib = tmp_path_factory.mktemp("archivebox_browser_lib")
    _ensure_puppeteer(shared_lib)

    browser = _resolve_browser(shared_lib)
-    if not browser:
-        pytest.skip("No Chrome/Chromium binary available for browser security tests")
+    assert browser, "No Chrome/Chromium binary available for browser security tests"

    return {
        "node_modules_dir": shared_lib / "npm" / "node_modules",