Improve scheduling, runtime paths, and API behavior

2026-04-06 07:47:53 +10:00 · 2026-03-15 18:31:56 -07:00
parent 7d42c6c8b5
commit 70c9358cf9
37 changed files with 1058 additions and 398 deletions
--- a/archivebox/tests/test_cli_install.py
+++ b/archivebox/tests/test_cli_install.py
@@ -7,6 +7,7 @@ Verify install detects and records binary dependencies in DB.
 import os
 import subprocess
 import sqlite3
+from pathlib import Path

 from .fixtures import *

@@ -94,24 +95,41 @@ def test_install_shows_binary_status(tmp_path, process):
    assert len(output) > 50


-def test_install_updates_binary_table(tmp_path, process, disable_extractors_dict):
-    """Test that install command runs successfully.
-
-    Binary records are created lazily when binaries are first used, not during install.
-    """
+def test_install_updates_binary_table(tmp_path, process):
+    """Test that install completes and only mutates dependency state."""
    os.chdir(tmp_path)
+    env = os.environ.copy()
+    tmp_short = Path('/tmp') / f'abx-install-{tmp_path.name}'
+    tmp_short.mkdir(parents=True, exist_ok=True)
+    env.update({
+        'TMP_DIR': str(tmp_short),
+        'ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS': 'true',
+    })

-    # Run install - it should complete without errors or timeout (which is expected)
-    # The install command starts the orchestrator which runs continuously
-    try:
-        result = subprocess.run(
-            ['archivebox', 'install'],
-            capture_output=True,
-            timeout=30,
-            env=disable_extractors_dict,
-        )
-        # If it completes, should be successful
-        assert result.returncode == 0
-    except subprocess.TimeoutExpired:
-        # Timeout is expected since orchestrator runs continuously
-        pass
+    result = subprocess.run(
+        ['archivebox', 'install'],
+        capture_output=True,
+        text=True,
+        timeout=420,
+        env=env,
+    )
+
+    output = result.stdout + result.stderr
+    assert result.returncode == 0, output
+
+    conn = sqlite3.connect("index.sqlite3")
+    c = conn.cursor()
+
+    binary_counts = dict(c.execute(
+        "SELECT status, COUNT(*) FROM machine_binary GROUP BY status"
+    ).fetchall())
+    snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
+    sealed_crawls = c.execute(
+        "SELECT COUNT(*) FROM crawls_crawl WHERE status='sealed'"
+    ).fetchone()[0]
+    conn.close()
+
+    assert sealed_crawls >= 1
+    assert snapshot_count == 0
+    assert binary_counts.get('queued', 0) == 0
+    assert binary_counts.get('installed', 0) > 0
--- a/archivebox/tests/test_cli_remove.py
+++ b/archivebox/tests/test_cli_remove.py
@@ -99,6 +99,8 @@ def test_remove_yes_flag_skips_confirmation(tmp_path, process, disable_extractor
    )

    assert result.returncode == 0
+    output = result.stdout.decode("utf-8") + result.stderr.decode("utf-8")
+    assert "Index now contains 0 links." in output


 def test_remove_multiple_snapshots(tmp_path, process, disable_extractors_dict):
@@ -173,6 +175,30 @@ def test_remove_nonexistent_url_fails_gracefully(tmp_path, process, disable_extr
    assert result.returncode != 0 or 'not found' in result.stdout.lower() or 'no matches' in result.stdout.lower()


+def test_remove_reports_remaining_link_count_correctly(tmp_path, process, disable_extractors_dict):
+    """Test remove reports the remaining snapshot count after deletion."""
+    os.chdir(tmp_path)
+
+    for url in ['https://example.com', 'https://example.org']:
+        subprocess.run(
+            ['archivebox', 'add', '--index-only', '--depth=0', url],
+            capture_output=True,
+            env=disable_extractors_dict,
+            check=True,
+        )
+
+    result = subprocess.run(
+        ['archivebox', 'remove', 'https://example.org', '--yes'],
+        capture_output=True,
+        env=disable_extractors_dict,
+        check=True,
+    )
+
+    output = result.stdout.decode("utf-8") + result.stderr.decode("utf-8")
+    assert "Removed 1 out of 2 links" in output
+    assert "Index now contains 1 links." in output
+
+
 def test_remove_after_flag(tmp_path, process, disable_extractors_dict):
    """Test remove --after flag removes snapshots after date."""
    os.chdir(tmp_path)
--- a/archivebox/tests/test_cli_schedule.py
+++ b/archivebox/tests/test_cli_schedule.py
@@ -1,56 +1,62 @@
 #!/usr/bin/env python3
-"""
-Tests for archivebox schedule command.
-Verify schedule creates scheduled crawl records.
-"""
+"""CLI-specific tests for archivebox schedule."""

 import os
-import subprocess
 import sqlite3
+import subprocess

-from .fixtures import *
+from .fixtures import process, disable_extractors_dict


-def test_schedule_creates_scheduled_crawl(tmp_path, process, disable_extractors_dict):
-    """Test that schedule command creates a scheduled crawl."""
+def test_schedule_run_all_enqueues_scheduled_crawl(tmp_path, process, disable_extractors_dict):
    os.chdir(tmp_path)

-    result = subprocess.run(
-        ['archivebox', 'schedule', '--every=day', '--depth=0', 'https://example.com'],
-        capture_output=True,
-        env=disable_extractors_dict,
-        timeout=30,
-    )
-
-    # Should complete (creating schedule or showing usage)
-    assert result.returncode in [0, 1, 2]
-
-
-def test_schedule_with_every_flag(tmp_path, process, disable_extractors_dict):
-    """Test schedule with --every flag."""
-    os.chdir(tmp_path)
-
-    result = subprocess.run(
-        ['archivebox', 'schedule', '--every=week', '--depth=0', 'https://example.com'],
-        capture_output=True,
-        env=disable_extractors_dict,
-        timeout=30,
-    )
-
-    assert result.returncode in [0, 1, 2]
-
-
-def test_schedule_list_shows_schedules(tmp_path, process):
-    """Test that schedule can list existing schedules."""
-    os.chdir(tmp_path)
-
-    # Try to list schedules
-    result = subprocess.run(
-        ['archivebox', 'schedule', '--list'],
+    subprocess.run(
+        ['archivebox', 'schedule', '--every=daily', '--depth=0', 'https://example.com'],
        capture_output=True,
        text=True,
-        timeout=30,
+        check=True,
    )

-    # Should show schedules or empty list
-    assert result.returncode in [0, 1, 2]
+    result = subprocess.run(
+        ['archivebox', 'schedule', '--run-all'],
+        capture_output=True,
+        text=True,
+        env=disable_extractors_dict,
+    )
+
+    assert result.returncode == 0
+    assert 'Enqueued 1 scheduled crawl' in result.stdout
+
+    conn = sqlite3.connect(tmp_path / "index.sqlite3")
+    try:
+        crawl_count = conn.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0]
+        queued_count = conn.execute("SELECT COUNT(*) FROM crawls_crawl WHERE status = 'queued'").fetchone()[0]
+    finally:
+        conn.close()
+
+    assert crawl_count >= 2
+    assert queued_count >= 1
+
+
+def test_schedule_without_import_path_creates_maintenance_schedule(tmp_path, process):
+    os.chdir(tmp_path)
+
+    result = subprocess.run(
+        ['archivebox', 'schedule', '--every=day'],
+        capture_output=True,
+        text=True,
+    )
+
+    assert result.returncode == 0
+    assert 'Created scheduled maintenance update' in result.stdout
+
+    conn = sqlite3.connect(tmp_path / "index.sqlite3")
+    try:
+        row = conn.execute(
+            "SELECT urls, status FROM crawls_crawl ORDER BY created_at DESC LIMIT 1"
+        ).fetchone()
+    finally:
+        conn.close()
+
+    assert row == ('archivebox://update', 'sealed')
--- a/archivebox/tests/test_cli_status.py
+++ b/archivebox/tests/test_cli_status.py
@@ -7,10 +7,25 @@ Verify status reports accurate collection state from DB and filesystem.
 import os
 import subprocess
 import sqlite3
+from pathlib import Path

 from .fixtures import *


+def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None:
+    candidates = {snapshot_id}
+    if len(snapshot_id) == 32:
+        candidates.add(f"{snapshot_id[:8]}-{snapshot_id[8:12]}-{snapshot_id[12:16]}-{snapshot_id[16:20]}-{snapshot_id[20:]}")
+    elif len(snapshot_id) == 36 and '-' in snapshot_id:
+        candidates.add(snapshot_id.replace('-', ''))
+
+    for needle in candidates:
+        for path in data_dir.rglob(needle):
+            if path.is_dir():
+                return path
+    return None
+
+
 def test_status_runs_successfully(tmp_path, process):
    """Test that status command runs without error."""
    os.chdir(tmp_path)
@@ -117,6 +132,37 @@ def test_status_detects_orphaned_directories(tmp_path, process, disable_extracto
    assert 'orphan' in result.stdout.lower() or '1' in result.stdout


+def test_status_counts_new_snapshot_output_dirs_as_archived(tmp_path, process, disable_extractors_dict):
+    """Test status reads archived/present counts from the current snapshot output layout."""
+    os.chdir(tmp_path)
+    env = disable_extractors_dict.copy()
+    env["ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS"] = "true"
+
+    subprocess.run(
+        ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
+        capture_output=True,
+        env=env,
+        check=True,
+    )
+
+    conn = sqlite3.connect("index.sqlite3")
+    c = conn.cursor()
+    snapshot_id = c.execute("SELECT id FROM core_snapshot WHERE url = ?", ('https://example.com',)).fetchone()[0]
+    conn.close()
+
+    snapshot_dir = _find_snapshot_dir(tmp_path, str(snapshot_id))
+    assert snapshot_dir is not None, f"Snapshot output directory not found for {snapshot_id}"
+    title_dir = snapshot_dir / "title"
+    title_dir.mkdir(parents=True, exist_ok=True)
+    (title_dir / "title.txt").write_text("Example Domain")
+
+    result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True, env=env)
+
+    assert result.returncode == 0, result.stdout + result.stderr
+    assert 'archived: 1' in result.stdout
+    assert 'present: 1' in result.stdout
+
+
 def test_status_shows_user_info(tmp_path, process):
    """Test status shows user/login information."""
    os.chdir(tmp_path)
--- a/archivebox/tests/test_cli_version.py
+++ b/archivebox/tests/test_cli_version.py
@@ -5,12 +5,63 @@ Verify version output and system information reporting.
 """

 import os
+import re
+import sys
+import tempfile
 import subprocess
-import sqlite3
+from pathlib import Path

 from .fixtures import *


+def _archivebox_cli() -> str:
+    cli = Path(sys.executable).with_name("archivebox")
+    return str(cli if cli.exists() else "archivebox")
+
+
+def _run_real_cli(
+    args: list[str],
+    cwd: Path,
+    *,
+    home_dir: Path,
+    timeout: int = 180,
+    extra_env: dict[str, str] | None = None,
+) -> subprocess.CompletedProcess[str]:
+    env = os.environ.copy()
+    env.pop("DATA_DIR", None)
+    env["HOME"] = str(home_dir)
+    env["USE_COLOR"] = "False"
+    env["SHOW_PROGRESS"] = "False"
+    if extra_env:
+        env.update(extra_env)
+    return subprocess.run(
+        [_archivebox_cli(), *args],
+        capture_output=True,
+        text=True,
+        cwd=cwd,
+        env=env,
+        timeout=timeout,
+    )
+
+
+def _make_deep_collection_dir(tmp_path: Path) -> Path:
+    deep_dir = tmp_path / "deep-collection"
+    for idx in range(6):
+        deep_dir /= f"segment-{idx}-1234567890abcdef"
+    deep_dir.mkdir(parents=True)
+    return deep_dir
+
+
+def _extract_location_path(output: str, key: str) -> Path:
+    for line in output.splitlines():
+        if key not in line:
+            continue
+        columns = [column for column in re.split(r"\s{2,}", line.strip()) if column]
+        if len(columns) >= 5 and columns[1] == key:
+            return Path(os.path.expanduser(columns[-1]))
+    raise AssertionError(f"Did not find a {key} location line in output:\n{output}")
+
+
 def test_version_quiet_outputs_version_number(tmp_path):
    """Test that version --quiet outputs just the version number."""
    os.chdir(tmp_path)
@@ -66,3 +117,32 @@ def test_version_in_uninitialized_dir_still_works(tmp_path):
    # Should still output version
    assert result.returncode == 0
    assert len(result.stdout.strip()) > 0
+
+
+def test_version_auto_selects_short_tmp_dir_for_deep_collection_path(tmp_path):
+    """Test the real CLI init/version flow auto-selects a short TMP_DIR outside deep collections."""
+    data_dir = _make_deep_collection_dir(tmp_path)
+    default_tmp_dir = data_dir / "tmp"
+    extra_env = {"ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS": "true"}
+
+    with tempfile.TemporaryDirectory(prefix="abx-home-") as home_tmp:
+        home_dir = Path(home_tmp)
+
+        init_result = _run_real_cli(["init", "--quick"], cwd=data_dir, home_dir=home_dir, extra_env=extra_env)
+        assert init_result.returncode == 0, init_result.stdout + init_result.stderr
+
+        version_result = _run_real_cli(["version"], cwd=data_dir, home_dir=home_dir, extra_env=extra_env)
+        output = version_result.stdout + version_result.stderr
+
+    assert version_result.returncode == 0, output
+    assert "ArchiveBox" in output
+    assert "TMP_DIR" in output
+    assert "Error with configured TMP_DIR" not in output
+
+    reported_tmp_dir = _extract_location_path(output, "TMP_DIR")
+    if not reported_tmp_dir.is_absolute():
+        reported_tmp_dir = (data_dir / reported_tmp_dir).resolve()
+
+    assert reported_tmp_dir.exists()
+    assert not reported_tmp_dir.is_relative_to(default_tmp_dir)
+    assert len(f"file://{reported_tmp_dir / 'supervisord.sock'}") <= 96
--- a/archivebox/tests/test_process_runtime_paths.py
+++ b/archivebox/tests/test_process_runtime_paths.py
@@ -0,0 +1,38 @@
+import os
+import unittest
+from pathlib import Path
+
+
+os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
+
+
+from archivebox.machine.models import Process
+
+
+class TestProcessRuntimePaths(unittest.TestCase):
+    def test_hook_processes_use_isolated_runtime_dir(self):
+        process = Process(
+            process_type=Process.TypeChoices.HOOK,
+            pwd='/tmp/archive/example/chrome',
+            cmd=['node', '/plugins/chrome/on_Snapshot__11_chrome_wait.js', '--url=https://example.com'],
+        )
+
+        expected_dir = Path('/tmp/archive/example/chrome/.hooks/on_Snapshot__11_chrome_wait.js')
+        self.assertEqual(process.runtime_dir, expected_dir)
+        self.assertEqual(process.stdout_file, expected_dir / 'stdout.log')
+        self.assertEqual(process.stderr_file, expected_dir / 'stderr.log')
+        self.assertEqual(process.pid_file, expected_dir / 'process.pid')
+
+    def test_non_hook_processes_keep_runtime_files_in_pwd(self):
+        process = Process(
+            process_type=Process.TypeChoices.WORKER,
+            pwd='/tmp/archive/example',
+            cmd=['archivebox', 'run', '--snapshot-id', '123'],
+        )
+
+        expected_dir = Path('/tmp/archive/example')
+        self.assertEqual(process.runtime_dir, expected_dir)
+        self.assertEqual(process.stdout_file, expected_dir / 'stdout.log')
+        self.assertEqual(process.stderr_file, expected_dir / 'stderr.log')
+        self.assertEqual(process.pid_file, expected_dir / 'process.pid')
+
--- a/archivebox/tests/test_schedule.py
+++ b/archivebox/tests/test_schedule.py
@@ -1,44 +1,102 @@
 #!/usr/bin/env python3
-"""Integration tests for archivebox schedule command."""
+"""Integration tests for the database-backed archivebox schedule command."""

 import os
+import sqlite3
 import subprocess

 import pytest

-from .fixtures import process, disable_extractors_dict
+from .fixtures import process


-def test_schedule_show_lists_jobs(tmp_path, process):
-    """Test that --show lists current scheduled jobs."""
+def _fetchone(tmp_path, query):
+    conn = sqlite3.connect(tmp_path / "index.sqlite3")
+    try:
+        return conn.execute(query).fetchone()
+    finally:
+        conn.close()
+
+
+def test_schedule_creates_enabled_db_schedule(tmp_path, process):
    os.chdir(tmp_path)

+    result = subprocess.run(
+        ['archivebox', 'schedule', '--every=daily', '--depth=1', 'https://example.com/feed.xml'],
+        capture_output=True,
+        text=True,
+    )
+
+    assert result.returncode == 0
+
+    schedule_row = _fetchone(
+        tmp_path,
+        "SELECT schedule, is_enabled, label FROM crawls_crawlschedule ORDER BY created_at DESC LIMIT 1",
+    )
+    crawl_row = _fetchone(
+        tmp_path,
+        "SELECT urls, status, max_depth FROM crawls_crawl ORDER BY created_at DESC LIMIT 1",
+    )
+
+    assert schedule_row == ('daily', 1, 'Scheduled import: https://example.com/feed.xml')
+    assert crawl_row == ('https://example.com/feed.xml', 'sealed', 1)
+
+
+def test_schedule_show_lists_enabled_schedules(tmp_path, process):
+    os.chdir(tmp_path)
+
+    subprocess.run(
+        ['archivebox', 'schedule', '--every=weekly', 'https://example.com/feed.xml'],
+        capture_output=True,
+        text=True,
+        check=True,
+    )
+
    result = subprocess.run(
        ['archivebox', 'schedule', '--show'],
        capture_output=True,
        text=True,
    )

-    # Should either show jobs or indicate no jobs
-    assert 'no' in result.stdout.lower() or 'archivebox' in result.stdout.lower() or result.returncode == 0
+    assert result.returncode == 0
+    assert 'Active scheduled crawls' in result.stdout
+    assert 'https://example.com/feed.xml' in result.stdout
+    assert 'weekly' in result.stdout


-def test_schedule_clear_removes_jobs(tmp_path, process):
-    """Test that --clear removes scheduled jobs."""
+def test_schedule_clear_disables_existing_schedules(tmp_path, process):
    os.chdir(tmp_path)

+    subprocess.run(
+        ['archivebox', 'schedule', '--every=daily', 'https://example.com/feed.xml'],
+        capture_output=True,
+        text=True,
+        check=True,
+    )
+
    result = subprocess.run(
        ['archivebox', 'schedule', '--clear'],
        capture_output=True,
        text=True,
    )

-    # Should complete successfully (may have no jobs to clear)
    assert result.returncode == 0
+    assert 'Disabled 1 scheduled crawl' in result.stdout
+
+    disabled_count = _fetchone(
+        tmp_path,
+        "SELECT COUNT(*) FROM crawls_crawlschedule WHERE is_enabled = 0",
+    )[0]
+    enabled_count = _fetchone(
+        tmp_path,
+        "SELECT COUNT(*) FROM crawls_crawlschedule WHERE is_enabled = 1",
+    )[0]
+
+    assert disabled_count == 1
+    assert enabled_count == 0


 def test_schedule_every_requires_valid_period(tmp_path, process):
-    """Test that --every requires valid time period."""
    os.chdir(tmp_path)

    result = subprocess.run(
@@ -47,15 +105,12 @@ def test_schedule_every_requires_valid_period(tmp_path, process):
        text=True,
    )

-    # Should fail with invalid period
-    assert result.returncode != 0 or 'invalid' in result.stdout.lower()
+    assert result.returncode != 0
+    assert 'Invalid schedule' in result.stderr or 'Invalid schedule' in result.stdout


 class TestScheduleCLI:
-    """Test the CLI interface for schedule command."""
-
    def test_cli_help(self, tmp_path, process):
-        """Test that --help works for schedule command."""
        os.chdir(tmp_path)

        result = subprocess.run(
@@ -68,7 +123,7 @@ class TestScheduleCLI:
        assert '--every' in result.stdout
        assert '--show' in result.stdout
        assert '--clear' in result.stdout
-        assert '--depth' in result.stdout
+        assert '--run-all' in result.stdout


 if __name__ == '__main__':