Improve scheduling, runtime paths, and API behavior

This commit is contained in:
Nick Sweeting
2026-03-15 18:31:56 -07:00
parent 7d42c6c8b5
commit 70c9358cf9
37 changed files with 1058 additions and 398 deletions

View File

@@ -7,6 +7,7 @@ Verify install detects and records binary dependencies in DB.
import os
import subprocess
import sqlite3
from pathlib import Path
from .fixtures import *
@@ -94,24 +95,41 @@ def test_install_shows_binary_status(tmp_path, process):
assert len(output) > 50
def test_install_updates_binary_table(tmp_path, process, disable_extractors_dict):
"""Test that install command runs successfully.
Binary records are created lazily when binaries are first used, not during install.
"""
def test_install_updates_binary_table(tmp_path, process):
"""Test that install completes and only mutates dependency state."""
os.chdir(tmp_path)
env = os.environ.copy()
tmp_short = Path('/tmp') / f'abx-install-{tmp_path.name}'
tmp_short.mkdir(parents=True, exist_ok=True)
env.update({
'TMP_DIR': str(tmp_short),
'ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS': 'true',
})
# Run install - it should complete without errors or timeout (which is expected)
# The install command starts the orchestrator which runs continuously
try:
result = subprocess.run(
['archivebox', 'install'],
capture_output=True,
timeout=30,
env=disable_extractors_dict,
)
# If it completes, should be successful
assert result.returncode == 0
except subprocess.TimeoutExpired:
# Timeout is expected since orchestrator runs continuously
pass
result = subprocess.run(
['archivebox', 'install'],
capture_output=True,
text=True,
timeout=420,
env=env,
)
output = result.stdout + result.stderr
assert result.returncode == 0, output
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
binary_counts = dict(c.execute(
"SELECT status, COUNT(*) FROM machine_binary GROUP BY status"
).fetchall())
snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
sealed_crawls = c.execute(
"SELECT COUNT(*) FROM crawls_crawl WHERE status='sealed'"
).fetchone()[0]
conn.close()
assert sealed_crawls >= 1
assert snapshot_count == 0
assert binary_counts.get('queued', 0) == 0
assert binary_counts.get('installed', 0) > 0

View File

@@ -99,6 +99,8 @@ def test_remove_yes_flag_skips_confirmation(tmp_path, process, disable_extractor
)
assert result.returncode == 0
output = result.stdout.decode("utf-8") + result.stderr.decode("utf-8")
assert "Index now contains 0 links." in output
def test_remove_multiple_snapshots(tmp_path, process, disable_extractors_dict):
@@ -173,6 +175,30 @@ def test_remove_nonexistent_url_fails_gracefully(tmp_path, process, disable_extr
assert result.returncode != 0 or 'not found' in result.stdout.lower() or 'no matches' in result.stdout.lower()
def test_remove_reports_remaining_link_count_correctly(tmp_path, process, disable_extractors_dict):
"""Test remove reports the remaining snapshot count after deletion."""
os.chdir(tmp_path)
for url in ['https://example.com', 'https://example.org']:
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', url],
capture_output=True,
env=disable_extractors_dict,
check=True,
)
result = subprocess.run(
['archivebox', 'remove', 'https://example.org', '--yes'],
capture_output=True,
env=disable_extractors_dict,
check=True,
)
output = result.stdout.decode("utf-8") + result.stderr.decode("utf-8")
assert "Removed 1 out of 2 links" in output
assert "Index now contains 1 links." in output
def test_remove_after_flag(tmp_path, process, disable_extractors_dict):
"""Test remove --after flag removes snapshots after date."""
os.chdir(tmp_path)

View File

@@ -1,56 +1,62 @@
#!/usr/bin/env python3
"""
Tests for archivebox schedule command.
Verify schedule creates scheduled crawl records.
"""
"""CLI-specific tests for archivebox schedule."""
import os
import subprocess
import sqlite3
import subprocess
from .fixtures import *
from .fixtures import process, disable_extractors_dict
def test_schedule_creates_scheduled_crawl(tmp_path, process, disable_extractors_dict):
"""Test that schedule command creates a scheduled crawl."""
def test_schedule_run_all_enqueues_scheduled_crawl(tmp_path, process, disable_extractors_dict):
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'schedule', '--every=day', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
timeout=30,
)
# Should complete (creating schedule or showing usage)
assert result.returncode in [0, 1, 2]
def test_schedule_with_every_flag(tmp_path, process, disable_extractors_dict):
"""Test schedule with --every flag."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'schedule', '--every=week', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
timeout=30,
)
assert result.returncode in [0, 1, 2]
def test_schedule_list_shows_schedules(tmp_path, process):
"""Test that schedule can list existing schedules."""
os.chdir(tmp_path)
# Try to list schedules
result = subprocess.run(
['archivebox', 'schedule', '--list'],
subprocess.run(
['archivebox', 'schedule', '--every=daily', '--depth=0', 'https://example.com'],
capture_output=True,
text=True,
timeout=30,
check=True,
)
# Should show schedules or empty list
assert result.returncode in [0, 1, 2]
result = subprocess.run(
['archivebox', 'schedule', '--run-all'],
capture_output=True,
text=True,
env=disable_extractors_dict,
)
assert result.returncode == 0
assert 'Enqueued 1 scheduled crawl' in result.stdout
conn = sqlite3.connect(tmp_path / "index.sqlite3")
try:
crawl_count = conn.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0]
queued_count = conn.execute("SELECT COUNT(*) FROM crawls_crawl WHERE status = 'queued'").fetchone()[0]
finally:
conn.close()
assert crawl_count >= 2
assert queued_count >= 1
def test_schedule_without_import_path_creates_maintenance_schedule(tmp_path, process):
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'schedule', '--every=day'],
capture_output=True,
text=True,
)
assert result.returncode == 0
assert 'Created scheduled maintenance update' in result.stdout
conn = sqlite3.connect(tmp_path / "index.sqlite3")
try:
row = conn.execute(
"SELECT urls, status FROM crawls_crawl ORDER BY created_at DESC LIMIT 1"
).fetchone()
finally:
conn.close()
assert row == ('archivebox://update', 'sealed')

View File

@@ -7,10 +7,25 @@ Verify status reports accurate collection state from DB and filesystem.
import os
import subprocess
import sqlite3
from pathlib import Path
from .fixtures import *
def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None:
candidates = {snapshot_id}
if len(snapshot_id) == 32:
candidates.add(f"{snapshot_id[:8]}-{snapshot_id[8:12]}-{snapshot_id[12:16]}-{snapshot_id[16:20]}-{snapshot_id[20:]}")
elif len(snapshot_id) == 36 and '-' in snapshot_id:
candidates.add(snapshot_id.replace('-', ''))
for needle in candidates:
for path in data_dir.rglob(needle):
if path.is_dir():
return path
return None
def test_status_runs_successfully(tmp_path, process):
"""Test that status command runs without error."""
os.chdir(tmp_path)
@@ -117,6 +132,37 @@ def test_status_detects_orphaned_directories(tmp_path, process, disable_extracto
assert 'orphan' in result.stdout.lower() or '1' in result.stdout
def test_status_counts_new_snapshot_output_dirs_as_archived(tmp_path, process, disable_extractors_dict):
"""Test status reads archived/present counts from the current snapshot output layout."""
os.chdir(tmp_path)
env = disable_extractors_dict.copy()
env["ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS"] = "true"
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=env,
check=True,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
snapshot_id = c.execute("SELECT id FROM core_snapshot WHERE url = ?", ('https://example.com',)).fetchone()[0]
conn.close()
snapshot_dir = _find_snapshot_dir(tmp_path, str(snapshot_id))
assert snapshot_dir is not None, f"Snapshot output directory not found for {snapshot_id}"
title_dir = snapshot_dir / "title"
title_dir.mkdir(parents=True, exist_ok=True)
(title_dir / "title.txt").write_text("Example Domain")
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True, env=env)
assert result.returncode == 0, result.stdout + result.stderr
assert 'archived: 1' in result.stdout
assert 'present: 1' in result.stdout
def test_status_shows_user_info(tmp_path, process):
"""Test status shows user/login information."""
os.chdir(tmp_path)

View File

@@ -5,12 +5,63 @@ Verify version output and system information reporting.
"""
import os
import re
import sys
import tempfile
import subprocess
import sqlite3
from pathlib import Path
from .fixtures import *
def _archivebox_cli() -> str:
cli = Path(sys.executable).with_name("archivebox")
return str(cli if cli.exists() else "archivebox")
def _run_real_cli(
args: list[str],
cwd: Path,
*,
home_dir: Path,
timeout: int = 180,
extra_env: dict[str, str] | None = None,
) -> subprocess.CompletedProcess[str]:
env = os.environ.copy()
env.pop("DATA_DIR", None)
env["HOME"] = str(home_dir)
env["USE_COLOR"] = "False"
env["SHOW_PROGRESS"] = "False"
if extra_env:
env.update(extra_env)
return subprocess.run(
[_archivebox_cli(), *args],
capture_output=True,
text=True,
cwd=cwd,
env=env,
timeout=timeout,
)
def _make_deep_collection_dir(tmp_path: Path) -> Path:
deep_dir = tmp_path / "deep-collection"
for idx in range(6):
deep_dir /= f"segment-{idx}-1234567890abcdef"
deep_dir.mkdir(parents=True)
return deep_dir
def _extract_location_path(output: str, key: str) -> Path:
for line in output.splitlines():
if key not in line:
continue
columns = [column for column in re.split(r"\s{2,}", line.strip()) if column]
if len(columns) >= 5 and columns[1] == key:
return Path(os.path.expanduser(columns[-1]))
raise AssertionError(f"Did not find a {key} location line in output:\n{output}")
def test_version_quiet_outputs_version_number(tmp_path):
"""Test that version --quiet outputs just the version number."""
os.chdir(tmp_path)
@@ -66,3 +117,32 @@ def test_version_in_uninitialized_dir_still_works(tmp_path):
# Should still output version
assert result.returncode == 0
assert len(result.stdout.strip()) > 0
def test_version_auto_selects_short_tmp_dir_for_deep_collection_path(tmp_path):
"""Test the real CLI init/version flow auto-selects a short TMP_DIR outside deep collections."""
data_dir = _make_deep_collection_dir(tmp_path)
default_tmp_dir = data_dir / "tmp"
extra_env = {"ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS": "true"}
with tempfile.TemporaryDirectory(prefix="abx-home-") as home_tmp:
home_dir = Path(home_tmp)
init_result = _run_real_cli(["init", "--quick"], cwd=data_dir, home_dir=home_dir, extra_env=extra_env)
assert init_result.returncode == 0, init_result.stdout + init_result.stderr
version_result = _run_real_cli(["version"], cwd=data_dir, home_dir=home_dir, extra_env=extra_env)
output = version_result.stdout + version_result.stderr
assert version_result.returncode == 0, output
assert "ArchiveBox" in output
assert "TMP_DIR" in output
assert "Error with configured TMP_DIR" not in output
reported_tmp_dir = _extract_location_path(output, "TMP_DIR")
if not reported_tmp_dir.is_absolute():
reported_tmp_dir = (data_dir / reported_tmp_dir).resolve()
assert reported_tmp_dir.exists()
assert not reported_tmp_dir.is_relative_to(default_tmp_dir)
assert len(f"file://{reported_tmp_dir / 'supervisord.sock'}") <= 96

View File

@@ -0,0 +1,38 @@
import os
import unittest
from pathlib import Path
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
from archivebox.machine.models import Process
class TestProcessRuntimePaths(unittest.TestCase):
def test_hook_processes_use_isolated_runtime_dir(self):
process = Process(
process_type=Process.TypeChoices.HOOK,
pwd='/tmp/archive/example/chrome',
cmd=['node', '/plugins/chrome/on_Snapshot__11_chrome_wait.js', '--url=https://example.com'],
)
expected_dir = Path('/tmp/archive/example/chrome/.hooks/on_Snapshot__11_chrome_wait.js')
self.assertEqual(process.runtime_dir, expected_dir)
self.assertEqual(process.stdout_file, expected_dir / 'stdout.log')
self.assertEqual(process.stderr_file, expected_dir / 'stderr.log')
self.assertEqual(process.pid_file, expected_dir / 'process.pid')
def test_non_hook_processes_keep_runtime_files_in_pwd(self):
process = Process(
process_type=Process.TypeChoices.WORKER,
pwd='/tmp/archive/example',
cmd=['archivebox', 'run', '--snapshot-id', '123'],
)
expected_dir = Path('/tmp/archive/example')
self.assertEqual(process.runtime_dir, expected_dir)
self.assertEqual(process.stdout_file, expected_dir / 'stdout.log')
self.assertEqual(process.stderr_file, expected_dir / 'stderr.log')
self.assertEqual(process.pid_file, expected_dir / 'process.pid')

View File

@@ -1,44 +1,102 @@
#!/usr/bin/env python3
"""Integration tests for archivebox schedule command."""
"""Integration tests for the database-backed archivebox schedule command."""
import os
import sqlite3
import subprocess
import pytest
from .fixtures import process, disable_extractors_dict
from .fixtures import process
def test_schedule_show_lists_jobs(tmp_path, process):
"""Test that --show lists current scheduled jobs."""
def _fetchone(tmp_path, query):
conn = sqlite3.connect(tmp_path / "index.sqlite3")
try:
return conn.execute(query).fetchone()
finally:
conn.close()
def test_schedule_creates_enabled_db_schedule(tmp_path, process):
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'schedule', '--every=daily', '--depth=1', 'https://example.com/feed.xml'],
capture_output=True,
text=True,
)
assert result.returncode == 0
schedule_row = _fetchone(
tmp_path,
"SELECT schedule, is_enabled, label FROM crawls_crawlschedule ORDER BY created_at DESC LIMIT 1",
)
crawl_row = _fetchone(
tmp_path,
"SELECT urls, status, max_depth FROM crawls_crawl ORDER BY created_at DESC LIMIT 1",
)
assert schedule_row == ('daily', 1, 'Scheduled import: https://example.com/feed.xml')
assert crawl_row == ('https://example.com/feed.xml', 'sealed', 1)
def test_schedule_show_lists_enabled_schedules(tmp_path, process):
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'schedule', '--every=weekly', 'https://example.com/feed.xml'],
capture_output=True,
text=True,
check=True,
)
result = subprocess.run(
['archivebox', 'schedule', '--show'],
capture_output=True,
text=True,
)
# Should either show jobs or indicate no jobs
assert 'no' in result.stdout.lower() or 'archivebox' in result.stdout.lower() or result.returncode == 0
assert result.returncode == 0
assert 'Active scheduled crawls' in result.stdout
assert 'https://example.com/feed.xml' in result.stdout
assert 'weekly' in result.stdout
def test_schedule_clear_removes_jobs(tmp_path, process):
"""Test that --clear removes scheduled jobs."""
def test_schedule_clear_disables_existing_schedules(tmp_path, process):
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'schedule', '--every=daily', 'https://example.com/feed.xml'],
capture_output=True,
text=True,
check=True,
)
result = subprocess.run(
['archivebox', 'schedule', '--clear'],
capture_output=True,
text=True,
)
# Should complete successfully (may have no jobs to clear)
assert result.returncode == 0
assert 'Disabled 1 scheduled crawl' in result.stdout
disabled_count = _fetchone(
tmp_path,
"SELECT COUNT(*) FROM crawls_crawlschedule WHERE is_enabled = 0",
)[0]
enabled_count = _fetchone(
tmp_path,
"SELECT COUNT(*) FROM crawls_crawlschedule WHERE is_enabled = 1",
)[0]
assert disabled_count == 1
assert enabled_count == 0
def test_schedule_every_requires_valid_period(tmp_path, process):
"""Test that --every requires valid time period."""
os.chdir(tmp_path)
result = subprocess.run(
@@ -47,15 +105,12 @@ def test_schedule_every_requires_valid_period(tmp_path, process):
text=True,
)
# Should fail with invalid period
assert result.returncode != 0 or 'invalid' in result.stdout.lower()
assert result.returncode != 0
assert 'Invalid schedule' in result.stderr or 'Invalid schedule' in result.stdout
class TestScheduleCLI:
"""Test the CLI interface for schedule command."""
def test_cli_help(self, tmp_path, process):
"""Test that --help works for schedule command."""
os.chdir(tmp_path)
result = subprocess.run(
@@ -68,7 +123,7 @@ class TestScheduleCLI:
assert '--every' in result.stdout
assert '--show' in result.stdout
assert '--clear' in result.stdout
assert '--depth' in result.stdout
assert '--run-all' in result.stdout
if __name__ == '__main__':