mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
wip
This commit is contained in:
@@ -8,7 +8,7 @@ import textwrap
|
||||
import time
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
@@ -24,13 +24,14 @@ os.environ.setdefault("DATA_DIR", str(SESSION_DATA_DIR))
|
||||
# CLI Helpers (defined before fixtures that use them)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def run_archivebox_cmd(
|
||||
args: List[str],
|
||||
args: list[str],
|
||||
data_dir: Path,
|
||||
stdin: Optional[str] = None,
|
||||
stdin: str | None = None,
|
||||
timeout: int = 60,
|
||||
env: Optional[Dict[str, str]] = None,
|
||||
) -> Tuple[str, str, int]:
|
||||
env: dict[str, str] | None = None,
|
||||
) -> tuple[str, str, int]:
|
||||
"""
|
||||
Run archivebox command via subprocess, return (stdout, stderr, returncode).
|
||||
|
||||
@@ -44,28 +45,28 @@ def run_archivebox_cmd(
|
||||
Returns:
|
||||
Tuple of (stdout, stderr, returncode)
|
||||
"""
|
||||
cmd = [sys.executable, '-m', 'archivebox'] + args
|
||||
cmd = [sys.executable, "-m", "archivebox"] + args
|
||||
|
||||
base_env = os.environ.copy()
|
||||
base_env['DATA_DIR'] = str(data_dir)
|
||||
base_env['USE_COLOR'] = 'False'
|
||||
base_env['SHOW_PROGRESS'] = 'False'
|
||||
base_env["DATA_DIR"] = str(data_dir)
|
||||
base_env["USE_COLOR"] = "False"
|
||||
base_env["SHOW_PROGRESS"] = "False"
|
||||
# Disable slow extractors for faster tests
|
||||
base_env['SAVE_ARCHIVEDOTORG'] = 'False'
|
||||
base_env['SAVE_TITLE'] = 'False'
|
||||
base_env['SAVE_FAVICON'] = 'False'
|
||||
base_env['SAVE_WGET'] = 'False'
|
||||
base_env['SAVE_WARC'] = 'False'
|
||||
base_env['SAVE_PDF'] = 'False'
|
||||
base_env['SAVE_SCREENSHOT'] = 'False'
|
||||
base_env['SAVE_DOM'] = 'False'
|
||||
base_env['SAVE_SINGLEFILE'] = 'False'
|
||||
base_env['SAVE_READABILITY'] = 'False'
|
||||
base_env['SAVE_MERCURY'] = 'False'
|
||||
base_env['SAVE_GIT'] = 'False'
|
||||
base_env['SAVE_YTDLP'] = 'False'
|
||||
base_env['SAVE_HEADERS'] = 'False'
|
||||
base_env['SAVE_HTMLTOTEXT'] = 'False'
|
||||
base_env["SAVE_ARCHIVEDOTORG"] = "False"
|
||||
base_env["SAVE_TITLE"] = "False"
|
||||
base_env["SAVE_FAVICON"] = "False"
|
||||
base_env["SAVE_WGET"] = "False"
|
||||
base_env["SAVE_WARC"] = "False"
|
||||
base_env["SAVE_PDF"] = "False"
|
||||
base_env["SAVE_SCREENSHOT"] = "False"
|
||||
base_env["SAVE_DOM"] = "False"
|
||||
base_env["SAVE_SINGLEFILE"] = "False"
|
||||
base_env["SAVE_READABILITY"] = "False"
|
||||
base_env["SAVE_MERCURY"] = "False"
|
||||
base_env["SAVE_GIT"] = "False"
|
||||
base_env["SAVE_YTDLP"] = "False"
|
||||
base_env["SAVE_HEADERS"] = "False"
|
||||
base_env["SAVE_HTMLTOTEXT"] = "False"
|
||||
|
||||
if env:
|
||||
base_env.update(env)
|
||||
@@ -87,6 +88,7 @@ def run_archivebox_cmd(
|
||||
# Fixtures
|
||||
# =============================================================================
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def isolate_test_runtime(tmp_path):
|
||||
"""
|
||||
@@ -117,6 +119,7 @@ def isolate_test_runtime(tmp_path):
|
||||
def pytest_sessionfinish(session, exitstatus):
|
||||
shutil.rmtree(SESSION_DATA_DIR, ignore_errors=True)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def isolated_data_dir(tmp_path):
|
||||
"""
|
||||
@@ -124,7 +127,7 @@ def isolated_data_dir(tmp_path):
|
||||
|
||||
Uses tmp_path for complete isolation.
|
||||
"""
|
||||
data_dir = tmp_path / 'archivebox_data'
|
||||
data_dir = tmp_path / "archivebox_data"
|
||||
data_dir.mkdir()
|
||||
return data_dir
|
||||
|
||||
@@ -137,7 +140,7 @@ def initialized_archive(isolated_data_dir):
|
||||
Runs `archivebox init` via subprocess to set up database and directories.
|
||||
"""
|
||||
stdout, stderr, returncode = run_archivebox_cmd(
|
||||
['init', '--quick'],
|
||||
["init", "--quick"],
|
||||
data_dir=isolated_data_dir,
|
||||
timeout=60,
|
||||
)
|
||||
@@ -149,23 +152,24 @@ def initialized_archive(isolated_data_dir):
|
||||
# CWD-based CLI Helpers (no DATA_DIR env)
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def run_archivebox_cmd_cwd(
|
||||
args: List[str],
|
||||
args: list[str],
|
||||
cwd: Path,
|
||||
stdin: Optional[str] = None,
|
||||
stdin: str | None = None,
|
||||
timeout: int = 60,
|
||||
env: Optional[Dict[str, str]] = None,
|
||||
) -> Tuple[str, str, int]:
|
||||
env: dict[str, str] | None = None,
|
||||
) -> tuple[str, str, int]:
|
||||
"""
|
||||
Run archivebox command via subprocess using cwd as DATA_DIR (no DATA_DIR env).
|
||||
Returns (stdout, stderr, returncode).
|
||||
"""
|
||||
cmd = [sys.executable, '-m', 'archivebox'] + args
|
||||
cmd = [sys.executable, "-m", "archivebox"] + args
|
||||
|
||||
base_env = os.environ.copy()
|
||||
base_env.pop('DATA_DIR', None)
|
||||
base_env['USE_COLOR'] = 'False'
|
||||
base_env['SHOW_PROGRESS'] = 'False'
|
||||
base_env.pop("DATA_DIR", None)
|
||||
base_env["USE_COLOR"] = "False"
|
||||
base_env["SHOW_PROGRESS"] = "False"
|
||||
|
||||
if env:
|
||||
base_env.update(env)
|
||||
@@ -183,7 +187,7 @@ def run_archivebox_cmd_cwd(
|
||||
return result.stdout, result.stderr, result.returncode
|
||||
|
||||
|
||||
def stop_process(proc: subprocess.Popen[str]) -> Tuple[str, str]:
|
||||
def stop_process(proc: subprocess.Popen[str]) -> tuple[str, str]:
|
||||
if proc.poll() is None:
|
||||
proc.terminate()
|
||||
try:
|
||||
@@ -197,11 +201,11 @@ def run_python_cwd(
|
||||
script: str,
|
||||
cwd: Path,
|
||||
timeout: int = 60,
|
||||
) -> Tuple[str, str, int]:
|
||||
) -> tuple[str, str, int]:
|
||||
base_env = os.environ.copy()
|
||||
base_env.pop('DATA_DIR', None)
|
||||
base_env.pop("DATA_DIR", None)
|
||||
result = subprocess.run(
|
||||
[sys.executable, '-'],
|
||||
[sys.executable, "-"],
|
||||
input=script,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
@@ -253,7 +257,7 @@ def wait_for_archive_outputs(
|
||||
rel_path = candidate.relative_to(snapshot_dir)
|
||||
if rel_path.parts and rel_path.parts[0] == 'responses':
|
||||
continue
|
||||
if rel_path.name in {'stdout.log', 'stderr.log', 'cmd.sh'}:
|
||||
if rel_path.name in {"stdout.log", "stderr.log", "cmd.sh"}:
|
||||
continue
|
||||
output_rel = str(rel_path)
|
||||
break
|
||||
@@ -267,64 +271,68 @@ def wait_for_archive_outputs(
|
||||
raise SystemExit(1)
|
||||
|
||||
print('READY')
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
deadline = time.time() + timeout
|
||||
while time.time() < deadline:
|
||||
stdout, _stderr, returncode = run_python_cwd(script, cwd=cwd, timeout=30)
|
||||
if returncode == 0 and 'READY' in stdout:
|
||||
if returncode == 0 and "READY" in stdout:
|
||||
return True
|
||||
time.sleep(interval)
|
||||
return False
|
||||
|
||||
|
||||
def _get_machine_type() -> str:
|
||||
import platform
|
||||
|
||||
os_name = platform.system().lower()
|
||||
arch = platform.machine().lower()
|
||||
in_docker = os.environ.get('IN_DOCKER', '').lower() in ('1', 'true', 'yes')
|
||||
suffix = '-docker' if in_docker else ''
|
||||
return f'{arch}-{os_name}{suffix}'
|
||||
in_docker = os.environ.get("IN_DOCKER", "").lower() in ("1", "true", "yes")
|
||||
suffix = "-docker" if in_docker else ""
|
||||
return f"{arch}-{os_name}{suffix}"
|
||||
|
||||
def _find_cached_chromium(lib_dir: Path) -> Optional[Path]:
|
||||
|
||||
def _find_cached_chromium(lib_dir: Path) -> Path | None:
|
||||
candidates = [
|
||||
lib_dir / 'puppeteer',
|
||||
lib_dir / 'npm' / 'node_modules' / 'puppeteer' / '.local-chromium',
|
||||
lib_dir / "puppeteer",
|
||||
lib_dir / "npm" / "node_modules" / "puppeteer" / ".local-chromium",
|
||||
]
|
||||
for base in candidates:
|
||||
if not base.exists():
|
||||
continue
|
||||
for path in base.rglob('Chromium.app/Contents/MacOS/Chromium'):
|
||||
for path in base.rglob("Chromium.app/Contents/MacOS/Chromium"):
|
||||
return path
|
||||
for path in base.rglob('chrome-linux/chrome'):
|
||||
for path in base.rglob("chrome-linux/chrome"):
|
||||
return path
|
||||
for path in base.rglob('chrome-linux64/chrome'):
|
||||
for path in base.rglob("chrome-linux64/chrome"):
|
||||
return path
|
||||
return None
|
||||
|
||||
def _find_system_browser() -> Optional[Path]:
|
||||
|
||||
def _find_system_browser() -> Path | None:
|
||||
candidates = [
|
||||
Path('/Applications/Chromium.app/Contents/MacOS/Chromium'),
|
||||
Path('/usr/bin/chromium'),
|
||||
Path('/usr/bin/chromium-browser'),
|
||||
Path("/Applications/Chromium.app/Contents/MacOS/Chromium"),
|
||||
Path("/usr/bin/chromium"),
|
||||
Path("/usr/bin/chromium-browser"),
|
||||
]
|
||||
for candidate in candidates:
|
||||
if candidate.exists():
|
||||
return candidate
|
||||
return None
|
||||
|
||||
|
||||
def _ensure_puppeteer(shared_lib: Path) -> None:
|
||||
npm_prefix = shared_lib / 'npm'
|
||||
node_modules = npm_prefix / 'node_modules'
|
||||
puppeteer_dir = node_modules / 'puppeteer'
|
||||
npm_prefix = shared_lib / "npm"
|
||||
node_modules = npm_prefix / "node_modules"
|
||||
puppeteer_dir = node_modules / "puppeteer"
|
||||
if puppeteer_dir.exists():
|
||||
return
|
||||
npm_prefix.mkdir(parents=True, exist_ok=True)
|
||||
env = os.environ.copy()
|
||||
env['PUPPETEER_SKIP_DOWNLOAD'] = '1'
|
||||
env["PUPPETEER_SKIP_DOWNLOAD"] = "1"
|
||||
subprocess.run(
|
||||
['npm', 'install', 'puppeteer'],
|
||||
["npm", "install", "puppeteer"],
|
||||
cwd=str(npm_prefix),
|
||||
env=env,
|
||||
check=True,
|
||||
@@ -345,7 +353,7 @@ def real_archive_with_example(tmp_path_factory, request):
|
||||
request.cls.data_dir = tmp_path
|
||||
|
||||
stdout, stderr, returncode = run_archivebox_cmd_cwd(
|
||||
['init', '--quick'],
|
||||
["init", "--quick"],
|
||||
cwd=tmp_path,
|
||||
timeout=120,
|
||||
)
|
||||
@@ -353,28 +361,28 @@ def real_archive_with_example(tmp_path_factory, request):
|
||||
|
||||
stdout, stderr, returncode = run_archivebox_cmd_cwd(
|
||||
[
|
||||
'config',
|
||||
'--set',
|
||||
'LISTEN_HOST=archivebox.localhost:8000',
|
||||
'PUBLIC_INDEX=True',
|
||||
'PUBLIC_SNAPSHOTS=True',
|
||||
'PUBLIC_ADD_VIEW=True',
|
||||
"config",
|
||||
"--set",
|
||||
"LISTEN_HOST=archivebox.localhost:8000",
|
||||
"PUBLIC_INDEX=True",
|
||||
"PUBLIC_SNAPSHOTS=True",
|
||||
"PUBLIC_ADD_VIEW=True",
|
||||
],
|
||||
cwd=tmp_path,
|
||||
)
|
||||
assert returncode == 0, f"archivebox config failed: {stderr}"
|
||||
|
||||
add_env = {
|
||||
'RESPONSES_ENABLED': 'True',
|
||||
'SHOW_PROGRESS': 'False',
|
||||
'USE_COLOR': 'False',
|
||||
'RESPONSES_TIMEOUT': '30',
|
||||
"RESPONSES_ENABLED": "True",
|
||||
"SHOW_PROGRESS": "False",
|
||||
"USE_COLOR": "False",
|
||||
"RESPONSES_TIMEOUT": "30",
|
||||
}
|
||||
cmd = [sys.executable, '-m', 'archivebox', 'add', '--depth=0', '--plugins=responses', 'https://example.com']
|
||||
cmd = [sys.executable, "-m", "archivebox", "add", "--depth=0", "--plugins=responses", "https://example.com"]
|
||||
base_env = os.environ.copy()
|
||||
base_env.pop('DATA_DIR', None)
|
||||
base_env['USE_COLOR'] = 'False'
|
||||
base_env['SHOW_PROGRESS'] = 'False'
|
||||
base_env.pop("DATA_DIR", None)
|
||||
base_env["USE_COLOR"] = "False"
|
||||
base_env["SHOW_PROGRESS"] = "False"
|
||||
base_env.update(add_env)
|
||||
|
||||
proc = subprocess.Popen(
|
||||
@@ -386,7 +394,7 @@ def real_archive_with_example(tmp_path_factory, request):
|
||||
env=base_env,
|
||||
)
|
||||
|
||||
ready = wait_for_archive_outputs(tmp_path, 'https://example.com', timeout=600)
|
||||
ready = wait_for_archive_outputs(tmp_path, "https://example.com", timeout=600)
|
||||
stdout, stderr = stop_process(proc)
|
||||
assert ready, f"archivebox add did not produce required outputs within timeout:\nSTDOUT:\n{stdout}\nSTDERR:\n{stderr}"
|
||||
|
||||
@@ -397,34 +405,34 @@ def real_archive_with_example(tmp_path_factory, request):
|
||||
# Output Assertions
|
||||
# =============================================================================
|
||||
|
||||
def parse_jsonl_output(stdout: str) -> List[Dict[str, Any]]:
|
||||
|
||||
def parse_jsonl_output(stdout: str) -> list[dict[str, Any]]:
|
||||
"""Parse JSONL output into list of dicts via Process parser."""
|
||||
from archivebox.machine.models import Process
|
||||
return Process.parse_records_from_text(stdout or '')
|
||||
|
||||
return Process.parse_records_from_text(stdout or "")
|
||||
|
||||
|
||||
def assert_jsonl_contains_type(stdout: str, record_type: str, min_count: int = 1):
|
||||
"""Assert output contains at least min_count records of type."""
|
||||
records = parse_jsonl_output(stdout)
|
||||
matching = [r for r in records if r.get('type') == record_type]
|
||||
assert len(matching) >= min_count, \
|
||||
f"Expected >= {min_count} {record_type}, got {len(matching)}"
|
||||
matching = [r for r in records if r.get("type") == record_type]
|
||||
assert len(matching) >= min_count, f"Expected >= {min_count} {record_type}, got {len(matching)}"
|
||||
return matching
|
||||
|
||||
|
||||
def assert_jsonl_pass_through(stdout: str, input_records: List[Dict[str, Any]]):
|
||||
def assert_jsonl_pass_through(stdout: str, input_records: list[dict[str, Any]]):
|
||||
"""Assert that input records appear in output (pass-through behavior)."""
|
||||
output_records = parse_jsonl_output(stdout)
|
||||
output_ids = {r.get('id') for r in output_records if r.get('id')}
|
||||
output_ids = {r.get("id") for r in output_records if r.get("id")}
|
||||
|
||||
for input_rec in input_records:
|
||||
input_id = input_rec.get('id')
|
||||
input_id = input_rec.get("id")
|
||||
if input_id:
|
||||
assert input_id in output_ids, \
|
||||
f"Input record {input_id} not found in output (pass-through failed)"
|
||||
assert input_id in output_ids, f"Input record {input_id} not found in output (pass-through failed)"
|
||||
|
||||
|
||||
def assert_record_has_fields(record: Dict[str, Any], required_fields: List[str]):
|
||||
def assert_record_has_fields(record: dict[str, Any], required_fields: list[str]):
|
||||
"""Assert record has all required fields with non-None values."""
|
||||
for field in required_fields:
|
||||
assert field in record, f"Record missing field: {field}"
|
||||
@@ -435,31 +443,32 @@ def assert_record_has_fields(record: Dict[str, Any], required_fields: List[str])
|
||||
# Test Data Factories
|
||||
# =============================================================================
|
||||
|
||||
def create_test_url(domain: str = 'example.com', path: str | None = None) -> str:
|
||||
|
||||
def create_test_url(domain: str = "example.com", path: str | None = None) -> str:
|
||||
"""Generate unique test URL."""
|
||||
path = path or uuid7().hex[:8]
|
||||
return f'https://{domain}/{path}'
|
||||
return f"https://{domain}/{path}"
|
||||
|
||||
|
||||
def create_test_crawl_json(urls: List[str] | None = None, **kwargs) -> Dict[str, Any]:
|
||||
def create_test_crawl_json(urls: list[str] | None = None, **kwargs) -> dict[str, Any]:
|
||||
"""Create Crawl JSONL record for testing."""
|
||||
urls = urls or [create_test_url()]
|
||||
return {
|
||||
'type': 'Crawl',
|
||||
'urls': '\n'.join(urls),
|
||||
'max_depth': kwargs.get('max_depth', 0),
|
||||
'tags_str': kwargs.get('tags_str', ''),
|
||||
'status': kwargs.get('status', 'queued'),
|
||||
**{k: v for k, v in kwargs.items() if k not in ('max_depth', 'tags_str', 'status')},
|
||||
"type": "Crawl",
|
||||
"urls": "\n".join(urls),
|
||||
"max_depth": kwargs.get("max_depth", 0),
|
||||
"tags_str": kwargs.get("tags_str", ""),
|
||||
"status": kwargs.get("status", "queued"),
|
||||
**{k: v for k, v in kwargs.items() if k not in ("max_depth", "tags_str", "status")},
|
||||
}
|
||||
|
||||
|
||||
def create_test_snapshot_json(url: str | None = None, **kwargs) -> Dict[str, Any]:
|
||||
def create_test_snapshot_json(url: str | None = None, **kwargs) -> dict[str, Any]:
|
||||
"""Create Snapshot JSONL record for testing."""
|
||||
return {
|
||||
'type': 'Snapshot',
|
||||
'url': url or create_test_url(),
|
||||
'tags_str': kwargs.get('tags_str', ''),
|
||||
'status': kwargs.get('status', 'queued'),
|
||||
**{k: v for k, v in kwargs.items() if k not in ('tags_str', 'status')},
|
||||
"type": "Snapshot",
|
||||
"url": url or create_test_url(),
|
||||
"tags_str": kwargs.get("tags_str", ""),
|
||||
"status": kwargs.get("status", "queued"),
|
||||
**{k: v for k, v in kwargs.items() if k not in ("tags_str", "status")},
|
||||
}
|
||||
|
||||
@@ -5,34 +5,38 @@ from threading import Thread
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def process(tmp_path):
|
||||
process = subprocess.run(
|
||||
['archivebox', 'init'],
|
||||
["archivebox", "init"],
|
||||
capture_output=True,
|
||||
cwd=tmp_path,
|
||||
)
|
||||
return process
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def disable_extractors_dict():
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
"SAVE_WGET": "false",
|
||||
"SAVE_SINGLEFILE": "false",
|
||||
"SAVE_READABILITY": "false",
|
||||
"SAVE_MERCURY": "false",
|
||||
"SAVE_HTMLTOTEXT": "false",
|
||||
"SAVE_PDF": "false",
|
||||
"SAVE_SCREENSHOT": "false",
|
||||
"SAVE_DOM": "false",
|
||||
"SAVE_HEADERS": "false",
|
||||
"SAVE_GIT": "false",
|
||||
"SAVE_YTDLP": "false",
|
||||
"SAVE_ARCHIVEDOTORG": "false",
|
||||
"SAVE_TITLE": "false",
|
||||
"SAVE_FAVICON": "false",
|
||||
})
|
||||
env.update(
|
||||
{
|
||||
"SAVE_WGET": "false",
|
||||
"SAVE_SINGLEFILE": "false",
|
||||
"SAVE_READABILITY": "false",
|
||||
"SAVE_MERCURY": "false",
|
||||
"SAVE_HTMLTOTEXT": "false",
|
||||
"SAVE_PDF": "false",
|
||||
"SAVE_SCREENSHOT": "false",
|
||||
"SAVE_DOM": "false",
|
||||
"SAVE_HEADERS": "false",
|
||||
"SAVE_GIT": "false",
|
||||
"SAVE_YTDLP": "false",
|
||||
"SAVE_ARCHIVEDOTORG": "false",
|
||||
"SAVE_TITLE": "false",
|
||||
"SAVE_FAVICON": "false",
|
||||
},
|
||||
)
|
||||
return env
|
||||
|
||||
|
||||
|
||||
@@ -15,7 +15,6 @@ import sqlite3
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
from archivebox.uuid_compat import uuid7
|
||||
|
||||
@@ -494,6 +493,7 @@ INSERT INTO django_content_type (app_label, model) VALUES
|
||||
# Test Data Generators
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def generate_uuid() -> str:
|
||||
"""Generate a UUID string without dashes for SQLite."""
|
||||
return uuid7().hex
|
||||
@@ -501,45 +501,50 @@ def generate_uuid() -> str:
|
||||
|
||||
def generate_timestamp() -> str:
|
||||
"""Generate a timestamp string like ArchiveBox uses."""
|
||||
return datetime.now(timezone.utc).strftime('%Y%m%d%H%M%S') + '.000000'
|
||||
return datetime.now(timezone.utc).strftime("%Y%m%d%H%M%S") + ".000000"
|
||||
|
||||
|
||||
def seed_0_4_data(db_path: Path) -> Dict[str, List[Dict]]:
|
||||
def seed_0_4_data(db_path: Path) -> dict[str, list[dict]]:
|
||||
"""Seed a 0.4.x database with realistic test data."""
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
cursor = conn.cursor()
|
||||
|
||||
created_data = {
|
||||
'snapshots': [],
|
||||
'tags_str': [],
|
||||
"snapshots": [],
|
||||
"tags_str": [],
|
||||
}
|
||||
|
||||
test_urls = [
|
||||
('https://example.com/page1', 'Example Page 1', 'news,tech'),
|
||||
('https://example.org/article', 'Article Title', 'blog,reading'),
|
||||
('https://github.com/user/repo', 'GitHub Repository', 'code,github'),
|
||||
('https://news.ycombinator.com/item?id=12345', 'HN Discussion', 'news,discussion'),
|
||||
('https://en.wikipedia.org/wiki/Test', 'Wikipedia Test', 'reference,wiki'),
|
||||
("https://example.com/page1", "Example Page 1", "news,tech"),
|
||||
("https://example.org/article", "Article Title", "blog,reading"),
|
||||
("https://github.com/user/repo", "GitHub Repository", "code,github"),
|
||||
("https://news.ycombinator.com/item?id=12345", "HN Discussion", "news,discussion"),
|
||||
("https://en.wikipedia.org/wiki/Test", "Wikipedia Test", "reference,wiki"),
|
||||
]
|
||||
|
||||
for i, (url, title, tags) in enumerate(test_urls):
|
||||
snapshot_id = generate_uuid()
|
||||
timestamp = f'2024010{i+1}120000.000000'
|
||||
added = f'2024-01-0{i+1} 12:00:00'
|
||||
timestamp = f"2024010{i + 1}120000.000000"
|
||||
added = f"2024-01-0{i + 1} 12:00:00"
|
||||
|
||||
cursor.execute("""
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO core_snapshot (id, url, timestamp, title, tags, added, updated)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
""", (snapshot_id, url, timestamp, title, tags, added, added))
|
||||
""",
|
||||
(snapshot_id, url, timestamp, title, tags, added, added),
|
||||
)
|
||||
|
||||
created_data['snapshots'].append({
|
||||
'id': snapshot_id,
|
||||
'url': url,
|
||||
'timestamp': timestamp,
|
||||
'title': title,
|
||||
'tags': tags,
|
||||
})
|
||||
created_data['tags_str'].append(tags)
|
||||
created_data["snapshots"].append(
|
||||
{
|
||||
"id": snapshot_id,
|
||||
"url": url,
|
||||
"timestamp": timestamp,
|
||||
"title": title,
|
||||
"tags": tags,
|
||||
},
|
||||
)
|
||||
created_data["tags_str"].append(tags)
|
||||
|
||||
cursor.execute("""
|
||||
INSERT INTO django_migrations (app, name, applied)
|
||||
@@ -552,16 +557,16 @@ def seed_0_4_data(db_path: Path) -> Dict[str, List[Dict]]:
|
||||
return created_data
|
||||
|
||||
|
||||
def seed_0_7_data(db_path: Path) -> Dict[str, List[Dict]]:
|
||||
def seed_0_7_data(db_path: Path) -> dict[str, list[dict]]:
|
||||
"""Seed a 0.7.x database with realistic test data."""
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
cursor = conn.cursor()
|
||||
|
||||
created_data = {
|
||||
'users': [],
|
||||
'snapshots': [],
|
||||
'tags': [],
|
||||
'archiveresults': [],
|
||||
"users": [],
|
||||
"snapshots": [],
|
||||
"tags": [],
|
||||
"archiveresults": [],
|
||||
}
|
||||
|
||||
# Create a user
|
||||
@@ -572,125 +577,145 @@ def seed_0_7_data(db_path: Path) -> Dict[str, List[Dict]]:
|
||||
'admin@example.com', 1, 1, datetime('now'))
|
||||
""")
|
||||
user_id = cursor.lastrowid
|
||||
created_data['users'].append({'id': user_id, 'username': 'admin'})
|
||||
created_data["users"].append({"id": user_id, "username": "admin"})
|
||||
|
||||
# Create 5 tags
|
||||
tag_names = ['news', 'tech', 'blog', 'reference', 'code']
|
||||
tag_names = ["news", "tech", "blog", "reference", "code"]
|
||||
for name in tag_names:
|
||||
cursor.execute("""
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO core_tag (name, slug) VALUES (?, ?)
|
||||
""", (name, name.lower()))
|
||||
""",
|
||||
(name, name.lower()),
|
||||
)
|
||||
tag_id = cursor.lastrowid
|
||||
created_data['tags'].append({'id': tag_id, 'name': name, 'slug': name.lower()})
|
||||
created_data["tags"].append({"id": tag_id, "name": name, "slug": name.lower()})
|
||||
|
||||
# Create 5 snapshots
|
||||
test_urls = [
|
||||
('https://example.com/page1', 'Example Page 1'),
|
||||
('https://example.org/article', 'Article Title'),
|
||||
('https://github.com/user/repo', 'GitHub Repository'),
|
||||
('https://news.ycombinator.com/item?id=12345', 'HN Discussion'),
|
||||
('https://en.wikipedia.org/wiki/Test', 'Wikipedia Test'),
|
||||
("https://example.com/page1", "Example Page 1"),
|
||||
("https://example.org/article", "Article Title"),
|
||||
("https://github.com/user/repo", "GitHub Repository"),
|
||||
("https://news.ycombinator.com/item?id=12345", "HN Discussion"),
|
||||
("https://en.wikipedia.org/wiki/Test", "Wikipedia Test"),
|
||||
]
|
||||
|
||||
for i, (url, title) in enumerate(test_urls):
|
||||
snapshot_id = generate_uuid()
|
||||
timestamp = f'2024010{i+1}120000.000000'
|
||||
added = f'2024-01-0{i+1} 12:00:00'
|
||||
timestamp = f"2024010{i + 1}120000.000000"
|
||||
added = f"2024-01-0{i + 1} 12:00:00"
|
||||
|
||||
cursor.execute("""
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO core_snapshot (id, url, timestamp, title, added, updated)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
""", (snapshot_id, url, timestamp, title, added, added))
|
||||
""",
|
||||
(snapshot_id, url, timestamp, title, added, added),
|
||||
)
|
||||
|
||||
created_data['snapshots'].append({
|
||||
'id': snapshot_id,
|
||||
'url': url,
|
||||
'timestamp': timestamp,
|
||||
'title': title,
|
||||
})
|
||||
created_data["snapshots"].append(
|
||||
{
|
||||
"id": snapshot_id,
|
||||
"url": url,
|
||||
"timestamp": timestamp,
|
||||
"title": title,
|
||||
},
|
||||
)
|
||||
|
||||
# Assign 2 tags to each snapshot
|
||||
tag_ids = [created_data['tags'][i % 5]['id'], created_data['tags'][(i + 1) % 5]['id']]
|
||||
tag_ids = [created_data["tags"][i % 5]["id"], created_data["tags"][(i + 1) % 5]["id"]]
|
||||
for tag_id in tag_ids:
|
||||
cursor.execute("""
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO core_snapshot_tags (snapshot_id, tag_id) VALUES (?, ?)
|
||||
""", (snapshot_id, tag_id))
|
||||
""",
|
||||
(snapshot_id, tag_id),
|
||||
)
|
||||
|
||||
# Create 5 archive results for each snapshot
|
||||
extractors = ['title', 'favicon', 'screenshot', 'singlefile', 'wget']
|
||||
statuses = ['succeeded', 'succeeded', 'failed', 'succeeded', 'skipped']
|
||||
extractors = ["title", "favicon", "screenshot", "singlefile", "wget"]
|
||||
statuses = ["succeeded", "succeeded", "failed", "succeeded", "skipped"]
|
||||
|
||||
for j, (extractor, status) in enumerate(zip(extractors, statuses)):
|
||||
cursor.execute("""
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO core_archiveresult
|
||||
(snapshot_id, extractor, cmd, pwd, cmd_version, output, start_ts, end_ts, status)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", (
|
||||
snapshot_id, extractor,
|
||||
json.dumps([extractor, '--version']),
|
||||
f'/data/archive/{timestamp}',
|
||||
'1.0.0',
|
||||
f'{extractor}/index.html' if status == 'succeeded' else '',
|
||||
f'2024-01-0{i+1} 12:00:0{j}',
|
||||
f'2024-01-0{i+1} 12:00:1{j}',
|
||||
status
|
||||
))
|
||||
""",
|
||||
(
|
||||
snapshot_id,
|
||||
extractor,
|
||||
json.dumps([extractor, "--version"]),
|
||||
f"/data/archive/{timestamp}",
|
||||
"1.0.0",
|
||||
f"{extractor}/index.html" if status == "succeeded" else "",
|
||||
f"2024-01-0{i + 1} 12:00:0{j}",
|
||||
f"2024-01-0{i + 1} 12:00:1{j}",
|
||||
status,
|
||||
),
|
||||
)
|
||||
|
||||
created_data['archiveresults'].append({
|
||||
'snapshot_id': snapshot_id,
|
||||
'extractor': extractor,
|
||||
'status': status,
|
||||
})
|
||||
created_data["archiveresults"].append(
|
||||
{
|
||||
"snapshot_id": snapshot_id,
|
||||
"extractor": extractor,
|
||||
"status": status,
|
||||
},
|
||||
)
|
||||
|
||||
# Record migrations as applied (0.7.x migrations up to 0022)
|
||||
migrations = [
|
||||
('contenttypes', '0001_initial'),
|
||||
('contenttypes', '0002_remove_content_type_name'),
|
||||
('auth', '0001_initial'),
|
||||
('auth', '0002_alter_permission_name_max_length'),
|
||||
('auth', '0003_alter_user_email_max_length'),
|
||||
('auth', '0004_alter_user_username_opts'),
|
||||
('auth', '0005_alter_user_last_login_null'),
|
||||
('auth', '0006_require_contenttypes_0002'),
|
||||
('auth', '0007_alter_validators_add_error_messages'),
|
||||
('auth', '0008_alter_user_username_max_length'),
|
||||
('auth', '0009_alter_user_last_name_max_length'),
|
||||
('auth', '0010_alter_group_name_max_length'),
|
||||
('auth', '0011_update_proxy_permissions'),
|
||||
('auth', '0012_alter_user_first_name_max_length'),
|
||||
('admin', '0001_initial'),
|
||||
('admin', '0002_logentry_remove_auto_add'),
|
||||
('admin', '0003_logentry_add_action_flag_choices'),
|
||||
('sessions', '0001_initial'),
|
||||
('core', '0001_initial'),
|
||||
('core', '0002_auto_20200625_1521'),
|
||||
('core', '0003_auto_20200630_1034'),
|
||||
('core', '0004_auto_20200713_1552'),
|
||||
('core', '0005_auto_20200728_0326'),
|
||||
('core', '0006_auto_20201012_1520'),
|
||||
('core', '0007_archiveresult'),
|
||||
('core', '0008_auto_20210105_1421'),
|
||||
('core', '0009_auto_20210216_1038'),
|
||||
('core', '0010_auto_20210216_1055'),
|
||||
('core', '0011_auto_20210216_1331'),
|
||||
('core', '0012_auto_20210216_1425'),
|
||||
('core', '0013_auto_20210218_0729'),
|
||||
('core', '0014_auto_20210218_0729'),
|
||||
('core', '0015_auto_20210218_0730'),
|
||||
('core', '0016_auto_20210218_1204'),
|
||||
('core', '0017_auto_20210219_0211'),
|
||||
('core', '0018_auto_20210327_0952'),
|
||||
('core', '0019_auto_20210401_0654'),
|
||||
('core', '0020_auto_20210410_1031'),
|
||||
('core', '0021_auto_20220914_0934'),
|
||||
('core', '0022_auto_20231023_2008'),
|
||||
("contenttypes", "0001_initial"),
|
||||
("contenttypes", "0002_remove_content_type_name"),
|
||||
("auth", "0001_initial"),
|
||||
("auth", "0002_alter_permission_name_max_length"),
|
||||
("auth", "0003_alter_user_email_max_length"),
|
||||
("auth", "0004_alter_user_username_opts"),
|
||||
("auth", "0005_alter_user_last_login_null"),
|
||||
("auth", "0006_require_contenttypes_0002"),
|
||||
("auth", "0007_alter_validators_add_error_messages"),
|
||||
("auth", "0008_alter_user_username_max_length"),
|
||||
("auth", "0009_alter_user_last_name_max_length"),
|
||||
("auth", "0010_alter_group_name_max_length"),
|
||||
("auth", "0011_update_proxy_permissions"),
|
||||
("auth", "0012_alter_user_first_name_max_length"),
|
||||
("admin", "0001_initial"),
|
||||
("admin", "0002_logentry_remove_auto_add"),
|
||||
("admin", "0003_logentry_add_action_flag_choices"),
|
||||
("sessions", "0001_initial"),
|
||||
("core", "0001_initial"),
|
||||
("core", "0002_auto_20200625_1521"),
|
||||
("core", "0003_auto_20200630_1034"),
|
||||
("core", "0004_auto_20200713_1552"),
|
||||
("core", "0005_auto_20200728_0326"),
|
||||
("core", "0006_auto_20201012_1520"),
|
||||
("core", "0007_archiveresult"),
|
||||
("core", "0008_auto_20210105_1421"),
|
||||
("core", "0009_auto_20210216_1038"),
|
||||
("core", "0010_auto_20210216_1055"),
|
||||
("core", "0011_auto_20210216_1331"),
|
||||
("core", "0012_auto_20210216_1425"),
|
||||
("core", "0013_auto_20210218_0729"),
|
||||
("core", "0014_auto_20210218_0729"),
|
||||
("core", "0015_auto_20210218_0730"),
|
||||
("core", "0016_auto_20210218_1204"),
|
||||
("core", "0017_auto_20210219_0211"),
|
||||
("core", "0018_auto_20210327_0952"),
|
||||
("core", "0019_auto_20210401_0654"),
|
||||
("core", "0020_auto_20210410_1031"),
|
||||
("core", "0021_auto_20220914_0934"),
|
||||
("core", "0022_auto_20231023_2008"),
|
||||
]
|
||||
|
||||
for app, name in migrations:
|
||||
cursor.execute("""
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO django_migrations (app, name, applied)
|
||||
VALUES (?, ?, datetime('now'))
|
||||
""", (app, name))
|
||||
""",
|
||||
(app, name),
|
||||
)
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
@@ -698,17 +723,17 @@ def seed_0_7_data(db_path: Path) -> Dict[str, List[Dict]]:
|
||||
return created_data
|
||||
|
||||
|
||||
def seed_0_8_data(db_path: Path) -> Dict[str, List[Dict]]:
|
||||
def seed_0_8_data(db_path: Path) -> dict[str, list[dict]]:
|
||||
"""Seed a 0.8.x database with realistic test data including Crawls."""
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
cursor = conn.cursor()
|
||||
|
||||
created_data = {
|
||||
'users': [],
|
||||
'crawls': [],
|
||||
'snapshots': [],
|
||||
'tags': [],
|
||||
'archiveresults': [],
|
||||
"users": [],
|
||||
"crawls": [],
|
||||
"snapshots": [],
|
||||
"tags": [],
|
||||
"archiveresults": [],
|
||||
}
|
||||
|
||||
# Create a user
|
||||
@@ -719,243 +744,271 @@ def seed_0_8_data(db_path: Path) -> Dict[str, List[Dict]]:
|
||||
'admin@example.com', 1, 1, datetime('now'))
|
||||
""")
|
||||
user_id = cursor.lastrowid
|
||||
created_data['users'].append({'id': user_id, 'username': 'admin'})
|
||||
created_data["users"].append({"id": user_id, "username": "admin"})
|
||||
|
||||
# Create 5 tags
|
||||
tag_names = ['news', 'tech', 'blog', 'reference', 'code']
|
||||
tag_names = ["news", "tech", "blog", "reference", "code"]
|
||||
for name in tag_names:
|
||||
cursor.execute("""
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO core_tag (name, slug, created_at, modified_at, created_by_id)
|
||||
VALUES (?, ?, datetime('now'), datetime('now'), ?)
|
||||
""", (name, name.lower(), user_id))
|
||||
""",
|
||||
(name, name.lower(), user_id),
|
||||
)
|
||||
tag_id = cursor.lastrowid
|
||||
created_data['tags'].append({'id': tag_id, 'name': name, 'slug': name.lower()})
|
||||
created_data["tags"].append({"id": tag_id, "name": name, "slug": name.lower()})
|
||||
|
||||
# Create 2 Crawls (0.9.0 schema - no seeds)
|
||||
test_crawls = [
|
||||
('https://example.com\nhttps://example.org', 0, 'Example Crawl'),
|
||||
('https://github.com/ArchiveBox', 1, 'GitHub Crawl'),
|
||||
("https://example.com\nhttps://example.org", 0, "Example Crawl"),
|
||||
("https://github.com/ArchiveBox", 1, "GitHub Crawl"),
|
||||
]
|
||||
|
||||
for i, (urls, max_depth, label) in enumerate(test_crawls):
|
||||
crawl_id = generate_uuid()
|
||||
cursor.execute("""
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO crawls_crawl (id, created_at, created_by_id, modified_at, urls,
|
||||
config, max_depth, tags_str, label, status, retry_at,
|
||||
num_uses_failed, num_uses_succeeded)
|
||||
VALUES (?, datetime('now'), ?, datetime('now'), ?, '{}', ?, '', ?, 'queued', datetime('now'), 0, 0)
|
||||
""", (crawl_id, user_id, urls, max_depth, label))
|
||||
""",
|
||||
(crawl_id, user_id, urls, max_depth, label),
|
||||
)
|
||||
|
||||
created_data['crawls'].append({
|
||||
'id': crawl_id,
|
||||
'urls': urls,
|
||||
'max_depth': max_depth,
|
||||
'label': label,
|
||||
})
|
||||
created_data["crawls"].append(
|
||||
{
|
||||
"id": crawl_id,
|
||||
"urls": urls,
|
||||
"max_depth": max_depth,
|
||||
"label": label,
|
||||
},
|
||||
)
|
||||
|
||||
# Create 5 snapshots linked to crawls
|
||||
test_urls = [
|
||||
('https://example.com/page1', 'Example Page 1', created_data['crawls'][0]['id']),
|
||||
('https://example.org/article', 'Article Title', created_data['crawls'][0]['id']),
|
||||
('https://github.com/user/repo', 'GitHub Repository', created_data['crawls'][1]['id']),
|
||||
('https://news.ycombinator.com/item?id=12345', 'HN Discussion', None),
|
||||
('https://en.wikipedia.org/wiki/Test', 'Wikipedia Test', None),
|
||||
("https://example.com/page1", "Example Page 1", created_data["crawls"][0]["id"]),
|
||||
("https://example.org/article", "Article Title", created_data["crawls"][0]["id"]),
|
||||
("https://github.com/user/repo", "GitHub Repository", created_data["crawls"][1]["id"]),
|
||||
("https://news.ycombinator.com/item?id=12345", "HN Discussion", None),
|
||||
("https://en.wikipedia.org/wiki/Test", "Wikipedia Test", None),
|
||||
]
|
||||
|
||||
for i, (url, title, crawl_id) in enumerate(test_urls):
|
||||
snapshot_id = generate_uuid()
|
||||
timestamp = f'2024010{i+1}120000.000000'
|
||||
created_at = f'2024-01-0{i+1} 12:00:00'
|
||||
timestamp = f"2024010{i + 1}120000.000000"
|
||||
created_at = f"2024-01-0{i + 1} 12:00:00"
|
||||
|
||||
cursor.execute("""
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO core_snapshot (id, created_by_id, created_at, modified_at, url, timestamp,
|
||||
bookmarked_at, crawl_id, title, depth, status, config, notes)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, 0, 'queued', '{}', '')
|
||||
""", (snapshot_id, user_id, created_at, created_at, url, timestamp, created_at, crawl_id, title))
|
||||
""",
|
||||
(snapshot_id, user_id, created_at, created_at, url, timestamp, created_at, crawl_id, title),
|
||||
)
|
||||
|
||||
created_data['snapshots'].append({
|
||||
'id': snapshot_id,
|
||||
'url': url,
|
||||
'timestamp': timestamp,
|
||||
'title': title,
|
||||
'crawl_id': crawl_id,
|
||||
})
|
||||
created_data["snapshots"].append(
|
||||
{
|
||||
"id": snapshot_id,
|
||||
"url": url,
|
||||
"timestamp": timestamp,
|
||||
"title": title,
|
||||
"crawl_id": crawl_id,
|
||||
},
|
||||
)
|
||||
|
||||
# Assign 2 tags to each snapshot
|
||||
tag_ids = [created_data['tags'][i % 5]['id'], created_data['tags'][(i + 1) % 5]['id']]
|
||||
tag_ids = [created_data["tags"][i % 5]["id"], created_data["tags"][(i + 1) % 5]["id"]]
|
||||
for tag_id in tag_ids:
|
||||
cursor.execute("""
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO core_snapshot_tags (snapshot_id, tag_id) VALUES (?, ?)
|
||||
""", (snapshot_id, tag_id))
|
||||
""",
|
||||
(snapshot_id, tag_id),
|
||||
)
|
||||
|
||||
# Create 5 archive results for each snapshot
|
||||
extractors = ['title', 'favicon', 'screenshot', 'singlefile', 'wget']
|
||||
statuses = ['succeeded', 'succeeded', 'failed', 'succeeded', 'skipped']
|
||||
extractors = ["title", "favicon", "screenshot", "singlefile", "wget"]
|
||||
statuses = ["succeeded", "succeeded", "failed", "succeeded", "skipped"]
|
||||
|
||||
for j, (extractor, status) in enumerate(zip(extractors, statuses)):
|
||||
result_uuid = generate_uuid()
|
||||
cursor.execute("""
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO core_archiveresult
|
||||
(uuid, created_by_id, created_at, modified_at, snapshot_id, extractor, pwd,
|
||||
cmd, cmd_version, output, start_ts, end_ts, status, retry_at, notes, output_dir)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now'), '', ?)
|
||||
""", (
|
||||
result_uuid, user_id, f'2024-01-0{i+1} 12:00:0{j}', f'2024-01-0{i+1} 12:00:1{j}',
|
||||
snapshot_id, extractor,
|
||||
f'/data/archive/{timestamp}',
|
||||
json.dumps([extractor, '--version']),
|
||||
'1.0.0',
|
||||
f'{extractor}/index.html' if status == 'succeeded' else '',
|
||||
f'2024-01-0{i+1} 12:00:0{j}',
|
||||
f'2024-01-0{i+1} 12:00:1{j}',
|
||||
status,
|
||||
f'{extractor}',
|
||||
))
|
||||
""",
|
||||
(
|
||||
result_uuid,
|
||||
user_id,
|
||||
f"2024-01-0{i + 1} 12:00:0{j}",
|
||||
f"2024-01-0{i + 1} 12:00:1{j}",
|
||||
snapshot_id,
|
||||
extractor,
|
||||
f"/data/archive/{timestamp}",
|
||||
json.dumps([extractor, "--version"]),
|
||||
"1.0.0",
|
||||
f"{extractor}/index.html" if status == "succeeded" else "",
|
||||
f"2024-01-0{i + 1} 12:00:0{j}",
|
||||
f"2024-01-0{i + 1} 12:00:1{j}",
|
||||
status,
|
||||
f"{extractor}",
|
||||
),
|
||||
)
|
||||
|
||||
created_data['archiveresults'].append({
|
||||
'uuid': result_uuid,
|
||||
'snapshot_id': snapshot_id,
|
||||
'extractor': extractor,
|
||||
'status': status,
|
||||
})
|
||||
created_data["archiveresults"].append(
|
||||
{
|
||||
"uuid": result_uuid,
|
||||
"snapshot_id": snapshot_id,
|
||||
"extractor": extractor,
|
||||
"status": status,
|
||||
},
|
||||
)
|
||||
|
||||
# Record migrations as applied (0.8.x migrations)
|
||||
migrations = [
|
||||
('contenttypes', '0001_initial'),
|
||||
('contenttypes', '0002_remove_content_type_name'),
|
||||
('auth', '0001_initial'),
|
||||
('auth', '0002_alter_permission_name_max_length'),
|
||||
('auth', '0003_alter_user_email_max_length'),
|
||||
('auth', '0004_alter_user_username_opts'),
|
||||
('auth', '0005_alter_user_last_login_null'),
|
||||
('auth', '0006_require_contenttypes_0002'),
|
||||
('auth', '0007_alter_validators_add_error_messages'),
|
||||
('auth', '0008_alter_user_username_max_length'),
|
||||
('auth', '0009_alter_user_last_name_max_length'),
|
||||
('auth', '0010_alter_group_name_max_length'),
|
||||
('auth', '0011_update_proxy_permissions'),
|
||||
('auth', '0012_alter_user_first_name_max_length'),
|
||||
('admin', '0001_initial'),
|
||||
('admin', '0002_logentry_remove_auto_add'),
|
||||
('admin', '0003_logentry_add_action_flag_choices'),
|
||||
('sessions', '0001_initial'),
|
||||
('core', '0001_initial'),
|
||||
('core', '0002_auto_20200625_1521'),
|
||||
('core', '0003_auto_20200630_1034'),
|
||||
('core', '0004_auto_20200713_1552'),
|
||||
('core', '0005_auto_20200728_0326'),
|
||||
('core', '0006_auto_20201012_1520'),
|
||||
('core', '0007_archiveresult'),
|
||||
('core', '0008_auto_20210105_1421'),
|
||||
('core', '0009_auto_20210216_1038'),
|
||||
('core', '0010_auto_20210216_1055'),
|
||||
('core', '0011_auto_20210216_1331'),
|
||||
('core', '0012_auto_20210216_1425'),
|
||||
('core', '0013_auto_20210218_0729'),
|
||||
('core', '0014_auto_20210218_0729'),
|
||||
('core', '0015_auto_20210218_0730'),
|
||||
('core', '0016_auto_20210218_1204'),
|
||||
('core', '0017_auto_20210219_0211'),
|
||||
('core', '0018_auto_20210327_0952'),
|
||||
('core', '0019_auto_20210401_0654'),
|
||||
('core', '0020_auto_20210410_1031'),
|
||||
('core', '0021_auto_20220914_0934'),
|
||||
('core', '0022_auto_20231023_2008'),
|
||||
("contenttypes", "0001_initial"),
|
||||
("contenttypes", "0002_remove_content_type_name"),
|
||||
("auth", "0001_initial"),
|
||||
("auth", "0002_alter_permission_name_max_length"),
|
||||
("auth", "0003_alter_user_email_max_length"),
|
||||
("auth", "0004_alter_user_username_opts"),
|
||||
("auth", "0005_alter_user_last_login_null"),
|
||||
("auth", "0006_require_contenttypes_0002"),
|
||||
("auth", "0007_alter_validators_add_error_messages"),
|
||||
("auth", "0008_alter_user_username_max_length"),
|
||||
("auth", "0009_alter_user_last_name_max_length"),
|
||||
("auth", "0010_alter_group_name_max_length"),
|
||||
("auth", "0011_update_proxy_permissions"),
|
||||
("auth", "0012_alter_user_first_name_max_length"),
|
||||
("admin", "0001_initial"),
|
||||
("admin", "0002_logentry_remove_auto_add"),
|
||||
("admin", "0003_logentry_add_action_flag_choices"),
|
||||
("sessions", "0001_initial"),
|
||||
("core", "0001_initial"),
|
||||
("core", "0002_auto_20200625_1521"),
|
||||
("core", "0003_auto_20200630_1034"),
|
||||
("core", "0004_auto_20200713_1552"),
|
||||
("core", "0005_auto_20200728_0326"),
|
||||
("core", "0006_auto_20201012_1520"),
|
||||
("core", "0007_archiveresult"),
|
||||
("core", "0008_auto_20210105_1421"),
|
||||
("core", "0009_auto_20210216_1038"),
|
||||
("core", "0010_auto_20210216_1055"),
|
||||
("core", "0011_auto_20210216_1331"),
|
||||
("core", "0012_auto_20210216_1425"),
|
||||
("core", "0013_auto_20210218_0729"),
|
||||
("core", "0014_auto_20210218_0729"),
|
||||
("core", "0015_auto_20210218_0730"),
|
||||
("core", "0016_auto_20210218_1204"),
|
||||
("core", "0017_auto_20210219_0211"),
|
||||
("core", "0018_auto_20210327_0952"),
|
||||
("core", "0019_auto_20210401_0654"),
|
||||
("core", "0020_auto_20210410_1031"),
|
||||
("core", "0021_auto_20220914_0934"),
|
||||
("core", "0022_auto_20231023_2008"),
|
||||
# For 0.8.x (dev branch), record the migrations that 0023_new_schema replaces
|
||||
('core', '0023_alter_archiveresult_options_archiveresult_abid_and_more'),
|
||||
('core', '0024_auto_20240513_1143'),
|
||||
('core', '0025_alter_archiveresult_uuid'),
|
||||
('core', '0026_archiveresult_created_archiveresult_created_by_and_more'),
|
||||
('core', '0027_update_snapshot_ids'),
|
||||
('core', '0028_alter_archiveresult_uuid'),
|
||||
('core', '0029_alter_archiveresult_id'),
|
||||
('core', '0030_alter_archiveresult_uuid'),
|
||||
('core', '0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more'),
|
||||
('core', '0032_alter_archiveresult_id'),
|
||||
('core', '0033_rename_id_archiveresult_old_id'),
|
||||
('core', '0034_alter_archiveresult_old_id_alter_archiveresult_uuid'),
|
||||
('core', '0035_remove_archiveresult_uuid_archiveresult_id'),
|
||||
('core', '0036_alter_archiveresult_id_alter_archiveresult_old_id'),
|
||||
('core', '0037_rename_id_snapshot_old_id'),
|
||||
('core', '0038_rename_uuid_snapshot_id'),
|
||||
('core', '0039_rename_snapshot_archiveresult_snapshot_old'),
|
||||
('core', '0040_archiveresult_snapshot'),
|
||||
('core', '0041_alter_archiveresult_snapshot_and_more'),
|
||||
('core', '0042_remove_archiveresult_snapshot_old'),
|
||||
('core', '0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
|
||||
('core', '0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more'),
|
||||
('core', '0045_alter_snapshot_old_id'),
|
||||
('core', '0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
|
||||
('core', '0047_alter_snapshottag_unique_together_and_more'),
|
||||
('core', '0048_alter_archiveresult_snapshot_and_more'),
|
||||
('core', '0049_rename_snapshot_snapshottag_snapshot_old_and_more'),
|
||||
('core', '0050_alter_snapshottag_snapshot_old'),
|
||||
('core', '0051_snapshottag_snapshot_alter_snapshottag_snapshot_old'),
|
||||
('core', '0052_alter_snapshottag_unique_together_and_more'),
|
||||
('core', '0053_remove_snapshottag_snapshot_old'),
|
||||
('core', '0054_alter_snapshot_timestamp'),
|
||||
('core', '0055_alter_tag_slug'),
|
||||
('core', '0056_remove_tag_uuid'),
|
||||
('core', '0057_rename_id_tag_old_id'),
|
||||
('core', '0058_alter_tag_old_id'),
|
||||
('core', '0059_tag_id'),
|
||||
('core', '0060_alter_tag_id'),
|
||||
('core', '0061_rename_tag_snapshottag_old_tag_and_more'),
|
||||
('core', '0062_alter_snapshottag_old_tag'),
|
||||
('core', '0063_snapshottag_tag_alter_snapshottag_old_tag'),
|
||||
('core', '0064_alter_snapshottag_unique_together_and_more'),
|
||||
('core', '0065_remove_snapshottag_old_tag'),
|
||||
('core', '0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id'),
|
||||
('core', '0067_alter_snapshottag_tag'),
|
||||
('core', '0068_alter_archiveresult_options'),
|
||||
('core', '0069_alter_archiveresult_created_alter_snapshot_added_and_more'),
|
||||
('core', '0070_alter_archiveresult_created_by_alter_snapshot_added_and_more'),
|
||||
('core', '0071_remove_archiveresult_old_id_remove_snapshot_old_id_and_more'),
|
||||
('core', '0072_rename_added_snapshot_bookmarked_at_and_more'),
|
||||
('core', '0073_rename_created_archiveresult_created_at_and_more'),
|
||||
('core', '0074_alter_snapshot_downloaded_at'),
|
||||
("core", "0023_alter_archiveresult_options_archiveresult_abid_and_more"),
|
||||
("core", "0024_auto_20240513_1143"),
|
||||
("core", "0025_alter_archiveresult_uuid"),
|
||||
("core", "0026_archiveresult_created_archiveresult_created_by_and_more"),
|
||||
("core", "0027_update_snapshot_ids"),
|
||||
("core", "0028_alter_archiveresult_uuid"),
|
||||
("core", "0029_alter_archiveresult_id"),
|
||||
("core", "0030_alter_archiveresult_uuid"),
|
||||
("core", "0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more"),
|
||||
("core", "0032_alter_archiveresult_id"),
|
||||
("core", "0033_rename_id_archiveresult_old_id"),
|
||||
("core", "0034_alter_archiveresult_old_id_alter_archiveresult_uuid"),
|
||||
("core", "0035_remove_archiveresult_uuid_archiveresult_id"),
|
||||
("core", "0036_alter_archiveresult_id_alter_archiveresult_old_id"),
|
||||
("core", "0037_rename_id_snapshot_old_id"),
|
||||
("core", "0038_rename_uuid_snapshot_id"),
|
||||
("core", "0039_rename_snapshot_archiveresult_snapshot_old"),
|
||||
("core", "0040_archiveresult_snapshot"),
|
||||
("core", "0041_alter_archiveresult_snapshot_and_more"),
|
||||
("core", "0042_remove_archiveresult_snapshot_old"),
|
||||
("core", "0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more"),
|
||||
("core", "0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more"),
|
||||
("core", "0045_alter_snapshot_old_id"),
|
||||
("core", "0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more"),
|
||||
("core", "0047_alter_snapshottag_unique_together_and_more"),
|
||||
("core", "0048_alter_archiveresult_snapshot_and_more"),
|
||||
("core", "0049_rename_snapshot_snapshottag_snapshot_old_and_more"),
|
||||
("core", "0050_alter_snapshottag_snapshot_old"),
|
||||
("core", "0051_snapshottag_snapshot_alter_snapshottag_snapshot_old"),
|
||||
("core", "0052_alter_snapshottag_unique_together_and_more"),
|
||||
("core", "0053_remove_snapshottag_snapshot_old"),
|
||||
("core", "0054_alter_snapshot_timestamp"),
|
||||
("core", "0055_alter_tag_slug"),
|
||||
("core", "0056_remove_tag_uuid"),
|
||||
("core", "0057_rename_id_tag_old_id"),
|
||||
("core", "0058_alter_tag_old_id"),
|
||||
("core", "0059_tag_id"),
|
||||
("core", "0060_alter_tag_id"),
|
||||
("core", "0061_rename_tag_snapshottag_old_tag_and_more"),
|
||||
("core", "0062_alter_snapshottag_old_tag"),
|
||||
("core", "0063_snapshottag_tag_alter_snapshottag_old_tag"),
|
||||
("core", "0064_alter_snapshottag_unique_together_and_more"),
|
||||
("core", "0065_remove_snapshottag_old_tag"),
|
||||
("core", "0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id"),
|
||||
("core", "0067_alter_snapshottag_tag"),
|
||||
("core", "0068_alter_archiveresult_options"),
|
||||
("core", "0069_alter_archiveresult_created_alter_snapshot_added_and_more"),
|
||||
("core", "0070_alter_archiveresult_created_by_alter_snapshot_added_and_more"),
|
||||
("core", "0071_remove_archiveresult_old_id_remove_snapshot_old_id_and_more"),
|
||||
("core", "0072_rename_added_snapshot_bookmarked_at_and_more"),
|
||||
("core", "0073_rename_created_archiveresult_created_at_and_more"),
|
||||
("core", "0074_alter_snapshot_downloaded_at"),
|
||||
# For 0.8.x: DO NOT record 0023_new_schema - it replaces 0023-0074 for fresh installs
|
||||
# We already recorded 0023-0074 above, so Django will know the state
|
||||
# For 0.8.x: Record original machine migrations (before squashing)
|
||||
# DO NOT record 0001_squashed here - it replaces 0001-0004 for fresh installs
|
||||
('machine', '0001_initial'),
|
||||
('machine', '0002_alter_machine_stats_installedbinary'),
|
||||
('machine', '0003_alter_installedbinary_options_and_more'),
|
||||
('machine', '0004_alter_installedbinary_abspath_and_more'),
|
||||
("machine", "0001_initial"),
|
||||
("machine", "0002_alter_machine_stats_installedbinary"),
|
||||
("machine", "0003_alter_installedbinary_options_and_more"),
|
||||
("machine", "0004_alter_installedbinary_abspath_and_more"),
|
||||
# Then the new migrations after squashing
|
||||
('machine', '0002_rename_custom_cmds_to_overrides'),
|
||||
('machine', '0003_alter_dependency_id_alter_installedbinary_dependency_and_more'),
|
||||
('machine', '0004_drop_dependency_table'),
|
||||
("machine", "0002_rename_custom_cmds_to_overrides"),
|
||||
("machine", "0003_alter_dependency_id_alter_installedbinary_dependency_and_more"),
|
||||
("machine", "0004_drop_dependency_table"),
|
||||
# Crawls must come before core.0024 because 0024_b depends on it
|
||||
('crawls', '0001_initial'),
|
||||
("crawls", "0001_initial"),
|
||||
# Core 0024 migrations chain (in dependency order)
|
||||
('core', '0024_b_clear_config_fields'),
|
||||
('core', '0024_c_disable_fk_checks'),
|
||||
('core', '0024_d_fix_crawls_config'),
|
||||
('core', '0024_snapshot_crawl'),
|
||||
('core', '0024_f_add_snapshot_config'),
|
||||
('core', '0025_allow_duplicate_urls_per_crawl'),
|
||||
("core", "0024_b_clear_config_fields"),
|
||||
("core", "0024_c_disable_fk_checks"),
|
||||
("core", "0024_d_fix_crawls_config"),
|
||||
("core", "0024_snapshot_crawl"),
|
||||
("core", "0024_f_add_snapshot_config"),
|
||||
("core", "0025_allow_duplicate_urls_per_crawl"),
|
||||
# For 0.8.x: Record original api migration (before squashing)
|
||||
# DO NOT record 0001_squashed here - it replaces 0001 for fresh installs
|
||||
('api', '0001_initial'),
|
||||
('api', '0002_alter_apitoken_options'),
|
||||
('api', '0003_rename_user_apitoken_created_by_apitoken_abid_and_more'),
|
||||
('api', '0004_alter_apitoken_id_alter_apitoken_uuid'),
|
||||
('api', '0005_remove_apitoken_uuid_remove_outboundwebhook_uuid_and_more'),
|
||||
('api', '0006_remove_outboundwebhook_uuid_apitoken_id_and_more'),
|
||||
('api', '0007_alter_apitoken_created_by'),
|
||||
('api', '0008_alter_apitoken_created_alter_apitoken_created_by_and_more'),
|
||||
('api', '0009_rename_created_apitoken_created_at_and_more'),
|
||||
("api", "0001_initial"),
|
||||
("api", "0002_alter_apitoken_options"),
|
||||
("api", "0003_rename_user_apitoken_created_by_apitoken_abid_and_more"),
|
||||
("api", "0004_alter_apitoken_id_alter_apitoken_uuid"),
|
||||
("api", "0005_remove_apitoken_uuid_remove_outboundwebhook_uuid_and_more"),
|
||||
("api", "0006_remove_outboundwebhook_uuid_apitoken_id_and_more"),
|
||||
("api", "0007_alter_apitoken_created_by"),
|
||||
("api", "0008_alter_apitoken_created_alter_apitoken_created_by_and_more"),
|
||||
("api", "0009_rename_created_apitoken_created_at_and_more"),
|
||||
# Note: crawls.0001_initial moved earlier (before core.0024) due to dependencies
|
||||
# Stop here - 0.8.x ends at core.0025, crawls.0001, and we want to TEST the later migrations
|
||||
# Do NOT record 0026+ as they need to be tested during migration
|
||||
]
|
||||
|
||||
for app, name in migrations:
|
||||
cursor.execute("""
|
||||
cursor.execute(
|
||||
"""
|
||||
INSERT INTO django_migrations (app, name, applied)
|
||||
VALUES (?, ?, datetime('now'))
|
||||
""", (app, name))
|
||||
""",
|
||||
(app, name),
|
||||
)
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
@@ -967,33 +1020,34 @@ def seed_0_8_data(db_path: Path) -> Dict[str, List[Dict]]:
|
||||
# Helper Functions
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def run_archivebox(data_dir: Path, args: list, timeout: int = 60, env: dict | None = None) -> subprocess.CompletedProcess:
|
||||
"""Run archivebox command in subprocess with given data directory."""
|
||||
base_env = os.environ.copy()
|
||||
base_env['DATA_DIR'] = str(data_dir)
|
||||
base_env['USE_COLOR'] = 'False'
|
||||
base_env['SHOW_PROGRESS'] = 'False'
|
||||
base_env["DATA_DIR"] = str(data_dir)
|
||||
base_env["USE_COLOR"] = "False"
|
||||
base_env["SHOW_PROGRESS"] = "False"
|
||||
# Disable ALL extractors for faster tests (can be overridden by env parameter)
|
||||
base_env['SAVE_ARCHIVEDOTORG'] = 'False'
|
||||
base_env['SAVE_TITLE'] = 'False'
|
||||
base_env['SAVE_FAVICON'] = 'False'
|
||||
base_env['SAVE_WGET'] = 'False'
|
||||
base_env['SAVE_SINGLEFILE'] = 'False'
|
||||
base_env['SAVE_SCREENSHOT'] = 'False'
|
||||
base_env['SAVE_PDF'] = 'False'
|
||||
base_env['SAVE_DOM'] = 'False'
|
||||
base_env['SAVE_READABILITY'] = 'False'
|
||||
base_env['SAVE_MERCURY'] = 'False'
|
||||
base_env['SAVE_GIT'] = 'False'
|
||||
base_env['SAVE_YTDLP'] = 'False'
|
||||
base_env['SAVE_HEADERS'] = 'False'
|
||||
base_env['SAVE_HTMLTOTEXT'] = 'False'
|
||||
base_env["SAVE_ARCHIVEDOTORG"] = "False"
|
||||
base_env["SAVE_TITLE"] = "False"
|
||||
base_env["SAVE_FAVICON"] = "False"
|
||||
base_env["SAVE_WGET"] = "False"
|
||||
base_env["SAVE_SINGLEFILE"] = "False"
|
||||
base_env["SAVE_SCREENSHOT"] = "False"
|
||||
base_env["SAVE_PDF"] = "False"
|
||||
base_env["SAVE_DOM"] = "False"
|
||||
base_env["SAVE_READABILITY"] = "False"
|
||||
base_env["SAVE_MERCURY"] = "False"
|
||||
base_env["SAVE_GIT"] = "False"
|
||||
base_env["SAVE_YTDLP"] = "False"
|
||||
base_env["SAVE_HEADERS"] = "False"
|
||||
base_env["SAVE_HTMLTOTEXT"] = "False"
|
||||
|
||||
# Override with any custom env vars
|
||||
if env:
|
||||
base_env.update(env)
|
||||
|
||||
cmd = [sys.executable, '-m', 'archivebox'] + args
|
||||
cmd = [sys.executable, "-m", "archivebox"] + args
|
||||
|
||||
return subprocess.run(
|
||||
cmd,
|
||||
@@ -1007,12 +1061,12 @@ def run_archivebox(data_dir: Path, args: list, timeout: int = 60, env: dict | No
|
||||
|
||||
def create_data_dir_structure(data_dir: Path):
|
||||
"""Create the basic ArchiveBox data directory structure."""
|
||||
(data_dir / 'archive').mkdir(parents=True, exist_ok=True)
|
||||
(data_dir / 'sources').mkdir(parents=True, exist_ok=True)
|
||||
(data_dir / 'logs').mkdir(parents=True, exist_ok=True)
|
||||
(data_dir / "archive").mkdir(parents=True, exist_ok=True)
|
||||
(data_dir / "sources").mkdir(parents=True, exist_ok=True)
|
||||
(data_dir / "logs").mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def verify_snapshot_count(db_path: Path, expected: int) -> Tuple[bool, str]:
|
||||
def verify_snapshot_count(db_path: Path, expected: int) -> tuple[bool, str]:
|
||||
"""Verify the number of snapshots in the database."""
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
cursor = conn.cursor()
|
||||
@@ -1025,7 +1079,7 @@ def verify_snapshot_count(db_path: Path, expected: int) -> Tuple[bool, str]:
|
||||
return False, f"Snapshot count mismatch: expected {expected}, got {count}"
|
||||
|
||||
|
||||
def verify_tag_count(db_path: Path, expected: int) -> Tuple[bool, str]:
|
||||
def verify_tag_count(db_path: Path, expected: int) -> tuple[bool, str]:
|
||||
"""Verify the number of tags in the database (exact match)."""
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
cursor = conn.cursor()
|
||||
@@ -1038,7 +1092,7 @@ def verify_tag_count(db_path: Path, expected: int) -> Tuple[bool, str]:
|
||||
return False, f"Tag count mismatch: expected {expected}, got {count}"
|
||||
|
||||
|
||||
def verify_archiveresult_count(db_path: Path, expected: int) -> Tuple[bool, str]:
|
||||
def verify_archiveresult_count(db_path: Path, expected: int) -> tuple[bool, str]:
|
||||
"""Verify the number of archive results in the database."""
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
cursor = conn.cursor()
|
||||
@@ -1051,7 +1105,7 @@ def verify_archiveresult_count(db_path: Path, expected: int) -> Tuple[bool, str]
|
||||
return False, f"ArchiveResult count mismatch: expected {expected}, got {count}"
|
||||
|
||||
|
||||
def verify_snapshot_urls(db_path: Path, expected_urls: List[str]) -> Tuple[bool, str]:
|
||||
def verify_snapshot_urls(db_path: Path, expected_urls: list[str]) -> tuple[bool, str]:
|
||||
"""Verify ALL expected URLs exist in snapshots."""
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
cursor = conn.cursor()
|
||||
@@ -1065,7 +1119,7 @@ def verify_snapshot_urls(db_path: Path, expected_urls: List[str]) -> Tuple[bool,
|
||||
return False, f"Missing URLs: {missing}"
|
||||
|
||||
|
||||
def verify_snapshot_titles(db_path: Path, expected_titles: Dict[str, str]) -> Tuple[bool, str]:
|
||||
def verify_snapshot_titles(db_path: Path, expected_titles: dict[str, str]) -> tuple[bool, str]:
|
||||
"""Verify ALL snapshot titles are preserved."""
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
cursor = conn.cursor()
|
||||
@@ -1085,7 +1139,7 @@ def verify_snapshot_titles(db_path: Path, expected_titles: Dict[str, str]) -> Tu
|
||||
return False, f"Title mismatches: {mismatches}"
|
||||
|
||||
|
||||
def verify_foreign_keys(db_path: Path) -> Tuple[bool, str]:
|
||||
def verify_foreign_keys(db_path: Path) -> tuple[bool, str]:
|
||||
"""Verify foreign key relationships are intact."""
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
cursor = conn.cursor()
|
||||
@@ -1104,21 +1158,21 @@ def verify_foreign_keys(db_path: Path) -> Tuple[bool, str]:
|
||||
return False, f"Found {orphaned_results} orphaned ArchiveResults"
|
||||
|
||||
|
||||
def verify_all_snapshots_in_output(output: str, snapshots: List[Dict]) -> Tuple[bool, str]:
|
||||
def verify_all_snapshots_in_output(output: str, snapshots: list[dict]) -> tuple[bool, str]:
|
||||
"""Verify ALL snapshots appear in command output (not just one)."""
|
||||
missing = []
|
||||
for snapshot in snapshots:
|
||||
url_fragment = snapshot['url'][:30]
|
||||
title = snapshot.get('title', '')
|
||||
url_fragment = snapshot["url"][:30]
|
||||
title = snapshot.get("title", "")
|
||||
if url_fragment not in output and (not title or title not in output):
|
||||
missing.append(snapshot['url'])
|
||||
missing.append(snapshot["url"])
|
||||
|
||||
if not missing:
|
||||
return True, "All snapshots found in output"
|
||||
return False, f"Missing snapshots in output: {missing}"
|
||||
|
||||
|
||||
def verify_crawl_count(db_path: Path, expected: int) -> Tuple[bool, str]:
|
||||
def verify_crawl_count(db_path: Path, expected: int) -> tuple[bool, str]:
|
||||
"""Verify the number of crawls in the database."""
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
cursor = conn.cursor()
|
||||
@@ -1131,7 +1185,7 @@ def verify_crawl_count(db_path: Path, expected: int) -> Tuple[bool, str]:
|
||||
return False, f"Crawl count mismatch: expected {expected}, got {count}"
|
||||
|
||||
|
||||
def verify_process_migration(db_path: Path, expected_archiveresult_count: int) -> Tuple[bool, str]:
|
||||
def verify_process_migration(db_path: Path, expected_archiveresult_count: int) -> tuple[bool, str]:
|
||||
"""
|
||||
Verify that ArchiveResults were properly migrated to Process records.
|
||||
|
||||
@@ -1170,13 +1224,13 @@ def verify_process_migration(db_path: Path, expected_archiveresult_count: int) -
|
||||
status_errors = []
|
||||
for ar_status, p_status, p_exit_code in cursor.fetchall():
|
||||
expected_p_status, expected_exit_code = {
|
||||
'queued': ('queued', None),
|
||||
'started': ('running', None),
|
||||
'backoff': ('queued', None),
|
||||
'succeeded': ('exited', 0),
|
||||
'failed': ('exited', 1),
|
||||
'skipped': ('exited', None),
|
||||
}.get(ar_status, ('queued', None))
|
||||
"queued": ("queued", None),
|
||||
"started": ("running", None),
|
||||
"backoff": ("queued", None),
|
||||
"succeeded": ("exited", 0),
|
||||
"failed": ("exited", 1),
|
||||
"skipped": ("exited", None),
|
||||
}.get(ar_status, ("queued", None))
|
||||
|
||||
if p_status != expected_p_status:
|
||||
status_errors.append(f"AR status {ar_status} → Process {p_status}, expected {expected_p_status}")
|
||||
|
||||
@@ -12,48 +12,50 @@ from archivebox.crawls.models import Crawl
|
||||
pytestmark = pytest.mark.django_db
|
||||
|
||||
User = get_user_model()
|
||||
WEB_HOST = 'web.archivebox.localhost:8000'
|
||||
ADMIN_HOST = 'admin.archivebox.localhost:8000'
|
||||
WEB_HOST = "web.archivebox.localhost:8000"
|
||||
ADMIN_HOST = "admin.archivebox.localhost:8000"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def admin_user(db):
|
||||
return User.objects.create_superuser(
|
||||
username='addviewadmin',
|
||||
email='addviewadmin@test.com',
|
||||
password='testpassword',
|
||||
username="addviewadmin",
|
||||
email="addviewadmin@test.com",
|
||||
password="testpassword",
|
||||
)
|
||||
|
||||
|
||||
def test_add_view_renders_tag_editor_and_url_filter_fields(client, admin_user, monkeypatch):
|
||||
monkeypatch.setattr(SERVER_CONFIG, 'PUBLIC_ADD_VIEW', True)
|
||||
monkeypatch.setattr(SERVER_CONFIG, "PUBLIC_ADD_VIEW", True)
|
||||
|
||||
response = client.get(reverse('add'), HTTP_HOST=WEB_HOST)
|
||||
response = client.get(reverse("add"), HTTP_HOST=WEB_HOST)
|
||||
body = response.content.decode()
|
||||
|
||||
assert response.status_code == 200
|
||||
assert 'tag-editor-container' in body
|
||||
assert "tag-editor-container" in body
|
||||
assert 'name="url_filters_allowlist"' in body
|
||||
assert 'name="url_filters_denylist"' in body
|
||||
assert 'Same domain only' in body
|
||||
assert "Same domain only" in body
|
||||
assert 'name="persona"' in body
|
||||
assert 'Overwrite existing snapshots' not in body
|
||||
assert 'Update/retry previously failed URLs' not in body
|
||||
assert 'Index only dry run (add crawl but don't archive yet)' in body
|
||||
assert "Overwrite existing snapshots" not in body
|
||||
assert "Update/retry previously failed URLs" not in body
|
||||
assert "Index only dry run (add crawl but don't archive yet)" in body
|
||||
assert 'name="notes"' in body
|
||||
assert 'name="max_urls"' in body
|
||||
assert 'name="max_size"' in body
|
||||
assert '<input type="text" name="notes"' in body
|
||||
assert body.index('name="persona"') < body.index('<h3>Crawl Plugins</h3>')
|
||||
assert 'data-url-regex=' in body
|
||||
assert body.index('name="persona"') < body.index("<h3>Crawl Plugins</h3>")
|
||||
assert "data-url-regex=" in body
|
||||
assert 'id="url-highlight-layer"' in body
|
||||
assert 'id="detected-urls-list"' in body
|
||||
assert 'detected-url-toggle-btn' in body
|
||||
assert "detected-url-toggle-btn" in body
|
||||
|
||||
|
||||
def test_add_view_checks_configured_search_backend_by_default(client, monkeypatch):
|
||||
monkeypatch.setattr(SERVER_CONFIG, 'PUBLIC_ADD_VIEW', True)
|
||||
monkeypatch.setattr(SEARCH_BACKEND_CONFIG, 'SEARCH_BACKEND_ENGINE', 'sqlite')
|
||||
monkeypatch.setattr(SERVER_CONFIG, "PUBLIC_ADD_VIEW", True)
|
||||
monkeypatch.setattr(SEARCH_BACKEND_CONFIG, "SEARCH_BACKEND_ENGINE", "sqlite")
|
||||
|
||||
response = client.get(reverse('add'), HTTP_HOST=WEB_HOST)
|
||||
response = client.get(reverse("add"), HTTP_HOST=WEB_HOST)
|
||||
body = response.content.decode()
|
||||
|
||||
assert response.status_code == 200
|
||||
@@ -65,99 +67,181 @@ def test_add_view_checks_configured_search_backend_by_default(client, monkeypatc
|
||||
|
||||
|
||||
def test_add_view_creates_crawl_with_tag_and_url_filter_overrides(client, admin_user, monkeypatch):
|
||||
monkeypatch.setattr(SERVER_CONFIG, 'PUBLIC_ADD_VIEW', True)
|
||||
monkeypatch.setattr(SERVER_CONFIG, "PUBLIC_ADD_VIEW", True)
|
||||
client.force_login(admin_user)
|
||||
|
||||
response = client.post(
|
||||
reverse('add'),
|
||||
reverse("add"),
|
||||
data={
|
||||
'url': 'https://example.com\nhttps://cdn.example.com/asset.js',
|
||||
'tag': 'alpha,beta',
|
||||
'depth': '1',
|
||||
'url_filters_allowlist': 'example.com\n*.example.com',
|
||||
'url_filters_denylist': 'cdn.example.com',
|
||||
'notes': 'Created from /add/',
|
||||
'schedule': '',
|
||||
'persona': 'Default',
|
||||
'index_only': '',
|
||||
'config': '{}',
|
||||
"url": "https://example.com\nhttps://cdn.example.com/asset.js",
|
||||
"tag": "alpha,beta",
|
||||
"depth": "1",
|
||||
"max_urls": "3",
|
||||
"max_size": "45mb",
|
||||
"url_filters_allowlist": "example.com\n*.example.com",
|
||||
"url_filters_denylist": "cdn.example.com",
|
||||
"notes": "Created from /add/",
|
||||
"schedule": "",
|
||||
"persona": "Default",
|
||||
"index_only": "",
|
||||
"config": "{}",
|
||||
},
|
||||
HTTP_HOST=WEB_HOST,
|
||||
)
|
||||
|
||||
assert response.status_code == 302
|
||||
|
||||
crawl = Crawl.objects.order_by('-created_at').first()
|
||||
crawl = Crawl.objects.order_by("-created_at").first()
|
||||
assert crawl is not None
|
||||
assert crawl.tags_str == 'alpha,beta'
|
||||
assert crawl.notes == 'Created from /add/'
|
||||
assert crawl.config.get('DEFAULT_PERSONA') == 'Default'
|
||||
assert crawl.config['URL_ALLOWLIST'] == 'example.com\n*.example.com'
|
||||
assert crawl.config['URL_DENYLIST'] == 'cdn.example.com'
|
||||
assert 'OVERWRITE' not in crawl.config
|
||||
assert 'ONLY_NEW' not in crawl.config
|
||||
assert crawl.tags_str == "alpha,beta"
|
||||
assert crawl.notes == "Created from /add/"
|
||||
assert crawl.max_urls == 3
|
||||
assert crawl.max_size == 45 * 1024 * 1024
|
||||
assert crawl.config.get("DEFAULT_PERSONA") == "Default"
|
||||
assert crawl.config["MAX_URLS"] == 3
|
||||
assert crawl.config["MAX_SIZE"] == 45 * 1024 * 1024
|
||||
assert crawl.config["URL_ALLOWLIST"] == "example.com\n*.example.com"
|
||||
assert crawl.config["URL_DENYLIST"] == "cdn.example.com"
|
||||
assert "OVERWRITE" not in crawl.config
|
||||
assert "ONLY_NEW" not in crawl.config
|
||||
|
||||
|
||||
def test_add_view_starts_background_runner_after_creating_crawl(client, admin_user, monkeypatch):
|
||||
monkeypatch.setattr(SERVER_CONFIG, "PUBLIC_ADD_VIEW", True)
|
||||
client.force_login(admin_user)
|
||||
|
||||
runner_calls = []
|
||||
monkeypatch.setattr("archivebox.services.runner.ensure_background_runner", lambda: runner_calls.append(True) or True)
|
||||
|
||||
response = client.post(
|
||||
reverse("add"),
|
||||
data={
|
||||
"url": "https://example.com",
|
||||
"tag": "",
|
||||
"depth": "0",
|
||||
"max_urls": "0",
|
||||
"max_size": "0",
|
||||
"url_filters_allowlist": "",
|
||||
"url_filters_denylist": "",
|
||||
"notes": "",
|
||||
"schedule": "",
|
||||
"persona": "Default",
|
||||
"index_only": "",
|
||||
"config": "{}",
|
||||
},
|
||||
HTTP_HOST=WEB_HOST,
|
||||
)
|
||||
|
||||
assert response.status_code == 302
|
||||
assert runner_calls == [True]
|
||||
|
||||
|
||||
def test_add_view_extracts_urls_from_mixed_text_input(client, admin_user, monkeypatch):
|
||||
monkeypatch.setattr(SERVER_CONFIG, 'PUBLIC_ADD_VIEW', True)
|
||||
monkeypatch.setattr(SERVER_CONFIG, "PUBLIC_ADD_VIEW", True)
|
||||
client.force_login(admin_user)
|
||||
|
||||
response = client.post(
|
||||
reverse('add'),
|
||||
reverse("add"),
|
||||
data={
|
||||
'url': '\n'.join([
|
||||
'https://sweeting.me,https://google.com',
|
||||
'Notes: [ArchiveBox](https://github.com/ArchiveBox/ArchiveBox), https://news.ycombinator.com',
|
||||
'[Wiki](https://en.wikipedia.org/wiki/Classification_(machine_learning))',
|
||||
'{"items":["https://example.com/three"]}',
|
||||
'csv,https://example.com/four',
|
||||
]),
|
||||
'tag': '',
|
||||
'depth': '0',
|
||||
'url_filters_allowlist': '',
|
||||
'url_filters_denylist': '',
|
||||
'notes': '',
|
||||
'schedule': '',
|
||||
'persona': 'Default',
|
||||
'index_only': '',
|
||||
'config': '{}',
|
||||
"url": "\n".join(
|
||||
[
|
||||
"https://sweeting.me,https://google.com",
|
||||
"Notes: [ArchiveBox](https://github.com/ArchiveBox/ArchiveBox), https://news.ycombinator.com",
|
||||
"[Wiki](https://en.wikipedia.org/wiki/Classification_(machine_learning))",
|
||||
'{"items":["https://example.com/three"]}',
|
||||
"csv,https://example.com/four",
|
||||
],
|
||||
),
|
||||
"tag": "",
|
||||
"depth": "0",
|
||||
"max_urls": "0",
|
||||
"max_size": "0",
|
||||
"url_filters_allowlist": "",
|
||||
"url_filters_denylist": "",
|
||||
"notes": "",
|
||||
"schedule": "",
|
||||
"persona": "Default",
|
||||
"index_only": "",
|
||||
"config": "{}",
|
||||
},
|
||||
HTTP_HOST=WEB_HOST,
|
||||
)
|
||||
|
||||
assert response.status_code == 302
|
||||
|
||||
crawl = Crawl.objects.order_by('-created_at').first()
|
||||
crawl = Crawl.objects.order_by("-created_at").first()
|
||||
assert crawl is not None
|
||||
assert crawl.urls == '\n'.join([
|
||||
'https://sweeting.me',
|
||||
'https://google.com',
|
||||
'https://github.com/ArchiveBox/ArchiveBox',
|
||||
'https://news.ycombinator.com',
|
||||
'https://en.wikipedia.org/wiki/Classification_(machine_learning)',
|
||||
'https://example.com/three',
|
||||
'https://example.com/four',
|
||||
])
|
||||
assert crawl.urls == "\n".join(
|
||||
[
|
||||
"https://sweeting.me",
|
||||
"https://google.com",
|
||||
"https://github.com/ArchiveBox/ArchiveBox",
|
||||
"https://news.ycombinator.com",
|
||||
"https://en.wikipedia.org/wiki/Classification_(machine_learning)",
|
||||
"https://example.com/three",
|
||||
"https://example.com/four",
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def test_add_view_trims_trailing_punctuation_from_markdown_urls(client, admin_user, monkeypatch):
|
||||
monkeypatch.setattr(SERVER_CONFIG, "PUBLIC_ADD_VIEW", True)
|
||||
client.force_login(admin_user)
|
||||
|
||||
response = client.post(
|
||||
reverse("add"),
|
||||
data={
|
||||
"url": "\n".join(
|
||||
[
|
||||
"Docs: https://github.com/ArchiveBox/ArchiveBox.",
|
||||
"Issue: https://github.com/abc?abc#234234?.",
|
||||
],
|
||||
),
|
||||
"tag": "",
|
||||
"depth": "0",
|
||||
"max_urls": "0",
|
||||
"max_size": "0",
|
||||
"url_filters_allowlist": "",
|
||||
"url_filters_denylist": "",
|
||||
"notes": "",
|
||||
"schedule": "",
|
||||
"persona": "Default",
|
||||
"index_only": "",
|
||||
"config": "{}",
|
||||
},
|
||||
HTTP_HOST=WEB_HOST,
|
||||
)
|
||||
|
||||
assert response.status_code == 302
|
||||
|
||||
crawl = Crawl.objects.order_by("-created_at").first()
|
||||
assert crawl is not None
|
||||
assert crawl.urls == "\n".join(
|
||||
[
|
||||
"https://github.com/ArchiveBox/ArchiveBox",
|
||||
"https://github.com/abc?abc#234234",
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def test_add_view_exposes_api_token_for_tag_widget_autocomplete(client, admin_user, monkeypatch):
|
||||
monkeypatch.setattr(SERVER_CONFIG, 'PUBLIC_ADD_VIEW', True)
|
||||
monkeypatch.setattr(SERVER_CONFIG, "PUBLIC_ADD_VIEW", True)
|
||||
client.force_login(admin_user)
|
||||
|
||||
response = client.get(reverse('add'), HTTP_HOST=WEB_HOST)
|
||||
response = client.get(reverse("add"), HTTP_HOST=WEB_HOST)
|
||||
|
||||
assert response.status_code == 200
|
||||
assert b'window.ARCHIVEBOX_API_KEY' in response.content
|
||||
assert b"window.ARCHIVEBOX_API_KEY" in response.content
|
||||
|
||||
|
||||
def test_tags_autocomplete_requires_auth_when_public_snapshots_list_disabled(client, settings):
|
||||
settings.PUBLIC_SNAPSHOTS_LIST = False
|
||||
settings.PUBLIC_INDEX = False
|
||||
Tag.objects.create(name='archive')
|
||||
Tag.objects.create(name="archive")
|
||||
|
||||
response = client.get(
|
||||
reverse('api-1:tags_autocomplete'),
|
||||
{'q': 'a'},
|
||||
reverse("api-1:tags_autocomplete"),
|
||||
{"q": "a"},
|
||||
HTTP_HOST=ADMIN_HOST,
|
||||
)
|
||||
|
||||
@@ -167,29 +251,29 @@ def test_tags_autocomplete_requires_auth_when_public_snapshots_list_disabled(cli
|
||||
def test_tags_autocomplete_allows_public_access_when_public_snapshots_list_enabled(client, settings):
|
||||
settings.PUBLIC_SNAPSHOTS_LIST = True
|
||||
settings.PUBLIC_INDEX = False
|
||||
Tag.objects.create(name='archive')
|
||||
Tag.objects.create(name="archive")
|
||||
|
||||
response = client.get(
|
||||
reverse('api-1:tags_autocomplete'),
|
||||
{'q': 'a'},
|
||||
reverse("api-1:tags_autocomplete"),
|
||||
{"q": "a"},
|
||||
HTTP_HOST=ADMIN_HOST,
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
assert response.json()['tags'][0]['name'] == 'archive'
|
||||
assert response.json()["tags"][0]["name"] == "archive"
|
||||
|
||||
|
||||
def test_tags_autocomplete_allows_authenticated_user_when_public_snapshots_list_disabled(client, admin_user, settings):
|
||||
settings.PUBLIC_SNAPSHOTS_LIST = False
|
||||
settings.PUBLIC_INDEX = False
|
||||
Tag.objects.create(name='archive')
|
||||
Tag.objects.create(name="archive")
|
||||
client.force_login(admin_user)
|
||||
|
||||
response = client.get(
|
||||
reverse('api-1:tags_autocomplete'),
|
||||
{'q': 'a'},
|
||||
reverse("api-1:tags_autocomplete"),
|
||||
{"q": "a"},
|
||||
HTTP_HOST=ADMIN_HOST,
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
assert response.json()['tags'][0]['name'] == 'archive'
|
||||
assert response.json()["tags"][0]["name"] == "archive"
|
||||
|
||||
@@ -4,83 +4,83 @@ from archivebox.base_models.admin import KeyValueWidget
|
||||
def test_key_value_widget_renders_enum_autocomplete_metadata(monkeypatch):
|
||||
monkeypatch.setattr(
|
||||
KeyValueWidget,
|
||||
'_get_config_options',
|
||||
"_get_config_options",
|
||||
lambda self: {
|
||||
'CHROME_WAIT_FOR': {
|
||||
'plugin': 'chrome',
|
||||
'type': 'string',
|
||||
'default': 'networkidle2',
|
||||
'description': 'Page load completion condition',
|
||||
'enum': ['domcontentloaded', 'load', 'networkidle0', 'networkidle2'],
|
||||
"CHROME_WAIT_FOR": {
|
||||
"plugin": "chrome",
|
||||
"type": "string",
|
||||
"default": "networkidle2",
|
||||
"description": "Page load completion condition",
|
||||
"enum": ["domcontentloaded", "load", "networkidle0", "networkidle2"],
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
html = str(
|
||||
KeyValueWidget().render(
|
||||
'config',
|
||||
{'CHROME_WAIT_FOR': 'load'},
|
||||
attrs={'id': 'id_config'},
|
||||
)
|
||||
"config",
|
||||
{"CHROME_WAIT_FOR": "load"},
|
||||
attrs={"id": "id_config"},
|
||||
),
|
||||
)
|
||||
|
||||
assert '"enum": ["domcontentloaded", "load", "networkidle0", "networkidle2"]' in html
|
||||
assert 'class="kv-value-options"' in html
|
||||
assert 'class="kv-help"' in html
|
||||
assert 'configureValueInput_id_config' in html
|
||||
assert 'describeMeta_id_config' in html
|
||||
assert 'validateValueAgainstMeta_id_config' in html
|
||||
assert "configureValueInput_id_config" in html
|
||||
assert "describeMeta_id_config" in html
|
||||
assert "validateValueAgainstMeta_id_config" in html
|
||||
|
||||
|
||||
def test_key_value_widget_renders_numeric_and_pattern_constraints(monkeypatch):
|
||||
monkeypatch.setattr(
|
||||
KeyValueWidget,
|
||||
'_get_config_options',
|
||||
"_get_config_options",
|
||||
lambda self: {
|
||||
'TIMEOUT': {
|
||||
'plugin': 'base',
|
||||
'type': 'integer',
|
||||
'default': 60,
|
||||
'description': 'Timeout in seconds',
|
||||
'minimum': 5,
|
||||
'maximum': 120,
|
||||
"TIMEOUT": {
|
||||
"plugin": "base",
|
||||
"type": "integer",
|
||||
"default": 60,
|
||||
"description": "Timeout in seconds",
|
||||
"minimum": 5,
|
||||
"maximum": 120,
|
||||
},
|
||||
'CHROME_RESOLUTION': {
|
||||
'plugin': 'chrome',
|
||||
'type': 'string',
|
||||
'default': '1440,2000',
|
||||
'description': 'Viewport resolution',
|
||||
'pattern': '^\\d+,\\d+$',
|
||||
"CHROME_RESOLUTION": {
|
||||
"plugin": "chrome",
|
||||
"type": "string",
|
||||
"default": "1440,2000",
|
||||
"description": "Viewport resolution",
|
||||
"pattern": "^\\d+,\\d+$",
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
html = str(KeyValueWidget().render('config', {}, attrs={'id': 'id_config'}))
|
||||
html = str(KeyValueWidget().render("config", {}, attrs={"id": "id_config"}))
|
||||
|
||||
assert '"minimum": 5' in html
|
||||
assert '"maximum": 120' in html
|
||||
assert '"pattern": "^\\\\d+,\\\\d+$"' in html
|
||||
assert 'Expected: ' in html
|
||||
assert 'Example: ' in html
|
||||
assert 'setValueValidationState_id_config' in html
|
||||
assert 'coerceValueForStorage_id_config' in html
|
||||
assert "Expected: " in html
|
||||
assert "Example: " in html
|
||||
assert "setValueValidationState_id_config" in html
|
||||
assert "coerceValueForStorage_id_config" in html
|
||||
|
||||
|
||||
def test_key_value_widget_accepts_common_boolean_spellings(monkeypatch):
|
||||
monkeypatch.setattr(
|
||||
KeyValueWidget,
|
||||
'_get_config_options',
|
||||
"_get_config_options",
|
||||
lambda self: {
|
||||
'DEBUG': {
|
||||
'plugin': 'base',
|
||||
'type': 'boolean',
|
||||
'default': False,
|
||||
'description': 'Enable debug mode',
|
||||
"DEBUG": {
|
||||
"plugin": "base",
|
||||
"type": "boolean",
|
||||
"default": False,
|
||||
"description": "Enable debug mode",
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
html = str(KeyValueWidget().render('config', {'DEBUG': 'True'}, attrs={'id': 'id_config'}))
|
||||
html = str(KeyValueWidget().render("config", {"DEBUG": "True"}, attrs={"id": "id_config"}))
|
||||
|
||||
assert "enumValues = ['True', 'False']" in html
|
||||
assert "raw.toLowerCase()" in html
|
||||
@@ -91,35 +91,35 @@ def test_key_value_widget_accepts_common_boolean_spellings(monkeypatch):
|
||||
def test_key_value_widget_shows_array_and_object_examples_and_binary_rules(monkeypatch):
|
||||
monkeypatch.setattr(
|
||||
KeyValueWidget,
|
||||
'_get_config_options',
|
||||
"_get_config_options",
|
||||
lambda self: {
|
||||
'WGET_ARGS_EXTRA': {
|
||||
'plugin': 'wget',
|
||||
'type': 'array',
|
||||
'default': [],
|
||||
'description': 'Extra arguments to append to wget command',
|
||||
"WGET_ARGS_EXTRA": {
|
||||
"plugin": "wget",
|
||||
"type": "array",
|
||||
"default": [],
|
||||
"description": "Extra arguments to append to wget command",
|
||||
},
|
||||
'SAVE_ALLOWLIST': {
|
||||
'plugin': 'base',
|
||||
'type': 'object',
|
||||
'default': {},
|
||||
'description': 'Regex allowlist mapped to enabled methods',
|
||||
"SAVE_ALLOWLIST": {
|
||||
"plugin": "base",
|
||||
"type": "object",
|
||||
"default": {},
|
||||
"description": "Regex allowlist mapped to enabled methods",
|
||||
},
|
||||
'WGET_BINARY': {
|
||||
'plugin': 'wget',
|
||||
'type': 'string',
|
||||
'default': 'wget',
|
||||
'description': 'Path to wget binary',
|
||||
"WGET_BINARY": {
|
||||
"plugin": "wget",
|
||||
"type": "string",
|
||||
"default": "wget",
|
||||
"description": "Path to wget binary",
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
html = str(KeyValueWidget().render('config', {}, attrs={'id': 'id_config'}))
|
||||
html = str(KeyValueWidget().render("config", {}, attrs={"id": "id_config"}))
|
||||
|
||||
assert 'Example: ["--extra-arg"]' in html
|
||||
assert 'Example: {"^https://example\\\\.com": ["wget"]}' in html
|
||||
assert 'Example: wget or /usr/bin/wget' in html
|
||||
assert 'validateBinaryValue_id_config' in html
|
||||
assert "Example: wget or /usr/bin/wget" in html
|
||||
assert "validateBinaryValue_id_config" in html
|
||||
assert "meta.key.endsWith('_BINARY')" in html
|
||||
assert "Binary paths cannot contain quotes" in html
|
||||
|
||||
@@ -127,25 +127,25 @@ def test_key_value_widget_shows_array_and_object_examples_and_binary_rules(monke
|
||||
def test_key_value_widget_falls_back_to_binary_validation_for_unknown_binary_keys(monkeypatch):
|
||||
monkeypatch.setattr(
|
||||
KeyValueWidget,
|
||||
'_get_config_options',
|
||||
"_get_config_options",
|
||||
lambda self: {
|
||||
'CHROME_BINARY': {
|
||||
'plugin': 'base',
|
||||
'type': 'string',
|
||||
'default': '',
|
||||
'description': 'Resolved Chromium/Chrome binary path shared across plugins',
|
||||
"CHROME_BINARY": {
|
||||
"plugin": "base",
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"description": "Resolved Chromium/Chrome binary path shared across plugins",
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
html = str(
|
||||
KeyValueWidget().render(
|
||||
'config',
|
||||
{'NODE_BINARY': '/opt/homebrew/bin/node'},
|
||||
attrs={'id': 'id_config'},
|
||||
)
|
||||
"config",
|
||||
{"NODE_BINARY": "/opt/homebrew/bin/node"},
|
||||
attrs={"id": "id_config"},
|
||||
),
|
||||
)
|
||||
|
||||
assert 'function getMetaForKey_id_config' in html
|
||||
assert "function getMetaForKey_id_config" in html
|
||||
assert "if (key.endsWith('_BINARY'))" in html
|
||||
assert 'Path to binary executable' in html
|
||||
assert "Path to binary executable" in html
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
import pytest
|
||||
from django.contrib.admin.sites import AdminSite
|
||||
from django.test import RequestFactory
|
||||
from django.urls import reverse
|
||||
import html
|
||||
from uuid import uuid4
|
||||
|
||||
|
||||
@@ -26,18 +29,18 @@ def _create_machine():
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
return Machine.objects.create(
|
||||
guid=f'test-guid-{uuid4()}',
|
||||
hostname='test-host',
|
||||
guid=f"test-guid-{uuid4()}",
|
||||
hostname="test-host",
|
||||
hw_in_docker=False,
|
||||
hw_in_vm=False,
|
||||
hw_manufacturer='Test',
|
||||
hw_product='Test Product',
|
||||
hw_uuid=f'test-hw-{uuid4()}',
|
||||
os_arch='arm64',
|
||||
os_family='darwin',
|
||||
os_platform='macOS',
|
||||
os_release='14.0',
|
||||
os_kernel='Darwin',
|
||||
hw_manufacturer="Test",
|
||||
hw_product="Test Product",
|
||||
hw_uuid=f"test-hw-{uuid4()}",
|
||||
os_arch="arm64",
|
||||
os_family="darwin",
|
||||
os_platform="macOS",
|
||||
os_release="14.0",
|
||||
os_kernel="Darwin",
|
||||
stats={},
|
||||
config={},
|
||||
)
|
||||
@@ -48,16 +51,16 @@ def _create_iface(machine):
|
||||
|
||||
return NetworkInterface.objects.create(
|
||||
machine=machine,
|
||||
mac_address='00:11:22:33:44:66',
|
||||
ip_public='203.0.113.11',
|
||||
ip_local='10.0.0.11',
|
||||
dns_server='1.1.1.1',
|
||||
hostname='test-host',
|
||||
iface='en0',
|
||||
isp='Test ISP',
|
||||
city='Test City',
|
||||
region='Test Region',
|
||||
country='Test Country',
|
||||
mac_address="00:11:22:33:44:66",
|
||||
ip_public="203.0.113.11",
|
||||
ip_local="10.0.0.11",
|
||||
dns_server="1.1.1.1",
|
||||
hostname="test-host",
|
||||
iface="en0",
|
||||
isp="Test ISP",
|
||||
city="Test City",
|
||||
region="Test Region",
|
||||
country="Test Country",
|
||||
)
|
||||
|
||||
|
||||
@@ -72,14 +75,14 @@ def test_archiveresult_admin_links_plugin_and_process():
|
||||
machine=iface.machine,
|
||||
iface=iface,
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
pwd=str(snapshot.output_dir / 'wget'),
|
||||
cmd=['/tmp/on_Snapshot__06_wget.finite.bg.py', '--url=https://example.com'],
|
||||
pwd=str(snapshot.output_dir / "wget"),
|
||||
cmd=["/tmp/on_Snapshot__06_wget.finite.bg.py", "--url=https://example.com"],
|
||||
status=Process.StatusChoices.EXITED,
|
||||
)
|
||||
result = ArchiveResult.objects.create(
|
||||
snapshot=snapshot,
|
||||
plugin='wget',
|
||||
hook_name='on_Snapshot__06_wget.finite.bg.py',
|
||||
plugin="wget",
|
||||
hook_name="on_Snapshot__06_wget.finite.bg.py",
|
||||
process=process,
|
||||
status=ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
)
|
||||
@@ -89,8 +92,85 @@ def test_archiveresult_admin_links_plugin_and_process():
|
||||
plugin_html = str(admin.plugin_with_icon(result))
|
||||
process_html = str(admin.process_link(result))
|
||||
|
||||
assert '/admin/environment/plugins/builtin.wget/' in plugin_html
|
||||
assert f'/admin/machine/process/{process.id}/change' in process_html
|
||||
assert "/admin/environment/plugins/builtin.wget/" in plugin_html
|
||||
assert f"/admin/machine/process/{process.id}/change" in process_html
|
||||
|
||||
|
||||
def test_snapshot_admin_zip_links():
|
||||
from archivebox.core.admin_snapshots import SnapshotAdmin
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
snapshot = _create_snapshot()
|
||||
admin = SnapshotAdmin(Snapshot, AdminSite())
|
||||
|
||||
zip_url = admin.get_snapshot_zip_url(snapshot)
|
||||
|
||||
assert html.escape(zip_url, quote=True) not in str(admin.files(snapshot))
|
||||
assert html.escape(zip_url, quote=True) in str(admin.size_with_stats(snapshot))
|
||||
assert html.escape(zip_url, quote=True) in str(admin.admin_actions(snapshot))
|
||||
|
||||
|
||||
def test_archiveresult_admin_zip_links():
|
||||
from archivebox.core.admin_archiveresults import ArchiveResultAdmin
|
||||
from archivebox.core.models import ArchiveResult
|
||||
|
||||
snapshot = _create_snapshot()
|
||||
result = ArchiveResult.objects.create(
|
||||
snapshot=snapshot,
|
||||
plugin="wget",
|
||||
hook_name="on_Snapshot__06_wget.finite.bg.py",
|
||||
status=ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
output_str="Saved output",
|
||||
)
|
||||
|
||||
admin = ArchiveResultAdmin(ArchiveResult, AdminSite())
|
||||
zip_url = admin.get_output_zip_url(result)
|
||||
|
||||
assert html.escape(zip_url, quote=True) in str(admin.zip_link(result))
|
||||
assert html.escape(zip_url, quote=True) in str(admin.admin_actions(result))
|
||||
|
||||
|
||||
def test_archiveresult_admin_copy_command_redacts_sensitive_env_keys():
|
||||
from archivebox.core.admin_archiveresults import ArchiveResultAdmin
|
||||
from archivebox.core.models import ArchiveResult
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
snapshot = _create_snapshot()
|
||||
iface = _create_iface(_create_machine())
|
||||
process = Process.objects.create(
|
||||
machine=iface.machine,
|
||||
iface=iface,
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
pwd=str(snapshot.output_dir / "wget"),
|
||||
cmd=["/tmp/on_Snapshot__06_wget.finite.bg.py", "--url=https://example.com"],
|
||||
env={
|
||||
"SOURCE_URL": "https://example.com",
|
||||
"SAFE_FLAG": "1",
|
||||
"API_KEY": "super-secret-key",
|
||||
"ACCESS_TOKEN": "super-secret-token",
|
||||
"SHARED_SECRET": "super-secret-secret",
|
||||
},
|
||||
status=Process.StatusChoices.EXITED,
|
||||
)
|
||||
result = ArchiveResult.objects.create(
|
||||
snapshot=snapshot,
|
||||
plugin="wget",
|
||||
hook_name="on_Snapshot__06_wget.finite.bg.py",
|
||||
process=process,
|
||||
status=ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
)
|
||||
|
||||
admin = ArchiveResultAdmin(ArchiveResult, AdminSite())
|
||||
cmd_html = str(admin.cmd_str(result))
|
||||
|
||||
assert "SAFE_FLAG=1" in cmd_html
|
||||
assert "SOURCE_URL=https://example.com" in cmd_html
|
||||
assert "API_KEY" not in cmd_html
|
||||
assert "ACCESS_TOKEN" not in cmd_html
|
||||
assert "SHARED_SECRET" not in cmd_html
|
||||
assert "super-secret-key" not in cmd_html
|
||||
assert "super-secret-token" not in cmd_html
|
||||
assert "super-secret-secret" not in cmd_html
|
||||
|
||||
|
||||
def test_process_admin_links_binary_and_iface():
|
||||
@@ -101,11 +181,11 @@ def test_process_admin_links_binary_and_iface():
|
||||
iface = _create_iface(machine)
|
||||
binary = Binary.objects.create(
|
||||
machine=machine,
|
||||
name='wget',
|
||||
abspath='/usr/local/bin/wget',
|
||||
version='1.21.2',
|
||||
binprovider='env',
|
||||
binproviders='env',
|
||||
name="wget",
|
||||
abspath="/usr/local/bin/wget",
|
||||
version="1.21.2",
|
||||
binprovider="env",
|
||||
binproviders="env",
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
)
|
||||
process = Process.objects.create(
|
||||
@@ -113,8 +193,8 @@ def test_process_admin_links_binary_and_iface():
|
||||
iface=iface,
|
||||
binary=binary,
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
pwd='/tmp/wget',
|
||||
cmd=['/tmp/on_Snapshot__06_wget.finite.bg.py', '--url=https://example.com'],
|
||||
pwd="/tmp/wget",
|
||||
cmd=["/tmp/on_Snapshot__06_wget.finite.bg.py", "--url=https://example.com"],
|
||||
status=Process.StatusChoices.EXITED,
|
||||
)
|
||||
|
||||
@@ -123,5 +203,107 @@ def test_process_admin_links_binary_and_iface():
|
||||
binary_html = str(admin.binary_link(process))
|
||||
iface_html = str(admin.iface_link(process))
|
||||
|
||||
assert f'/admin/machine/binary/{binary.id}/change' in binary_html
|
||||
assert f'/admin/machine/networkinterface/{iface.id}/change' in iface_html
|
||||
assert f"/admin/machine/binary/{binary.id}/change" in binary_html
|
||||
assert f"/admin/machine/networkinterface/{iface.id}/change" in iface_html
|
||||
|
||||
|
||||
def test_process_admin_kill_actions_only_terminate_running_processes(monkeypatch):
|
||||
from archivebox.machine.admin import ProcessAdmin
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
machine = _create_machine()
|
||||
running = Process.objects.create(
|
||||
machine=machine,
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
pwd="/tmp/running",
|
||||
cmd=["/tmp/on_Snapshot__06_wget.finite.bg.py", "--url=https://example.com"],
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
)
|
||||
exited = Process.objects.create(
|
||||
machine=machine,
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
pwd="/tmp/exited",
|
||||
cmd=["/tmp/on_Snapshot__06_wget.finite.bg.py", "--url=https://example.com"],
|
||||
status=Process.StatusChoices.EXITED,
|
||||
)
|
||||
|
||||
admin = ProcessAdmin(Process, AdminSite())
|
||||
request = RequestFactory().post("/admin/machine/process/")
|
||||
|
||||
terminated = []
|
||||
flashed = []
|
||||
|
||||
monkeypatch.setattr(Process, "is_running", property(lambda self: self.pk == running.pk), raising=False)
|
||||
monkeypatch.setattr(Process, "terminate", lambda self, graceful_timeout=5.0: terminated.append(self.pk) or True)
|
||||
monkeypatch.setattr(admin, "message_user", lambda req, msg, level=None: flashed.append((msg, level)))
|
||||
|
||||
admin.kill_processes(request, Process.objects.filter(pk__in=[running.pk, exited.pk]).order_by("created_at"))
|
||||
|
||||
assert terminated == [running.pk]
|
||||
assert any("Killed 1 running process" in msg for msg, _level in flashed)
|
||||
assert any("Skipped 1 process" in msg for msg, _level in flashed)
|
||||
|
||||
|
||||
def test_process_admin_object_kill_action_redirects_and_skips_exited(monkeypatch):
|
||||
from archivebox.machine.admin import ProcessAdmin
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
machine = _create_machine()
|
||||
process = Process.objects.create(
|
||||
machine=machine,
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
pwd="/tmp/exited",
|
||||
cmd=["/tmp/on_Snapshot__06_wget.finite.bg.py", "--url=https://example.com"],
|
||||
status=Process.StatusChoices.EXITED,
|
||||
)
|
||||
|
||||
admin = ProcessAdmin(Process, AdminSite())
|
||||
request = RequestFactory().post(f"/admin/machine/process/{process.pk}/change/")
|
||||
|
||||
terminated = []
|
||||
flashed = []
|
||||
|
||||
monkeypatch.setattr(Process, "is_running", property(lambda self: False), raising=False)
|
||||
monkeypatch.setattr(Process, "terminate", lambda self, graceful_timeout=5.0: terminated.append(self.pk) or True)
|
||||
monkeypatch.setattr(admin, "message_user", lambda req, msg, level=None: flashed.append((msg, level)))
|
||||
|
||||
response = admin.kill_process(request, process)
|
||||
|
||||
assert response.status_code == 302
|
||||
assert response.url == reverse("admin:machine_process_change", args=[process.pk])
|
||||
assert terminated == []
|
||||
assert any("Skipped 1 process" in msg for msg, _level in flashed)
|
||||
|
||||
|
||||
def test_process_admin_output_summary_uses_archiveresult_output_files():
|
||||
from archivebox.core.models import ArchiveResult
|
||||
from archivebox.machine.admin import ProcessAdmin
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
snapshot = _create_snapshot()
|
||||
machine = _create_machine()
|
||||
process = Process.objects.create(
|
||||
machine=machine,
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
pwd=str(snapshot.output_dir / "wget"),
|
||||
cmd=["/tmp/on_Snapshot__06_wget.finite.bg.py", "--url=https://example.com"],
|
||||
status=Process.StatusChoices.EXITED,
|
||||
)
|
||||
ArchiveResult.objects.create(
|
||||
snapshot=snapshot,
|
||||
plugin="wget",
|
||||
hook_name="on_Snapshot__06_wget.finite.bg.py",
|
||||
process=process,
|
||||
status=ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
output_files={
|
||||
"index.html": {"extension": "html", "mimetype": "text/html", "size": 1024},
|
||||
"title.txt": {"extension": "txt", "mimetype": "text/plain", "size": "512"},
|
||||
},
|
||||
)
|
||||
|
||||
admin = ProcessAdmin(Process, AdminSite())
|
||||
|
||||
output_html = str(admin.output_summary(process))
|
||||
|
||||
assert "2 files" in output_html
|
||||
assert "1.5 KB" in output_html
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -12,25 +12,25 @@ User = get_user_model()
|
||||
class CLIScheduleAPITests(TestCase):
|
||||
def setUp(self):
|
||||
self.user = User.objects.create_user(
|
||||
username='api-user',
|
||||
password='testpass123',
|
||||
email='api@example.com',
|
||||
username="api-user",
|
||||
password="testpass123",
|
||||
email="api@example.com",
|
||||
)
|
||||
|
||||
def test_schedule_api_creates_schedule(self):
|
||||
request = RequestFactory().post('/api/v1/cli/schedule')
|
||||
request = RequestFactory().post("/api/v1/cli/schedule")
|
||||
request.user = self.user
|
||||
setattr(request, 'stdout', StringIO())
|
||||
setattr(request, 'stderr', StringIO())
|
||||
setattr(request, "stdout", StringIO())
|
||||
setattr(request, "stderr", StringIO())
|
||||
args = ScheduleCommandSchema(
|
||||
every='daily',
|
||||
import_path='https://example.com/feed.xml',
|
||||
every="daily",
|
||||
import_path="https://example.com/feed.xml",
|
||||
quiet=True,
|
||||
)
|
||||
|
||||
response = cli_schedule(request, args)
|
||||
|
||||
self.assertTrue(response['success'])
|
||||
self.assertEqual(response['result_format'], 'json')
|
||||
self.assertTrue(response["success"])
|
||||
self.assertEqual(response["result_format"], "json")
|
||||
self.assertEqual(CrawlSchedule.objects.count(), 1)
|
||||
self.assertEqual(len(response['result']['created_schedule_ids']), 1)
|
||||
self.assertEqual(len(response["result"]["created_schedule_ids"]), 1)
|
||||
|
||||
@@ -4,8 +4,10 @@ from uuid import uuid4
|
||||
import pytest
|
||||
from django.db import connection
|
||||
|
||||
from abx_dl.events import ProcessCompletedEvent, ProcessStartedEvent
|
||||
|
||||
from abx_dl.events import BinaryEvent, ProcessCompletedEvent, ProcessStartedEvent
|
||||
from abx_dl.orchestrator import create_bus
|
||||
from abx_dl.output_files import OutputFile
|
||||
|
||||
|
||||
pytestmark = pytest.mark.django_db
|
||||
@@ -36,18 +38,18 @@ def _create_machine():
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
return Machine.objects.create(
|
||||
guid=f'test-guid-{uuid4()}',
|
||||
hostname='test-host',
|
||||
guid=f"test-guid-{uuid4()}",
|
||||
hostname="test-host",
|
||||
hw_in_docker=False,
|
||||
hw_in_vm=False,
|
||||
hw_manufacturer='Test',
|
||||
hw_product='Test Product',
|
||||
hw_uuid=f'test-hw-{uuid4()}',
|
||||
os_arch='arm64',
|
||||
os_family='darwin',
|
||||
os_platform='macOS',
|
||||
os_release='14.0',
|
||||
os_kernel='Darwin',
|
||||
hw_manufacturer="Test",
|
||||
hw_product="Test Product",
|
||||
hw_uuid=f"test-hw-{uuid4()}",
|
||||
os_arch="arm64",
|
||||
os_family="darwin",
|
||||
os_platform="macOS",
|
||||
os_release="14.0",
|
||||
os_kernel="Darwin",
|
||||
stats={},
|
||||
config={},
|
||||
)
|
||||
@@ -58,16 +60,16 @@ def _create_iface(machine):
|
||||
|
||||
return NetworkInterface.objects.create(
|
||||
machine=machine,
|
||||
mac_address='00:11:22:33:44:55',
|
||||
ip_public='203.0.113.10',
|
||||
ip_local='10.0.0.10',
|
||||
dns_server='1.1.1.1',
|
||||
hostname='test-host',
|
||||
iface='en0',
|
||||
isp='Test ISP',
|
||||
city='Test City',
|
||||
region='Test Region',
|
||||
country='Test Country',
|
||||
mac_address="00:11:22:33:44:55",
|
||||
ip_public="203.0.113.10",
|
||||
ip_local="10.0.0.10",
|
||||
dns_server="1.1.1.1",
|
||||
hostname="test-host",
|
||||
iface="en0",
|
||||
isp="Test ISP",
|
||||
city="Test City",
|
||||
region="Test Region",
|
||||
country="Test Country",
|
||||
)
|
||||
|
||||
|
||||
@@ -92,7 +94,7 @@ def test_process_completed_projects_inline_archiveresult():
|
||||
stderr="",
|
||||
exit_code=0,
|
||||
output_dir=str(plugin_dir),
|
||||
output_files=["index.html"],
|
||||
output_files=[OutputFile(path="index.html", extension="html", mimetype="text/html", size=15)],
|
||||
process_id="proc-inline",
|
||||
snapshot_id=str(snapshot.id),
|
||||
start_ts="2026-03-22T12:00:00+00:00",
|
||||
@@ -118,6 +120,8 @@ def test_process_completed_projects_inline_archiveresult():
|
||||
assert result.status == ArchiveResult.StatusChoices.SUCCEEDED
|
||||
assert result.output_str == "wget/index.html"
|
||||
assert "index.html" in result.output_files
|
||||
assert result.output_files["index.html"] == {"extension": "html", "mimetype": "text/html", "size": 15}
|
||||
assert result.output_size == 15
|
||||
_cleanup_machine_process_rows()
|
||||
|
||||
|
||||
@@ -215,24 +219,212 @@ def test_process_completed_projects_noresults_archiveresult():
|
||||
result = ArchiveResult.objects.get(snapshot=snapshot, plugin="title", hook_name="on_Snapshot__54_title.js")
|
||||
assert result.status == ArchiveResult.StatusChoices.NORESULTS
|
||||
assert result.output_str == "No title found"
|
||||
|
||||
|
||||
def test_retry_failed_archiveresults_requeues_snapshot_in_queued_state():
|
||||
from archivebox.core.models import ArchiveResult, Snapshot
|
||||
|
||||
snapshot = _create_snapshot()
|
||||
ArchiveResult.objects.create(
|
||||
snapshot=snapshot,
|
||||
plugin="chrome",
|
||||
hook_name="on_Snapshot__11_chrome_wait",
|
||||
status=ArchiveResult.StatusChoices.FAILED,
|
||||
output_str="timed out",
|
||||
output_files={"stderr.log": {}},
|
||||
output_size=123,
|
||||
output_mimetypes="text/plain",
|
||||
)
|
||||
|
||||
reset_count = snapshot.retry_failed_archiveresults()
|
||||
|
||||
snapshot.refresh_from_db()
|
||||
result = ArchiveResult.objects.get(snapshot=snapshot, plugin="chrome", hook_name="on_Snapshot__11_chrome_wait")
|
||||
assert reset_count == 1
|
||||
assert snapshot.status == Snapshot.StatusChoices.QUEUED
|
||||
assert snapshot.retry_at is not None
|
||||
assert snapshot.current_step == 0
|
||||
assert result.status == ArchiveResult.StatusChoices.QUEUED
|
||||
assert result.output_str == ""
|
||||
assert result.output_json is None
|
||||
assert result.output_files == {}
|
||||
assert result.output_size == 0
|
||||
assert result.output_mimetypes == ""
|
||||
assert result.start_ts is None
|
||||
assert result.end_ts is None
|
||||
snapshot.refresh_from_db()
|
||||
assert snapshot.title in (None, "")
|
||||
_cleanup_machine_process_rows()
|
||||
|
||||
|
||||
def test_process_completed_projects_snapshot_title_from_output_str():
|
||||
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata
|
||||
from archivebox.services.process_service import ProcessService
|
||||
|
||||
snapshot = _create_snapshot()
|
||||
plugin_dir = Path(snapshot.output_dir) / "title"
|
||||
plugin_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
bus = create_bus(name="test_snapshot_title_output_str")
|
||||
process_service = ProcessService(bus)
|
||||
service = ArchiveResultService(bus, process_service=process_service)
|
||||
|
||||
event = ProcessCompletedEvent(
|
||||
plugin_name="title",
|
||||
hook_name="on_Snapshot__54_title.js",
|
||||
stdout='{"snapshot_id":"%s","type":"ArchiveResult","status":"succeeded","output_str":"Example Domain"}\n' % snapshot.id,
|
||||
stderr="",
|
||||
exit_code=0,
|
||||
output_dir=str(plugin_dir),
|
||||
output_files=[],
|
||||
process_id="proc-title-output-str",
|
||||
snapshot_id=str(snapshot.id),
|
||||
start_ts="2026-03-22T12:00:00+00:00",
|
||||
end_ts="2026-03-22T12:00:01+00:00",
|
||||
)
|
||||
|
||||
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
|
||||
service._project_from_process_completed(
|
||||
event,
|
||||
{
|
||||
"snapshot_id": str(snapshot.id),
|
||||
"plugin": "title",
|
||||
"hook_name": "on_Snapshot__54_title.js",
|
||||
"status": "succeeded",
|
||||
"output_str": "Example Domain",
|
||||
},
|
||||
output_files,
|
||||
output_size,
|
||||
output_mimetypes,
|
||||
)
|
||||
|
||||
snapshot.refresh_from_db()
|
||||
assert snapshot.title == "Example Domain"
|
||||
_cleanup_machine_process_rows()
|
||||
|
||||
|
||||
def test_process_completed_projects_snapshot_title_from_title_file():
|
||||
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata
|
||||
from archivebox.services.process_service import ProcessService
|
||||
|
||||
snapshot = _create_snapshot()
|
||||
plugin_dir = Path(snapshot.output_dir) / "title"
|
||||
plugin_dir.mkdir(parents=True, exist_ok=True)
|
||||
(plugin_dir / "title.txt").write_text("Example Domain")
|
||||
|
||||
bus = create_bus(name="test_snapshot_title_file")
|
||||
process_service = ProcessService(bus)
|
||||
service = ArchiveResultService(bus, process_service=process_service)
|
||||
|
||||
event = ProcessCompletedEvent(
|
||||
plugin_name="title",
|
||||
hook_name="on_Snapshot__54_title.js",
|
||||
stdout='{"snapshot_id":"%s","type":"ArchiveResult","status":"noresults","output_str":"No title found"}\n' % snapshot.id,
|
||||
stderr="",
|
||||
exit_code=0,
|
||||
output_dir=str(plugin_dir),
|
||||
output_files=[OutputFile(path="title.txt", extension="txt", mimetype="text/plain", size=14)],
|
||||
process_id="proc-title-file",
|
||||
snapshot_id=str(snapshot.id),
|
||||
start_ts="2026-03-22T12:00:00+00:00",
|
||||
end_ts="2026-03-22T12:00:01+00:00",
|
||||
)
|
||||
|
||||
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
|
||||
service._project_from_process_completed(
|
||||
event,
|
||||
{
|
||||
"snapshot_id": str(snapshot.id),
|
||||
"plugin": "title",
|
||||
"hook_name": "on_Snapshot__54_title.js",
|
||||
"status": "noresults",
|
||||
"output_str": "No title found",
|
||||
},
|
||||
output_files,
|
||||
output_size,
|
||||
output_mimetypes,
|
||||
)
|
||||
|
||||
snapshot.refresh_from_db()
|
||||
assert snapshot.title == "Example Domain"
|
||||
_cleanup_machine_process_rows()
|
||||
|
||||
|
||||
def test_snapshot_resolved_title_falls_back_to_title_file_without_db_title():
|
||||
from archivebox.core.models import ArchiveResult
|
||||
|
||||
snapshot = _create_snapshot()
|
||||
plugin_dir = Path(snapshot.output_dir) / "title"
|
||||
plugin_dir.mkdir(parents=True, exist_ok=True)
|
||||
(plugin_dir / "title.txt").write_text("Example Domain")
|
||||
ArchiveResult.objects.create(
|
||||
snapshot=snapshot,
|
||||
plugin="title",
|
||||
hook_name="on_Snapshot__54_title.js",
|
||||
status="noresults",
|
||||
output_str="No title found",
|
||||
output_files={"title.txt": {}},
|
||||
)
|
||||
|
||||
snapshot.refresh_from_db()
|
||||
assert snapshot.title in (None, "")
|
||||
assert snapshot.resolved_title == "Example Domain"
|
||||
_cleanup_machine_process_rows()
|
||||
|
||||
|
||||
def test_collect_output_metadata_preserves_file_metadata():
|
||||
from archivebox.services.archive_result_service import _resolve_output_metadata
|
||||
|
||||
output_files, output_size, output_mimetypes = _resolve_output_metadata(
|
||||
[OutputFile(path="index.html", extension="html", mimetype="text/html", size=42)],
|
||||
Path("/tmp/does-not-need-to-exist"),
|
||||
)
|
||||
|
||||
assert output_files == {
|
||||
"index.html": {
|
||||
"extension": "html",
|
||||
"mimetype": "text/html",
|
||||
"size": 42,
|
||||
},
|
||||
}
|
||||
assert output_size == 42
|
||||
assert output_mimetypes == "text/html"
|
||||
|
||||
|
||||
def test_collect_output_metadata_detects_warc_gz_mimetype(tmp_path):
|
||||
from archivebox.services.archive_result_service import _collect_output_metadata
|
||||
|
||||
plugin_dir = tmp_path / "wget"
|
||||
warc_file = plugin_dir / "warc" / "capture.warc.gz"
|
||||
warc_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
warc_file.write_bytes(b"warc-bytes")
|
||||
|
||||
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
|
||||
|
||||
assert output_files["warc/capture.warc.gz"] == {
|
||||
"extension": "gz",
|
||||
"mimetype": "application/warc",
|
||||
"size": 10,
|
||||
}
|
||||
assert output_size == 10
|
||||
assert output_mimetypes == "application/warc"
|
||||
|
||||
|
||||
def test_process_started_hydrates_binary_and_iface_from_existing_binary_records(monkeypatch):
|
||||
from archivebox.machine.models import Binary, NetworkInterface
|
||||
from archivebox.services.process_service import ProcessService
|
||||
|
||||
machine = _create_machine()
|
||||
iface = _create_iface(machine)
|
||||
monkeypatch.setattr(NetworkInterface, 'current', classmethod(lambda cls, refresh=False: iface))
|
||||
monkeypatch.setattr(NetworkInterface, "current", classmethod(lambda cls, refresh=False: iface))
|
||||
|
||||
binary = Binary.objects.create(
|
||||
machine=machine,
|
||||
name='postlight-parser',
|
||||
abspath='/tmp/postlight-parser',
|
||||
version='2.2.3',
|
||||
binprovider='npm',
|
||||
binproviders='npm',
|
||||
name="postlight-parser",
|
||||
abspath="/tmp/postlight-parser",
|
||||
version="2.2.3",
|
||||
binprovider="npm",
|
||||
binproviders="npm",
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
)
|
||||
|
||||
@@ -268,15 +460,15 @@ def test_process_started_uses_node_binary_for_js_hooks_without_plugin_binary(mon
|
||||
|
||||
machine = _create_machine()
|
||||
iface = _create_iface(machine)
|
||||
monkeypatch.setattr(NetworkInterface, 'current', classmethod(lambda cls, refresh=False: iface))
|
||||
monkeypatch.setattr(NetworkInterface, "current", classmethod(lambda cls, refresh=False: iface))
|
||||
|
||||
node = Binary.objects.create(
|
||||
machine=machine,
|
||||
name='node',
|
||||
abspath='/tmp/node',
|
||||
version='22.0.0',
|
||||
binprovider='env',
|
||||
binproviders='env',
|
||||
name="node",
|
||||
abspath="/tmp/node",
|
||||
version="22.0.0",
|
||||
binprovider="env",
|
||||
binproviders="env",
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
)
|
||||
|
||||
@@ -303,3 +495,40 @@ def test_process_started_uses_node_binary_for_js_hooks_without_plugin_binary(mon
|
||||
process = service._get_or_create_process(event)
|
||||
assert process.binary_id == node.id
|
||||
assert process.iface_id == iface.id
|
||||
|
||||
|
||||
def test_binary_event_reuses_existing_installed_binary_row(monkeypatch):
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
from archivebox.services.binary_service import BinaryService as ArchiveBoxBinaryService
|
||||
|
||||
machine = _create_machine()
|
||||
monkeypatch.setattr(Machine, "current", classmethod(lambda cls: machine))
|
||||
|
||||
binary = Binary.objects.create(
|
||||
machine=machine,
|
||||
name="wget",
|
||||
abspath="/bin/sh",
|
||||
version="9.9.9",
|
||||
binprovider="env",
|
||||
binproviders="env,apt,brew",
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
)
|
||||
|
||||
service = ArchiveBoxBinaryService(create_bus(name="test_binary_event_reuses_existing_installed_binary_row"))
|
||||
event = BinaryEvent(
|
||||
name="wget",
|
||||
plugin_name="wget",
|
||||
hook_name="on_Crawl__10_wget_install.finite.bg",
|
||||
output_dir="/tmp/wget",
|
||||
binproviders="provider",
|
||||
)
|
||||
|
||||
service._project_binary(event)
|
||||
|
||||
binary.refresh_from_db()
|
||||
assert Binary.objects.filter(machine=machine, name="wget").count() == 1
|
||||
assert binary.status == Binary.StatusChoices.INSTALLED
|
||||
assert binary.abspath == "/bin/sh"
|
||||
assert binary.version == "9.9.9"
|
||||
assert binary.binprovider == "env"
|
||||
assert binary.binproviders == "provider"
|
||||
|
||||
@@ -78,8 +78,8 @@ class TestLDAPConfig(unittest.TestCase):
|
||||
from archivebox.config import get_CONFIG
|
||||
|
||||
all_config = get_CONFIG()
|
||||
self.assertIn('LDAP_CONFIG', all_config)
|
||||
self.assertEqual(all_config['LDAP_CONFIG'].__class__.__name__, 'LDAPConfig')
|
||||
self.assertIn("LDAP_CONFIG", all_config)
|
||||
self.assertEqual(all_config["LDAP_CONFIG"].__class__.__name__, "LDAPConfig")
|
||||
|
||||
|
||||
class TestLDAPIntegration(unittest.TestCase):
|
||||
@@ -95,7 +95,7 @@ class TestLDAPIntegration(unittest.TestCase):
|
||||
self.assertIn("django.contrib.auth.backends.ModelBackend", settings.AUTHENTICATION_BACKENDS)
|
||||
|
||||
# LDAP backend should not be present when disabled
|
||||
ldap_backends = [b for b in settings.AUTHENTICATION_BACKENDS if 'ldap' in b.lower()]
|
||||
ldap_backends = [b for b in settings.AUTHENTICATION_BACKENDS if "ldap" in b.lower()]
|
||||
self.assertEqual(len(ldap_backends), 0, "LDAP backend should not be present when LDAP_ENABLED=False")
|
||||
|
||||
def test_django_settings_with_ldap_library_check(self):
|
||||
@@ -106,7 +106,8 @@ class TestLDAPIntegration(unittest.TestCase):
|
||||
if not ldap_available:
|
||||
# Settings should have loaded without LDAP backend
|
||||
from django.conf import settings
|
||||
ldap_backends = [b for b in settings.AUTHENTICATION_BACKENDS if 'ldap' in b.lower()]
|
||||
|
||||
ldap_backends = [b for b in settings.AUTHENTICATION_BACKENDS if "ldap" in b.lower()]
|
||||
self.assertEqual(len(ldap_backends), 0, "LDAP backend should not be present when libraries unavailable")
|
||||
|
||||
|
||||
@@ -117,14 +118,14 @@ class TestLDAPAuthBackend(unittest.TestCase):
|
||||
"""Test that ArchiveBoxLDAPBackend class is defined."""
|
||||
from archivebox.ldap.auth import ArchiveBoxLDAPBackend
|
||||
|
||||
self.assertTrue(hasattr(ArchiveBoxLDAPBackend, 'authenticate_ldap_user'))
|
||||
self.assertTrue(hasattr(ArchiveBoxLDAPBackend, "authenticate_ldap_user"))
|
||||
|
||||
def test_ldap_backend_inherits_correctly(self):
|
||||
"""Test that ArchiveBoxLDAPBackend has correct inheritance."""
|
||||
from archivebox.ldap.auth import ArchiveBoxLDAPBackend
|
||||
|
||||
# Should have authenticate_ldap_user method (from base or overridden)
|
||||
self.assertTrue(callable(getattr(ArchiveBoxLDAPBackend, 'authenticate_ldap_user', None)))
|
||||
self.assertTrue(callable(getattr(ArchiveBoxLDAPBackend, "authenticate_ldap_user", None)))
|
||||
|
||||
|
||||
class TestArchiveBoxWithLDAP(unittest.TestCase):
|
||||
@@ -132,7 +133,7 @@ class TestArchiveBoxWithLDAP(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test environment."""
|
||||
self.work_dir = tempfile.mkdtemp(prefix='archivebox-ldap-test-')
|
||||
self.work_dir = tempfile.mkdtemp(prefix="archivebox-ldap-test-")
|
||||
|
||||
def test_archivebox_init_without_ldap(self):
|
||||
"""Test that archivebox init works without LDAP enabled."""
|
||||
@@ -140,15 +141,15 @@ class TestArchiveBoxWithLDAP(unittest.TestCase):
|
||||
|
||||
# Run archivebox init
|
||||
result = subprocess.run(
|
||||
[sys.executable, '-m', 'archivebox', 'init'],
|
||||
[sys.executable, "-m", "archivebox", "init"],
|
||||
cwd=self.work_dir,
|
||||
capture_output=True,
|
||||
timeout=45,
|
||||
env={
|
||||
**os.environ,
|
||||
'DATA_DIR': self.work_dir,
|
||||
'LDAP_ENABLED': 'False',
|
||||
}
|
||||
"DATA_DIR": self.work_dir,
|
||||
"LDAP_ENABLED": "False",
|
||||
},
|
||||
)
|
||||
|
||||
# Should succeed
|
||||
@@ -160,16 +161,16 @@ class TestArchiveBoxWithLDAP(unittest.TestCase):
|
||||
|
||||
# Run archivebox version with LDAP config env vars
|
||||
result = subprocess.run(
|
||||
[sys.executable, '-m', 'archivebox', 'version'],
|
||||
[sys.executable, "-m", "archivebox", "version"],
|
||||
cwd=self.work_dir,
|
||||
capture_output=True,
|
||||
timeout=10,
|
||||
env={
|
||||
**os.environ,
|
||||
'DATA_DIR': self.work_dir,
|
||||
'LDAP_ENABLED': 'False',
|
||||
'LDAP_SERVER_URI': 'ldap://ldap-test.localhost:389',
|
||||
}
|
||||
"DATA_DIR": self.work_dir,
|
||||
"LDAP_ENABLED": "False",
|
||||
"LDAP_SERVER_URI": "ldap://ldap-test.localhost:389",
|
||||
},
|
||||
)
|
||||
|
||||
# Should succeed
|
||||
@@ -181,7 +182,7 @@ class TestLDAPConfigValidationInArchiveBox(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test environment."""
|
||||
self.work_dir = tempfile.mkdtemp(prefix='archivebox-ldap-validation-')
|
||||
self.work_dir = tempfile.mkdtemp(prefix="archivebox-ldap-validation-")
|
||||
|
||||
def test_archivebox_init_with_incomplete_ldap_config(self):
|
||||
"""Test that archivebox init fails with helpful error when LDAP config is incomplete."""
|
||||
@@ -189,16 +190,16 @@ class TestLDAPConfigValidationInArchiveBox(unittest.TestCase):
|
||||
|
||||
# Run archivebox init with LDAP enabled but missing required fields
|
||||
result = subprocess.run(
|
||||
[sys.executable, '-m', 'archivebox', 'init'],
|
||||
[sys.executable, "-m", "archivebox", "init"],
|
||||
cwd=self.work_dir,
|
||||
capture_output=True,
|
||||
timeout=45,
|
||||
env={
|
||||
**os.environ,
|
||||
'DATA_DIR': self.work_dir,
|
||||
'LDAP_ENABLED': 'True',
|
||||
"DATA_DIR": self.work_dir,
|
||||
"LDAP_ENABLED": "True",
|
||||
# Missing: LDAP_SERVER_URI, LDAP_BIND_DN, etc.
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
# Should fail with validation error
|
||||
@@ -206,9 +207,12 @@ class TestLDAPConfigValidationInArchiveBox(unittest.TestCase):
|
||||
|
||||
# Check error message
|
||||
stderr = result.stderr.decode()
|
||||
self.assertIn("LDAP_* config options must all be set", stderr,
|
||||
f"Expected validation error message in: {stderr}")
|
||||
self.assertIn(
|
||||
"LDAP_* config options must all be set",
|
||||
stderr,
|
||||
f"Expected validation error message in: {stderr}",
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -14,8 +14,8 @@ def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None:
|
||||
candidates = {snapshot_id}
|
||||
if len(snapshot_id) == 32:
|
||||
candidates.add(f"{snapshot_id[:8]}-{snapshot_id[8:12]}-{snapshot_id[12:16]}-{snapshot_id[16:20]}-{snapshot_id[20:]}")
|
||||
elif len(snapshot_id) == 36 and '-' in snapshot_id:
|
||||
candidates.add(snapshot_id.replace('-', ''))
|
||||
elif len(snapshot_id) == 36 and "-" in snapshot_id:
|
||||
candidates.add(snapshot_id.replace("-", ""))
|
||||
|
||||
for needle in candidates:
|
||||
for path in data_dir.rglob(needle):
|
||||
@@ -28,7 +28,7 @@ def test_add_single_url_creates_snapshot_in_db(tmp_path, process, disable_extrac
|
||||
"""Test that adding a single URL creates a snapshot in the database."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
@@ -41,14 +41,14 @@ def test_add_single_url_creates_snapshot_in_db(tmp_path, process, disable_extrac
|
||||
conn.close()
|
||||
|
||||
assert len(snapshots) == 1
|
||||
assert snapshots[0][0] == 'https://example.com'
|
||||
assert snapshots[0][0] == "https://example.com"
|
||||
|
||||
|
||||
def test_add_bg_creates_root_snapshot_rows_immediately(tmp_path, process, disable_extractors_dict):
|
||||
"""Background add should create root snapshots immediately so the queue is visible in the DB."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'add', '--bg', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--bg", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
@@ -61,15 +61,15 @@ def test_add_bg_creates_root_snapshot_rows_immediately(tmp_path, process, disabl
|
||||
conn.close()
|
||||
|
||||
assert len(snapshots) == 1
|
||||
assert snapshots[0][0] == 'https://example.com'
|
||||
assert snapshots[0][1] == 'queued'
|
||||
assert snapshots[0][0] == "https://example.com"
|
||||
assert snapshots[0][1] == "queued"
|
||||
|
||||
|
||||
def test_add_creates_crawl_record(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that add command creates a Crawl record in the database."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
@@ -86,7 +86,7 @@ def test_add_creates_source_file(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that add creates a source file with the URL."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
@@ -105,7 +105,7 @@ def test_add_multiple_urls_single_command(tmp_path, process, disable_extractors_
|
||||
"""Test adding multiple URLs in a single command."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com', 'https://example.org'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com", "https://example.org"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
@@ -119,8 +119,8 @@ def test_add_multiple_urls_single_command(tmp_path, process, disable_extractors_
|
||||
conn.close()
|
||||
|
||||
assert snapshot_count == 2
|
||||
assert urls[0][0] == 'https://example.com'
|
||||
assert urls[1][0] == 'https://example.org'
|
||||
assert urls[0][0] == "https://example.com"
|
||||
assert urls[1][0] == "https://example.org"
|
||||
|
||||
|
||||
def test_add_from_file(tmp_path, process, disable_extractors_dict):
|
||||
@@ -136,7 +136,7 @@ def test_add_from_file(tmp_path, process, disable_extractors_dict):
|
||||
urls_file.write_text("https://example.com\nhttps://example.org\n")
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', str(urls_file)],
|
||||
["archivebox", "add", "--index-only", "--depth=0", str(urls_file)],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
@@ -158,41 +158,41 @@ def test_add_with_depth_0_flag(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that --depth=0 flag is accepted and works."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'unrecognized arguments: --depth' not in result.stderr.decode('utf-8')
|
||||
assert "unrecognized arguments: --depth" not in result.stderr.decode("utf-8")
|
||||
|
||||
|
||||
def test_add_with_depth_1_flag(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that --depth=1 flag is accepted."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=1', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=1", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'unrecognized arguments: --depth' not in result.stderr.decode('utf-8')
|
||||
assert "unrecognized arguments: --depth" not in result.stderr.decode("utf-8")
|
||||
|
||||
|
||||
def test_add_rejects_invalid_depth_values(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that add rejects depth values outside the supported range."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
for depth in ('5', '-1'):
|
||||
for depth in ("5", "-1"):
|
||||
result = subprocess.run(
|
||||
['archivebox', 'add', '--index-only', f'--depth={depth}', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", f"--depth={depth}", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
stderr = result.stderr.decode('utf-8').lower()
|
||||
stderr = result.stderr.decode("utf-8").lower()
|
||||
assert result.returncode != 0
|
||||
assert 'invalid' in stderr or 'not one of' in stderr
|
||||
assert "invalid" in stderr or "not one of" in stderr
|
||||
|
||||
|
||||
def test_add_with_tags(tmp_path, process, disable_extractors_dict):
|
||||
@@ -203,7 +203,7 @@ def test_add_with_tags(tmp_path, process, disable_extractors_dict):
|
||||
"""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', '--tag=test,example', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "--tag=test,example", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
@@ -214,14 +214,14 @@ def test_add_with_tags(tmp_path, process, disable_extractors_dict):
|
||||
conn.close()
|
||||
|
||||
# Tags are stored as a comma-separated string in crawl
|
||||
assert 'test' in tags_str or 'example' in tags_str
|
||||
assert "test" in tags_str or "example" in tags_str
|
||||
|
||||
|
||||
def test_add_records_selected_persona_on_crawl(tmp_path, process, disable_extractors_dict):
|
||||
"""Test add persists the selected persona so browser config derives from it later."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', '--persona=Default', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "--persona=Default", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
@@ -231,12 +231,12 @@ def test_add_records_selected_persona_on_crawl(tmp_path, process, disable_extrac
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
persona_id, default_persona = c.execute(
|
||||
"SELECT persona_id, json_extract(config, '$.DEFAULT_PERSONA') FROM crawls_crawl LIMIT 1"
|
||||
"SELECT persona_id, json_extract(config, '$.DEFAULT_PERSONA') FROM crawls_crawl LIMIT 1",
|
||||
).fetchone()
|
||||
conn.close()
|
||||
|
||||
assert persona_id
|
||||
assert default_persona == 'Default'
|
||||
assert default_persona == "Default"
|
||||
assert (tmp_path / "personas" / "Default" / "chrome_user_data").is_dir()
|
||||
|
||||
|
||||
@@ -244,10 +244,13 @@ def test_add_records_url_filter_overrides_on_crawl(tmp_path, process, disable_ex
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(
|
||||
[
|
||||
'archivebox', 'add', '--index-only', '--depth=0',
|
||||
'--domain-allowlist=example.com,*.example.com',
|
||||
'--domain-denylist=static.example.com',
|
||||
'https://example.com',
|
||||
"archivebox",
|
||||
"add",
|
||||
"--index-only",
|
||||
"--depth=0",
|
||||
"--domain-allowlist=example.com,*.example.com",
|
||||
"--domain-denylist=static.example.com",
|
||||
"https://example.com",
|
||||
],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
@@ -258,12 +261,12 @@ def test_add_records_url_filter_overrides_on_crawl(tmp_path, process, disable_ex
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
allowlist, denylist = c.execute(
|
||||
"SELECT json_extract(config, '$.URL_ALLOWLIST'), json_extract(config, '$.URL_DENYLIST') FROM crawls_crawl LIMIT 1"
|
||||
"SELECT json_extract(config, '$.URL_ALLOWLIST'), json_extract(config, '$.URL_DENYLIST') FROM crawls_crawl LIMIT 1",
|
||||
).fetchone()
|
||||
conn.close()
|
||||
|
||||
assert allowlist == 'example.com,*.example.com'
|
||||
assert denylist == 'static.example.com'
|
||||
assert allowlist == "example.com,*.example.com"
|
||||
assert denylist == "static.example.com"
|
||||
assert (tmp_path / "personas" / "Default" / "chrome_extensions").is_dir()
|
||||
|
||||
|
||||
@@ -277,14 +280,14 @@ def test_add_duplicate_url_creates_separate_crawls(tmp_path, process, disable_ex
|
||||
|
||||
# Add URL first time
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Add same URL second time
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
@@ -306,27 +309,27 @@ def test_add_with_overwrite_flag(tmp_path, process, disable_extractors_dict):
|
||||
|
||||
# Add URL first time
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Add with overwrite
|
||||
result = subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--overwrite', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--overwrite", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'unrecognized arguments: --overwrite' not in result.stderr.decode('utf-8')
|
||||
assert "unrecognized arguments: --overwrite" not in result.stderr.decode("utf-8")
|
||||
|
||||
|
||||
def test_add_creates_snapshot_output_directory(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that add creates the current snapshot output directory on disk."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
@@ -346,14 +349,39 @@ def test_add_help_shows_depth_and_tag_options(tmp_path, process):
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'add', '--help'],
|
||||
["archivebox", "add", "--help"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert '--depth' in result.stdout
|
||||
assert '--tag' in result.stdout
|
||||
assert "--depth" in result.stdout
|
||||
assert "--max-urls" in result.stdout
|
||||
assert "--max-size" in result.stdout
|
||||
assert "--tag" in result.stdout
|
||||
|
||||
|
||||
def test_add_records_max_url_and_size_limits_on_crawl(tmp_path, process, disable_extractors_dict):
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(
|
||||
["archivebox", "add", "--index-only", "--depth=1", "--max-urls=3", "--max-size=45mb", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
max_urls, max_size, config_max_urls, config_max_size = c.execute(
|
||||
"SELECT max_urls, max_size, json_extract(config, '$.MAX_URLS'), json_extract(config, '$.MAX_SIZE') FROM crawls_crawl LIMIT 1",
|
||||
).fetchone()
|
||||
conn.close()
|
||||
|
||||
assert max_urls == 3
|
||||
assert max_size == 45 * 1024 * 1024
|
||||
assert config_max_urls == 3
|
||||
assert config_max_size == 45 * 1024 * 1024
|
||||
|
||||
|
||||
def test_add_without_args_shows_usage(tmp_path, process):
|
||||
@@ -361,21 +389,21 @@ def test_add_without_args_shows_usage(tmp_path, process):
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'add'],
|
||||
["archivebox", "add"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
combined = result.stdout + result.stderr
|
||||
assert result.returncode != 0
|
||||
assert 'usage' in combined.lower() or 'url' in combined.lower()
|
||||
assert "usage" in combined.lower() or "url" in combined.lower()
|
||||
|
||||
|
||||
def test_add_index_only_skips_extraction(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that --index-only flag skips extraction (fast)."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30, # Should be fast
|
||||
@@ -396,7 +424,7 @@ def test_add_links_snapshot_to_crawl(tmp_path, process, disable_extractors_dict)
|
||||
"""Test that add links the snapshot to the crawl via crawl_id."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
@@ -419,7 +447,7 @@ def test_add_sets_snapshot_timestamp(tmp_path, process, disable_extractors_dict)
|
||||
"""Test that add sets a timestamp on the snapshot."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
@@ -17,10 +17,10 @@ from archivebox.tests.conftest import (
|
||||
)
|
||||
|
||||
PROJECTOR_TEST_ENV = {
|
||||
'PLUGINS': 'favicon',
|
||||
'SAVE_FAVICON': 'True',
|
||||
'USE_COLOR': 'False',
|
||||
'SHOW_PROGRESS': 'False',
|
||||
"PLUGINS": "favicon",
|
||||
"SAVE_FAVICON": "True",
|
||||
"USE_COLOR": "False",
|
||||
"SHOW_PROGRESS": "False",
|
||||
}
|
||||
|
||||
|
||||
@@ -32,12 +32,12 @@ class TestArchiveResultCreate:
|
||||
url = create_test_url()
|
||||
|
||||
# Create a snapshot first
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
|
||||
stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
# Pipe snapshot to archiveresult create
|
||||
stdout2, stderr, code = run_archivebox_cmd(
|
||||
['archiveresult', 'create', '--plugin=title'],
|
||||
["archiveresult", "create", "--plugin=title"],
|
||||
stdin=json.dumps(snapshot),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
@@ -46,49 +46,49 @@ class TestArchiveResultCreate:
|
||||
|
||||
records = parse_jsonl_output(stdout2)
|
||||
# Should have the Snapshot passed through and an ArchiveResult request emitted
|
||||
types = [r.get('type') for r in records]
|
||||
assert 'Snapshot' in types
|
||||
assert 'ArchiveResult' in types
|
||||
types = [r.get("type") for r in records]
|
||||
assert "Snapshot" in types
|
||||
assert "ArchiveResult" in types
|
||||
|
||||
ar = next(r for r in records if r['type'] == 'ArchiveResult')
|
||||
assert ar['plugin'] == 'title'
|
||||
assert 'id' not in ar
|
||||
ar = next(r for r in records if r["type"] == "ArchiveResult")
|
||||
assert ar["plugin"] == "title"
|
||||
assert "id" not in ar
|
||||
|
||||
def test_create_with_specific_plugin(self, initialized_archive):
|
||||
"""Create archive result for specific plugin."""
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
|
||||
stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
stdout2, stderr, code = run_archivebox_cmd(
|
||||
['archiveresult', 'create', '--plugin=screenshot'],
|
||||
["archiveresult", "create", "--plugin=screenshot"],
|
||||
stdin=json.dumps(snapshot),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout2)
|
||||
ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
|
||||
ar_records = [r for r in records if r.get("type") == "ArchiveResult"]
|
||||
assert len(ar_records) >= 1
|
||||
assert ar_records[0]['plugin'] == 'screenshot'
|
||||
assert ar_records[0]["plugin"] == "screenshot"
|
||||
|
||||
def test_create_pass_through_crawl(self, initialized_archive):
|
||||
"""Pass-through Crawl records unchanged."""
|
||||
url = create_test_url()
|
||||
|
||||
# Create crawl and snapshot
|
||||
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
|
||||
stdout1, _, _ = run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive)
|
||||
crawl = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
stdout2, _, _ = run_archivebox_cmd(
|
||||
['snapshot', 'create'],
|
||||
["snapshot", "create"],
|
||||
stdin=json.dumps(crawl),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
# Now pipe all to archiveresult create
|
||||
stdout3, stderr, code = run_archivebox_cmd(
|
||||
['archiveresult', 'create', '--plugin=title'],
|
||||
["archiveresult", "create", "--plugin=title"],
|
||||
stdin=stdout2,
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
@@ -96,23 +96,23 @@ class TestArchiveResultCreate:
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout3)
|
||||
|
||||
types = [r.get('type') for r in records]
|
||||
assert 'Crawl' in types
|
||||
assert 'Snapshot' in types
|
||||
assert 'ArchiveResult' in types
|
||||
types = [r.get("type") for r in records]
|
||||
assert "Crawl" in types
|
||||
assert "Snapshot" in types
|
||||
assert "ArchiveResult" in types
|
||||
|
||||
def test_create_pass_through_only_when_no_snapshots(self, initialized_archive):
|
||||
"""Only pass-through records but no new snapshots returns success."""
|
||||
crawl_record = {'type': 'Crawl', 'id': 'fake-id', 'urls': 'https://example.com'}
|
||||
crawl_record = {"type": "Crawl", "id": "fake-id", "urls": "https://example.com"}
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['archiveresult', 'create'],
|
||||
["archiveresult", "create"],
|
||||
stdin=json.dumps(crawl_record),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
assert 'Passed through' in stderr
|
||||
assert "Passed through" in stderr
|
||||
|
||||
|
||||
class TestArchiveResultList:
|
||||
@@ -121,26 +121,26 @@ class TestArchiveResultList:
|
||||
def test_list_empty(self, initialized_archive):
|
||||
"""List with no archive results returns empty."""
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['archiveresult', 'list'],
|
||||
["archiveresult", "list"],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
assert 'Listed 0 archive results' in stderr
|
||||
assert "Listed 0 archive results" in stderr
|
||||
|
||||
def test_list_filter_by_status(self, initialized_archive):
|
||||
"""Filter archive results by status."""
|
||||
# Create snapshot and materialize an archive result via the runner
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
|
||||
stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
stdout2, _, _ = run_archivebox_cmd(
|
||||
['archiveresult', 'create', '--plugin=favicon'],
|
||||
["archiveresult", "create", "--plugin=favicon"],
|
||||
stdin=json.dumps(snapshot),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
run_archivebox_cmd(
|
||||
['run'],
|
||||
["run"],
|
||||
stdin=stdout2,
|
||||
data_dir=initialized_archive,
|
||||
timeout=120,
|
||||
@@ -148,38 +148,38 @@ class TestArchiveResultList:
|
||||
)
|
||||
created = parse_jsonl_output(
|
||||
run_archivebox_cmd(
|
||||
['archiveresult', 'list', '--plugin=favicon'],
|
||||
["archiveresult", "list", "--plugin=favicon"],
|
||||
data_dir=initialized_archive,
|
||||
)[0]
|
||||
)[0],
|
||||
)[0]
|
||||
run_archivebox_cmd(
|
||||
['archiveresult', 'update', '--status=queued'],
|
||||
["archiveresult", "update", "--status=queued"],
|
||||
stdin=json.dumps(created),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['archiveresult', 'list', '--status=queued'],
|
||||
["archiveresult", "list", "--status=queued"],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
for r in records:
|
||||
assert r['status'] == 'queued'
|
||||
assert r["status"] == "queued"
|
||||
|
||||
def test_list_filter_by_plugin(self, initialized_archive):
|
||||
"""Filter archive results by plugin."""
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
|
||||
stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
stdout2, _, _ = run_archivebox_cmd(
|
||||
['archiveresult', 'create', '--plugin=favicon'],
|
||||
["archiveresult", "create", "--plugin=favicon"],
|
||||
stdin=json.dumps(snapshot),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
run_archivebox_cmd(
|
||||
['run'],
|
||||
["run"],
|
||||
stdin=stdout2,
|
||||
data_dir=initialized_archive,
|
||||
timeout=120,
|
||||
@@ -187,29 +187,29 @@ class TestArchiveResultList:
|
||||
)
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['archiveresult', 'list', '--plugin=favicon'],
|
||||
["archiveresult", "list", "--plugin=favicon"],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
for r in records:
|
||||
assert r['plugin'] == 'favicon'
|
||||
assert r["plugin"] == "favicon"
|
||||
|
||||
def test_list_with_limit(self, initialized_archive):
|
||||
"""Limit number of results."""
|
||||
# Create multiple archive results
|
||||
for _ in range(3):
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
|
||||
stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
stdout2, _, _ = run_archivebox_cmd(
|
||||
['archiveresult', 'create', '--plugin=favicon'],
|
||||
["archiveresult", "create", "--plugin=favicon"],
|
||||
stdin=json.dumps(snapshot),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
run_archivebox_cmd(
|
||||
['run'],
|
||||
["run"],
|
||||
stdin=stdout2,
|
||||
data_dir=initialized_archive,
|
||||
timeout=120,
|
||||
@@ -217,7 +217,7 @@ class TestArchiveResultList:
|
||||
)
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['archiveresult', 'list', '--limit=2'],
|
||||
["archiveresult", "list", "--limit=2"],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
@@ -232,38 +232,38 @@ class TestArchiveResultUpdate:
|
||||
def test_update_status(self, initialized_archive):
|
||||
"""Update archive result status."""
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
|
||||
stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
stdout2, _, _ = run_archivebox_cmd(
|
||||
['archiveresult', 'create', '--plugin=favicon'],
|
||||
["archiveresult", "create", "--plugin=favicon"],
|
||||
stdin=json.dumps(snapshot),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
stdout_run, _, _ = run_archivebox_cmd(
|
||||
['run'],
|
||||
["run"],
|
||||
stdin=stdout2,
|
||||
data_dir=initialized_archive,
|
||||
timeout=120,
|
||||
env=PROJECTOR_TEST_ENV,
|
||||
)
|
||||
stdout_list, _, _ = run_archivebox_cmd(
|
||||
['archiveresult', 'list', '--plugin=favicon'],
|
||||
["archiveresult", "list", "--plugin=favicon"],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
ar = parse_jsonl_output(stdout_list)[0]
|
||||
|
||||
stdout3, stderr, code = run_archivebox_cmd(
|
||||
['archiveresult', 'update', '--status=failed'],
|
||||
["archiveresult", "update", "--status=failed"],
|
||||
stdin=json.dumps(ar),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
assert 'Updated 1 archive results' in stderr
|
||||
assert "Updated 1 archive results" in stderr
|
||||
|
||||
records = parse_jsonl_output(stdout3)
|
||||
assert records[0]['status'] == 'failed'
|
||||
assert records[0]["status"] == "failed"
|
||||
|
||||
|
||||
class TestArchiveResultDelete:
|
||||
@@ -272,65 +272,65 @@ class TestArchiveResultDelete:
|
||||
def test_delete_requires_yes(self, initialized_archive):
|
||||
"""Delete requires --yes flag."""
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
|
||||
stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
stdout2, _, _ = run_archivebox_cmd(
|
||||
['archiveresult', 'create', '--plugin=favicon'],
|
||||
["archiveresult", "create", "--plugin=favicon"],
|
||||
stdin=json.dumps(snapshot),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
stdout_run, _, _ = run_archivebox_cmd(
|
||||
['run'],
|
||||
["run"],
|
||||
stdin=stdout2,
|
||||
data_dir=initialized_archive,
|
||||
timeout=120,
|
||||
env=PROJECTOR_TEST_ENV,
|
||||
)
|
||||
stdout_list, _, _ = run_archivebox_cmd(
|
||||
['archiveresult', 'list', '--plugin=favicon'],
|
||||
["archiveresult", "list", "--plugin=favicon"],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
ar = parse_jsonl_output(stdout_list)[0]
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['archiveresult', 'delete'],
|
||||
["archiveresult", "delete"],
|
||||
stdin=json.dumps(ar),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 1
|
||||
assert '--yes' in stderr
|
||||
assert "--yes" in stderr
|
||||
|
||||
def test_delete_with_yes(self, initialized_archive):
|
||||
"""Delete with --yes flag works."""
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
|
||||
stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
stdout2, _, _ = run_archivebox_cmd(
|
||||
['archiveresult', 'create', '--plugin=favicon'],
|
||||
["archiveresult", "create", "--plugin=favicon"],
|
||||
stdin=json.dumps(snapshot),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
stdout_run, _, _ = run_archivebox_cmd(
|
||||
['run'],
|
||||
["run"],
|
||||
stdin=stdout2,
|
||||
data_dir=initialized_archive,
|
||||
timeout=120,
|
||||
env=PROJECTOR_TEST_ENV,
|
||||
)
|
||||
stdout_list, _, _ = run_archivebox_cmd(
|
||||
['archiveresult', 'list', '--plugin=favicon'],
|
||||
["archiveresult", "list", "--plugin=favicon"],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
ar = parse_jsonl_output(stdout_list)[0]
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['archiveresult', 'delete', '--yes'],
|
||||
["archiveresult", "delete", "--yes"],
|
||||
stdin=json.dumps(ar),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
assert 'Deleted 1 archive results' in stderr
|
||||
assert "Deleted 1 archive results" in stderr
|
||||
|
||||
@@ -11,27 +11,27 @@ import subprocess
|
||||
def test_config_displays_all_config(tmp_path, process):
|
||||
"""Test that config without args displays all configuration."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(['archivebox', 'config'], capture_output=True, text=True)
|
||||
result = subprocess.run(["archivebox", "config"], capture_output=True, text=True)
|
||||
|
||||
assert result.returncode == 0
|
||||
output = result.stdout
|
||||
# Should show config sections
|
||||
assert len(output) > 100
|
||||
# Should show at least some standard config keys
|
||||
assert 'TIMEOUT' in output or 'OUTPUT_PERMISSIONS' in output
|
||||
assert "TIMEOUT" in output or "OUTPUT_PERMISSIONS" in output
|
||||
|
||||
|
||||
def test_config_get_specific_key(tmp_path, process):
|
||||
"""Test that config --get KEY retrieves specific value."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'config', '--get', 'TIMEOUT'],
|
||||
["archivebox", "config", "--get", "TIMEOUT"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'TIMEOUT' in result.stdout
|
||||
assert "TIMEOUT" in result.stdout
|
||||
|
||||
|
||||
def test_config_set_writes_to_file(tmp_path, process):
|
||||
@@ -39,7 +39,7 @@ def test_config_set_writes_to_file(tmp_path, process):
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'config', '--set', 'TIMEOUT=120'],
|
||||
["archivebox", "config", "--set", "TIMEOUT=120"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
@@ -47,11 +47,11 @@ def test_config_set_writes_to_file(tmp_path, process):
|
||||
assert result.returncode == 0
|
||||
|
||||
# Verify config file was updated
|
||||
config_file = tmp_path / 'ArchiveBox.conf'
|
||||
config_file = tmp_path / "ArchiveBox.conf"
|
||||
assert config_file.exists()
|
||||
|
||||
content = config_file.read_text()
|
||||
assert 'TIMEOUT' in content or '120' in content
|
||||
assert "TIMEOUT" in content or "120" in content
|
||||
|
||||
|
||||
def test_config_set_and_get_roundtrip(tmp_path, process):
|
||||
@@ -60,19 +60,19 @@ def test_config_set_and_get_roundtrip(tmp_path, process):
|
||||
|
||||
# Set a unique value
|
||||
subprocess.run(
|
||||
['archivebox', 'config', '--set', 'TIMEOUT=987'],
|
||||
["archivebox", "config", "--set", "TIMEOUT=987"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Get the value back
|
||||
result = subprocess.run(
|
||||
['archivebox', 'config', '--get', 'TIMEOUT'],
|
||||
["archivebox", "config", "--get", "TIMEOUT"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert '987' in result.stdout
|
||||
assert "987" in result.stdout
|
||||
|
||||
|
||||
def test_config_set_multiple_values(tmp_path, process):
|
||||
@@ -80,7 +80,7 @@ def test_config_set_multiple_values(tmp_path, process):
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'config', '--set', 'TIMEOUT=111', 'YTDLP_TIMEOUT=222'],
|
||||
["archivebox", "config", "--set", "TIMEOUT=111", "YTDLP_TIMEOUT=222"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
@@ -88,10 +88,10 @@ def test_config_set_multiple_values(tmp_path, process):
|
||||
assert result.returncode == 0
|
||||
|
||||
# Verify both were written
|
||||
config_file = tmp_path / 'ArchiveBox.conf'
|
||||
config_file = tmp_path / "ArchiveBox.conf"
|
||||
content = config_file.read_text()
|
||||
assert '111' in content
|
||||
assert '222' in content
|
||||
assert "111" in content
|
||||
assert "222" in content
|
||||
|
||||
|
||||
def test_config_set_invalid_key_fails(tmp_path, process):
|
||||
@@ -99,7 +99,7 @@ def test_config_set_invalid_key_fails(tmp_path, process):
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'config', '--set', 'TOTALLY_INVALID_KEY_XYZ=value'],
|
||||
["archivebox", "config", "--set", "TOTALLY_INVALID_KEY_XYZ=value"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
@@ -112,7 +112,7 @@ def test_config_set_requires_equals_sign(tmp_path, process):
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'config', '--set', 'TIMEOUT'],
|
||||
["archivebox", "config", "--set", "TIMEOUT"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
@@ -125,13 +125,13 @@ def test_config_search_finds_keys(tmp_path, process):
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'config', '--search', 'TIMEOUT'],
|
||||
["archivebox", "config", "--search", "TIMEOUT"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should find timeout-related config
|
||||
assert 'TIMEOUT' in result.stdout
|
||||
assert "TIMEOUT" in result.stdout
|
||||
|
||||
|
||||
def test_config_preserves_existing_values(tmp_path, process):
|
||||
@@ -140,21 +140,21 @@ def test_config_preserves_existing_values(tmp_path, process):
|
||||
|
||||
# Set first value
|
||||
subprocess.run(
|
||||
['archivebox', 'config', '--set', 'TIMEOUT=100'],
|
||||
["archivebox", "config", "--set", "TIMEOUT=100"],
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
# Set second value
|
||||
subprocess.run(
|
||||
['archivebox', 'config', '--set', 'YTDLP_TIMEOUT=200'],
|
||||
["archivebox", "config", "--set", "YTDLP_TIMEOUT=200"],
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
# Verify both are in config file
|
||||
config_file = tmp_path / 'ArchiveBox.conf'
|
||||
config_file = tmp_path / "ArchiveBox.conf"
|
||||
content = config_file.read_text()
|
||||
assert 'TIMEOUT' in content
|
||||
assert 'YTDLP_TIMEOUT' in content
|
||||
assert "TIMEOUT" in content
|
||||
assert "YTDLP_TIMEOUT" in content
|
||||
|
||||
|
||||
def test_config_file_is_valid_toml(tmp_path, process):
|
||||
@@ -162,15 +162,15 @@ def test_config_file_is_valid_toml(tmp_path, process):
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'config', '--set', 'TIMEOUT=150'],
|
||||
["archivebox", "config", "--set", "TIMEOUT=150"],
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
config_file = tmp_path / 'ArchiveBox.conf'
|
||||
config_file = tmp_path / "ArchiveBox.conf"
|
||||
content = config_file.read_text()
|
||||
|
||||
# Basic TOML validation - should have sections and key=value pairs
|
||||
assert '[' in content or '=' in content
|
||||
assert "[" in content or "=" in content
|
||||
|
||||
|
||||
def test_config_updates_existing_value(tmp_path, process):
|
||||
@@ -179,22 +179,22 @@ def test_config_updates_existing_value(tmp_path, process):
|
||||
|
||||
# Set initial value
|
||||
subprocess.run(
|
||||
['archivebox', 'config', '--set', 'TIMEOUT=100'],
|
||||
["archivebox", "config", "--set", "TIMEOUT=100"],
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
# Update to new value
|
||||
subprocess.run(
|
||||
['archivebox', 'config', '--set', 'TIMEOUT=200'],
|
||||
["archivebox", "config", "--set", "TIMEOUT=200"],
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
# Get current value
|
||||
result = subprocess.run(
|
||||
['archivebox', 'config', '--get', 'TIMEOUT'],
|
||||
["archivebox", "config", "--get", "TIMEOUT"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should show updated value
|
||||
assert '200' in result.stdout
|
||||
assert "200" in result.stdout
|
||||
|
||||
@@ -25,26 +25,26 @@ class TestCrawlCreate:
|
||||
url = create_test_url()
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['crawl', 'create', url],
|
||||
["crawl", "create", url],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 0, f"Command failed: {stderr}"
|
||||
assert 'Created crawl' in stderr
|
||||
assert "Created crawl" in stderr
|
||||
|
||||
# Check JSONL output
|
||||
records = parse_jsonl_output(stdout)
|
||||
assert len(records) == 1
|
||||
assert records[0]['type'] == 'Crawl'
|
||||
assert url in records[0]['urls']
|
||||
assert records[0]["type"] == "Crawl"
|
||||
assert url in records[0]["urls"]
|
||||
|
||||
def test_create_from_stdin_urls(self, initialized_archive):
|
||||
"""Create crawl from stdin URLs (one per line)."""
|
||||
urls = [create_test_url() for _ in range(3)]
|
||||
stdin = '\n'.join(urls)
|
||||
stdin = "\n".join(urls)
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['crawl', 'create'],
|
||||
["crawl", "create"],
|
||||
stdin=stdin,
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
@@ -54,45 +54,45 @@ class TestCrawlCreate:
|
||||
records = parse_jsonl_output(stdout)
|
||||
assert len(records) == 1
|
||||
crawl = records[0]
|
||||
assert crawl['type'] == 'Crawl'
|
||||
assert crawl["type"] == "Crawl"
|
||||
# All URLs should be in the crawl
|
||||
for url in urls:
|
||||
assert url in crawl['urls']
|
||||
assert url in crawl["urls"]
|
||||
|
||||
def test_create_with_depth(self, initialized_archive):
|
||||
"""Create crawl with --depth flag."""
|
||||
url = create_test_url()
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['crawl', 'create', '--depth=2', url],
|
||||
["crawl", "create", "--depth=2", url],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
assert records[0]['max_depth'] == 2
|
||||
assert records[0]["max_depth"] == 2
|
||||
|
||||
def test_create_with_tag(self, initialized_archive):
|
||||
"""Create crawl with --tag flag."""
|
||||
url = create_test_url()
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['crawl', 'create', '--tag=test-tag', url],
|
||||
["crawl", "create", "--tag=test-tag", url],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
assert 'test-tag' in records[0].get('tags', '')
|
||||
assert "test-tag" in records[0].get("tags_str", "")
|
||||
|
||||
def test_create_pass_through_other_types(self, initialized_archive):
|
||||
"""Pass-through records of other types unchanged."""
|
||||
tag_record = {'type': 'Tag', 'id': 'fake-tag-id', 'name': 'test'}
|
||||
tag_record = {"type": "Tag", "id": "fake-tag-id", "name": "test"}
|
||||
url = create_test_url()
|
||||
stdin = json.dumps(tag_record) + '\n' + json.dumps({'url': url})
|
||||
stdin = json.dumps(tag_record) + "\n" + json.dumps({"url": url})
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['crawl', 'create'],
|
||||
["crawl", "create"],
|
||||
stdin=stdin,
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
@@ -101,20 +101,20 @@ class TestCrawlCreate:
|
||||
records = parse_jsonl_output(stdout)
|
||||
|
||||
# Should have both the passed-through Tag and the new Crawl
|
||||
types = [r.get('type') for r in records]
|
||||
assert 'Tag' in types
|
||||
assert 'Crawl' in types
|
||||
types = [r.get("type") for r in records]
|
||||
assert "Tag" in types
|
||||
assert "Crawl" in types
|
||||
|
||||
def test_create_pass_through_existing_crawl(self, initialized_archive):
|
||||
"""Existing Crawl records (with id) are passed through."""
|
||||
# First create a crawl
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
|
||||
stdout1, _, _ = run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive)
|
||||
crawl = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
# Now pipe it back - should pass through
|
||||
stdout2, stderr, code = run_archivebox_cmd(
|
||||
['crawl', 'create'],
|
||||
["crawl", "create"],
|
||||
stdin=json.dumps(crawl),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
@@ -122,7 +122,7 @@ class TestCrawlCreate:
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout2)
|
||||
assert len(records) == 1
|
||||
assert records[0]['id'] == crawl['id']
|
||||
assert records[0]["id"] == crawl["id"]
|
||||
|
||||
|
||||
class TestCrawlList:
|
||||
@@ -131,51 +131,51 @@ class TestCrawlList:
|
||||
def test_list_empty(self, initialized_archive):
|
||||
"""List with no crawls returns empty."""
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['crawl', 'list'],
|
||||
["crawl", "list"],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
assert 'Listed 0 crawls' in stderr
|
||||
assert "Listed 0 crawls" in stderr
|
||||
|
||||
def test_list_returns_created(self, initialized_archive):
|
||||
"""List returns previously created crawls."""
|
||||
url = create_test_url()
|
||||
run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
|
||||
run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive)
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['crawl', 'list'],
|
||||
["crawl", "list"],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
assert len(records) >= 1
|
||||
assert any(url in r.get('urls', '') for r in records)
|
||||
assert any(url in r.get("urls", "") for r in records)
|
||||
|
||||
def test_list_filter_by_status(self, initialized_archive):
|
||||
"""Filter crawls by status."""
|
||||
url = create_test_url()
|
||||
run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
|
||||
run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive)
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['crawl', 'list', '--status=queued'],
|
||||
["crawl", "list", "--status=queued"],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
for r in records:
|
||||
assert r['status'] == 'queued'
|
||||
assert r["status"] == "queued"
|
||||
|
||||
def test_list_with_limit(self, initialized_archive):
|
||||
"""Limit number of results."""
|
||||
# Create multiple crawls
|
||||
for _ in range(3):
|
||||
run_archivebox_cmd(['crawl', 'create', create_test_url()], data_dir=initialized_archive)
|
||||
run_archivebox_cmd(["crawl", "create", create_test_url()], data_dir=initialized_archive)
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['crawl', 'list', '--limit=2'],
|
||||
["crawl", "list", "--limit=2"],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
@@ -191,21 +191,21 @@ class TestCrawlUpdate:
|
||||
"""Update crawl status."""
|
||||
# Create a crawl
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
|
||||
stdout1, _, _ = run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive)
|
||||
crawl = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
# Update it
|
||||
stdout2, stderr, code = run_archivebox_cmd(
|
||||
['crawl', 'update', '--status=started'],
|
||||
["crawl", "update", "--status=started"],
|
||||
stdin=json.dumps(crawl),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
assert 'Updated 1 crawls' in stderr
|
||||
assert "Updated 1 crawls" in stderr
|
||||
|
||||
records = parse_jsonl_output(stdout2)
|
||||
assert records[0]['status'] == 'started'
|
||||
assert records[0]["status"] == "started"
|
||||
|
||||
|
||||
class TestCrawlDelete:
|
||||
@@ -214,45 +214,45 @@ class TestCrawlDelete:
|
||||
def test_delete_requires_yes(self, initialized_archive):
|
||||
"""Delete requires --yes flag."""
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
|
||||
stdout1, _, _ = run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive)
|
||||
crawl = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['crawl', 'delete'],
|
||||
["crawl", "delete"],
|
||||
stdin=json.dumps(crawl),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 1
|
||||
assert '--yes' in stderr
|
||||
assert "--yes" in stderr
|
||||
|
||||
def test_delete_with_yes(self, initialized_archive):
|
||||
"""Delete with --yes flag works."""
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
|
||||
stdout1, _, _ = run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive)
|
||||
crawl = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['crawl', 'delete', '--yes'],
|
||||
["crawl", "delete", "--yes"],
|
||||
stdin=json.dumps(crawl),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
assert 'Deleted 1 crawls' in stderr
|
||||
assert "Deleted 1 crawls" in stderr
|
||||
|
||||
def test_delete_dry_run(self, initialized_archive):
|
||||
"""Dry run shows what would be deleted."""
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
|
||||
stdout1, _, _ = run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive)
|
||||
crawl = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['crawl', 'delete', '--dry-run'],
|
||||
["crawl", "delete", "--dry-run"],
|
||||
stdin=json.dumps(crawl),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
assert 'Would delete' in stderr
|
||||
assert 'dry run' in stderr.lower()
|
||||
assert "Would delete" in stderr
|
||||
assert "dry run" in stderr.lower()
|
||||
|
||||
@@ -15,14 +15,14 @@ def test_extract_runs_on_existing_snapshots(tmp_path, process, disable_extractor
|
||||
|
||||
# Add a snapshot first
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Run extract
|
||||
result = subprocess.run(
|
||||
['archivebox', 'extract'],
|
||||
["archivebox", "extract"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
@@ -38,7 +38,7 @@ def test_extract_preserves_snapshot_count(tmp_path, process, disable_extractors_
|
||||
|
||||
# Add snapshot
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
@@ -50,7 +50,7 @@ def test_extract_preserves_snapshot_count(tmp_path, process, disable_extractors_
|
||||
|
||||
# Run extract
|
||||
subprocess.run(
|
||||
['archivebox', 'extract', '--overwrite'],
|
||||
["archivebox", "extract", "--overwrite"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
|
||||
@@ -6,34 +6,33 @@ import sqlite3
|
||||
import json
|
||||
|
||||
|
||||
|
||||
def test_extract_runs_on_snapshot_id(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that extract command accepts a snapshot ID."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# First create a snapshot
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Get the snapshot ID
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
snapshot_id = c.execute("SELECT id FROM core_snapshot LIMIT 1").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
# Run extract on the snapshot
|
||||
result = subprocess.run(
|
||||
['archivebox', 'extract', '--no-wait', str(snapshot_id)],
|
||||
["archivebox", "extract", "--no-wait", str(snapshot_id)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Should not error about invalid snapshot ID
|
||||
assert 'not found' not in result.stderr.lower()
|
||||
assert "not found" not in result.stderr.lower()
|
||||
|
||||
|
||||
def test_extract_with_enabled_extractor_creates_archiveresult(tmp_path, process, disable_extractors_dict):
|
||||
@@ -42,33 +41,35 @@ def test_extract_with_enabled_extractor_creates_archiveresult(tmp_path, process,
|
||||
|
||||
# First create a snapshot
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Get the snapshot ID
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
snapshot_id = c.execute("SELECT id FROM core_snapshot LIMIT 1").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
# Run extract with title extractor enabled
|
||||
env = disable_extractors_dict.copy()
|
||||
env['SAVE_TITLE'] = 'true'
|
||||
env["SAVE_TITLE"] = "true"
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'extract', '--no-wait', str(snapshot_id)],
|
||||
["archivebox", "extract", "--no-wait", str(snapshot_id)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
)
|
||||
|
||||
# Check for archiveresults (may be queued, not completed with --no-wait)
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
count = c.execute("SELECT COUNT(*) FROM core_archiveresult WHERE snapshot_id = ?",
|
||||
(snapshot_id,)).fetchone()[0]
|
||||
count = c.execute(
|
||||
"SELECT COUNT(*) FROM core_archiveresult WHERE snapshot_id = ?",
|
||||
(snapshot_id,),
|
||||
).fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
# May or may not have results depending on timing
|
||||
@@ -81,25 +82,25 @@ def test_extract_plugin_option_accepted(tmp_path, process, disable_extractors_di
|
||||
|
||||
# First create a snapshot
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Get the snapshot ID
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
snapshot_id = c.execute("SELECT id FROM core_snapshot LIMIT 1").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'extract', '--plugin=title', '--no-wait', str(snapshot_id)],
|
||||
["archivebox", "extract", "--plugin=title", "--no-wait", str(snapshot_id)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
assert 'unrecognized arguments: --plugin' not in result.stderr
|
||||
assert "unrecognized arguments: --plugin" not in result.stderr
|
||||
|
||||
|
||||
def test_extract_stdin_snapshot_id(tmp_path, process, disable_extractors_dict):
|
||||
@@ -108,27 +109,27 @@ def test_extract_stdin_snapshot_id(tmp_path, process, disable_extractors_dict):
|
||||
|
||||
# First create a snapshot
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Get the snapshot ID
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
snapshot_id = c.execute("SELECT id FROM core_snapshot LIMIT 1").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'extract', '--no-wait'],
|
||||
input=f'{snapshot_id}\n',
|
||||
["archivebox", "extract", "--no-wait"],
|
||||
input=f"{snapshot_id}\n",
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Should not show "not found" error
|
||||
assert 'not found' not in result.stderr.lower() or result.returncode == 0
|
||||
assert "not found" not in result.stderr.lower() or result.returncode == 0
|
||||
|
||||
|
||||
def test_extract_stdin_jsonl_input(tmp_path, process, disable_extractors_dict):
|
||||
@@ -137,21 +138,21 @@ def test_extract_stdin_jsonl_input(tmp_path, process, disable_extractors_dict):
|
||||
|
||||
# First create a snapshot
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Get the snapshot ID
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
snapshot_id = c.execute("SELECT id FROM core_snapshot LIMIT 1").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
jsonl_input = json.dumps({"type": "Snapshot", "id": str(snapshot_id)}) + '\n'
|
||||
jsonl_input = json.dumps({"type": "Snapshot", "id": str(snapshot_id)}) + "\n"
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'extract', '--no-wait'],
|
||||
["archivebox", "extract", "--no-wait"],
|
||||
input=jsonl_input,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
@@ -159,7 +160,7 @@ def test_extract_stdin_jsonl_input(tmp_path, process, disable_extractors_dict):
|
||||
)
|
||||
|
||||
# Should not show "not found" error
|
||||
assert 'not found' not in result.stderr.lower() or result.returncode == 0
|
||||
assert "not found" not in result.stderr.lower() or result.returncode == 0
|
||||
|
||||
|
||||
def test_extract_pipeline_from_snapshot(tmp_path, process, disable_extractors_dict):
|
||||
@@ -168,14 +169,14 @@ def test_extract_pipeline_from_snapshot(tmp_path, process, disable_extractors_di
|
||||
|
||||
# Create snapshot and pipe to extract
|
||||
snapshot_proc = subprocess.Popen(
|
||||
['archivebox', 'snapshot', 'https://example.com'],
|
||||
["archivebox", "snapshot", "https://example.com"],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'extract', '--no-wait'],
|
||||
["archivebox", "extract", "--no-wait"],
|
||||
stdin=snapshot_proc.stdout,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
@@ -185,10 +186,12 @@ def test_extract_pipeline_from_snapshot(tmp_path, process, disable_extractors_di
|
||||
snapshot_proc.wait()
|
||||
|
||||
# Check database for snapshot
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
snapshot = c.execute("SELECT id, url FROM core_snapshot WHERE url = ?",
|
||||
('https://example.com',)).fetchone()
|
||||
snapshot = c.execute(
|
||||
"SELECT id, url FROM core_snapshot WHERE url = ?",
|
||||
("https://example.com",),
|
||||
).fetchone()
|
||||
conn.close()
|
||||
|
||||
assert snapshot is not None, "Snapshot should be created by pipeline"
|
||||
@@ -200,18 +203,18 @@ def test_extract_multiple_snapshots(tmp_path, process, disable_extractors_dict):
|
||||
|
||||
# Create multiple snapshots one at a time to avoid deduplication issues
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', 'https://iana.org'],
|
||||
["archivebox", "add", "--index-only", "https://iana.org"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Get all snapshot IDs
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
snapshot_ids = c.execute("SELECT id FROM core_snapshot").fetchall()
|
||||
conn.close()
|
||||
@@ -219,9 +222,9 @@ def test_extract_multiple_snapshots(tmp_path, process, disable_extractors_dict):
|
||||
assert len(snapshot_ids) >= 2, "Should have at least 2 snapshots"
|
||||
|
||||
# Extract from all snapshots
|
||||
ids_input = '\n'.join(str(s[0]) for s in snapshot_ids) + '\n'
|
||||
ids_input = "\n".join(str(s[0]) for s in snapshot_ids) + "\n"
|
||||
result = subprocess.run(
|
||||
['archivebox', 'extract', '--no-wait'],
|
||||
["archivebox", "extract", "--no-wait"],
|
||||
input=ids_input,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
@@ -230,7 +233,7 @@ def test_extract_multiple_snapshots(tmp_path, process, disable_extractors_dict):
|
||||
assert result.returncode == 0, result.stderr
|
||||
|
||||
# Should not error
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
@@ -246,25 +249,25 @@ class TestExtractCLI:
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'extract', '--help'],
|
||||
["archivebox", "extract", "--help"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert '--plugin' in result.stdout or '-p' in result.stdout
|
||||
assert '--wait' in result.stdout or '--no-wait' in result.stdout
|
||||
assert "--plugin" in result.stdout or "-p" in result.stdout
|
||||
assert "--wait" in result.stdout or "--no-wait" in result.stdout
|
||||
|
||||
def test_cli_no_snapshots_shows_warning(self, tmp_path, process):
|
||||
"""Test that running without snapshots shows a warning."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'extract', '--no-wait'],
|
||||
input='',
|
||||
["archivebox", "extract", "--no-wait"],
|
||||
input="",
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should show warning about no snapshots or exit normally (empty input)
|
||||
assert result.returncode == 0 or 'No' in result.stderr
|
||||
assert result.returncode == 0 or "No" in result.stderr
|
||||
|
||||
@@ -11,20 +11,20 @@ import subprocess
|
||||
def test_help_runs_successfully(tmp_path):
|
||||
"""Test that help command runs and produces output."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(['archivebox', 'help'], capture_output=True, text=True)
|
||||
result = subprocess.run(["archivebox", "help"], capture_output=True, text=True)
|
||||
|
||||
assert result.returncode == 0
|
||||
combined = result.stdout + result.stderr
|
||||
assert len(combined) > 100
|
||||
assert 'archivebox' in combined.lower()
|
||||
assert "archivebox" in combined.lower()
|
||||
|
||||
|
||||
def test_help_in_initialized_dir(tmp_path, process):
|
||||
"""Test help command in initialized data directory."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(['archivebox', 'help'], capture_output=True, text=True)
|
||||
result = subprocess.run(["archivebox", "help"], capture_output=True, text=True)
|
||||
|
||||
assert result.returncode == 0
|
||||
combined = result.stdout + result.stderr
|
||||
assert 'init' in combined
|
||||
assert 'add' in combined
|
||||
assert "init" in combined
|
||||
assert "add" in combined
|
||||
|
||||
@@ -11,13 +11,13 @@ import subprocess
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
|
||||
|
||||
DIR_PERMISSIONS = STORAGE_CONFIG.OUTPUT_PERMISSIONS.replace('6', '7').replace('4', '5')
|
||||
DIR_PERMISSIONS = STORAGE_CONFIG.OUTPUT_PERMISSIONS.replace("6", "7").replace("4", "5")
|
||||
|
||||
|
||||
def test_init_creates_database_file(tmp_path):
|
||||
"""Test that init creates index.sqlite3 database file."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(['archivebox', 'init'], capture_output=True)
|
||||
result = subprocess.run(["archivebox", "init"], capture_output=True)
|
||||
|
||||
assert result.returncode == 0
|
||||
db_path = tmp_path / "index.sqlite3"
|
||||
@@ -28,7 +28,7 @@ def test_init_creates_database_file(tmp_path):
|
||||
def test_init_creates_archive_directory(tmp_path):
|
||||
"""Test that init creates archive directory."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(['archivebox', 'init'], capture_output=True)
|
||||
subprocess.run(["archivebox", "init"], capture_output=True)
|
||||
|
||||
archive_dir = tmp_path / "archive"
|
||||
assert archive_dir.exists()
|
||||
@@ -38,7 +38,7 @@ def test_init_creates_archive_directory(tmp_path):
|
||||
def test_init_creates_sources_directory(tmp_path):
|
||||
"""Test that init creates sources directory."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(['archivebox', 'init'], capture_output=True)
|
||||
subprocess.run(["archivebox", "init"], capture_output=True)
|
||||
|
||||
sources_dir = tmp_path / "sources"
|
||||
assert sources_dir.exists()
|
||||
@@ -48,7 +48,7 @@ def test_init_creates_sources_directory(tmp_path):
|
||||
def test_init_creates_logs_directory(tmp_path):
|
||||
"""Test that init creates logs directory."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(['archivebox', 'init'], capture_output=True)
|
||||
subprocess.run(["archivebox", "init"], capture_output=True)
|
||||
|
||||
logs_dir = tmp_path / "logs"
|
||||
assert logs_dir.exists()
|
||||
@@ -58,7 +58,7 @@ def test_init_creates_logs_directory(tmp_path):
|
||||
def test_init_creates_config_file(tmp_path):
|
||||
"""Test that init creates ArchiveBox.conf config file."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(['archivebox', 'init'], capture_output=True)
|
||||
subprocess.run(["archivebox", "init"], capture_output=True)
|
||||
|
||||
config_file = tmp_path / "ArchiveBox.conf"
|
||||
assert config_file.exists()
|
||||
@@ -68,7 +68,7 @@ def test_init_creates_config_file(tmp_path):
|
||||
def test_init_runs_migrations(tmp_path):
|
||||
"""Test that init runs Django migrations and creates core tables."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(['archivebox', 'init'], capture_output=True)
|
||||
subprocess.run(["archivebox", "init"], capture_output=True)
|
||||
|
||||
# Check that migrations were applied
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
@@ -76,7 +76,7 @@ def test_init_runs_migrations(tmp_path):
|
||||
|
||||
# Check django_migrations table exists
|
||||
migrations = c.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='django_migrations'"
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='django_migrations'",
|
||||
).fetchall()
|
||||
assert len(migrations) == 1
|
||||
|
||||
@@ -90,14 +90,14 @@ def test_init_runs_migrations(tmp_path):
|
||||
def test_init_creates_core_snapshot_table(tmp_path):
|
||||
"""Test that init creates core_snapshot table."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(['archivebox', 'init'], capture_output=True)
|
||||
subprocess.run(["archivebox", "init"], capture_output=True)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
|
||||
# Check core_snapshot table exists
|
||||
tables = c.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='core_snapshot'"
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='core_snapshot'",
|
||||
).fetchall()
|
||||
assert len(tables) == 1
|
||||
|
||||
@@ -107,14 +107,14 @@ def test_init_creates_core_snapshot_table(tmp_path):
|
||||
def test_init_creates_crawls_crawl_table(tmp_path):
|
||||
"""Test that init creates crawls_crawl table."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(['archivebox', 'init'], capture_output=True)
|
||||
subprocess.run(["archivebox", "init"], capture_output=True)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
|
||||
# Check crawls_crawl table exists
|
||||
tables = c.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='crawls_crawl'"
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='crawls_crawl'",
|
||||
).fetchall()
|
||||
assert len(tables) == 1
|
||||
|
||||
@@ -124,14 +124,14 @@ def test_init_creates_crawls_crawl_table(tmp_path):
|
||||
def test_init_creates_core_archiveresult_table(tmp_path):
|
||||
"""Test that init creates core_archiveresult table."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(['archivebox', 'init'], capture_output=True)
|
||||
subprocess.run(["archivebox", "init"], capture_output=True)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
|
||||
# Check core_archiveresult table exists
|
||||
tables = c.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='core_archiveresult'"
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='core_archiveresult'",
|
||||
).fetchall()
|
||||
assert len(tables) == 1
|
||||
|
||||
@@ -141,7 +141,7 @@ def test_init_creates_core_archiveresult_table(tmp_path):
|
||||
def test_init_sets_correct_file_permissions(tmp_path):
|
||||
"""Test that init sets correct permissions on created files."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(['archivebox', 'init'], capture_output=True)
|
||||
subprocess.run(["archivebox", "init"], capture_output=True)
|
||||
|
||||
# Check database permissions
|
||||
db_path = tmp_path / "index.sqlite3"
|
||||
@@ -157,12 +157,12 @@ def test_init_is_idempotent(tmp_path):
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# First init
|
||||
result1 = subprocess.run(['archivebox', 'init'], capture_output=True, text=True)
|
||||
result1 = subprocess.run(["archivebox", "init"], capture_output=True, text=True)
|
||||
assert result1.returncode == 0
|
||||
assert "Initializing a new ArchiveBox" in result1.stdout
|
||||
|
||||
# Second init should update, not fail
|
||||
result2 = subprocess.run(['archivebox', 'init'], capture_output=True, text=True)
|
||||
result2 = subprocess.run(["archivebox", "init"], capture_output=True, text=True)
|
||||
assert result2.returncode == 0
|
||||
assert "updating existing ArchiveBox" in result2.stdout or "up-to-date" in result2.stdout.lower()
|
||||
|
||||
@@ -180,7 +180,7 @@ def test_init_with_existing_data_preserves_snapshots(tmp_path, process, disable_
|
||||
|
||||
# Add a snapshot
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
@@ -193,7 +193,7 @@ def test_init_with_existing_data_preserves_snapshots(tmp_path, process, disable_
|
||||
conn.close()
|
||||
|
||||
# Run init again
|
||||
result = subprocess.run(['archivebox', 'init'], capture_output=True)
|
||||
result = subprocess.run(["archivebox", "init"], capture_output=True)
|
||||
assert result.returncode == 0
|
||||
|
||||
# Snapshot should still exist
|
||||
@@ -208,7 +208,7 @@ def test_init_quick_flag_skips_checks(tmp_path):
|
||||
"""Test that init --quick runs faster by skipping some checks."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(['archivebox', 'init', '--quick'], capture_output=True, text=True)
|
||||
result = subprocess.run(["archivebox", "init", "--quick"], capture_output=True, text=True)
|
||||
|
||||
assert result.returncode == 0
|
||||
# Database should still be created
|
||||
@@ -219,14 +219,14 @@ def test_init_quick_flag_skips_checks(tmp_path):
|
||||
def test_init_creates_machine_table(tmp_path):
|
||||
"""Test that init creates the machine_machine table."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(['archivebox', 'init'], capture_output=True)
|
||||
subprocess.run(["archivebox", "init"], capture_output=True)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
|
||||
# Check machine_machine table exists
|
||||
tables = c.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='machine_machine'"
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='machine_machine'",
|
||||
).fetchall()
|
||||
conn.close()
|
||||
|
||||
@@ -236,18 +236,18 @@ def test_init_creates_machine_table(tmp_path):
|
||||
def test_init_output_shows_collection_info(tmp_path):
|
||||
"""Test that init output shows helpful collection information."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(['archivebox', 'init'], capture_output=True, text=True)
|
||||
result = subprocess.run(["archivebox", "init"], capture_output=True, text=True)
|
||||
|
||||
output = result.stdout
|
||||
# Should show some helpful info about the collection
|
||||
assert 'ArchiveBox' in output or 'collection' in output.lower() or 'Initializing' in output
|
||||
assert "ArchiveBox" in output or "collection" in output.lower() or "Initializing" in output
|
||||
|
||||
|
||||
def test_init_ignores_unrecognized_archive_directories(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that init upgrades existing dirs without choking on extra folders."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
check=True,
|
||||
@@ -255,7 +255,7 @@ def test_init_ignores_unrecognized_archive_directories(tmp_path, process, disabl
|
||||
(tmp_path / "archive" / "some_random_folder").mkdir(parents=True, exist_ok=True)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'init'],
|
||||
["archivebox", "init"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
|
||||
@@ -14,7 +14,7 @@ def test_install_runs_successfully(tmp_path, process):
|
||||
"""Test that install command runs without error."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'install', '--dry-run'],
|
||||
["archivebox", "install", "--dry-run"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
@@ -29,7 +29,7 @@ def test_install_creates_binary_records_in_db(tmp_path, process):
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'install', '--dry-run'],
|
||||
["archivebox", "install", "--dry-run"],
|
||||
capture_output=True,
|
||||
timeout=60,
|
||||
)
|
||||
@@ -40,7 +40,7 @@ def test_install_creates_binary_records_in_db(tmp_path, process):
|
||||
|
||||
# Check machine_binary table exists
|
||||
tables = c.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='machine_binary'"
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='machine_binary'",
|
||||
).fetchall()
|
||||
conn.close()
|
||||
|
||||
@@ -52,14 +52,14 @@ def test_install_dry_run_does_not_install(tmp_path, process):
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'install', '--dry-run'],
|
||||
["archivebox", "install", "--dry-run"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
)
|
||||
|
||||
# Should complete without actually installing
|
||||
assert 'dry' in result.stdout.lower() or result.returncode in [0, 1]
|
||||
assert "dry" in result.stdout.lower() or result.returncode in [0, 1]
|
||||
|
||||
|
||||
def test_install_detects_system_binaries(tmp_path, process):
|
||||
@@ -67,7 +67,7 @@ def test_install_detects_system_binaries(tmp_path, process):
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'install', '--dry-run'],
|
||||
["archivebox", "install", "--dry-run"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
@@ -82,7 +82,7 @@ def test_install_shows_binary_status(tmp_path, process):
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'install', '--dry-run'],
|
||||
["archivebox", "install", "--dry-run"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
@@ -97,34 +97,34 @@ def test_install_dry_run_prints_dry_run_message(tmp_path, process):
|
||||
"""Test that install --dry-run clearly reports that no changes will be made."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'install', '--dry-run'],
|
||||
["archivebox", "install", "--dry-run"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'dry run' in result.stdout.lower()
|
||||
assert "dry run" in result.stdout.lower()
|
||||
|
||||
|
||||
def test_install_help_lists_dry_run_flag(tmp_path):
|
||||
"""Test that install --help documents the dry-run option."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'install', '--help'],
|
||||
["archivebox", "install", "--help"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert '--dry-run' in result.stdout or '-d' in result.stdout
|
||||
assert "--dry-run" in result.stdout or "-d" in result.stdout
|
||||
|
||||
|
||||
def test_install_invalid_option_fails(tmp_path):
|
||||
"""Test that invalid install options fail cleanly."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'install', '--invalid-option'],
|
||||
["archivebox", "install", "--invalid-option"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
@@ -136,29 +136,31 @@ def test_install_from_empty_dir_initializes_collection(tmp_path):
|
||||
"""Test that install bootstraps an empty dir before performing work."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'install', '--dry-run'],
|
||||
["archivebox", "install", "--dry-run"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
output = result.stdout + result.stderr
|
||||
assert result.returncode == 0
|
||||
assert 'Initializing' in output or 'Dry run' in output or 'init' in output.lower()
|
||||
assert "Initializing" in output or "Dry run" in output or "init" in output.lower()
|
||||
|
||||
|
||||
def test_install_updates_binary_table(tmp_path, process):
|
||||
"""Test that install completes and only mutates dependency state."""
|
||||
os.chdir(tmp_path)
|
||||
env = os.environ.copy()
|
||||
tmp_short = Path('/tmp') / f'abx-install-{tmp_path.name}'
|
||||
tmp_short = Path("/tmp") / f"abx-install-{tmp_path.name}"
|
||||
tmp_short.mkdir(parents=True, exist_ok=True)
|
||||
env.update({
|
||||
'TMP_DIR': str(tmp_short),
|
||||
'ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS': 'true',
|
||||
})
|
||||
env.update(
|
||||
{
|
||||
"TMP_DIR": str(tmp_short),
|
||||
"ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS": "true",
|
||||
},
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'install'],
|
||||
["archivebox", "install"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=420,
|
||||
@@ -171,16 +173,18 @@ def test_install_updates_binary_table(tmp_path, process):
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
|
||||
binary_counts = dict(c.execute(
|
||||
"SELECT status, COUNT(*) FROM machine_binary GROUP BY status"
|
||||
).fetchall())
|
||||
binary_counts = dict(
|
||||
c.execute(
|
||||
"SELECT status, COUNT(*) FROM machine_binary GROUP BY status",
|
||||
).fetchall(),
|
||||
)
|
||||
snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
|
||||
sealed_crawls = c.execute(
|
||||
"SELECT COUNT(*) FROM crawls_crawl WHERE status='sealed'"
|
||||
"SELECT COUNT(*) FROM crawls_crawl WHERE status='sealed'",
|
||||
).fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
assert sealed_crawls >= 1
|
||||
assert snapshot_count == 0
|
||||
assert binary_counts.get('queued', 0) == 0
|
||||
assert binary_counts.get('installed', 0) > 0
|
||||
assert binary_counts.get("queued", 0) == 0
|
||||
assert binary_counts.get("installed", 0) > 0
|
||||
|
||||
@@ -11,52 +11,48 @@ import subprocess
|
||||
|
||||
|
||||
def _parse_jsonl(stdout: str) -> list[dict]:
|
||||
return [
|
||||
json.loads(line)
|
||||
for line in stdout.splitlines()
|
||||
if line.strip().startswith('{')
|
||||
]
|
||||
return [json.loads(line) for line in stdout.splitlines() if line.strip().startswith("{")]
|
||||
|
||||
|
||||
def test_list_outputs_existing_snapshots_as_jsonl(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that list prints one JSON object per stored snapshot."""
|
||||
os.chdir(tmp_path)
|
||||
for url in ['https://example.com', 'https://iana.org']:
|
||||
for url in ["https://example.com", "https://iana.org"]:
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', url],
|
||||
["archivebox", "add", "--index-only", "--depth=0", url],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
check=True,
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'list'],
|
||||
["archivebox", "list"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
rows = _parse_jsonl(result.stdout)
|
||||
urls = {row['url'] for row in rows}
|
||||
urls = {row["url"] for row in rows}
|
||||
|
||||
assert result.returncode == 0, result.stderr
|
||||
assert 'https://example.com' in urls
|
||||
assert 'https://iana.org' in urls
|
||||
assert "https://example.com" in urls
|
||||
assert "https://iana.org" in urls
|
||||
|
||||
|
||||
def test_list_filters_by_url_icontains(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that list --url__icontains returns only matching snapshots."""
|
||||
os.chdir(tmp_path)
|
||||
for url in ['https://example.com', 'https://iana.org']:
|
||||
for url in ["https://example.com", "https://iana.org"]:
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', url],
|
||||
["archivebox", "add", "--index-only", "--depth=0", url],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
check=True,
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'list', '--url__icontains', 'example.com'],
|
||||
["archivebox", "list", "--url__icontains", "example.com"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
@@ -65,15 +61,15 @@ def test_list_filters_by_url_icontains(tmp_path, process, disable_extractors_dic
|
||||
rows = _parse_jsonl(result.stdout)
|
||||
assert result.returncode == 0, result.stderr
|
||||
assert len(rows) == 1
|
||||
assert rows[0]['url'] == 'https://example.com'
|
||||
assert rows[0]["url"] == "https://example.com"
|
||||
|
||||
|
||||
def test_list_filters_by_crawl_id_and_limit(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that crawl-id and limit filters constrain the result set."""
|
||||
os.chdir(tmp_path)
|
||||
for url in ['https://example.com', 'https://iana.org']:
|
||||
for url in ["https://example.com", "https://iana.org"]:
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', url],
|
||||
["archivebox", "add", "--index-only", "--depth=0", url],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
check=True,
|
||||
@@ -81,14 +77,16 @@ def test_list_filters_by_crawl_id_and_limit(tmp_path, process, disable_extractor
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
crawl_id = str(c.execute(
|
||||
"SELECT crawl_id FROM core_snapshot WHERE url = ?",
|
||||
('https://example.com',),
|
||||
).fetchone()[0])
|
||||
crawl_id = str(
|
||||
c.execute(
|
||||
"SELECT crawl_id FROM core_snapshot WHERE url = ?",
|
||||
("https://example.com",),
|
||||
).fetchone()[0],
|
||||
)
|
||||
conn.close()
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'list', '--crawl-id', crawl_id, '--limit', '1'],
|
||||
["archivebox", "list", "--crawl-id", crawl_id, "--limit", "1"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
@@ -97,15 +95,15 @@ def test_list_filters_by_crawl_id_and_limit(tmp_path, process, disable_extractor
|
||||
rows = _parse_jsonl(result.stdout)
|
||||
assert result.returncode == 0, result.stderr
|
||||
assert len(rows) == 1
|
||||
assert rows[0]['crawl_id'].replace('-', '') == crawl_id.replace('-', '')
|
||||
assert rows[0]['url'] == 'https://example.com'
|
||||
assert rows[0]["crawl_id"].replace("-", "") == crawl_id.replace("-", "")
|
||||
assert rows[0]["url"] == "https://example.com"
|
||||
|
||||
|
||||
def test_list_filters_by_status(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that list can filter using the current snapshot status."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
check=True,
|
||||
@@ -117,7 +115,7 @@ def test_list_filters_by_status(tmp_path, process, disable_extractors_dict):
|
||||
conn.close()
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'list', '--status', status],
|
||||
["archivebox", "list", "--status", status],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
@@ -126,7 +124,7 @@ def test_list_filters_by_status(tmp_path, process, disable_extractors_dict):
|
||||
rows = _parse_jsonl(result.stdout)
|
||||
assert result.returncode == 0, result.stderr
|
||||
assert len(rows) == 1
|
||||
assert rows[0]['status'] == status
|
||||
assert rows[0]["status"] == status
|
||||
|
||||
|
||||
def test_list_help_lists_filter_options(tmp_path, process):
|
||||
@@ -134,13 +132,60 @@ def test_list_help_lists_filter_options(tmp_path, process):
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'list', '--help'],
|
||||
["archivebox", "list", "--help"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert '--url__icontains' in result.stdout
|
||||
assert '--crawl-id' in result.stdout
|
||||
assert '--limit' in result.stdout
|
||||
assert "--url__icontains" in result.stdout
|
||||
assert "--crawl-id" in result.stdout
|
||||
assert "--limit" in result.stdout
|
||||
assert "--search" in result.stdout
|
||||
|
||||
|
||||
def test_list_allows_sort_with_limit(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that list can sort and then apply limit without queryset slicing errors."""
|
||||
os.chdir(tmp_path)
|
||||
for url in ["https://example.com", "https://iana.org", "https://example.net"]:
|
||||
subprocess.run(
|
||||
["archivebox", "add", "--index-only", "--depth=0", url],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
check=True,
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
["archivebox", "list", "--limit", "2", "--sort", "-created_at"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
rows = _parse_jsonl(result.stdout)
|
||||
assert result.returncode == 0, result.stderr
|
||||
assert len(rows) == 2
|
||||
|
||||
|
||||
def test_list_search_meta_matches_metadata(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that list --search=meta applies metadata search to the queryset."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
check=True,
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
["archivebox", "list", "--search=meta", "example.com"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
rows = _parse_jsonl(result.stdout)
|
||||
assert result.returncode == 0, result.stderr
|
||||
assert len(rows) == 1
|
||||
assert rows[0]["url"] == "https://example.com"
|
||||
|
||||
@@ -13,7 +13,7 @@ def test_manage_help_works(tmp_path, process):
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'manage', 'help'],
|
||||
["archivebox", "manage", "help"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
@@ -28,7 +28,7 @@ def test_manage_showmigrations_works(tmp_path, process):
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'manage', 'showmigrations'],
|
||||
["archivebox", "manage", "showmigrations"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
@@ -36,7 +36,7 @@ def test_manage_showmigrations_works(tmp_path, process):
|
||||
|
||||
assert result.returncode == 0
|
||||
# Should show migration status
|
||||
assert 'core' in result.stdout or '[' in result.stdout
|
||||
assert "core" in result.stdout or "[" in result.stdout
|
||||
|
||||
|
||||
def test_manage_dbshell_command_exists(tmp_path, process):
|
||||
@@ -44,7 +44,7 @@ def test_manage_dbshell_command_exists(tmp_path, process):
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'manage', 'help', 'dbshell'],
|
||||
["archivebox", "manage", "help", "dbshell"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
@@ -52,7 +52,7 @@ def test_manage_dbshell_command_exists(tmp_path, process):
|
||||
|
||||
# Should show help for dbshell
|
||||
assert result.returncode == 0
|
||||
assert 'dbshell' in result.stdout or 'database' in result.stdout.lower()
|
||||
assert "dbshell" in result.stdout or "database" in result.stdout.lower()
|
||||
|
||||
|
||||
def test_manage_check_works(tmp_path, process):
|
||||
@@ -60,7 +60,7 @@ def test_manage_check_works(tmp_path, process):
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'manage', 'check'],
|
||||
["archivebox", "manage", "check"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
|
||||
@@ -111,14 +111,14 @@ def test_read_args_or_stdin_handles_args_stdin_and_mixed_jsonl():
|
||||
read_args_or_stdin(
|
||||
(),
|
||||
stream=MockTTYStringIO(
|
||||
'https://plain-url.com\n'
|
||||
"https://plain-url.com\n"
|
||||
'{"type":"Snapshot","url":"https://jsonl-url.com","tags":"test"}\n'
|
||||
'{"type":"Tag","id":"tag-1","name":"example"}\n'
|
||||
'01234567-89ab-cdef-0123-456789abcdef\n'
|
||||
'not valid json\n',
|
||||
"01234567-89ab-cdef-0123-456789abcdef\n"
|
||||
"not valid json\n",
|
||||
is_tty=False,
|
||||
),
|
||||
)
|
||||
),
|
||||
)
|
||||
assert len(stdin_records) == 4
|
||||
assert stdin_records[0]["url"] == "https://plain-url.com"
|
||||
@@ -135,7 +135,7 @@ def test_read_args_or_stdin_handles_args_stdin_and_mixed_jsonl():
|
||||
'{"type":"Crawl","id":"crawl-1","urls":"https://example.com\\nhttps://foo.com"}\n',
|
||||
is_tty=False,
|
||||
),
|
||||
)
|
||||
),
|
||||
)
|
||||
assert len(crawl_records) == 1
|
||||
assert crawl_records[0]["type"] == TYPE_CRAWL
|
||||
@@ -151,14 +151,12 @@ def test_collect_urls_from_plugins_reads_only_parser_outputs(tmp_path):
|
||||
|
||||
(tmp_path / "wget").mkdir()
|
||||
(tmp_path / "wget" / "urls.jsonl").write_text(
|
||||
'{"url":"https://wget-link-1.com"}\n'
|
||||
'{"url":"https://wget-link-2.com"}\n',
|
||||
'{"url":"https://wget-link-1.com"}\n{"url":"https://wget-link-2.com"}\n',
|
||||
encoding="utf-8",
|
||||
)
|
||||
(tmp_path / "parse_html_urls").mkdir()
|
||||
(tmp_path / "parse_html_urls" / "urls.jsonl").write_text(
|
||||
'{"url":"https://html-link-1.com"}\n'
|
||||
'{"url":"https://html-link-2.com","title":"HTML Link 2"}\n',
|
||||
'{"url":"https://html-link-1.com"}\n{"url":"https://html-link-2.com","title":"HTML Link 2"}\n',
|
||||
encoding="utf-8",
|
||||
)
|
||||
(tmp_path / "screenshot").mkdir()
|
||||
@@ -187,6 +185,22 @@ def test_collect_urls_from_plugins_trims_markdown_suffixes(tmp_path):
|
||||
assert urls[0]["url"] == "https://docs.sweeting.me/s/youtube-favorites"
|
||||
|
||||
|
||||
def test_collect_urls_from_plugins_trims_trailing_punctuation(tmp_path):
|
||||
from archivebox.hooks import collect_urls_from_plugins
|
||||
|
||||
(tmp_path / "parse_html_urls").mkdir()
|
||||
(tmp_path / "parse_html_urls" / "urls.jsonl").write_text(
|
||||
('{"url":"https://github.com/ArchiveBox/ArchiveBox."}\n{"url":"https://github.com/abc?abc#234234?."}\n'),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
urls = collect_urls_from_plugins(tmp_path)
|
||||
assert [url["url"] for url in urls] == [
|
||||
"https://github.com/ArchiveBox/ArchiveBox",
|
||||
"https://github.com/abc?abc#234234",
|
||||
]
|
||||
|
||||
|
||||
def test_crawl_create_stdout_pipes_into_run(initialized_archive):
|
||||
"""`archivebox crawl create | archivebox run` should queue and materialize snapshots."""
|
||||
url = create_test_url()
|
||||
@@ -311,10 +325,7 @@ def test_archiveresult_list_stdout_pipes_into_run(initialized_archive):
|
||||
_assert_stdout_is_jsonl_only(run_stdout)
|
||||
|
||||
run_records = parse_jsonl_output(run_stdout)
|
||||
assert any(
|
||||
record.get("type") == "ArchiveResult" and record.get("id") == archiveresult["id"]
|
||||
for record in run_records
|
||||
)
|
||||
assert any(record.get("type") == "ArchiveResult" and record.get("id") == archiveresult["id"] for record in run_records)
|
||||
|
||||
|
||||
def test_binary_create_stdout_pipes_into_run(initialized_archive):
|
||||
|
||||
@@ -14,8 +14,8 @@ def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None:
|
||||
candidates = {snapshot_id}
|
||||
if len(snapshot_id) == 32:
|
||||
candidates.add(f"{snapshot_id[:8]}-{snapshot_id[8:12]}-{snapshot_id[12:16]}-{snapshot_id[16:20]}-{snapshot_id[20:]}")
|
||||
elif len(snapshot_id) == 36 and '-' in snapshot_id:
|
||||
candidates.add(snapshot_id.replace('-', ''))
|
||||
elif len(snapshot_id) == 36 and "-" in snapshot_id:
|
||||
candidates.add(snapshot_id.replace("-", ""))
|
||||
|
||||
for needle in candidates:
|
||||
for path in data_dir.rglob(needle):
|
||||
@@ -30,7 +30,7 @@ def test_remove_deletes_snapshot_from_db(tmp_path, process, disable_extractors_d
|
||||
|
||||
# Add a snapshot
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
@@ -44,7 +44,7 @@ def test_remove_deletes_snapshot_from_db(tmp_path, process, disable_extractors_d
|
||||
|
||||
# Remove it
|
||||
subprocess.run(
|
||||
['archivebox', 'remove', 'https://example.com', '--yes'],
|
||||
["archivebox", "remove", "https://example.com", "--yes"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
@@ -64,7 +64,7 @@ def test_remove_deletes_archive_directory(tmp_path, process, disable_extractors_
|
||||
|
||||
# Add a snapshot
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
@@ -78,7 +78,7 @@ def test_remove_deletes_archive_directory(tmp_path, process, disable_extractors_
|
||||
assert snapshot_dir is not None, f"Snapshot output directory not found for {snapshot_id}"
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'remove', 'https://example.com', '--yes', '--delete'],
|
||||
["archivebox", "remove", "https://example.com", "--yes", "--delete"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
@@ -91,14 +91,14 @@ def test_remove_yes_flag_skips_confirmation(tmp_path, process, disable_extractor
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Remove with --yes should complete without interaction
|
||||
result = subprocess.run(
|
||||
['archivebox', 'remove', 'https://example.com', '--yes'],
|
||||
["archivebox", "remove", "https://example.com", "--yes"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
@@ -114,9 +114,9 @@ def test_remove_multiple_snapshots(tmp_path, process, disable_extractors_dict):
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Add multiple snapshots
|
||||
for url in ['https://example.com', 'https://example.org']:
|
||||
for url in ["https://example.com", "https://example.org"]:
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', url],
|
||||
["archivebox", "add", "--index-only", "--depth=0", url],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
@@ -130,7 +130,7 @@ def test_remove_multiple_snapshots(tmp_path, process, disable_extractors_dict):
|
||||
|
||||
# Remove both
|
||||
subprocess.run(
|
||||
['archivebox', 'remove', 'https://example.com', 'https://example.org', '--yes'],
|
||||
["archivebox", "remove", "https://example.com", "https://example.org", "--yes"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
@@ -150,14 +150,14 @@ def test_remove_with_filter(tmp_path, process, disable_extractors_dict):
|
||||
|
||||
# Add snapshots
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Remove using filter
|
||||
result = subprocess.run(
|
||||
['archivebox', 'remove', '--filter-type=search', '--filter=example.com', '--yes'],
|
||||
["archivebox", "remove", "--filter-type=search", "--filter=example.com", "--yes"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
@@ -171,16 +171,16 @@ def test_remove_with_regex_filter_deletes_all_matches(tmp_path, process, disable
|
||||
"""Test regex filters remove every matching snapshot."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
for url in ['https://example.com', 'https://iana.org']:
|
||||
for url in ["https://example.com", "https://iana.org"]:
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', url],
|
||||
["archivebox", "add", "--index-only", "--depth=0", url],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
check=True,
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'remove', '--filter-type=regex', '.*', '--yes'],
|
||||
["archivebox", "remove", "--filter-type=regex", ".*", "--yes"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
check=True,
|
||||
@@ -193,7 +193,7 @@ def test_remove_with_regex_filter_deletes_all_matches(tmp_path, process, disable
|
||||
|
||||
output = result.stdout.decode("utf-8") + result.stderr.decode("utf-8")
|
||||
assert count_after == 0
|
||||
assert 'Removed' in output or 'Found' in output
|
||||
assert "Removed" in output or "Found" in output
|
||||
|
||||
|
||||
def test_remove_nonexistent_url_fails_gracefully(tmp_path, process, disable_extractors_dict):
|
||||
@@ -201,30 +201,30 @@ def test_remove_nonexistent_url_fails_gracefully(tmp_path, process, disable_extr
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'remove', 'https://nonexistent-url-12345.com', '--yes'],
|
||||
["archivebox", "remove", "https://nonexistent-url-12345.com", "--yes"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Should fail or show error
|
||||
stdout_text = result.stdout.decode('utf-8', errors='replace').lower()
|
||||
assert result.returncode != 0 or 'not found' in stdout_text or 'no matches' in stdout_text
|
||||
stdout_text = result.stdout.decode("utf-8", errors="replace").lower()
|
||||
assert result.returncode != 0 or "not found" in stdout_text or "no matches" in stdout_text
|
||||
|
||||
|
||||
def test_remove_reports_remaining_link_count_correctly(tmp_path, process, disable_extractors_dict):
|
||||
"""Test remove reports the remaining snapshot count after deletion."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
for url in ['https://example.com', 'https://example.org']:
|
||||
for url in ["https://example.com", "https://example.org"]:
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', url],
|
||||
["archivebox", "add", "--index-only", "--depth=0", url],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
check=True,
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'remove', 'https://example.org', '--yes'],
|
||||
["archivebox", "remove", "https://example.org", "--yes"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
check=True,
|
||||
@@ -240,14 +240,14 @@ def test_remove_after_flag(tmp_path, process, disable_extractors_dict):
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Try remove with --after flag (should work or show usage)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'remove', '--after=2020-01-01', '--yes'],
|
||||
["archivebox", "remove", "--after=2020-01-01", "--yes"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
|
||||
@@ -21,8 +21,8 @@ from archivebox.tests.conftest import (
|
||||
)
|
||||
|
||||
RUN_TEST_ENV = {
|
||||
'PLUGINS': 'favicon',
|
||||
'SAVE_FAVICON': 'True',
|
||||
"PLUGINS": "favicon",
|
||||
"SAVE_FAVICON": "True",
|
||||
}
|
||||
|
||||
|
||||
@@ -34,7 +34,7 @@ class TestRunWithCrawl:
|
||||
crawl_record = create_test_crawl_json()
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
["run"],
|
||||
stdin=json.dumps(crawl_record),
|
||||
data_dir=initialized_archive,
|
||||
timeout=120,
|
||||
@@ -45,21 +45,21 @@ class TestRunWithCrawl:
|
||||
|
||||
# Should output the created Crawl
|
||||
records = parse_jsonl_output(stdout)
|
||||
crawl_records = [r for r in records if r.get('type') == 'Crawl']
|
||||
crawl_records = [r for r in records if r.get("type") == "Crawl"]
|
||||
assert len(crawl_records) >= 1
|
||||
assert crawl_records[0].get('id') # Should have an id now
|
||||
assert crawl_records[0].get("id") # Should have an id now
|
||||
|
||||
def test_run_with_existing_crawl(self, initialized_archive):
|
||||
"""Run re-queues an existing Crawl (with id)."""
|
||||
url = create_test_url()
|
||||
|
||||
# First create a crawl
|
||||
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive, env=RUN_TEST_ENV)
|
||||
stdout1, _, _ = run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive, env=RUN_TEST_ENV)
|
||||
crawl = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
# Run with the existing crawl
|
||||
stdout2, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
["run"],
|
||||
stdin=json.dumps(crawl),
|
||||
data_dir=initialized_archive,
|
||||
timeout=120,
|
||||
@@ -79,7 +79,7 @@ class TestRunWithSnapshot:
|
||||
snapshot_record = create_test_snapshot_json()
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
["run"],
|
||||
stdin=json.dumps(snapshot_record),
|
||||
data_dir=initialized_archive,
|
||||
timeout=120,
|
||||
@@ -89,21 +89,21 @@ class TestRunWithSnapshot:
|
||||
assert code == 0, f"Command failed: {stderr}"
|
||||
|
||||
records = parse_jsonl_output(stdout)
|
||||
snapshot_records = [r for r in records if r.get('type') == 'Snapshot']
|
||||
snapshot_records = [r for r in records if r.get("type") == "Snapshot"]
|
||||
assert len(snapshot_records) >= 1
|
||||
assert snapshot_records[0].get('id')
|
||||
assert snapshot_records[0].get("id")
|
||||
|
||||
def test_run_with_existing_snapshot(self, initialized_archive):
|
||||
"""Run re-queues an existing Snapshot (with id)."""
|
||||
url = create_test_url()
|
||||
|
||||
# First create a snapshot
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive, env=RUN_TEST_ENV)
|
||||
stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive, env=RUN_TEST_ENV)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
# Run with the existing snapshot
|
||||
stdout2, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
["run"],
|
||||
stdin=json.dumps(snapshot),
|
||||
data_dir=initialized_archive,
|
||||
timeout=120,
|
||||
@@ -117,10 +117,10 @@ class TestRunWithSnapshot:
|
||||
def test_run_with_plain_url(self, initialized_archive):
|
||||
"""Run accepts plain URL records (no type field)."""
|
||||
url = create_test_url()
|
||||
url_record = {'url': url}
|
||||
url_record = {"url": url}
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
["run"],
|
||||
stdin=json.dumps(url_record),
|
||||
data_dir=initialized_archive,
|
||||
timeout=120,
|
||||
@@ -140,21 +140,21 @@ class TestRunWithArchiveResult:
|
||||
url = create_test_url()
|
||||
|
||||
# Create snapshot and archive result
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive, env=RUN_TEST_ENV)
|
||||
stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive, env=RUN_TEST_ENV)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
stdout2, _, _ = run_archivebox_cmd(
|
||||
['archiveresult', 'create', '--plugin=favicon'],
|
||||
["archiveresult", "create", "--plugin=favicon"],
|
||||
stdin=json.dumps(snapshot),
|
||||
data_dir=initialized_archive,
|
||||
env=RUN_TEST_ENV,
|
||||
)
|
||||
ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult')
|
||||
ar = next(r for r in parse_jsonl_output(stdout2) if r.get("type") == "ArchiveResult")
|
||||
|
||||
# Update to failed
|
||||
ar['status'] = 'failed'
|
||||
ar["status"] = "failed"
|
||||
run_archivebox_cmd(
|
||||
['archiveresult', 'update', '--status=failed'],
|
||||
["archiveresult", "update", "--status=failed"],
|
||||
stdin=json.dumps(ar),
|
||||
data_dir=initialized_archive,
|
||||
env=RUN_TEST_ENV,
|
||||
@@ -162,7 +162,7 @@ class TestRunWithArchiveResult:
|
||||
|
||||
# Now run should re-queue it
|
||||
stdout3, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
["run"],
|
||||
stdin=json.dumps(ar),
|
||||
data_dir=initialized_archive,
|
||||
timeout=120,
|
||||
@@ -171,7 +171,7 @@ class TestRunWithArchiveResult:
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout3)
|
||||
ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
|
||||
ar_records = [r for r in records if r.get("type") == "ArchiveResult"]
|
||||
assert len(ar_records) >= 1
|
||||
|
||||
|
||||
@@ -180,19 +180,19 @@ class TestRunPassThrough:
|
||||
|
||||
def test_run_passes_through_unknown_types(self, initialized_archive):
|
||||
"""Run passes through records with unknown types."""
|
||||
unknown_record = {'type': 'Unknown', 'id': 'fake-id', 'data': 'test'}
|
||||
unknown_record = {"type": "Unknown", "id": "fake-id", "data": "test"}
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
["run"],
|
||||
stdin=json.dumps(unknown_record),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
unknown_records = [r for r in records if r.get('type') == 'Unknown']
|
||||
unknown_records = [r for r in records if r.get("type") == "Unknown"]
|
||||
assert len(unknown_records) == 1
|
||||
assert unknown_records[0]['data'] == 'test'
|
||||
assert unknown_records[0]["data"] == "test"
|
||||
|
||||
def test_run_outputs_all_processed_records(self, initialized_archive):
|
||||
"""Run outputs all processed records for chaining."""
|
||||
@@ -200,7 +200,7 @@ class TestRunPassThrough:
|
||||
crawl_record = create_test_crawl_json(urls=[url])
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
["run"],
|
||||
stdin=json.dumps(crawl_record),
|
||||
data_dir=initialized_archive,
|
||||
timeout=120,
|
||||
@@ -220,16 +220,18 @@ class TestRunMixedInput:
|
||||
"""Run handles mixed Crawl/Snapshot/ArchiveResult input."""
|
||||
crawl = create_test_crawl_json()
|
||||
snapshot = create_test_snapshot_json()
|
||||
unknown = {'type': 'Tag', 'id': 'fake', 'name': 'test'}
|
||||
unknown = {"type": "Tag", "id": "fake", "name": "test"}
|
||||
|
||||
stdin = '\n'.join([
|
||||
json.dumps(crawl),
|
||||
json.dumps(snapshot),
|
||||
json.dumps(unknown),
|
||||
])
|
||||
stdin = "\n".join(
|
||||
[
|
||||
json.dumps(crawl),
|
||||
json.dumps(snapshot),
|
||||
json.dumps(unknown),
|
||||
],
|
||||
)
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
["run"],
|
||||
stdin=stdin,
|
||||
data_dir=initialized_archive,
|
||||
timeout=120,
|
||||
@@ -239,9 +241,9 @@ class TestRunMixedInput:
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
|
||||
types = set(r.get('type') for r in records)
|
||||
types = {r.get("type") for r in records}
|
||||
# Should have processed Crawl and Snapshot, passed through Tag
|
||||
assert 'Crawl' in types or 'Snapshot' in types or 'Tag' in types
|
||||
assert "Crawl" in types or "Snapshot" in types or "Tag" in types
|
||||
|
||||
|
||||
class TestRunEmpty:
|
||||
@@ -250,8 +252,8 @@ class TestRunEmpty:
|
||||
def test_run_empty_stdin(self, initialized_archive):
|
||||
"""Run with empty stdin returns success."""
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin='',
|
||||
["run"],
|
||||
stdin="",
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
@@ -259,16 +261,16 @@ class TestRunEmpty:
|
||||
|
||||
def test_run_no_records_to_process(self, initialized_archive):
|
||||
"""Run with only pass-through records shows message."""
|
||||
unknown = {'type': 'Unknown', 'id': 'fake'}
|
||||
unknown = {"type": "Unknown", "id": "fake"}
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
["run"],
|
||||
stdin=json.dumps(unknown),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
assert 'No records to process' in stderr
|
||||
assert "No records to process" in stderr
|
||||
|
||||
|
||||
class TestRunDaemonMode:
|
||||
@@ -328,13 +330,13 @@ class TestRecoverOrphanedCrawls:
|
||||
from archivebox.services.runner import recover_orphaned_crawls
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls='https://example.com',
|
||||
urls="https://example.com",
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
retry_at=None,
|
||||
)
|
||||
Snapshot.objects.create(
|
||||
url='https://example.com',
|
||||
url="https://example.com",
|
||||
crawl=crawl,
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
retry_at=None,
|
||||
@@ -358,13 +360,13 @@ class TestRecoverOrphanedCrawls:
|
||||
from archivebox.services.runner import recover_orphaned_crawls
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls='https://example.com',
|
||||
urls="https://example.com",
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
retry_at=None,
|
||||
)
|
||||
snapshot = Snapshot.objects.create(
|
||||
url='https://example.com',
|
||||
url="https://example.com",
|
||||
crawl=crawl,
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
retry_at=None,
|
||||
@@ -376,10 +378,10 @@ class TestRecoverOrphanedCrawls:
|
||||
machine=machine,
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
cmd=['/plugins/chrome/on_Crawl__91_chrome_wait.js'],
|
||||
cmd=["/plugins/chrome/on_Crawl__91_chrome_wait.js"],
|
||||
env={
|
||||
'CRAWL_ID': str(crawl.id),
|
||||
'SNAPSHOT_ID': str(snapshot.id),
|
||||
"CRAWL_ID": str(crawl.id),
|
||||
"SNAPSHOT_ID": str(snapshot.id),
|
||||
},
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
@@ -397,13 +399,13 @@ class TestRecoverOrphanedCrawls:
|
||||
from archivebox.services.runner import recover_orphaned_crawls
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls='https://example.com',
|
||||
urls="https://example.com",
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
retry_at=None,
|
||||
)
|
||||
Snapshot.objects.create(
|
||||
url='https://example.com',
|
||||
url="https://example.com",
|
||||
crawl=crawl,
|
||||
status=Snapshot.StatusChoices.SEALED,
|
||||
retry_at=None,
|
||||
@@ -426,13 +428,13 @@ class TestRecoverOrphanedSnapshots:
|
||||
from archivebox.services.runner import recover_orphaned_snapshots
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls='https://example.com',
|
||||
urls="https://example.com",
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
status=Crawl.StatusChoices.SEALED,
|
||||
retry_at=None,
|
||||
)
|
||||
snapshot = Snapshot.objects.create(
|
||||
url='https://example.com',
|
||||
url="https://example.com",
|
||||
crawl=crawl,
|
||||
status=Snapshot.StatusChoices.STARTED,
|
||||
retry_at=None,
|
||||
|
||||
@@ -6,26 +6,25 @@ import sqlite3
|
||||
import subprocess
|
||||
|
||||
|
||||
|
||||
def test_schedule_run_all_enqueues_scheduled_crawl(tmp_path, process, disable_extractors_dict):
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'schedule', '--every=daily', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "schedule", "--every=daily", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'schedule', '--run-all'],
|
||||
["archivebox", "schedule", "--run-all"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'Enqueued 1 scheduled crawl' in result.stdout
|
||||
assert "Enqueued 1 scheduled crawl" in result.stdout
|
||||
|
||||
conn = sqlite3.connect(tmp_path / "index.sqlite3")
|
||||
try:
|
||||
@@ -42,20 +41,20 @@ def test_schedule_without_import_path_creates_maintenance_schedule(tmp_path, pro
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'schedule', '--every=day'],
|
||||
["archivebox", "schedule", "--every=day"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'Created scheduled maintenance update' in result.stdout
|
||||
assert "Created scheduled maintenance update" in result.stdout
|
||||
|
||||
conn = sqlite3.connect(tmp_path / "index.sqlite3")
|
||||
try:
|
||||
row = conn.execute(
|
||||
"SELECT urls, status FROM crawls_crawl ORDER BY created_at DESC LIMIT 1"
|
||||
"SELECT urls, status FROM crawls_crawl ORDER BY created_at DESC LIMIT 1",
|
||||
).fetchone()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
assert row == ('archivebox://update', 'sealed')
|
||||
assert row == ("archivebox://update", "sealed")
|
||||
|
||||
@@ -15,21 +15,21 @@ def test_search_finds_snapshots(tmp_path, process, disable_extractors_dict):
|
||||
|
||||
# Add snapshots
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Search for it
|
||||
result = subprocess.run(
|
||||
['archivebox', 'search', 'example'],
|
||||
["archivebox", "search", "example"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'example' in result.stdout
|
||||
assert "example" in result.stdout
|
||||
|
||||
|
||||
def test_search_returns_no_results_for_missing_term(tmp_path, process, disable_extractors_dict):
|
||||
@@ -37,13 +37,13 @@ def test_search_returns_no_results_for_missing_term(tmp_path, process, disable_e
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'search', 'nonexistentterm12345'],
|
||||
["archivebox", "search", "nonexistentterm12345"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
@@ -58,7 +58,7 @@ def test_search_on_empty_archive(tmp_path, process):
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'search', 'anything'],
|
||||
["archivebox", "search", "anything"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
@@ -72,14 +72,14 @@ def test_search_json_outputs_matching_snapshots(tmp_path, process, disable_extra
|
||||
"""Test that search --json returns parseable matching snapshot rows."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
check=True,
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'search', '--json'],
|
||||
["archivebox", "search", "--json"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
@@ -87,21 +87,21 @@ def test_search_json_outputs_matching_snapshots(tmp_path, process, disable_extra
|
||||
|
||||
assert result.returncode == 0, result.stderr
|
||||
payload = json.loads(result.stdout)
|
||||
assert any('example.com' in row.get('url', '') for row in payload)
|
||||
assert any("example.com" in row.get("url", "") for row in payload)
|
||||
|
||||
|
||||
def test_search_json_with_headers_wraps_links_payload(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that search --json --with-headers returns a headers envelope."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
check=True,
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'search', '--json', '--with-headers'],
|
||||
["archivebox", "search", "--json", "--with-headers"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
@@ -109,51 +109,51 @@ def test_search_json_with_headers_wraps_links_payload(tmp_path, process, disable
|
||||
|
||||
assert result.returncode == 0, result.stderr
|
||||
payload = json.loads(result.stdout)
|
||||
links = payload.get('links', payload)
|
||||
assert any('example.com' in row.get('url', '') for row in links)
|
||||
links = payload.get("links", payload)
|
||||
assert any("example.com" in row.get("url", "") for row in links)
|
||||
|
||||
|
||||
def test_search_html_outputs_markup(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that search --html renders an HTML response."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
check=True,
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'search', '--html'],
|
||||
["archivebox", "search", "--html"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
assert result.returncode == 0, result.stderr
|
||||
assert '<' in result.stdout
|
||||
assert "<" in result.stdout
|
||||
|
||||
|
||||
def test_search_csv_outputs_requested_column(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that search --csv emits the requested fields."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
check=True,
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'search', '--csv', 'url', '--with-headers'],
|
||||
["archivebox", "search", "--csv", "url", "--with-headers"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
assert result.returncode == 0, result.stderr
|
||||
assert 'url' in result.stdout
|
||||
assert 'example.com' in result.stdout
|
||||
assert "url" in result.stdout
|
||||
assert "example.com" in result.stdout
|
||||
|
||||
|
||||
def test_search_with_headers_requires_structured_output_format(tmp_path, process):
|
||||
@@ -161,36 +161,36 @@ def test_search_with_headers_requires_structured_output_format(tmp_path, process
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'search', '--with-headers'],
|
||||
["archivebox", "search", "--with-headers"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
assert result.returncode != 0
|
||||
assert 'requires' in result.stderr.lower() or 'json' in result.stderr.lower()
|
||||
assert "requires" in result.stderr.lower() or "json" in result.stderr.lower()
|
||||
|
||||
|
||||
def test_search_sort_option_runs_successfully(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that search --sort accepts sortable fields."""
|
||||
os.chdir(tmp_path)
|
||||
for url in ['https://iana.org', 'https://example.com']:
|
||||
for url in ["https://iana.org", "https://example.com"]:
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', url],
|
||||
["archivebox", "add", "--index-only", "--depth=0", url],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
check=True,
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'search', '--csv', 'url', '--sort=url'],
|
||||
["archivebox", "search", "--csv", "url", "--sort=url"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
assert result.returncode == 0, result.stderr
|
||||
assert 'example.com' in result.stdout or 'iana.org' in result.stdout
|
||||
assert "example.com" in result.stdout or "iana.org" in result.stdout
|
||||
|
||||
|
||||
def test_search_help_lists_supported_filters(tmp_path, process):
|
||||
@@ -198,13 +198,13 @@ def test_search_help_lists_supported_filters(tmp_path, process):
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'search', '--help'],
|
||||
["archivebox", "search", "--help"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert '--filter-type' in result.stdout or '-f' in result.stdout
|
||||
assert '--status' in result.stdout
|
||||
assert '--sort' in result.stdout
|
||||
assert "--filter-type" in result.stdout or "-f" in result.stdout
|
||||
assert "--status" in result.stdout
|
||||
assert "--sort" in result.stdout
|
||||
|
||||
@@ -24,14 +24,14 @@ def test_server_shows_usage_info(tmp_path, process):
|
||||
# Just check that the command is recognized
|
||||
# We won't actually start a full server in tests
|
||||
result = subprocess.run(
|
||||
['archivebox', 'server', '--help'],
|
||||
["archivebox", "server", "--help"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'server' in result.stdout.lower() or 'http' in result.stdout.lower()
|
||||
assert "server" in result.stdout.lower() or "http" in result.stdout.lower()
|
||||
|
||||
|
||||
def test_server_init_flag(tmp_path, process):
|
||||
@@ -40,14 +40,14 @@ def test_server_init_flag(tmp_path, process):
|
||||
|
||||
# Check init flag is recognized
|
||||
result = subprocess.run(
|
||||
['archivebox', 'server', '--help'],
|
||||
["archivebox", "server", "--help"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert '--init' in result.stdout or 'init' in result.stdout.lower()
|
||||
assert "--init" in result.stdout or "init" in result.stdout.lower()
|
||||
|
||||
|
||||
def test_runner_worker_uses_current_interpreter():
|
||||
@@ -109,3 +109,61 @@ def test_stop_existing_background_runner_cleans_up_and_stops_orchestrators():
|
||||
runner_a.kill_tree.assert_called_once_with(graceful_timeout=2.0)
|
||||
runner_b.terminate.assert_called_once_with(graceful_timeout=2.0)
|
||||
log.assert_called_once()
|
||||
|
||||
|
||||
def test_stop_existing_server_workers_takes_over_same_runserver_port(monkeypatch):
|
||||
from archivebox.cli.archivebox_server import stop_existing_server_workers
|
||||
|
||||
supervisor = Mock()
|
||||
supervisor.getProcessInfo.side_effect = lambda name: {
|
||||
"worker_runserver": {"statename": "RUNNING"},
|
||||
"worker_daphne": {"statename": "STOPPED"},
|
||||
}.get(name, None)
|
||||
stop_worker = Mock()
|
||||
log = Mock()
|
||||
|
||||
monkeypatch.setattr(
|
||||
"archivebox.cli.archivebox_server._read_supervisor_worker_command",
|
||||
lambda worker_name: f"{sys.executable} -m archivebox manage runserver 0.0.0.0:8000" if worker_name == "worker_runserver" else "",
|
||||
)
|
||||
|
||||
stopped = stop_existing_server_workers(
|
||||
supervisor=supervisor,
|
||||
stop_worker_fn=stop_worker,
|
||||
host="0.0.0.0",
|
||||
port="8000",
|
||||
log=log,
|
||||
)
|
||||
|
||||
assert stopped == 1
|
||||
stop_worker.assert_called_once_with(supervisor, "worker_runserver")
|
||||
log.assert_called_once()
|
||||
|
||||
|
||||
def test_stop_existing_server_workers_leaves_different_port_running(monkeypatch):
|
||||
from archivebox.cli.archivebox_server import stop_existing_server_workers
|
||||
|
||||
supervisor = Mock()
|
||||
supervisor.getProcessInfo.side_effect = lambda name: {
|
||||
"worker_runserver": {"statename": "RUNNING"},
|
||||
"worker_daphne": {"statename": "STOPPED"},
|
||||
}.get(name, None)
|
||||
stop_worker = Mock()
|
||||
log = Mock()
|
||||
|
||||
monkeypatch.setattr(
|
||||
"archivebox.cli.archivebox_server._read_supervisor_worker_command",
|
||||
lambda worker_name: f"{sys.executable} -m archivebox manage runserver 127.0.0.1:9000" if worker_name == "worker_runserver" else "",
|
||||
)
|
||||
|
||||
stopped = stop_existing_server_workers(
|
||||
supervisor=supervisor,
|
||||
stop_worker_fn=stop_worker,
|
||||
host="0.0.0.0",
|
||||
port="8000",
|
||||
log=log,
|
||||
)
|
||||
|
||||
assert stopped == 0
|
||||
stop_worker.assert_not_called()
|
||||
log.assert_not_called()
|
||||
|
||||
@@ -14,7 +14,7 @@ def test_shell_command_exists(tmp_path, process):
|
||||
|
||||
# Test that the command exists (will fail without input but should recognize command)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'shell', '--help'],
|
||||
["archivebox", "shell", "--help"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
@@ -29,11 +29,11 @@ def test_shell_c_executes_python(tmp_path, process):
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'shell', '-c', 'print("shell-ok")'],
|
||||
["archivebox", "shell", "-c", 'print("shell-ok")'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
assert result.returncode == 0, result.stderr
|
||||
assert 'shell-ok' in result.stdout
|
||||
assert "shell-ok" in result.stdout
|
||||
|
||||
@@ -25,29 +25,29 @@ class TestSnapshotCreate:
|
||||
url = create_test_url()
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['snapshot', 'create', url],
|
||||
["snapshot", "create", url],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 0, f"Command failed: {stderr}"
|
||||
assert 'Created' in stderr
|
||||
assert "Created" in stderr
|
||||
|
||||
records = parse_jsonl_output(stdout)
|
||||
assert len(records) == 1
|
||||
assert records[0]['type'] == 'Snapshot'
|
||||
assert records[0]['url'] == url
|
||||
assert records[0]["type"] == "Snapshot"
|
||||
assert records[0]["url"] == url
|
||||
|
||||
def test_create_from_crawl_jsonl(self, initialized_archive):
|
||||
"""Create snapshots from Crawl JSONL input."""
|
||||
url = create_test_url()
|
||||
|
||||
# First create a crawl
|
||||
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
|
||||
stdout1, _, _ = run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive)
|
||||
crawl = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
# Pipe crawl to snapshot create
|
||||
stdout2, stderr, code = run_archivebox_cmd(
|
||||
['snapshot', 'create'],
|
||||
["snapshot", "create"],
|
||||
stdin=json.dumps(crawl),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
@@ -56,34 +56,34 @@ class TestSnapshotCreate:
|
||||
|
||||
records = parse_jsonl_output(stdout2)
|
||||
# Should have the Crawl passed through and the Snapshot created
|
||||
types = [r.get('type') for r in records]
|
||||
assert 'Crawl' in types
|
||||
assert 'Snapshot' in types
|
||||
types = [r.get("type") for r in records]
|
||||
assert "Crawl" in types
|
||||
assert "Snapshot" in types
|
||||
|
||||
snapshot = next(r for r in records if r['type'] == 'Snapshot')
|
||||
assert snapshot['url'] == url
|
||||
snapshot = next(r for r in records if r["type"] == "Snapshot")
|
||||
assert snapshot["url"] == url
|
||||
|
||||
def test_create_with_tag(self, initialized_archive):
|
||||
"""Create snapshot with --tag flag."""
|
||||
url = create_test_url()
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['snapshot', 'create', '--tag=test-tag', url],
|
||||
["snapshot", "create", "--tag=test-tag", url],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
assert 'test-tag' in records[0].get('tags', '')
|
||||
assert "test-tag" in records[0].get("tags", "")
|
||||
|
||||
def test_create_pass_through_other_types(self, initialized_archive):
|
||||
"""Pass-through records of other types unchanged."""
|
||||
tag_record = {'type': 'Tag', 'id': 'fake-tag-id', 'name': 'test'}
|
||||
tag_record = {"type": "Tag", "id": "fake-tag-id", "name": "test"}
|
||||
url = create_test_url()
|
||||
stdin = json.dumps(tag_record) + '\n' + json.dumps({'url': url})
|
||||
stdin = json.dumps(tag_record) + "\n" + json.dumps({"url": url})
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['snapshot', 'create'],
|
||||
["snapshot", "create"],
|
||||
stdin=stdin,
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
@@ -91,16 +91,16 @@ class TestSnapshotCreate:
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
|
||||
types = [r.get('type') for r in records]
|
||||
assert 'Tag' in types
|
||||
assert 'Snapshot' in types
|
||||
types = [r.get("type") for r in records]
|
||||
assert "Tag" in types
|
||||
assert "Snapshot" in types
|
||||
|
||||
def test_create_multiple_urls(self, initialized_archive):
|
||||
"""Create snapshots from multiple URLs."""
|
||||
urls = [create_test_url() for _ in range(3)]
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['snapshot', 'create'] + urls,
|
||||
["snapshot", "create"] + urls,
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
@@ -108,7 +108,7 @@ class TestSnapshotCreate:
|
||||
records = parse_jsonl_output(stdout)
|
||||
assert len(records) == 3
|
||||
|
||||
created_urls = {r['url'] for r in records}
|
||||
created_urls = {r["url"] for r in records}
|
||||
for url in urls:
|
||||
assert url in created_urls
|
||||
|
||||
@@ -119,65 +119,65 @@ class TestSnapshotList:
|
||||
def test_list_empty(self, initialized_archive):
|
||||
"""List with no snapshots returns empty."""
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['snapshot', 'list'],
|
||||
["snapshot", "list"],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
assert 'Listed 0 snapshots' in stderr
|
||||
assert "Listed 0 snapshots" in stderr
|
||||
|
||||
def test_list_returns_created(self, initialized_archive):
|
||||
"""List returns previously created snapshots."""
|
||||
url = create_test_url()
|
||||
run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
|
||||
run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['snapshot', 'list'],
|
||||
["snapshot", "list"],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
assert len(records) >= 1
|
||||
assert any(r.get('url') == url for r in records)
|
||||
assert any(r.get("url") == url for r in records)
|
||||
|
||||
def test_list_filter_by_status(self, initialized_archive):
|
||||
"""Filter snapshots by status."""
|
||||
url = create_test_url()
|
||||
run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
|
||||
run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['snapshot', 'list', '--status=queued'],
|
||||
["snapshot", "list", "--status=queued"],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
for r in records:
|
||||
assert r['status'] == 'queued'
|
||||
assert r["status"] == "queued"
|
||||
|
||||
def test_list_filter_by_url_contains(self, initialized_archive):
|
||||
"""Filter snapshots by URL contains."""
|
||||
url = create_test_url(domain='unique-domain-12345.com')
|
||||
run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
|
||||
url = create_test_url(domain="unique-domain-12345.com")
|
||||
run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['snapshot', 'list', '--url__icontains=unique-domain-12345'],
|
||||
["snapshot", "list", "--url__icontains=unique-domain-12345"],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
records = parse_jsonl_output(stdout)
|
||||
assert len(records) == 1
|
||||
assert 'unique-domain-12345' in records[0]['url']
|
||||
assert "unique-domain-12345" in records[0]["url"]
|
||||
|
||||
def test_list_with_limit(self, initialized_archive):
|
||||
"""Limit number of results."""
|
||||
for _ in range(3):
|
||||
run_archivebox_cmd(['snapshot', 'create', create_test_url()], data_dir=initialized_archive)
|
||||
run_archivebox_cmd(["snapshot", "create", create_test_url()], data_dir=initialized_archive)
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['snapshot', 'list', '--limit=2'],
|
||||
["snapshot", "list", "--limit=2"],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
@@ -185,6 +185,35 @@ class TestSnapshotList:
|
||||
records = parse_jsonl_output(stdout)
|
||||
assert len(records) == 2
|
||||
|
||||
def test_list_with_sort_and_limit(self, initialized_archive):
|
||||
"""Sorting should be applied before limiting."""
|
||||
for _ in range(3):
|
||||
run_archivebox_cmd(["snapshot", "create", create_test_url()], data_dir=initialized_archive)
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
["snapshot", "list", "--limit=2", "--sort=-created_at"],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 0, f"Command failed: {stderr}"
|
||||
records = parse_jsonl_output(stdout)
|
||||
assert len(records) == 2
|
||||
|
||||
def test_list_search_meta(self, initialized_archive):
|
||||
"""snapshot list should support metadata search mode."""
|
||||
url = create_test_url(domain="meta-search-example.com")
|
||||
run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
["snapshot", "list", "--search=meta", "meta-search-example.com"],
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 0, f"Command failed: {stderr}"
|
||||
records = parse_jsonl_output(stdout)
|
||||
assert len(records) == 1
|
||||
assert "meta-search-example.com" in records[0]["url"]
|
||||
|
||||
|
||||
class TestSnapshotUpdate:
|
||||
"""Tests for `archivebox snapshot update`."""
|
||||
@@ -192,35 +221,35 @@ class TestSnapshotUpdate:
|
||||
def test_update_status(self, initialized_archive):
|
||||
"""Update snapshot status."""
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
|
||||
stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
stdout2, stderr, code = run_archivebox_cmd(
|
||||
['snapshot', 'update', '--status=started'],
|
||||
["snapshot", "update", "--status=started"],
|
||||
stdin=json.dumps(snapshot),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
assert 'Updated 1 snapshots' in stderr
|
||||
assert "Updated 1 snapshots" in stderr
|
||||
|
||||
records = parse_jsonl_output(stdout2)
|
||||
assert records[0]['status'] == 'started'
|
||||
assert records[0]["status"] == "started"
|
||||
|
||||
def test_update_add_tag(self, initialized_archive):
|
||||
"""Update snapshot by adding tag."""
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
|
||||
stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
stdout2, stderr, code = run_archivebox_cmd(
|
||||
['snapshot', 'update', '--tag=new-tag'],
|
||||
["snapshot", "update", "--tag=new-tag"],
|
||||
stdin=json.dumps(snapshot),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
assert 'Updated 1 snapshots' in stderr
|
||||
assert "Updated 1 snapshots" in stderr
|
||||
|
||||
|
||||
class TestSnapshotDelete:
|
||||
@@ -229,44 +258,44 @@ class TestSnapshotDelete:
|
||||
def test_delete_requires_yes(self, initialized_archive):
|
||||
"""Delete requires --yes flag."""
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
|
||||
stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['snapshot', 'delete'],
|
||||
["snapshot", "delete"],
|
||||
stdin=json.dumps(snapshot),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 1
|
||||
assert '--yes' in stderr
|
||||
assert "--yes" in stderr
|
||||
|
||||
def test_delete_with_yes(self, initialized_archive):
|
||||
"""Delete with --yes flag works."""
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
|
||||
stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['snapshot', 'delete', '--yes'],
|
||||
["snapshot", "delete", "--yes"],
|
||||
stdin=json.dumps(snapshot),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
assert 'Deleted 1 snapshots' in stderr
|
||||
assert "Deleted 1 snapshots" in stderr
|
||||
|
||||
def test_delete_dry_run(self, initialized_archive):
|
||||
"""Dry run shows what would be deleted."""
|
||||
url = create_test_url()
|
||||
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
|
||||
stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
|
||||
snapshot = parse_jsonl_output(stdout1)[0]
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['snapshot', 'delete', '--dry-run'],
|
||||
["snapshot", "delete", "--dry-run"],
|
||||
stdin=json.dumps(snapshot),
|
||||
data_dir=initialized_archive,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
assert 'Would delete' in stderr
|
||||
assert "Would delete" in stderr
|
||||
|
||||
@@ -14,8 +14,8 @@ def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None:
|
||||
candidates = {snapshot_id}
|
||||
if len(snapshot_id) == 32:
|
||||
candidates.add(f"{snapshot_id[:8]}-{snapshot_id[8:12]}-{snapshot_id[12:16]}-{snapshot_id[16:20]}-{snapshot_id[20:]}")
|
||||
elif len(snapshot_id) == 36 and '-' in snapshot_id:
|
||||
candidates.add(snapshot_id.replace('-', ''))
|
||||
elif len(snapshot_id) == 36 and "-" in snapshot_id:
|
||||
candidates.add(snapshot_id.replace("-", ""))
|
||||
|
||||
for needle in candidates:
|
||||
for path in data_dir.rglob(needle):
|
||||
@@ -27,7 +27,7 @@ def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None:
|
||||
def test_status_runs_successfully(tmp_path, process):
|
||||
"""Test that status command runs without error."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True)
|
||||
result = subprocess.run(["archivebox", "status"], capture_output=True, text=True)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert len(result.stdout) > 100
|
||||
@@ -36,11 +36,11 @@ def test_status_runs_successfully(tmp_path, process):
|
||||
def test_status_shows_zero_snapshots_in_empty_archive(tmp_path, process):
|
||||
"""Test status shows 0 snapshots in empty archive."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True)
|
||||
result = subprocess.run(["archivebox", "status"], capture_output=True, text=True)
|
||||
|
||||
output = result.stdout
|
||||
# Should indicate empty/zero state
|
||||
assert '0' in output
|
||||
assert "0" in output
|
||||
|
||||
|
||||
def test_status_shows_correct_snapshot_count(tmp_path, process, disable_extractors_dict):
|
||||
@@ -48,14 +48,14 @@ def test_status_shows_correct_snapshot_count(tmp_path, process, disable_extracto
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Add 3 snapshots
|
||||
for url in ['https://example.com', 'https://example.org', 'https://example.net']:
|
||||
for url in ["https://example.com", "https://example.org", "https://example.net"]:
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', url],
|
||||
["archivebox", "add", "--index-only", "--depth=0", url],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True)
|
||||
result = subprocess.run(["archivebox", "status"], capture_output=True, text=True)
|
||||
|
||||
# Verify DB has 3 snapshots
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
@@ -65,7 +65,7 @@ def test_status_shows_correct_snapshot_count(tmp_path, process, disable_extracto
|
||||
|
||||
assert db_count == 3
|
||||
# Status output should show 3
|
||||
assert '3' in result.stdout
|
||||
assert "3" in result.stdout
|
||||
|
||||
|
||||
def test_status_shows_archived_count(tmp_path, process, disable_extractors_dict):
|
||||
@@ -73,25 +73,25 @@ def test_status_shows_archived_count(tmp_path, process, disable_extractors_dict)
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True)
|
||||
result = subprocess.run(["archivebox", "status"], capture_output=True, text=True)
|
||||
|
||||
# Should show archived/unarchived categories
|
||||
assert 'archived' in result.stdout.lower() or 'queued' in result.stdout.lower()
|
||||
assert "archived" in result.stdout.lower() or "queued" in result.stdout.lower()
|
||||
|
||||
|
||||
def test_status_shows_archive_directory_size(tmp_path, process):
|
||||
"""Test status reports archive directory size."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True)
|
||||
result = subprocess.run(["archivebox", "status"], capture_output=True, text=True)
|
||||
|
||||
output = result.stdout
|
||||
# Should show size info
|
||||
assert 'Size' in output or 'size' in output
|
||||
assert "Size" in output or "size" in output
|
||||
|
||||
|
||||
def test_status_counts_archive_directories(tmp_path, process, disable_extractors_dict):
|
||||
@@ -99,15 +99,15 @@ def test_status_counts_archive_directories(tmp_path, process, disable_extractors
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True)
|
||||
result = subprocess.run(["archivebox", "status"], capture_output=True, text=True)
|
||||
|
||||
# Should show directory count
|
||||
assert 'present' in result.stdout.lower() or 'directories' in result.stdout
|
||||
assert "present" in result.stdout.lower() or "directories" in result.stdout
|
||||
|
||||
|
||||
def test_status_detects_orphaned_directories(tmp_path, process, disable_extractors_dict):
|
||||
@@ -116,7 +116,7 @@ def test_status_detects_orphaned_directories(tmp_path, process, disable_extracto
|
||||
|
||||
# Add a snapshot
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
@@ -124,10 +124,10 @@ def test_status_detects_orphaned_directories(tmp_path, process, disable_extracto
|
||||
# Create an orphaned directory
|
||||
(tmp_path / "archive" / "fake_orphaned_dir").mkdir(parents=True, exist_ok=True)
|
||||
|
||||
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True)
|
||||
result = subprocess.run(["archivebox", "status"], capture_output=True, text=True)
|
||||
|
||||
# Should mention orphaned dirs
|
||||
assert 'orphan' in result.stdout.lower() or '1' in result.stdout
|
||||
assert "orphan" in result.stdout.lower() or "1" in result.stdout
|
||||
|
||||
|
||||
def test_status_counts_new_snapshot_output_dirs_as_archived(tmp_path, process, disable_extractors_dict):
|
||||
@@ -137,7 +137,7 @@ def test_status_counts_new_snapshot_output_dirs_as_archived(tmp_path, process, d
|
||||
env["ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS"] = "true"
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=env,
|
||||
check=True,
|
||||
@@ -145,7 +145,7 @@ def test_status_counts_new_snapshot_output_dirs_as_archived(tmp_path, process, d
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
snapshot_id = c.execute("SELECT id FROM core_snapshot WHERE url = ?", ('https://example.com',)).fetchone()[0]
|
||||
snapshot_id = c.execute("SELECT id FROM core_snapshot WHERE url = ?", ("https://example.com",)).fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
snapshot_dir = _find_snapshot_dir(tmp_path, str(snapshot_id))
|
||||
@@ -154,21 +154,21 @@ def test_status_counts_new_snapshot_output_dirs_as_archived(tmp_path, process, d
|
||||
title_dir.mkdir(parents=True, exist_ok=True)
|
||||
(title_dir / "title.txt").write_text("Example Domain")
|
||||
|
||||
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True, env=env)
|
||||
result = subprocess.run(["archivebox", "status"], capture_output=True, text=True, env=env)
|
||||
|
||||
assert result.returncode == 0, result.stdout + result.stderr
|
||||
assert 'archived: 1' in result.stdout
|
||||
assert 'present: 1' in result.stdout
|
||||
assert "archived: 1" in result.stdout
|
||||
assert "present: 1" in result.stdout
|
||||
|
||||
|
||||
def test_status_shows_user_info(tmp_path, process):
|
||||
"""Test status shows user/login information."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True)
|
||||
result = subprocess.run(["archivebox", "status"], capture_output=True, text=True)
|
||||
|
||||
output = result.stdout
|
||||
# Should show user section
|
||||
assert 'user' in output.lower() or 'login' in output.lower()
|
||||
assert "user" in output.lower() or "login" in output.lower()
|
||||
|
||||
|
||||
def test_status_reads_from_db_not_filesystem(tmp_path, process, disable_extractors_dict):
|
||||
@@ -177,7 +177,7 @@ def test_status_reads_from_db_not_filesystem(tmp_path, process, disable_extracto
|
||||
|
||||
# Add snapshot to DB
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
@@ -191,35 +191,35 @@ def test_status_reads_from_db_not_filesystem(tmp_path, process, disable_extracto
|
||||
assert db_count == 1
|
||||
|
||||
# Status should reflect DB count
|
||||
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True)
|
||||
assert '1' in result.stdout
|
||||
result = subprocess.run(["archivebox", "status"], capture_output=True, text=True)
|
||||
assert "1" in result.stdout
|
||||
|
||||
|
||||
def test_status_shows_index_file_info(tmp_path, process):
|
||||
"""Test status shows index file information."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True)
|
||||
result = subprocess.run(["archivebox", "status"], capture_output=True, text=True)
|
||||
|
||||
# Should mention index
|
||||
assert 'index' in result.stdout.lower() or 'Index' in result.stdout
|
||||
assert "index" in result.stdout.lower() or "Index" in result.stdout
|
||||
|
||||
|
||||
def test_status_help_lists_available_options(tmp_path, process):
|
||||
"""Test that status --help works and documents the command."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'status', '--help'],
|
||||
["archivebox", "status", "--help"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'status' in result.stdout.lower() or 'statistic' in result.stdout.lower()
|
||||
assert "status" in result.stdout.lower() or "statistic" in result.stdout.lower()
|
||||
|
||||
|
||||
def test_status_shows_data_directory_path(tmp_path, process):
|
||||
"""Test that status reports which collection directory it is inspecting."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True)
|
||||
result = subprocess.run(["archivebox", "status"], capture_output=True, text=True)
|
||||
|
||||
assert 'archive' in result.stdout.lower() or str(tmp_path) in result.stdout
|
||||
assert "archive" in result.stdout.lower() or str(tmp_path) in result.stdout
|
||||
|
||||
@@ -13,7 +13,7 @@ def test_update_runs_successfully_on_empty_archive(tmp_path, process):
|
||||
"""Test that update runs without error on empty archive."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'update'],
|
||||
["archivebox", "update"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
@@ -29,14 +29,14 @@ def test_update_reconciles_existing_snapshots(tmp_path, process, disable_extract
|
||||
|
||||
# Add a snapshot (index-only for faster test)
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Run update - should reconcile and queue
|
||||
result = subprocess.run(
|
||||
['archivebox', 'update'],
|
||||
["archivebox", "update"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
@@ -51,13 +51,13 @@ def test_update_specific_snapshot_by_filter(tmp_path, process, disable_extractor
|
||||
|
||||
# Add multiple snapshots
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=90,
|
||||
)
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.org'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.org"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=90,
|
||||
@@ -65,7 +65,7 @@ def test_update_specific_snapshot_by_filter(tmp_path, process, disable_extractor
|
||||
|
||||
# Update with filter pattern (uses filter_patterns argument)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'update', '--filter-type=substring', 'example.com'],
|
||||
["archivebox", "update", "--filter-type=substring", "example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
@@ -81,7 +81,7 @@ def test_update_preserves_snapshot_count(tmp_path, process, disable_extractors_d
|
||||
|
||||
# Add snapshots
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=90,
|
||||
@@ -97,7 +97,7 @@ def test_update_preserves_snapshot_count(tmp_path, process, disable_extractors_d
|
||||
|
||||
# Run update (should reconcile + queue, not create new snapshots)
|
||||
subprocess.run(
|
||||
['archivebox', 'update'],
|
||||
["archivebox", "update"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
@@ -118,7 +118,7 @@ def test_update_queues_snapshots_for_archiving(tmp_path, process, disable_extrac
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=90,
|
||||
@@ -126,7 +126,7 @@ def test_update_queues_snapshots_for_archiving(tmp_path, process, disable_extrac
|
||||
|
||||
# Run update
|
||||
result = subprocess.run(
|
||||
['archivebox', 'update'],
|
||||
["archivebox", "update"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
@@ -140,4 +140,4 @@ def test_update_queues_snapshots_for_archiving(tmp_path, process, disable_extrac
|
||||
status = c.execute("SELECT status FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
assert status == 'queued'
|
||||
assert status == "queued"
|
||||
|
||||
@@ -67,56 +67,56 @@ def _extract_location_path(output: str, key: str) -> Path:
|
||||
def test_version_quiet_outputs_version_number(tmp_path):
|
||||
"""Test that version --quiet outputs just the version number."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(['archivebox', 'version', '--quiet'], capture_output=True, text=True)
|
||||
result = subprocess.run(["archivebox", "version", "--quiet"], capture_output=True, text=True)
|
||||
|
||||
assert result.returncode == 0
|
||||
version = result.stdout.strip()
|
||||
assert version
|
||||
# Version should be semver-ish format (e.g., 0.8.0)
|
||||
parts = version.split('.')
|
||||
parts = version.split(".")
|
||||
assert len(parts) >= 2
|
||||
|
||||
|
||||
def test_version_flag_outputs_version_number(tmp_path):
|
||||
"""Test that top-level --version reports the package version."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(['archivebox', '--version'], capture_output=True, text=True)
|
||||
result = subprocess.run(["archivebox", "--version"], capture_output=True, text=True)
|
||||
|
||||
assert result.returncode == 0
|
||||
version = result.stdout.strip()
|
||||
assert version
|
||||
assert len(version.split('.')) >= 2
|
||||
assert len(version.split(".")) >= 2
|
||||
|
||||
|
||||
def test_version_shows_system_info_in_initialized_dir(tmp_path, process):
|
||||
"""Test that version shows system metadata in initialized directory."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(['archivebox', 'version'], capture_output=True, text=True)
|
||||
result = subprocess.run(["archivebox", "version"], capture_output=True, text=True)
|
||||
|
||||
output = result.stdout
|
||||
assert 'ArchiveBox' in output
|
||||
assert "ArchiveBox" in output
|
||||
# Should show system info
|
||||
assert any(x in output for x in ['ARCH=', 'OS=', 'PYTHON='])
|
||||
assert any(x in output for x in ["ARCH=", "OS=", "PYTHON="])
|
||||
|
||||
|
||||
def test_version_shows_binaries_after_init(tmp_path, process):
|
||||
"""Test that version shows binary dependencies in initialized directory."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(['archivebox', 'version'], capture_output=True, text=True)
|
||||
result = subprocess.run(["archivebox", "version"], capture_output=True, text=True)
|
||||
|
||||
output = result.stdout
|
||||
# Should show binary section
|
||||
assert 'Binary' in output or 'Dependencies' in output
|
||||
assert "Binary" in output or "Dependencies" in output
|
||||
|
||||
|
||||
def test_version_shows_data_locations(tmp_path, process):
|
||||
"""Test that version shows data directory locations."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(['archivebox', 'version'], capture_output=True, text=True)
|
||||
result = subprocess.run(["archivebox", "version"], capture_output=True, text=True)
|
||||
|
||||
output = result.stdout
|
||||
# Should show paths
|
||||
assert any(x in output for x in ['Data', 'Code', 'location'])
|
||||
assert any(x in output for x in ["Data", "Code", "location"])
|
||||
|
||||
|
||||
def test_version_in_uninitialized_dir_still_works(tmp_path):
|
||||
@@ -125,7 +125,7 @@ def test_version_in_uninitialized_dir_still_works(tmp_path):
|
||||
empty_dir.mkdir()
|
||||
os.chdir(empty_dir)
|
||||
|
||||
result = subprocess.run(['archivebox', 'version', '--quiet'], capture_output=True, text=True)
|
||||
result = subprocess.run(["archivebox", "version", "--quiet"], capture_output=True, text=True)
|
||||
|
||||
# Should still output version
|
||||
assert result.returncode == 0
|
||||
@@ -164,15 +164,15 @@ def test_version_auto_selects_short_tmp_dir_for_deep_collection_path(tmp_path):
|
||||
def test_version_help_lists_quiet_flag(tmp_path):
|
||||
"""Test that version --help documents the quiet output mode."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(['archivebox', 'version', '--help'], capture_output=True, text=True)
|
||||
result = subprocess.run(["archivebox", "version", "--help"], capture_output=True, text=True)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert '--quiet' in result.stdout or '-q' in result.stdout
|
||||
assert "--quiet" in result.stdout or "-q" in result.stdout
|
||||
|
||||
|
||||
def test_version_invalid_option_fails(tmp_path):
|
||||
"""Test that invalid version options fail cleanly."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(['archivebox', 'version', '--invalid-option'], capture_output=True, text=True)
|
||||
result = subprocess.run(["archivebox", "version", "--invalid-option"], capture_output=True, text=True)
|
||||
|
||||
assert result.returncode != 0
|
||||
|
||||
@@ -7,19 +7,18 @@ import subprocess
|
||||
import pytest
|
||||
|
||||
|
||||
|
||||
def test_config_shows_all_config_values(tmp_path, process):
|
||||
"""Test that config without args shows all config values."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'config'],
|
||||
["archivebox", "config"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should show various config sections
|
||||
assert 'TIMEOUT' in result.stdout or 'timeout' in result.stdout.lower()
|
||||
assert "TIMEOUT" in result.stdout or "timeout" in result.stdout.lower()
|
||||
# Config should show some output
|
||||
assert len(result.stdout) > 100
|
||||
|
||||
@@ -29,13 +28,13 @@ def test_config_get_specific_key(tmp_path, process):
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'config', '--get', 'TIMEOUT'],
|
||||
["archivebox", "config", "--get", "TIMEOUT"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should show the TIMEOUT value
|
||||
assert 'TIMEOUT' in result.stdout or result.returncode == 0
|
||||
assert "TIMEOUT" in result.stdout or result.returncode == 0
|
||||
|
||||
|
||||
def test_config_set_value_writes_to_config_file(tmp_path, process):
|
||||
@@ -44,18 +43,18 @@ def test_config_set_value_writes_to_config_file(tmp_path, process):
|
||||
|
||||
# Set a config value
|
||||
result = subprocess.run(
|
||||
['archivebox', 'config', '--set', 'TIMEOUT=120'],
|
||||
["archivebox", "config", "--set", "TIMEOUT=120"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
assert result.returncode == 0, result.stderr
|
||||
|
||||
# Read the config file directly to verify it was written
|
||||
config_file = tmp_path / 'ArchiveBox.conf'
|
||||
config_file = tmp_path / "ArchiveBox.conf"
|
||||
if config_file.exists():
|
||||
config_content = config_file.read_text()
|
||||
# Config should contain the set value
|
||||
assert 'TIMEOUT' in config_content or 'timeout' in config_content.lower()
|
||||
assert "TIMEOUT" in config_content or "timeout" in config_content.lower()
|
||||
|
||||
|
||||
def test_config_set_and_get_roundtrip(tmp_path, process):
|
||||
@@ -64,19 +63,19 @@ def test_config_set_and_get_roundtrip(tmp_path, process):
|
||||
|
||||
# Set a value
|
||||
set_result = subprocess.run(
|
||||
['archivebox', 'config', '--set', 'TIMEOUT=999'],
|
||||
["archivebox", "config", "--set", "TIMEOUT=999"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Verify set was successful
|
||||
assert set_result.returncode == 0 or '999' in set_result.stdout
|
||||
assert set_result.returncode == 0 or "999" in set_result.stdout
|
||||
|
||||
# Read the config file directly to verify
|
||||
config_file = tmp_path / 'ArchiveBox.conf'
|
||||
config_file = tmp_path / "ArchiveBox.conf"
|
||||
if config_file.exists():
|
||||
config_content = config_file.read_text()
|
||||
assert '999' in config_content or 'TIMEOUT' in config_content
|
||||
assert "999" in config_content or "TIMEOUT" in config_content
|
||||
|
||||
|
||||
def test_config_search_finds_matching_keys(tmp_path, process):
|
||||
@@ -84,13 +83,13 @@ def test_config_search_finds_matching_keys(tmp_path, process):
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'config', '--search', 'TIMEOUT'],
|
||||
["archivebox", "config", "--search", "TIMEOUT"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should find TIMEOUT-related config
|
||||
assert 'TIMEOUT' in result.stdout or result.returncode == 0
|
||||
assert "TIMEOUT" in result.stdout or result.returncode == 0
|
||||
|
||||
|
||||
def test_config_invalid_key_fails(tmp_path, process):
|
||||
@@ -98,13 +97,13 @@ def test_config_invalid_key_fails(tmp_path, process):
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'config', '--set', 'INVALID_KEY_THAT_DOES_NOT_EXIST=value'],
|
||||
["archivebox", "config", "--set", "INVALID_KEY_THAT_DOES_NOT_EXIST=value"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should fail
|
||||
assert result.returncode != 0 or 'failed' in result.stdout.lower()
|
||||
assert result.returncode != 0 or "failed" in result.stdout.lower()
|
||||
|
||||
|
||||
def test_config_set_requires_equals_sign(tmp_path, process):
|
||||
@@ -112,7 +111,7 @@ def test_config_set_requires_equals_sign(tmp_path, process):
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'config', '--set', 'TIMEOUT'],
|
||||
["archivebox", "config", "--set", "TIMEOUT"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
@@ -129,15 +128,15 @@ class TestConfigCLI:
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'config', '--help'],
|
||||
["archivebox", "config", "--help"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert '--get' in result.stdout
|
||||
assert '--set' in result.stdout
|
||||
assert "--get" in result.stdout
|
||||
assert "--set" in result.stdout
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
|
||||
@@ -17,310 +17,317 @@ def test_get_db_binaries_by_name_collapses_youtube_dl_aliases(monkeypatch):
|
||||
now = timezone.now()
|
||||
records = [
|
||||
SimpleNamespace(
|
||||
name='youtube-dl',
|
||||
version='',
|
||||
binprovider='',
|
||||
abspath='/usr/bin/youtube-dl',
|
||||
name="youtube-dl",
|
||||
version="",
|
||||
binprovider="",
|
||||
abspath="/usr/bin/youtube-dl",
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
modified_at=now,
|
||||
),
|
||||
SimpleNamespace(
|
||||
name='yt-dlp',
|
||||
version='2026.03.01',
|
||||
binprovider='pip',
|
||||
abspath='/usr/bin/yt-dlp',
|
||||
name="yt-dlp",
|
||||
version="2026.03.01",
|
||||
binprovider="pip",
|
||||
abspath="/usr/bin/yt-dlp",
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
modified_at=now + timedelta(seconds=1),
|
||||
),
|
||||
]
|
||||
|
||||
monkeypatch.setattr(config_views.Binary, 'objects', SimpleNamespace(all=lambda: records))
|
||||
monkeypatch.setattr(config_views.Binary, "objects", SimpleNamespace(all=lambda: records))
|
||||
|
||||
binaries = config_views.get_db_binaries_by_name()
|
||||
|
||||
assert 'yt-dlp' in binaries
|
||||
assert 'youtube-dl' not in binaries
|
||||
assert binaries['yt-dlp'].version == '2026.03.01'
|
||||
assert "yt-dlp" in binaries
|
||||
assert "youtube-dl" not in binaries
|
||||
assert binaries["yt-dlp"].version == "2026.03.01"
|
||||
|
||||
|
||||
def test_binaries_list_view_uses_db_version_and_hides_youtube_dl_alias(monkeypatch):
|
||||
request = RequestFactory().get('/admin/environment/binaries/')
|
||||
request = RequestFactory().get("/admin/environment/binaries/")
|
||||
request.user = SimpleNamespace(is_superuser=True)
|
||||
|
||||
db_binary = SimpleNamespace(
|
||||
name='youtube-dl',
|
||||
version='2026.03.01',
|
||||
binprovider='pip',
|
||||
abspath='/usr/bin/yt-dlp',
|
||||
name="youtube-dl",
|
||||
version="2026.03.01",
|
||||
binprovider="pip",
|
||||
abspath="/usr/bin/yt-dlp",
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
sha256='',
|
||||
sha256="",
|
||||
modified_at=timezone.now(),
|
||||
)
|
||||
|
||||
monkeypatch.setattr(config_views, 'get_db_binaries_by_name', lambda: {'yt-dlp': db_binary})
|
||||
monkeypatch.setattr(config_views, "get_db_binaries_by_name", lambda: {"yt-dlp": db_binary})
|
||||
|
||||
context = config_views.binaries_list_view.__wrapped__(request)
|
||||
|
||||
assert len(context['table']['Binary Name']) == 1
|
||||
assert str(context['table']['Binary Name'][0].link_item) == 'yt-dlp'
|
||||
assert context['table']['Found Version'][0] == '✅ 2026.03.01'
|
||||
assert context['table']['Provided By'][0] == 'pip'
|
||||
assert context['table']['Found Abspath'][0] == '/usr/bin/yt-dlp'
|
||||
assert len(context["table"]["Binary Name"]) == 1
|
||||
assert str(context["table"]["Binary Name"][0].link_item) == "yt-dlp"
|
||||
assert context["table"]["Found Version"][0] == "✅ 2026.03.01"
|
||||
assert context["table"]["Provided By"][0] == "pip"
|
||||
assert context["table"]["Found Abspath"][0] == "/usr/bin/yt-dlp"
|
||||
|
||||
|
||||
def test_binaries_list_view_only_shows_persisted_records(monkeypatch):
|
||||
request = RequestFactory().get('/admin/environment/binaries/')
|
||||
request = RequestFactory().get("/admin/environment/binaries/")
|
||||
request.user = SimpleNamespace(is_superuser=True)
|
||||
|
||||
monkeypatch.setattr(config_views, 'get_db_binaries_by_name', lambda: {})
|
||||
monkeypatch.setattr(config_views, "get_db_binaries_by_name", lambda: {})
|
||||
|
||||
context = config_views.binaries_list_view.__wrapped__(request)
|
||||
|
||||
assert context['table']['Binary Name'] == []
|
||||
assert context['table']['Found Version'] == []
|
||||
assert context['table']['Provided By'] == []
|
||||
assert context['table']['Found Abspath'] == []
|
||||
assert context["table"]["Binary Name"] == []
|
||||
assert context["table"]["Found Version"] == []
|
||||
assert context["table"]["Provided By"] == []
|
||||
assert context["table"]["Found Abspath"] == []
|
||||
|
||||
|
||||
def test_binary_detail_view_uses_canonical_db_record(monkeypatch):
|
||||
request = RequestFactory().get('/admin/environment/binaries/youtube-dl/')
|
||||
request = RequestFactory().get("/admin/environment/binaries/youtube-dl/")
|
||||
request.user = SimpleNamespace(is_superuser=True)
|
||||
|
||||
db_binary = SimpleNamespace(
|
||||
id='019d14cc-6c40-7793-8ff1-0f8bb050e8a3',
|
||||
name='yt-dlp',
|
||||
version='2026.03.01',
|
||||
binprovider='pip',
|
||||
abspath='/usr/bin/yt-dlp',
|
||||
sha256='abc123',
|
||||
id="019d14cc-6c40-7793-8ff1-0f8bb050e8a3",
|
||||
name="yt-dlp",
|
||||
version="2026.03.01",
|
||||
binprovider="pip",
|
||||
abspath="/usr/bin/yt-dlp",
|
||||
sha256="abc123",
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
modified_at=timezone.now(),
|
||||
)
|
||||
|
||||
monkeypatch.setattr(config_views, 'get_db_binaries_by_name', lambda: {'yt-dlp': db_binary})
|
||||
monkeypatch.setattr(config_views, "get_db_binaries_by_name", lambda: {"yt-dlp": db_binary})
|
||||
|
||||
context = config_views.binary_detail_view.__wrapped__(request, key='youtube-dl')
|
||||
section = context['data'][0]
|
||||
context = config_views.binary_detail_view.__wrapped__(request, key="youtube-dl")
|
||||
section = context["data"][0]
|
||||
|
||||
assert context['title'] == 'yt-dlp'
|
||||
assert section['fields']['name'] == 'yt-dlp'
|
||||
assert section['fields']['version'] == '2026.03.01'
|
||||
assert section['fields']['binprovider'] == 'pip'
|
||||
assert section['fields']['abspath'] == '/usr/bin/yt-dlp'
|
||||
assert '/admin/machine/binary/019d14cc-6c40-7793-8ff1-0f8bb050e8a3/change/?_changelist_filters=q%3Dyt-dlp' in section['description']
|
||||
assert context["title"] == "yt-dlp"
|
||||
assert section["fields"]["name"] == "yt-dlp"
|
||||
assert section["fields"]["version"] == "2026.03.01"
|
||||
assert section["fields"]["binprovider"] == "pip"
|
||||
assert section["fields"]["abspath"] == "/usr/bin/yt-dlp"
|
||||
assert "/admin/machine/binary/019d14cc-6c40-7793-8ff1-0f8bb050e8a3/change/?_changelist_filters=q%3Dyt-dlp" in section["description"]
|
||||
|
||||
|
||||
def test_binary_detail_view_marks_unrecorded_binary(monkeypatch):
|
||||
request = RequestFactory().get('/admin/environment/binaries/wget/')
|
||||
request = RequestFactory().get("/admin/environment/binaries/wget/")
|
||||
request.user = SimpleNamespace(is_superuser=True)
|
||||
|
||||
monkeypatch.setattr(config_views, 'get_db_binaries_by_name', lambda: {})
|
||||
monkeypatch.setattr(config_views, "get_db_binaries_by_name", lambda: {})
|
||||
|
||||
context = config_views.binary_detail_view.__wrapped__(request, key='wget')
|
||||
section = context['data'][0]
|
||||
context = config_views.binary_detail_view.__wrapped__(request, key="wget")
|
||||
section = context["data"][0]
|
||||
|
||||
assert section['description'] == 'No persisted Binary record found'
|
||||
assert section['fields']['status'] == 'unrecorded'
|
||||
assert section['fields']['binprovider'] == 'not recorded'
|
||||
assert section["description"] == "No persisted Binary record found"
|
||||
assert section["fields"]["status"] == "unrecorded"
|
||||
assert section["fields"]["binprovider"] == "not recorded"
|
||||
|
||||
|
||||
def test_plugin_detail_view_renders_config_in_dedicated_sections(monkeypatch):
|
||||
request = RequestFactory().get('/admin/environment/plugins/builtin.example/')
|
||||
request = RequestFactory().get("/admin/environment/plugins/builtin.example/")
|
||||
request.user = SimpleNamespace(is_superuser=True)
|
||||
|
||||
plugin_config = {
|
||||
'title': 'Example Plugin',
|
||||
'description': 'Example config used to verify plugin metadata rendering.',
|
||||
'type': 'object',
|
||||
'required_plugins': ['chrome'],
|
||||
'required_binaries': ['example-cli'],
|
||||
'output_mimetypes': ['text/plain', 'application/json'],
|
||||
'properties': {
|
||||
'EXAMPLE_ENABLED': {
|
||||
'type': 'boolean',
|
||||
'description': 'Enable the example plugin.',
|
||||
'x-fallback': 'CHECK_SSL_VALIDITY',
|
||||
"title": "Example Plugin",
|
||||
"description": "Example config used to verify plugin metadata rendering.",
|
||||
"type": "object",
|
||||
"required_plugins": ["chrome"],
|
||||
"required_binaries": ["example-cli"],
|
||||
"output_mimetypes": ["text/plain", "application/json"],
|
||||
"properties": {
|
||||
"EXAMPLE_ENABLED": {
|
||||
"type": "boolean",
|
||||
"description": "Enable the example plugin.",
|
||||
"x-fallback": "CHECK_SSL_VALIDITY",
|
||||
},
|
||||
'EXAMPLE_BINARY': {
|
||||
'type': 'string',
|
||||
'default': 'gallery-dl',
|
||||
'description': 'Filesystem path for example output.',
|
||||
'x-aliases': ['USE_EXAMPLE_BINARY'],
|
||||
"EXAMPLE_BINARY": {
|
||||
"type": "string",
|
||||
"default": "gallery-dl",
|
||||
"description": "Filesystem path for example output.",
|
||||
"x-aliases": ["USE_EXAMPLE_BINARY"],
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
monkeypatch.setattr(config_views, 'get_filesystem_plugins', lambda: {
|
||||
'builtin.example': {
|
||||
'id': 'builtin.example',
|
||||
'name': 'example',
|
||||
'source': 'builtin',
|
||||
'path': '/plugins/example',
|
||||
'hooks': ['on_Snapshot__01_example.py'],
|
||||
'config': plugin_config,
|
||||
}
|
||||
})
|
||||
monkeypatch.setattr(config_views, 'get_machine_admin_url', lambda: '/admin/machine/machine/test-machine/change/')
|
||||
monkeypatch.setattr(
|
||||
config_views,
|
||||
"get_filesystem_plugins",
|
||||
lambda: {
|
||||
"builtin.example": {
|
||||
"id": "builtin.example",
|
||||
"name": "example",
|
||||
"source": "builtin",
|
||||
"path": "/plugins/example",
|
||||
"hooks": ["on_Snapshot__01_example.py"],
|
||||
"config": plugin_config,
|
||||
},
|
||||
},
|
||||
)
|
||||
monkeypatch.setattr(config_views, "get_machine_admin_url", lambda: "/admin/machine/machine/test-machine/change/")
|
||||
|
||||
context = config_views.plugin_detail_view.__wrapped__(request, key='builtin.example')
|
||||
context = config_views.plugin_detail_view.__wrapped__(request, key="builtin.example")
|
||||
|
||||
assert context['title'] == 'example'
|
||||
assert len(context['data']) == 5
|
||||
assert context["title"] == "example"
|
||||
assert len(context["data"]) == 5
|
||||
|
||||
summary_section, hooks_section, metadata_section, config_section, properties_section = context['data']
|
||||
summary_section, hooks_section, metadata_section, config_section, properties_section = context["data"]
|
||||
|
||||
assert summary_section['fields'] == {
|
||||
'id': 'builtin.example',
|
||||
'name': 'example',
|
||||
'source': 'builtin',
|
||||
assert summary_section["fields"] == {
|
||||
"id": "builtin.example",
|
||||
"name": "example",
|
||||
"source": "builtin",
|
||||
}
|
||||
assert '/plugins/example' in summary_section['description']
|
||||
assert 'https://archivebox.github.io/abx-plugins/#example' in summary_section['description']
|
||||
assert "/plugins/example" in summary_section["description"]
|
||||
assert "https://archivebox.github.io/abx-plugins/#example" in summary_section["description"]
|
||||
|
||||
assert hooks_section['name'] == 'Hooks'
|
||||
assert hooks_section['fields'] == {}
|
||||
assert 'https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/example/on_Snapshot__01_example.py' in hooks_section['description']
|
||||
assert 'on_Snapshot__01_example.py' in hooks_section['description']
|
||||
assert hooks_section["name"] == "Hooks"
|
||||
assert hooks_section["fields"] == {}
|
||||
assert (
|
||||
"https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/example/on_Snapshot__01_example.py"
|
||||
in hooks_section["description"]
|
||||
)
|
||||
assert "on_Snapshot__01_example.py" in hooks_section["description"]
|
||||
|
||||
assert metadata_section['name'] == 'Plugin Metadata'
|
||||
assert metadata_section['fields'] == {}
|
||||
assert 'Example Plugin' in metadata_section['description']
|
||||
assert 'Example config used to verify plugin metadata rendering.' in metadata_section['description']
|
||||
assert 'https://archivebox.github.io/abx-plugins/#chrome' in metadata_section['description']
|
||||
assert '/admin/environment/binaries/example-cli/' in metadata_section['description']
|
||||
assert 'text/plain' in metadata_section['description']
|
||||
assert 'application/json' in metadata_section['description']
|
||||
assert metadata_section["name"] == "Plugin Metadata"
|
||||
assert metadata_section["fields"] == {}
|
||||
assert "Example Plugin" in metadata_section["description"]
|
||||
assert "Example config used to verify plugin metadata rendering." in metadata_section["description"]
|
||||
assert "https://archivebox.github.io/abx-plugins/#chrome" in metadata_section["description"]
|
||||
assert "/admin/environment/binaries/example-cli/" in metadata_section["description"]
|
||||
assert "text/plain" in metadata_section["description"]
|
||||
assert "application/json" in metadata_section["description"]
|
||||
|
||||
assert config_section['name'] == 'config.json'
|
||||
assert config_section['fields'] == {}
|
||||
assert '<pre style=' in config_section['description']
|
||||
assert 'EXAMPLE_ENABLED' in config_section['description']
|
||||
assert '<span style="color: #0550ae;">"properties"</span>' in config_section['description']
|
||||
assert config_section["name"] == "config.json"
|
||||
assert config_section["fields"] == {}
|
||||
assert "<pre style=" in config_section["description"]
|
||||
assert "EXAMPLE_ENABLED" in config_section["description"]
|
||||
assert '<span style="color: #0550ae;">"properties"</span>' in config_section["description"]
|
||||
|
||||
assert properties_section['name'] == 'Config Properties'
|
||||
assert properties_section['fields'] == {}
|
||||
assert '/admin/machine/machine/test-machine/change/' in properties_section['description']
|
||||
assert '/admin/machine/binary/' in properties_section['description']
|
||||
assert '/admin/environment/binaries/' in properties_section['description']
|
||||
assert 'EXAMPLE_ENABLED' in properties_section['description']
|
||||
assert 'boolean' in properties_section['description']
|
||||
assert 'Enable the example plugin.' in properties_section['description']
|
||||
assert '/admin/environment/config/EXAMPLE_ENABLED/' in properties_section['description']
|
||||
assert '/admin/environment/config/CHECK_SSL_VALIDITY/' in properties_section['description']
|
||||
assert '/admin/environment/config/USE_EXAMPLE_BINARY/' in properties_section['description']
|
||||
assert '/admin/environment/binaries/gallery-dl/' in properties_section['description']
|
||||
assert 'EXAMPLE_BINARY' in properties_section['description']
|
||||
assert properties_section["name"] == "Config Properties"
|
||||
assert properties_section["fields"] == {}
|
||||
assert "/admin/machine/machine/test-machine/change/" in properties_section["description"]
|
||||
assert "/admin/machine/binary/" in properties_section["description"]
|
||||
assert "/admin/environment/binaries/" in properties_section["description"]
|
||||
assert "EXAMPLE_ENABLED" in properties_section["description"]
|
||||
assert "boolean" in properties_section["description"]
|
||||
assert "Enable the example plugin." in properties_section["description"]
|
||||
assert "/admin/environment/config/EXAMPLE_ENABLED/" in properties_section["description"]
|
||||
assert "/admin/environment/config/CHECK_SSL_VALIDITY/" in properties_section["description"]
|
||||
assert "/admin/environment/config/USE_EXAMPLE_BINARY/" in properties_section["description"]
|
||||
assert "/admin/environment/binaries/gallery-dl/" in properties_section["description"]
|
||||
assert "EXAMPLE_BINARY" in properties_section["description"]
|
||||
|
||||
|
||||
def test_get_config_definition_link_keeps_core_config_search_link(monkeypatch):
|
||||
monkeypatch.setattr(core_views, 'find_plugin_for_config_key', lambda key: None)
|
||||
monkeypatch.setattr(core_views, "find_plugin_for_config_key", lambda key: None)
|
||||
|
||||
url, label = core_views.get_config_definition_link('CHECK_SSL_VALIDITY')
|
||||
url, label = core_views.get_config_definition_link("CHECK_SSL_VALIDITY")
|
||||
|
||||
assert 'github.com/search' in url
|
||||
assert 'CHECK_SSL_VALIDITY' in url
|
||||
assert label == 'archivebox/config'
|
||||
assert "github.com/search" in url
|
||||
assert "CHECK_SSL_VALIDITY" in url
|
||||
assert label == "archivebox/config"
|
||||
|
||||
|
||||
def test_get_config_definition_link_uses_plugin_config_json_for_plugin_options(monkeypatch):
|
||||
plugin_dir = core_views.BUILTIN_PLUGINS_DIR / 'parse_dom_outlinks'
|
||||
plugin_dir = core_views.BUILTIN_PLUGINS_DIR / "parse_dom_outlinks"
|
||||
|
||||
monkeypatch.setattr(core_views, 'find_plugin_for_config_key', lambda key: 'parse_dom_outlinks')
|
||||
monkeypatch.setattr(core_views, 'iter_plugin_dirs', lambda: [plugin_dir])
|
||||
monkeypatch.setattr(core_views, "find_plugin_for_config_key", lambda key: "parse_dom_outlinks")
|
||||
monkeypatch.setattr(core_views, "iter_plugin_dirs", lambda: [plugin_dir])
|
||||
|
||||
url, label = core_views.get_config_definition_link('PARSE_DOM_OUTLINKS_ENABLED')
|
||||
url, label = core_views.get_config_definition_link("PARSE_DOM_OUTLINKS_ENABLED")
|
||||
|
||||
assert url == 'https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/parse_dom_outlinks/config.json'
|
||||
assert label == 'abx_plugins/plugins/parse_dom_outlinks/config.json'
|
||||
assert url == "https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/parse_dom_outlinks/config.json"
|
||||
assert label == "abx_plugins/plugins/parse_dom_outlinks/config.json"
|
||||
|
||||
|
||||
def test_live_config_value_view_renames_source_field_and_uses_plugin_definition_link(monkeypatch):
|
||||
request = RequestFactory().get('/admin/environment/config/PARSE_DOM_OUTLINKS_ENABLED/')
|
||||
request = RequestFactory().get("/admin/environment/config/PARSE_DOM_OUTLINKS_ENABLED/")
|
||||
request.user = SimpleNamespace(is_superuser=True)
|
||||
|
||||
monkeypatch.setattr(core_views, 'get_all_configs', lambda: {})
|
||||
monkeypatch.setattr(core_views, 'get_flat_config', lambda: {})
|
||||
monkeypatch.setattr(core_views, 'get_config', lambda: {'PARSE_DOM_OUTLINKS_ENABLED': True})
|
||||
monkeypatch.setattr(core_views, 'find_config_default', lambda key: 'True')
|
||||
monkeypatch.setattr(core_views, 'find_config_type', lambda key: 'bool')
|
||||
monkeypatch.setattr(core_views, 'find_config_source', lambda key, merged: 'Default')
|
||||
monkeypatch.setattr(core_views, 'key_is_safe', lambda key: True)
|
||||
monkeypatch.setattr(core_views.CONSTANTS, 'CONFIG_FILE', SimpleNamespace(exists=lambda: False))
|
||||
monkeypatch.setattr(core_views, "get_all_configs", lambda: {})
|
||||
monkeypatch.setattr(core_views, "get_flat_config", lambda: {})
|
||||
monkeypatch.setattr(core_views, "get_config", lambda: {"PARSE_DOM_OUTLINKS_ENABLED": True})
|
||||
monkeypatch.setattr(core_views, "find_config_default", lambda key: "True")
|
||||
monkeypatch.setattr(core_views, "find_config_type", lambda key: "bool")
|
||||
monkeypatch.setattr(core_views, "find_config_source", lambda key, merged: "Default")
|
||||
monkeypatch.setattr(core_views, "key_is_safe", lambda key: True)
|
||||
monkeypatch.setattr(core_views.CONSTANTS, "CONFIG_FILE", SimpleNamespace(exists=lambda: False))
|
||||
|
||||
from archivebox.machine.models import Machine
|
||||
from archivebox.config.configset import BaseConfigSet
|
||||
|
||||
monkeypatch.setattr(Machine, 'current', classmethod(lambda cls: SimpleNamespace(id='machine-id', config={})))
|
||||
monkeypatch.setattr(BaseConfigSet, 'load_from_file', classmethod(lambda cls, path: {}))
|
||||
monkeypatch.setattr(Machine, "current", classmethod(lambda cls: SimpleNamespace(id="machine-id", config={})))
|
||||
monkeypatch.setattr(BaseConfigSet, "load_from_file", classmethod(lambda cls, path: {}))
|
||||
monkeypatch.setattr(
|
||||
core_views,
|
||||
'get_config_definition_link',
|
||||
"get_config_definition_link",
|
||||
lambda key: (
|
||||
'https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/parse_dom_outlinks/config.json',
|
||||
'abx_plugins/plugins/parse_dom_outlinks/config.json',
|
||||
"https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/parse_dom_outlinks/config.json",
|
||||
"abx_plugins/plugins/parse_dom_outlinks/config.json",
|
||||
),
|
||||
)
|
||||
|
||||
context = core_views.live_config_value_view.__wrapped__(request, key='PARSE_DOM_OUTLINKS_ENABLED')
|
||||
section = context['data'][0]
|
||||
context = core_views.live_config_value_view.__wrapped__(request, key="PARSE_DOM_OUTLINKS_ENABLED")
|
||||
section = context["data"][0]
|
||||
|
||||
assert 'Currently read from' in section['fields']
|
||||
assert 'Source' not in section['fields']
|
||||
assert section['fields']['Currently read from'] == 'Default'
|
||||
assert 'abx_plugins/plugins/parse_dom_outlinks/config.json' in section['help_texts']['Type']
|
||||
assert "Currently read from" in section["fields"]
|
||||
assert "Source" not in section["fields"]
|
||||
assert section["fields"]["Currently read from"] == "Default"
|
||||
assert "abx_plugins/plugins/parse_dom_outlinks/config.json" in section["help_texts"]["Type"]
|
||||
|
||||
|
||||
def test_find_config_source_prefers_environment_over_machine_and_file(monkeypatch):
|
||||
monkeypatch.setenv('CHECK_SSL_VALIDITY', 'false')
|
||||
monkeypatch.setenv("CHECK_SSL_VALIDITY", "false")
|
||||
|
||||
from archivebox.machine.models import Machine
|
||||
from archivebox.config.configset import BaseConfigSet
|
||||
|
||||
monkeypatch.setattr(
|
||||
Machine,
|
||||
'current',
|
||||
classmethod(lambda cls: SimpleNamespace(id='machine-id', config={'CHECK_SSL_VALIDITY': 'true'})),
|
||||
"current",
|
||||
classmethod(lambda cls: SimpleNamespace(id="machine-id", config={"CHECK_SSL_VALIDITY": "true"})),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
BaseConfigSet,
|
||||
'load_from_file',
|
||||
classmethod(lambda cls, path: {'CHECK_SSL_VALIDITY': 'true'}),
|
||||
"load_from_file",
|
||||
classmethod(lambda cls, path: {"CHECK_SSL_VALIDITY": "true"}),
|
||||
)
|
||||
|
||||
assert core_views.find_config_source('CHECK_SSL_VALIDITY', {'CHECK_SSL_VALIDITY': False}) == 'Environment'
|
||||
assert core_views.find_config_source("CHECK_SSL_VALIDITY", {"CHECK_SSL_VALIDITY": False}) == "Environment"
|
||||
|
||||
|
||||
def test_live_config_value_view_priority_text_matches_runtime_precedence(monkeypatch):
|
||||
request = RequestFactory().get('/admin/environment/config/CHECK_SSL_VALIDITY/')
|
||||
request = RequestFactory().get("/admin/environment/config/CHECK_SSL_VALIDITY/")
|
||||
request.user = SimpleNamespace(is_superuser=True)
|
||||
|
||||
monkeypatch.setattr(core_views, 'get_all_configs', lambda: {})
|
||||
monkeypatch.setattr(core_views, 'get_flat_config', lambda: {'CHECK_SSL_VALIDITY': True})
|
||||
monkeypatch.setattr(core_views, 'get_config', lambda: {'CHECK_SSL_VALIDITY': False})
|
||||
monkeypatch.setattr(core_views, 'find_config_default', lambda key: 'True')
|
||||
monkeypatch.setattr(core_views, 'find_config_type', lambda key: 'bool')
|
||||
monkeypatch.setattr(core_views, 'key_is_safe', lambda key: True)
|
||||
monkeypatch.setattr(core_views, "get_all_configs", lambda: {})
|
||||
monkeypatch.setattr(core_views, "get_flat_config", lambda: {"CHECK_SSL_VALIDITY": True})
|
||||
monkeypatch.setattr(core_views, "get_config", lambda: {"CHECK_SSL_VALIDITY": False})
|
||||
monkeypatch.setattr(core_views, "find_config_default", lambda key: "True")
|
||||
monkeypatch.setattr(core_views, "find_config_type", lambda key: "bool")
|
||||
monkeypatch.setattr(core_views, "key_is_safe", lambda key: True)
|
||||
|
||||
from archivebox.machine.models import Machine
|
||||
from archivebox.config.configset import BaseConfigSet
|
||||
|
||||
monkeypatch.setattr(
|
||||
Machine,
|
||||
'current',
|
||||
classmethod(lambda cls: SimpleNamespace(id='machine-id', config={'CHECK_SSL_VALIDITY': 'true'})),
|
||||
"current",
|
||||
classmethod(lambda cls: SimpleNamespace(id="machine-id", config={"CHECK_SSL_VALIDITY": "true"})),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
BaseConfigSet,
|
||||
'load_from_file',
|
||||
classmethod(lambda cls, path: {'CHECK_SSL_VALIDITY': 'true'}),
|
||||
"load_from_file",
|
||||
classmethod(lambda cls, path: {"CHECK_SSL_VALIDITY": "true"}),
|
||||
)
|
||||
monkeypatch.setattr(core_views.CONSTANTS, 'CONFIG_FILE', SimpleNamespace(exists=lambda: True))
|
||||
monkeypatch.setenv('CHECK_SSL_VALIDITY', 'false')
|
||||
monkeypatch.setattr(core_views.CONSTANTS, "CONFIG_FILE", SimpleNamespace(exists=lambda: True))
|
||||
monkeypatch.setenv("CHECK_SSL_VALIDITY", "false")
|
||||
|
||||
context = core_views.live_config_value_view.__wrapped__(request, key='CHECK_SSL_VALIDITY')
|
||||
section = context['data'][0]
|
||||
context = core_views.live_config_value_view.__wrapped__(request, key="CHECK_SSL_VALIDITY")
|
||||
section = context["data"][0]
|
||||
|
||||
assert section['fields']['Currently read from'] == 'Environment'
|
||||
help_text = section['help_texts']['Currently read from']
|
||||
assert help_text.index('Environment') < help_text.index('Machine') < help_text.index('Config File') < help_text.index('Default')
|
||||
assert 'Configuration Sources (highest priority first):' in section['help_texts']['Value']
|
||||
assert section["fields"]["Currently read from"] == "Environment"
|
||||
help_text = section["help_texts"]["Currently read from"]
|
||||
assert help_text.index("Environment") < help_text.index("Machine") < help_text.index("Config File") < help_text.index("Default")
|
||||
assert "Configuration Sources (highest priority first):" in section["help_texts"]["Value"]
|
||||
|
||||
@@ -8,19 +8,18 @@ import sqlite3
|
||||
import pytest
|
||||
|
||||
|
||||
|
||||
def test_crawl_creates_crawl_object(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that crawl command creates a Crawl object."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'crawl', '--no-wait', 'https://example.com'],
|
||||
["archivebox", "crawl", "--no-wait", "https://example.com"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
crawl = c.execute("SELECT id, max_depth FROM crawls_crawl ORDER BY created_at DESC LIMIT 1").fetchone()
|
||||
conn.close()
|
||||
@@ -33,13 +32,13 @@ def test_crawl_depth_sets_max_depth_in_crawl(tmp_path, process, disable_extracto
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'crawl', '--depth=2', '--no-wait', 'https://example.com'],
|
||||
["archivebox", "crawl", "--depth=2", "--no-wait", "https://example.com"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
crawl = c.execute("SELECT max_depth FROM crawls_crawl ORDER BY created_at DESC LIMIT 1").fetchone()
|
||||
conn.close()
|
||||
@@ -53,16 +52,18 @@ def test_crawl_creates_snapshot_for_url(tmp_path, process, disable_extractors_di
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'crawl', '--no-wait', 'https://example.com'],
|
||||
["archivebox", "crawl", "--no-wait", "https://example.com"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
snapshot = c.execute("SELECT url FROM core_snapshot WHERE url = ?",
|
||||
('https://example.com',)).fetchone()
|
||||
snapshot = c.execute(
|
||||
"SELECT url FROM core_snapshot WHERE url = ?",
|
||||
("https://example.com",),
|
||||
).fetchone()
|
||||
conn.close()
|
||||
|
||||
assert snapshot is not None, "Snapshot should be created for input URL"
|
||||
@@ -73,13 +74,13 @@ def test_crawl_links_snapshot_to_crawl(tmp_path, process, disable_extractors_dic
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'crawl', '--no-wait', 'https://example.com'],
|
||||
["archivebox", "crawl", "--no-wait", "https://example.com"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
|
||||
# Get the crawl ID
|
||||
@@ -88,8 +89,10 @@ def test_crawl_links_snapshot_to_crawl(tmp_path, process, disable_extractors_dic
|
||||
crawl_id = crawl[0]
|
||||
|
||||
# Check snapshot has correct crawl_id
|
||||
snapshot = c.execute("SELECT crawl_id FROM core_snapshot WHERE url = ?",
|
||||
('https://example.com',)).fetchone()
|
||||
snapshot = c.execute(
|
||||
"SELECT crawl_id FROM core_snapshot WHERE url = ?",
|
||||
("https://example.com",),
|
||||
).fetchone()
|
||||
conn.close()
|
||||
|
||||
assert snapshot is not None
|
||||
@@ -101,22 +104,26 @@ def test_crawl_multiple_urls_creates_multiple_snapshots(tmp_path, process, disab
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'crawl', '--no-wait',
|
||||
'https://example.com',
|
||||
'https://iana.org'],
|
||||
[
|
||||
"archivebox",
|
||||
"crawl",
|
||||
"--no-wait",
|
||||
"https://example.com",
|
||||
"https://iana.org",
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
urls = c.execute("SELECT url FROM core_snapshot ORDER BY url").fetchall()
|
||||
conn.close()
|
||||
|
||||
urls = [u[0] for u in urls]
|
||||
assert 'https://example.com' in urls
|
||||
assert 'https://iana.org' in urls
|
||||
assert "https://example.com" in urls
|
||||
assert "https://iana.org" in urls
|
||||
|
||||
|
||||
def test_crawl_from_file_creates_snapshot(tmp_path, process, disable_extractors_dict):
|
||||
@@ -124,17 +131,17 @@ def test_crawl_from_file_creates_snapshot(tmp_path, process, disable_extractors_
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Write URLs to a file
|
||||
urls_file = tmp_path / 'urls.txt'
|
||||
urls_file.write_text('https://example.com\n')
|
||||
urls_file = tmp_path / "urls.txt"
|
||||
urls_file.write_text("https://example.com\n")
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'crawl', '--no-wait', str(urls_file)],
|
||||
["archivebox", "crawl", "--no-wait", str(urls_file)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
snapshot = c.execute("SELECT url FROM core_snapshot").fetchone()
|
||||
conn.close()
|
||||
@@ -148,19 +155,19 @@ def test_crawl_persists_input_urls_on_crawl(tmp_path, process, disable_extractor
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'crawl', '--no-wait', 'https://example.com'],
|
||||
["archivebox", "crawl", "--no-wait", "https://example.com"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
crawl_urls = c.execute("SELECT urls FROM crawls_crawl ORDER BY created_at DESC LIMIT 1").fetchone()
|
||||
conn.close()
|
||||
|
||||
assert crawl_urls is not None, "Crawl should be created for crawl input"
|
||||
assert 'https://example.com' in crawl_urls[0], "Crawl should persist input URLs"
|
||||
assert "https://example.com" in crawl_urls[0], "Crawl should persist input URLs"
|
||||
|
||||
|
||||
class TestCrawlCLI:
|
||||
@@ -171,14 +178,14 @@ class TestCrawlCLI:
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'crawl', '--help'],
|
||||
["archivebox", "crawl", "--help"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'create' in result.stdout
|
||||
assert "create" in result.stdout
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
|
||||
@@ -14,75 +14,77 @@ pytestmark = pytest.mark.django_db
|
||||
|
||||
|
||||
User = get_user_model()
|
||||
ADMIN_HOST = 'admin.archivebox.localhost:8000'
|
||||
ADMIN_HOST = "admin.archivebox.localhost:8000"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def admin_user(db):
|
||||
return cast(UserManager, User.objects).create_superuser(
|
||||
username='crawladmin',
|
||||
email='crawladmin@test.com',
|
||||
password='testpassword',
|
||||
username="crawladmin",
|
||||
email="crawladmin@test.com",
|
||||
password="testpassword",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def crawl(admin_user):
|
||||
return Crawl.objects.create(
|
||||
urls='https://example.com\nhttps://example.org',
|
||||
tags_str='alpha,beta',
|
||||
urls="https://example.com\nhttps://example.org",
|
||||
tags_str="alpha,beta",
|
||||
created_by=admin_user,
|
||||
)
|
||||
|
||||
|
||||
def test_crawl_admin_change_view_renders_tag_editor_widget(client, admin_user, crawl):
|
||||
client.login(username='crawladmin', password='testpassword')
|
||||
client.login(username="crawladmin", password="testpassword")
|
||||
|
||||
response = client.get(
|
||||
reverse('admin:crawls_crawl_change', args=[crawl.pk]),
|
||||
reverse("admin:crawls_crawl_change", args=[crawl.pk]),
|
||||
HTTP_HOST=ADMIN_HOST,
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
assert b'name="tags_editor"' in response.content
|
||||
assert b'tag-editor-container' in response.content
|
||||
assert b'alpha' in response.content
|
||||
assert b'beta' in response.content
|
||||
assert b"tag-editor-container" in response.content
|
||||
assert b"alpha" in response.content
|
||||
assert b"beta" in response.content
|
||||
|
||||
|
||||
def test_crawl_admin_add_view_renders_url_filter_alias_fields(client, admin_user):
|
||||
client.login(username='crawladmin', password='testpassword')
|
||||
client.login(username="crawladmin", password="testpassword")
|
||||
|
||||
response = client.get(
|
||||
reverse('admin:crawls_crawl_add'),
|
||||
reverse("admin:crawls_crawl_add"),
|
||||
HTTP_HOST=ADMIN_HOST,
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
assert b'name="url_filters_allowlist"' in response.content
|
||||
assert b'name="url_filters_denylist"' in response.content
|
||||
assert b'Same domain only' in response.content
|
||||
assert b"Same domain only" in response.content
|
||||
|
||||
|
||||
def test_crawl_admin_form_saves_tags_editor_to_tags_str(crawl, admin_user):
|
||||
form = CrawlAdminForm(
|
||||
data={
|
||||
'created_at': crawl.created_at.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'urls': crawl.urls,
|
||||
'config': '{}',
|
||||
'max_depth': '0',
|
||||
'tags_editor': 'alpha, beta, Alpha, gamma',
|
||||
'url_filters_allowlist': 'example.com\n*.example.com',
|
||||
'url_filters_denylist': 'static.example.com',
|
||||
'persona_id': '',
|
||||
'label': '',
|
||||
'notes': '',
|
||||
'schedule': '',
|
||||
'status': crawl.status,
|
||||
'retry_at': crawl.retry_at.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
'created_by': str(admin_user.pk),
|
||||
'num_uses_failed': '0',
|
||||
'num_uses_succeeded': '0',
|
||||
"created_at": crawl.created_at.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"urls": crawl.urls,
|
||||
"config": "{}",
|
||||
"max_depth": "0",
|
||||
"max_urls": "3",
|
||||
"max_size": str(45 * 1024 * 1024),
|
||||
"tags_editor": "alpha, beta, Alpha, gamma",
|
||||
"url_filters_allowlist": "example.com\n*.example.com",
|
||||
"url_filters_denylist": "static.example.com",
|
||||
"persona_id": "",
|
||||
"label": "",
|
||||
"notes": "",
|
||||
"schedule": "",
|
||||
"status": crawl.status,
|
||||
"retry_at": crawl.retry_at.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"created_by": str(admin_user.pk),
|
||||
"num_uses_failed": "0",
|
||||
"num_uses_succeeded": "0",
|
||||
},
|
||||
instance=crawl,
|
||||
)
|
||||
@@ -91,130 +93,140 @@ def test_crawl_admin_form_saves_tags_editor_to_tags_str(crawl, admin_user):
|
||||
|
||||
updated = form.save()
|
||||
updated.refresh_from_db()
|
||||
assert updated.tags_str == 'alpha,beta,gamma'
|
||||
assert updated.config['URL_ALLOWLIST'] == 'example.com\n*.example.com'
|
||||
assert updated.config['URL_DENYLIST'] == 'static.example.com'
|
||||
assert updated.tags_str == "alpha,beta,gamma"
|
||||
assert updated.max_urls == 3
|
||||
assert updated.max_size == 45 * 1024 * 1024
|
||||
assert updated.config["MAX_URLS"] == 3
|
||||
assert updated.config["MAX_SIZE"] == 45 * 1024 * 1024
|
||||
assert updated.config["URL_ALLOWLIST"] == "example.com\n*.example.com"
|
||||
assert updated.config["URL_DENYLIST"] == "static.example.com"
|
||||
|
||||
|
||||
def test_crawl_admin_delete_snapshot_action_removes_snapshot_and_url(client, admin_user):
|
||||
crawl = Crawl.objects.create(
|
||||
urls='https://example.com/remove-me',
|
||||
urls="https://example.com/remove-me",
|
||||
created_by=admin_user,
|
||||
)
|
||||
snapshot = Snapshot.objects.create(
|
||||
crawl=crawl,
|
||||
url='https://example.com/remove-me',
|
||||
url="https://example.com/remove-me",
|
||||
)
|
||||
|
||||
client.login(username='crawladmin', password='testpassword')
|
||||
client.login(username="crawladmin", password="testpassword")
|
||||
response = client.post(
|
||||
reverse('admin:crawls_crawl_snapshot_delete', args=[crawl.pk, snapshot.pk]),
|
||||
reverse("admin:crawls_crawl_snapshot_delete", args=[crawl.pk, snapshot.pk]),
|
||||
HTTP_HOST=ADMIN_HOST,
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
assert response.json()['ok'] is True
|
||||
assert response.json()["ok"] is True
|
||||
assert not Snapshot.objects.filter(pk=snapshot.pk).exists()
|
||||
|
||||
crawl.refresh_from_db()
|
||||
assert 'https://example.com/remove-me' not in crawl.urls
|
||||
assert "https://example.com/remove-me" not in crawl.urls
|
||||
|
||||
|
||||
def test_crawl_admin_exclude_domain_action_prunes_urls_and_pending_snapshots(client, admin_user):
|
||||
crawl = Crawl.objects.create(
|
||||
urls='\n'.join([
|
||||
'https://cdn.example.com/asset.js',
|
||||
'https://cdn.example.com/second.js',
|
||||
'https://example.com/root',
|
||||
]),
|
||||
urls="\n".join(
|
||||
[
|
||||
"https://cdn.example.com/asset.js",
|
||||
"https://cdn.example.com/second.js",
|
||||
"https://example.com/root",
|
||||
],
|
||||
),
|
||||
created_by=admin_user,
|
||||
)
|
||||
queued_snapshot = Snapshot.objects.create(
|
||||
crawl=crawl,
|
||||
url='https://cdn.example.com/asset.js',
|
||||
url="https://cdn.example.com/asset.js",
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
)
|
||||
preserved_snapshot = Snapshot.objects.create(
|
||||
crawl=crawl,
|
||||
url='https://example.com/root',
|
||||
url="https://example.com/root",
|
||||
status=Snapshot.StatusChoices.SEALED,
|
||||
)
|
||||
|
||||
client.login(username='crawladmin', password='testpassword')
|
||||
client.login(username="crawladmin", password="testpassword")
|
||||
response = client.post(
|
||||
reverse('admin:crawls_crawl_snapshot_exclude_domain', args=[crawl.pk, queued_snapshot.pk]),
|
||||
reverse("admin:crawls_crawl_snapshot_exclude_domain", args=[crawl.pk, queued_snapshot.pk]),
|
||||
HTTP_HOST=ADMIN_HOST,
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
payload = response.json()
|
||||
assert payload['ok'] is True
|
||||
assert payload['domain'] == 'cdn.example.com'
|
||||
assert payload["ok"] is True
|
||||
assert payload["domain"] == "cdn.example.com"
|
||||
|
||||
crawl.refresh_from_db()
|
||||
assert crawl.get_url_denylist(use_effective_config=False) == ['cdn.example.com']
|
||||
assert 'https://cdn.example.com/asset.js' not in crawl.urls
|
||||
assert 'https://cdn.example.com/second.js' not in crawl.urls
|
||||
assert 'https://example.com/root' in crawl.urls
|
||||
assert crawl.get_url_denylist(use_effective_config=False) == ["cdn.example.com"]
|
||||
assert "https://cdn.example.com/asset.js" not in crawl.urls
|
||||
assert "https://cdn.example.com/second.js" not in crawl.urls
|
||||
assert "https://example.com/root" in crawl.urls
|
||||
assert not Snapshot.objects.filter(pk=queued_snapshot.pk).exists()
|
||||
assert Snapshot.objects.filter(pk=preserved_snapshot.pk).exists()
|
||||
|
||||
|
||||
def test_snapshot_from_json_trims_markdown_suffixes_on_discovered_urls(crawl):
|
||||
snapshot = Snapshot.from_json(
|
||||
{'url': 'https://docs.sweeting.me/s/youtube-favorites)**'},
|
||||
overrides={'crawl': crawl},
|
||||
{"url": "https://docs.sweeting.me/s/youtube-favorites)**"},
|
||||
overrides={"crawl": crawl},
|
||||
queue_for_extraction=False,
|
||||
)
|
||||
|
||||
assert snapshot is not None
|
||||
assert snapshot.url == 'https://docs.sweeting.me/s/youtube-favorites'
|
||||
assert snapshot.url == "https://docs.sweeting.me/s/youtube-favorites"
|
||||
|
||||
|
||||
def test_create_snapshots_from_urls_respects_url_allowlist_and_denylist(admin_user):
|
||||
crawl = Crawl.objects.create(
|
||||
urls='\n'.join([
|
||||
'https://example.com/root',
|
||||
'https://static.example.com/app.js',
|
||||
'https://other.test/page',
|
||||
]),
|
||||
urls="\n".join(
|
||||
[
|
||||
"https://example.com/root",
|
||||
"https://static.example.com/app.js",
|
||||
"https://other.test/page",
|
||||
],
|
||||
),
|
||||
created_by=admin_user,
|
||||
config={
|
||||
'URL_ALLOWLIST': 'example.com',
|
||||
'URL_DENYLIST': 'static.example.com',
|
||||
"URL_ALLOWLIST": "example.com",
|
||||
"URL_DENYLIST": "static.example.com",
|
||||
},
|
||||
)
|
||||
|
||||
created = crawl.create_snapshots_from_urls()
|
||||
|
||||
assert [snapshot.url for snapshot in created] == ['https://example.com/root']
|
||||
assert [snapshot.url for snapshot in created] == ["https://example.com/root"]
|
||||
|
||||
|
||||
def test_url_filter_regex_lists_preserve_commas_and_split_on_newlines_only(admin_user):
|
||||
crawl = Crawl.objects.create(
|
||||
urls='\n'.join([
|
||||
'https://example.com/root',
|
||||
'https://example.com/path,with,commas',
|
||||
'https://other.test/page',
|
||||
]),
|
||||
urls="\n".join(
|
||||
[
|
||||
"https://example.com/root",
|
||||
"https://example.com/path,with,commas",
|
||||
"https://other.test/page",
|
||||
],
|
||||
),
|
||||
created_by=admin_user,
|
||||
config={
|
||||
'URL_ALLOWLIST': r'^https://example\.com/(root|path,with,commas)$' + '\n' + r'^https://other\.test/page$',
|
||||
'URL_DENYLIST': r'^https://example\.com/path,with,commas$',
|
||||
"URL_ALLOWLIST": r"^https://example\.com/(root|path,with,commas)$" + "\n" + r"^https://other\.test/page$",
|
||||
"URL_DENYLIST": r"^https://example\.com/path,with,commas$",
|
||||
},
|
||||
)
|
||||
|
||||
assert crawl.get_url_allowlist(use_effective_config=False) == [
|
||||
r'^https://example\.com/(root|path,with,commas)$',
|
||||
r'^https://other\.test/page$',
|
||||
r"^https://example\.com/(root|path,with,commas)$",
|
||||
r"^https://other\.test/page$",
|
||||
]
|
||||
assert crawl.get_url_denylist(use_effective_config=False) == [
|
||||
r'^https://example\.com/path,with,commas$',
|
||||
r"^https://example\.com/path,with,commas$",
|
||||
]
|
||||
|
||||
created = crawl.create_snapshots_from_urls()
|
||||
|
||||
assert [snapshot.url for snapshot in created] == [
|
||||
'https://example.com/root',
|
||||
'https://other.test/page',
|
||||
"https://example.com/root",
|
||||
"https://other.test/page",
|
||||
]
|
||||
|
||||
@@ -19,7 +19,7 @@ from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
# Set up Django before importing any Django-dependent modules
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
|
||||
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "archivebox.settings")
|
||||
|
||||
|
||||
class TestBackgroundHookDetection(unittest.TestCase):
|
||||
@@ -28,32 +28,38 @@ class TestBackgroundHookDetection(unittest.TestCase):
|
||||
def test_bg_js_suffix_detected(self):
|
||||
"""Hooks with .bg.js suffix should be detected as background."""
|
||||
from archivebox.hooks import is_background_hook
|
||||
self.assertTrue(is_background_hook('on_Snapshot__21_consolelog.daemon.bg.js'))
|
||||
|
||||
self.assertTrue(is_background_hook("on_Snapshot__21_consolelog.daemon.bg.js"))
|
||||
|
||||
def test_bg_py_suffix_detected(self):
|
||||
"""Hooks with .bg.py suffix should be detected as background."""
|
||||
from archivebox.hooks import is_background_hook
|
||||
self.assertTrue(is_background_hook('on_Snapshot__24_responses.finite.bg.py'))
|
||||
|
||||
self.assertTrue(is_background_hook("on_Snapshot__24_responses.finite.bg.py"))
|
||||
|
||||
def test_bg_sh_suffix_detected(self):
|
||||
"""Hooks with .bg.sh suffix should be detected as background."""
|
||||
from archivebox.hooks import is_background_hook
|
||||
self.assertTrue(is_background_hook('on_Snapshot__23_ssl.daemon.bg.sh'))
|
||||
|
||||
self.assertTrue(is_background_hook("on_Snapshot__23_ssl.daemon.bg.sh"))
|
||||
|
||||
def test_legacy_background_suffix_detected(self):
|
||||
"""Hooks with __background in stem should be detected (backwards compat)."""
|
||||
from archivebox.hooks import is_background_hook
|
||||
self.assertTrue(is_background_hook('on_Snapshot__21_consolelog__background.js'))
|
||||
|
||||
self.assertTrue(is_background_hook("on_Snapshot__21_consolelog__background.js"))
|
||||
|
||||
def test_foreground_hook_not_detected(self):
|
||||
"""Hooks without .bg. or __background should NOT be detected as background."""
|
||||
from archivebox.hooks import is_background_hook
|
||||
self.assertFalse(is_background_hook('on_Snapshot__11_favicon.js'))
|
||||
|
||||
self.assertFalse(is_background_hook("on_Snapshot__11_favicon.js"))
|
||||
|
||||
def test_foreground_py_hook_not_detected(self):
|
||||
"""Python hooks without .bg. should NOT be detected as background."""
|
||||
from archivebox.hooks import is_background_hook
|
||||
self.assertFalse(is_background_hook('on_Snapshot__50_wget.py'))
|
||||
|
||||
self.assertFalse(is_background_hook("on_Snapshot__50_wget.py"))
|
||||
|
||||
|
||||
class TestJSONLParsing(unittest.TestCase):
|
||||
@@ -63,56 +69,61 @@ class TestJSONLParsing(unittest.TestCase):
|
||||
"""Clean JSONL format should be parsed correctly."""
|
||||
stdout = '{"type": "ArchiveResult", "status": "succeeded", "output_str": "Done"}'
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
records = Process.parse_records_from_text(stdout)
|
||||
|
||||
self.assertEqual(len(records), 1)
|
||||
self.assertEqual(records[0]['type'], 'ArchiveResult')
|
||||
self.assertEqual(records[0]['status'], 'succeeded')
|
||||
self.assertEqual(records[0]['output_str'], 'Done')
|
||||
self.assertEqual(records[0]["type"], "ArchiveResult")
|
||||
self.assertEqual(records[0]["status"], "succeeded")
|
||||
self.assertEqual(records[0]["output_str"], "Done")
|
||||
|
||||
def test_parse_multiple_jsonl_records(self):
|
||||
"""Multiple JSONL records should all be parsed."""
|
||||
stdout = '''{"type": "ArchiveResult", "status": "succeeded", "output_str": "Done"}
|
||||
{"type": "Binary", "name": "wget", "abspath": "/usr/bin/wget"}'''
|
||||
stdout = """{"type": "ArchiveResult", "status": "succeeded", "output_str": "Done"}
|
||||
{"type": "Binary", "name": "wget", "abspath": "/usr/bin/wget"}"""
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
records = Process.parse_records_from_text(stdout)
|
||||
|
||||
self.assertEqual(len(records), 2)
|
||||
self.assertEqual(records[0]['type'], 'ArchiveResult')
|
||||
self.assertEqual(records[1]['type'], 'Binary')
|
||||
self.assertEqual(records[0]["type"], "ArchiveResult")
|
||||
self.assertEqual(records[1]["type"], "Binary")
|
||||
|
||||
def test_parse_jsonl_with_log_output(self):
|
||||
"""JSONL should be extracted from mixed stdout with log lines."""
|
||||
stdout = '''Starting hook execution...
|
||||
stdout = """Starting hook execution...
|
||||
Processing URL: https://example.com
|
||||
{"type": "ArchiveResult", "status": "succeeded", "output_str": "Downloaded"}
|
||||
Hook completed successfully'''
|
||||
Hook completed successfully"""
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
records = Process.parse_records_from_text(stdout)
|
||||
|
||||
self.assertEqual(len(records), 1)
|
||||
self.assertEqual(records[0]['status'], 'succeeded')
|
||||
self.assertEqual(records[0]["status"], "succeeded")
|
||||
|
||||
def test_ignore_invalid_json(self):
|
||||
"""Invalid JSON should be silently ignored."""
|
||||
stdout = '''{"type": "ArchiveResult", "status": "succeeded"}
|
||||
stdout = """{"type": "ArchiveResult", "status": "succeeded"}
|
||||
{invalid json here}
|
||||
not json at all
|
||||
{"type": "Binary", "name": "wget"}'''
|
||||
{"type": "Binary", "name": "wget"}"""
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
records = Process.parse_records_from_text(stdout)
|
||||
|
||||
self.assertEqual(len(records), 2)
|
||||
|
||||
def test_json_without_type_ignored(self):
|
||||
"""JSON objects without 'type' field should be ignored."""
|
||||
stdout = '''{"status": "succeeded", "output_str": "Done"}
|
||||
{"type": "ArchiveResult", "status": "succeeded"}'''
|
||||
stdout = """{"status": "succeeded", "output_str": "Done"}
|
||||
{"type": "ArchiveResult", "status": "succeeded"}"""
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
records = Process.parse_records_from_text(stdout)
|
||||
|
||||
self.assertEqual(len(records), 1)
|
||||
self.assertEqual(records[0]['type'], 'ArchiveResult')
|
||||
self.assertEqual(records[0]["type"], "ArchiveResult")
|
||||
|
||||
|
||||
class TestInstallHookEnvVarHandling(unittest.TestCase):
|
||||
@@ -121,7 +132,7 @@ class TestInstallHookEnvVarHandling(unittest.TestCase):
|
||||
def setUp(self):
|
||||
"""Set up test environment."""
|
||||
self.work_dir = Path(tempfile.mkdtemp())
|
||||
self.test_hook = self.work_dir / 'test_hook.py'
|
||||
self.test_hook = self.work_dir / "test_hook.py"
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up test environment."""
|
||||
@@ -130,37 +141,37 @@ class TestInstallHookEnvVarHandling(unittest.TestCase):
|
||||
def test_binary_env_var_absolute_path_handling(self):
|
||||
"""Install hooks should handle absolute paths in XYZ_BINARY."""
|
||||
# Test the logic that install hooks use
|
||||
configured_binary = '/custom/path/to/wget2'
|
||||
if '/' in configured_binary:
|
||||
configured_binary = "/custom/path/to/wget2"
|
||||
if "/" in configured_binary:
|
||||
bin_name = Path(configured_binary).name
|
||||
else:
|
||||
bin_name = configured_binary
|
||||
|
||||
self.assertEqual(bin_name, 'wget2')
|
||||
self.assertEqual(bin_name, "wget2")
|
||||
|
||||
def test_binary_env_var_name_only_handling(self):
|
||||
"""Install hooks should handle binary names in XYZ_BINARY."""
|
||||
# Test the logic that install hooks use
|
||||
configured_binary = 'wget2'
|
||||
if '/' in configured_binary:
|
||||
configured_binary = "wget2"
|
||||
if "/" in configured_binary:
|
||||
bin_name = Path(configured_binary).name
|
||||
else:
|
||||
bin_name = configured_binary
|
||||
|
||||
self.assertEqual(bin_name, 'wget2')
|
||||
self.assertEqual(bin_name, "wget2")
|
||||
|
||||
def test_binary_env_var_empty_default(self):
|
||||
"""Install hooks should use default when XYZ_BINARY is empty."""
|
||||
configured_binary = ''
|
||||
configured_binary = ""
|
||||
if configured_binary:
|
||||
if '/' in configured_binary:
|
||||
if "/" in configured_binary:
|
||||
bin_name = Path(configured_binary).name
|
||||
else:
|
||||
bin_name = configured_binary
|
||||
else:
|
||||
bin_name = 'wget' # default
|
||||
bin_name = "wget" # default
|
||||
|
||||
self.assertEqual(bin_name, 'wget')
|
||||
self.assertEqual(bin_name, "wget")
|
||||
|
||||
|
||||
class TestHookDiscovery(unittest.TestCase):
|
||||
@@ -169,22 +180,22 @@ class TestHookDiscovery(unittest.TestCase):
|
||||
def setUp(self):
|
||||
"""Set up test plugin directory."""
|
||||
self.test_dir = Path(tempfile.mkdtemp())
|
||||
self.plugins_dir = self.test_dir / 'plugins'
|
||||
self.plugins_dir = self.test_dir / "plugins"
|
||||
self.plugins_dir.mkdir()
|
||||
|
||||
# Create test plugin structure
|
||||
wget_dir = self.plugins_dir / 'wget'
|
||||
wget_dir = self.plugins_dir / "wget"
|
||||
wget_dir.mkdir()
|
||||
(wget_dir / 'on_Snapshot__50_wget.py').write_text('# test hook')
|
||||
(wget_dir / 'on_Crawl__10_wget_install.finite.bg.py').write_text('# install hook')
|
||||
(wget_dir / "on_Snapshot__50_wget.py").write_text("# test hook")
|
||||
(wget_dir / "on_Crawl__10_wget_install.finite.bg.py").write_text("# install hook")
|
||||
|
||||
chrome_dir = self.plugins_dir / 'chrome'
|
||||
chrome_dir = self.plugins_dir / "chrome"
|
||||
chrome_dir.mkdir(exist_ok=True)
|
||||
(chrome_dir / 'on_Snapshot__20_chrome_tab.daemon.bg.js').write_text('// background hook')
|
||||
(chrome_dir / "on_Snapshot__20_chrome_tab.daemon.bg.js").write_text("// background hook")
|
||||
|
||||
consolelog_dir = self.plugins_dir / 'consolelog'
|
||||
consolelog_dir = self.plugins_dir / "consolelog"
|
||||
consolelog_dir.mkdir()
|
||||
(consolelog_dir / 'on_Snapshot__21_consolelog.daemon.bg.js').write_text('// background hook')
|
||||
(consolelog_dir / "on_Snapshot__21_consolelog.daemon.bg.js").write_text("// background hook")
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up test directory."""
|
||||
@@ -194,109 +205,118 @@ class TestHookDiscovery(unittest.TestCase):
|
||||
"""discover_hooks() should find all hooks for an event."""
|
||||
# Use the local implementation since we can't easily mock BUILTIN_PLUGINS_DIR
|
||||
hooks = []
|
||||
for ext in ('sh', 'py', 'js'):
|
||||
pattern = f'*/on_Snapshot__*.{ext}'
|
||||
for ext in ("sh", "py", "js"):
|
||||
pattern = f"*/on_Snapshot__*.{ext}"
|
||||
hooks.extend(self.plugins_dir.glob(pattern))
|
||||
|
||||
hooks = sorted(set(hooks), key=lambda p: p.name)
|
||||
|
||||
self.assertEqual(len(hooks), 3)
|
||||
hook_names = [h.name for h in hooks]
|
||||
self.assertIn('on_Snapshot__20_chrome_tab.daemon.bg.js', hook_names)
|
||||
self.assertIn('on_Snapshot__21_consolelog.daemon.bg.js', hook_names)
|
||||
self.assertIn('on_Snapshot__50_wget.py', hook_names)
|
||||
self.assertIn("on_Snapshot__20_chrome_tab.daemon.bg.js", hook_names)
|
||||
self.assertIn("on_Snapshot__21_consolelog.daemon.bg.js", hook_names)
|
||||
self.assertIn("on_Snapshot__50_wget.py", hook_names)
|
||||
|
||||
def test_discover_hooks_sorted_by_name(self):
|
||||
"""Hooks should be sorted by filename (numeric prefix ordering)."""
|
||||
hooks = []
|
||||
for ext in ('sh', 'py', 'js'):
|
||||
pattern = f'*/on_Snapshot__*.{ext}'
|
||||
for ext in ("sh", "py", "js"):
|
||||
pattern = f"*/on_Snapshot__*.{ext}"
|
||||
hooks.extend(self.plugins_dir.glob(pattern))
|
||||
|
||||
hooks = sorted(set(hooks), key=lambda p: p.name)
|
||||
|
||||
# Check numeric ordering
|
||||
self.assertEqual(hooks[0].name, 'on_Snapshot__20_chrome_tab.daemon.bg.js')
|
||||
self.assertEqual(hooks[1].name, 'on_Snapshot__21_consolelog.daemon.bg.js')
|
||||
self.assertEqual(hooks[2].name, 'on_Snapshot__50_wget.py')
|
||||
self.assertEqual(hooks[0].name, "on_Snapshot__20_chrome_tab.daemon.bg.js")
|
||||
self.assertEqual(hooks[1].name, "on_Snapshot__21_consolelog.daemon.bg.js")
|
||||
self.assertEqual(hooks[2].name, "on_Snapshot__50_wget.py")
|
||||
|
||||
def test_get_plugins_includes_non_snapshot_plugin_dirs(self):
|
||||
"""get_plugins() should include binary-only plugins with standardized metadata."""
|
||||
env_dir = self.plugins_dir / 'env'
|
||||
env_dir = self.plugins_dir / "env"
|
||||
env_dir.mkdir()
|
||||
(env_dir / 'on_Binary__15_env_discover.py').write_text('# binary hook')
|
||||
(env_dir / 'config.json').write_text('{"type": "object", "properties": {}}')
|
||||
(env_dir / "on_Binary__15_env_discover.py").write_text("# binary hook")
|
||||
(env_dir / "config.json").write_text('{"type": "object", "properties": {}}')
|
||||
|
||||
from archivebox import hooks as hooks_module
|
||||
|
||||
hooks_module.get_plugins.cache_clear()
|
||||
with patch.object(hooks_module, 'BUILTIN_PLUGINS_DIR', self.plugins_dir), patch.object(hooks_module, 'USER_PLUGINS_DIR', self.test_dir / 'user_plugins'):
|
||||
with (
|
||||
patch.object(hooks_module, "BUILTIN_PLUGINS_DIR", self.plugins_dir),
|
||||
patch.object(hooks_module, "USER_PLUGINS_DIR", self.test_dir / "user_plugins"),
|
||||
):
|
||||
plugins = hooks_module.get_plugins()
|
||||
|
||||
self.assertIn('env', plugins)
|
||||
self.assertIn("env", plugins)
|
||||
|
||||
def test_discover_binary_hooks_ignores_plugins_whitelist(self):
|
||||
"""Binary provider hooks should remain discoverable under --plugins filtering."""
|
||||
singlefile_dir = self.plugins_dir / 'singlefile'
|
||||
singlefile_dir = self.plugins_dir / "singlefile"
|
||||
singlefile_dir.mkdir()
|
||||
(singlefile_dir / 'config.json').write_text(
|
||||
(singlefile_dir / "config.json").write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"type": "object",
|
||||
"required_plugins": ["chrome"],
|
||||
"properties": {},
|
||||
}
|
||||
)
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
npm_dir = self.plugins_dir / 'npm'
|
||||
npm_dir = self.plugins_dir / "npm"
|
||||
npm_dir.mkdir()
|
||||
(npm_dir / 'on_Binary__10_npm_install.py').write_text('# npm binary hook')
|
||||
(npm_dir / 'config.json').write_text('{"type": "object", "properties": {}}')
|
||||
(npm_dir / "on_Binary__10_npm_install.py").write_text("# npm binary hook")
|
||||
(npm_dir / "config.json").write_text('{"type": "object", "properties": {}}')
|
||||
|
||||
from archivebox import hooks as hooks_module
|
||||
|
||||
hooks_module.get_plugins.cache_clear()
|
||||
with patch.object(hooks_module, 'BUILTIN_PLUGINS_DIR', self.plugins_dir), patch.object(hooks_module, 'USER_PLUGINS_DIR', self.test_dir / 'user_plugins'):
|
||||
hooks = hooks_module.discover_hooks('Binary', config={'PLUGINS': 'singlefile'})
|
||||
with (
|
||||
patch.object(hooks_module, "BUILTIN_PLUGINS_DIR", self.plugins_dir),
|
||||
patch.object(hooks_module, "USER_PLUGINS_DIR", self.test_dir / "user_plugins"),
|
||||
):
|
||||
hooks = hooks_module.discover_hooks("Binary", config={"PLUGINS": "singlefile"})
|
||||
|
||||
hook_names = [hook.name for hook in hooks]
|
||||
self.assertIn('on_Binary__10_npm_install.py', hook_names)
|
||||
self.assertIn("on_Binary__10_npm_install.py", hook_names)
|
||||
|
||||
def test_discover_crawl_hooks_only_include_declared_plugin_dependencies(self):
|
||||
"""Crawl hook discovery should include required_plugins without broadening to provider plugins."""
|
||||
responses_dir = self.plugins_dir / 'responses'
|
||||
responses_dir = self.plugins_dir / "responses"
|
||||
responses_dir.mkdir()
|
||||
(responses_dir / 'config.json').write_text(
|
||||
(responses_dir / "config.json").write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"type": "object",
|
||||
"required_plugins": ["chrome"],
|
||||
"properties": {},
|
||||
}
|
||||
)
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
chrome_dir = self.plugins_dir / 'chrome'
|
||||
chrome_dir = self.plugins_dir / "chrome"
|
||||
chrome_dir.mkdir(exist_ok=True)
|
||||
(chrome_dir / 'config.json').write_text('{"type": "object", "properties": {}}')
|
||||
(chrome_dir / 'on_Crawl__70_chrome_install.finite.bg.py').write_text('# chrome crawl hook')
|
||||
(chrome_dir / "config.json").write_text('{"type": "object", "properties": {}}')
|
||||
(chrome_dir / "on_Crawl__70_chrome_install.finite.bg.py").write_text("# chrome crawl hook")
|
||||
|
||||
npm_dir = self.plugins_dir / 'npm'
|
||||
npm_dir = self.plugins_dir / "npm"
|
||||
npm_dir.mkdir()
|
||||
(npm_dir / 'on_Binary__10_npm_install.py').write_text('# npm binary hook')
|
||||
(npm_dir / 'on_Crawl__00_npm_install.py').write_text('# npm crawl hook')
|
||||
(npm_dir / 'config.json').write_text('{"type": "object", "properties": {}}')
|
||||
(npm_dir / "on_Binary__10_npm_install.py").write_text("# npm binary hook")
|
||||
(npm_dir / "on_Crawl__00_npm_install.py").write_text("# npm crawl hook")
|
||||
(npm_dir / "config.json").write_text('{"type": "object", "properties": {}}')
|
||||
|
||||
from archivebox import hooks as hooks_module
|
||||
|
||||
hooks_module.get_plugins.cache_clear()
|
||||
with patch.object(hooks_module, 'BUILTIN_PLUGINS_DIR', self.plugins_dir), patch.object(hooks_module, 'USER_PLUGINS_DIR', self.test_dir / 'user_plugins'):
|
||||
hooks = hooks_module.discover_hooks('Crawl', config={'PLUGINS': 'responses'})
|
||||
with (
|
||||
patch.object(hooks_module, "BUILTIN_PLUGINS_DIR", self.plugins_dir),
|
||||
patch.object(hooks_module, "USER_PLUGINS_DIR", self.test_dir / "user_plugins"),
|
||||
):
|
||||
hooks = hooks_module.discover_hooks("Crawl", config={"PLUGINS": "responses"})
|
||||
|
||||
hook_names = [hook.name for hook in hooks]
|
||||
self.assertIn('on_Crawl__70_chrome_install.finite.bg.py', hook_names)
|
||||
self.assertNotIn('on_Crawl__00_npm_install.py', hook_names)
|
||||
self.assertIn("on_Crawl__70_chrome_install.finite.bg.py", hook_names)
|
||||
self.assertNotIn("on_Crawl__00_npm_install.py", hook_names)
|
||||
|
||||
|
||||
class TestGetExtractorName(unittest.TestCase):
|
||||
@@ -304,27 +324,29 @@ class TestGetExtractorName(unittest.TestCase):
|
||||
|
||||
def test_strip_numeric_prefix(self):
|
||||
"""Numeric prefix should be stripped from extractor name."""
|
||||
|
||||
# Inline implementation of get_extractor_name
|
||||
def get_extractor_name(extractor: str) -> str:
|
||||
parts = extractor.split('_', 1)
|
||||
parts = extractor.split("_", 1)
|
||||
if len(parts) == 2 and parts[0].isdigit():
|
||||
return parts[1]
|
||||
return extractor
|
||||
|
||||
self.assertEqual(get_extractor_name('10_title'), 'title')
|
||||
self.assertEqual(get_extractor_name('26_readability'), 'readability')
|
||||
self.assertEqual(get_extractor_name('50_parse_html_urls'), 'parse_html_urls')
|
||||
self.assertEqual(get_extractor_name("10_title"), "title")
|
||||
self.assertEqual(get_extractor_name("26_readability"), "readability")
|
||||
self.assertEqual(get_extractor_name("50_parse_html_urls"), "parse_html_urls")
|
||||
|
||||
def test_no_prefix_unchanged(self):
|
||||
"""Extractor without numeric prefix should be unchanged."""
|
||||
|
||||
def get_extractor_name(extractor: str) -> str:
|
||||
parts = extractor.split('_', 1)
|
||||
parts = extractor.split("_", 1)
|
||||
if len(parts) == 2 and parts[0].isdigit():
|
||||
return parts[1]
|
||||
return extractor
|
||||
|
||||
self.assertEqual(get_extractor_name('title'), 'title')
|
||||
self.assertEqual(get_extractor_name('readability'), 'readability')
|
||||
self.assertEqual(get_extractor_name("title"), "title")
|
||||
self.assertEqual(get_extractor_name("readability"), "readability")
|
||||
|
||||
|
||||
class TestHookExecution(unittest.TestCase):
|
||||
@@ -340,14 +362,14 @@ class TestHookExecution(unittest.TestCase):
|
||||
|
||||
def test_python_hook_execution(self):
|
||||
"""Python hook should execute and output JSONL."""
|
||||
hook_path = self.work_dir / 'test_hook.py'
|
||||
hook_path.write_text('''#!/usr/bin/env python3
|
||||
hook_path = self.work_dir / "test_hook.py"
|
||||
hook_path.write_text("""#!/usr/bin/env python3
|
||||
import json
|
||||
print(json.dumps({"type": "ArchiveResult", "status": "succeeded", "output_str": "Test passed"}))
|
||||
''')
|
||||
""")
|
||||
|
||||
result = subprocess.run(
|
||||
['python3', str(hook_path)],
|
||||
["python3", str(hook_path)],
|
||||
cwd=str(self.work_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
@@ -355,24 +377,25 @@ print(json.dumps({"type": "ArchiveResult", "status": "succeeded", "output_str":
|
||||
|
||||
self.assertEqual(result.returncode, 0)
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
records = Process.parse_records_from_text(result.stdout)
|
||||
self.assertTrue(records)
|
||||
self.assertEqual(records[0]['type'], 'ArchiveResult')
|
||||
self.assertEqual(records[0]['status'], 'succeeded')
|
||||
self.assertEqual(records[0]["type"], "ArchiveResult")
|
||||
self.assertEqual(records[0]["status"], "succeeded")
|
||||
|
||||
def test_js_hook_execution(self):
|
||||
"""JavaScript hook should execute and output JSONL."""
|
||||
# Skip if node not available
|
||||
if shutil.which('node') is None:
|
||||
self.skipTest('Node.js not available')
|
||||
if shutil.which("node") is None:
|
||||
self.skipTest("Node.js not available")
|
||||
|
||||
hook_path = self.work_dir / 'test_hook.js'
|
||||
hook_path.write_text('''#!/usr/bin/env node
|
||||
hook_path = self.work_dir / "test_hook.js"
|
||||
hook_path.write_text("""#!/usr/bin/env node
|
||||
console.log(JSON.stringify({type: 'ArchiveResult', status: 'succeeded', output_str: 'JS test'}));
|
||||
''')
|
||||
""")
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(hook_path)],
|
||||
["node", str(hook_path)],
|
||||
cwd=str(self.work_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
@@ -380,15 +403,16 @@ console.log(JSON.stringify({type: 'ArchiveResult', status: 'succeeded', output_s
|
||||
|
||||
self.assertEqual(result.returncode, 0)
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
records = Process.parse_records_from_text(result.stdout)
|
||||
self.assertTrue(records)
|
||||
self.assertEqual(records[0]['type'], 'ArchiveResult')
|
||||
self.assertEqual(records[0]['status'], 'succeeded')
|
||||
self.assertEqual(records[0]["type"], "ArchiveResult")
|
||||
self.assertEqual(records[0]["status"], "succeeded")
|
||||
|
||||
def test_hook_receives_cli_args(self):
|
||||
"""Hook should receive CLI arguments."""
|
||||
hook_path = self.work_dir / 'test_hook.py'
|
||||
hook_path.write_text('''#!/usr/bin/env python3
|
||||
hook_path = self.work_dir / "test_hook.py"
|
||||
hook_path.write_text("""#!/usr/bin/env python3
|
||||
import sys
|
||||
import json
|
||||
# Simple arg parsing
|
||||
@@ -398,10 +422,10 @@ for arg in sys.argv[1:]:
|
||||
key, val = arg[2:].split('=', 1)
|
||||
args[key.replace('-', '_')] = val
|
||||
print(json.dumps({"type": "ArchiveResult", "status": "succeeded", "url": args.get("url", "")}))
|
||||
''')
|
||||
""")
|
||||
|
||||
result = subprocess.run(
|
||||
['python3', str(hook_path), '--url=https://example.com'],
|
||||
["python3", str(hook_path), "--url=https://example.com"],
|
||||
cwd=str(self.work_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
@@ -409,9 +433,10 @@ print(json.dumps({"type": "ArchiveResult", "status": "succeeded", "url": args.ge
|
||||
|
||||
self.assertEqual(result.returncode, 0)
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
records = Process.parse_records_from_text(result.stdout)
|
||||
self.assertTrue(records)
|
||||
self.assertEqual(records[0]['url'], 'https://example.com')
|
||||
self.assertEqual(records[0]["url"], "https://example.com")
|
||||
|
||||
|
||||
class TestInstallHookOutput(unittest.TestCase):
|
||||
@@ -427,35 +452,41 @@ class TestInstallHookOutput(unittest.TestCase):
|
||||
|
||||
def test_install_hook_outputs_binary(self):
|
||||
"""Install hook should output Binary JSONL when binary found."""
|
||||
hook_output = json.dumps({
|
||||
'type': 'Binary',
|
||||
'name': 'wget',
|
||||
'abspath': '/usr/bin/wget',
|
||||
'version': '1.21.3',
|
||||
'sha256': None,
|
||||
'binprovider': 'apt',
|
||||
})
|
||||
hook_output = json.dumps(
|
||||
{
|
||||
"type": "Binary",
|
||||
"name": "wget",
|
||||
"abspath": "/usr/bin/wget",
|
||||
"version": "1.21.3",
|
||||
"sha256": None,
|
||||
"binprovider": "apt",
|
||||
},
|
||||
)
|
||||
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
data = Process.parse_records_from_text(hook_output)[0]
|
||||
self.assertEqual(data['type'], 'Binary')
|
||||
self.assertEqual(data['name'], 'wget')
|
||||
self.assertTrue(data['abspath'].startswith('/'))
|
||||
self.assertEqual(data["type"], "Binary")
|
||||
self.assertEqual(data["name"], "wget")
|
||||
self.assertTrue(data["abspath"].startswith("/"))
|
||||
|
||||
def test_install_hook_outputs_machine_config(self):
|
||||
"""Install hook should output Machine config update JSONL."""
|
||||
hook_output = json.dumps({
|
||||
'type': 'Machine',
|
||||
'config': {
|
||||
'WGET_BINARY': '/usr/bin/wget',
|
||||
hook_output = json.dumps(
|
||||
{
|
||||
"type": "Machine",
|
||||
"config": {
|
||||
"WGET_BINARY": "/usr/bin/wget",
|
||||
},
|
||||
},
|
||||
})
|
||||
)
|
||||
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
data = Process.parse_records_from_text(hook_output)[0]
|
||||
self.assertEqual(data['type'], 'Machine')
|
||||
self.assertIn('config', data)
|
||||
self.assertEqual(data['config']['WGET_BINARY'], '/usr/bin/wget')
|
||||
self.assertEqual(data["type"], "Machine")
|
||||
self.assertIn("config", data)
|
||||
self.assertEqual(data["config"]["WGET_BINARY"], "/usr/bin/wget")
|
||||
|
||||
|
||||
class TestSnapshotHookOutput(unittest.TestCase):
|
||||
@@ -463,75 +494,90 @@ class TestSnapshotHookOutput(unittest.TestCase):
|
||||
|
||||
def test_snapshot_hook_basic_output(self):
|
||||
"""Snapshot hook should output clean ArchiveResult JSONL."""
|
||||
hook_output = json.dumps({
|
||||
'type': 'ArchiveResult',
|
||||
'status': 'succeeded',
|
||||
'output_str': 'Downloaded 5 files',
|
||||
})
|
||||
hook_output = json.dumps(
|
||||
{
|
||||
"type": "ArchiveResult",
|
||||
"status": "succeeded",
|
||||
"output_str": "Downloaded 5 files",
|
||||
},
|
||||
)
|
||||
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
data = Process.parse_records_from_text(hook_output)[0]
|
||||
self.assertEqual(data['type'], 'ArchiveResult')
|
||||
self.assertEqual(data['status'], 'succeeded')
|
||||
self.assertIn('output_str', data)
|
||||
self.assertEqual(data["type"], "ArchiveResult")
|
||||
self.assertEqual(data["status"], "succeeded")
|
||||
self.assertIn("output_str", data)
|
||||
|
||||
def test_snapshot_hook_with_cmd(self):
|
||||
"""Snapshot hook should include cmd for binary FK lookup."""
|
||||
hook_output = json.dumps({
|
||||
'type': 'ArchiveResult',
|
||||
'status': 'succeeded',
|
||||
'output_str': 'Archived with wget',
|
||||
'cmd': ['/usr/bin/wget', '-p', '-k', 'https://example.com'],
|
||||
})
|
||||
hook_output = json.dumps(
|
||||
{
|
||||
"type": "ArchiveResult",
|
||||
"status": "succeeded",
|
||||
"output_str": "Archived with wget",
|
||||
"cmd": ["/usr/bin/wget", "-p", "-k", "https://example.com"],
|
||||
},
|
||||
)
|
||||
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
data = Process.parse_records_from_text(hook_output)[0]
|
||||
self.assertEqual(data['type'], 'ArchiveResult')
|
||||
self.assertIsInstance(data['cmd'], list)
|
||||
self.assertEqual(data['cmd'][0], '/usr/bin/wget')
|
||||
self.assertEqual(data["type"], "ArchiveResult")
|
||||
self.assertIsInstance(data["cmd"], list)
|
||||
self.assertEqual(data["cmd"][0], "/usr/bin/wget")
|
||||
|
||||
def test_snapshot_hook_with_output_json(self):
|
||||
"""Snapshot hook can include structured metadata in output_json."""
|
||||
hook_output = json.dumps({
|
||||
'type': 'ArchiveResult',
|
||||
'status': 'succeeded',
|
||||
'output_str': 'Got headers',
|
||||
'output_json': {
|
||||
'content-type': 'text/html',
|
||||
'server': 'nginx',
|
||||
'status-code': 200,
|
||||
hook_output = json.dumps(
|
||||
{
|
||||
"type": "ArchiveResult",
|
||||
"status": "succeeded",
|
||||
"output_str": "Got headers",
|
||||
"output_json": {
|
||||
"content-type": "text/html",
|
||||
"server": "nginx",
|
||||
"status-code": 200,
|
||||
},
|
||||
},
|
||||
})
|
||||
)
|
||||
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
data = Process.parse_records_from_text(hook_output)[0]
|
||||
self.assertEqual(data['type'], 'ArchiveResult')
|
||||
self.assertIsInstance(data['output_json'], dict)
|
||||
self.assertEqual(data['output_json']['status-code'], 200)
|
||||
self.assertEqual(data["type"], "ArchiveResult")
|
||||
self.assertIsInstance(data["output_json"], dict)
|
||||
self.assertEqual(data["output_json"]["status-code"], 200)
|
||||
|
||||
def test_snapshot_hook_skipped_status(self):
|
||||
"""Snapshot hook should support skipped status."""
|
||||
hook_output = json.dumps({
|
||||
'type': 'ArchiveResult',
|
||||
'status': 'skipped',
|
||||
'output_str': 'SAVE_WGET=False',
|
||||
})
|
||||
hook_output = json.dumps(
|
||||
{
|
||||
"type": "ArchiveResult",
|
||||
"status": "skipped",
|
||||
"output_str": "SAVE_WGET=False",
|
||||
},
|
||||
)
|
||||
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
data = Process.parse_records_from_text(hook_output)[0]
|
||||
self.assertEqual(data['status'], 'skipped')
|
||||
self.assertEqual(data["status"], "skipped")
|
||||
|
||||
def test_snapshot_hook_failed_status(self):
|
||||
"""Snapshot hook should support failed status."""
|
||||
hook_output = json.dumps({
|
||||
'type': 'ArchiveResult',
|
||||
'status': 'failed',
|
||||
'output_str': '404 Not Found',
|
||||
})
|
||||
hook_output = json.dumps(
|
||||
{
|
||||
"type": "ArchiveResult",
|
||||
"status": "failed",
|
||||
"output_str": "404 Not Found",
|
||||
},
|
||||
)
|
||||
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
data = Process.parse_records_from_text(hook_output)[0]
|
||||
self.assertEqual(data['status'], 'failed')
|
||||
self.assertEqual(data["status"], "failed")
|
||||
|
||||
|
||||
class TestPluginMetadata(unittest.TestCase):
|
||||
@@ -540,16 +586,16 @@ class TestPluginMetadata(unittest.TestCase):
|
||||
def test_plugin_name_added(self):
|
||||
"""run_hook() should add plugin name to records."""
|
||||
# Simulate what run_hook() does
|
||||
script = Path('/abx_plugins/plugins/wget/on_Snapshot__50_wget.py')
|
||||
script = Path("/abx_plugins/plugins/wget/on_Snapshot__50_wget.py")
|
||||
plugin_name = script.parent.name
|
||||
|
||||
record = {'type': 'ArchiveResult', 'status': 'succeeded'}
|
||||
record['plugin'] = plugin_name
|
||||
record['plugin_hook'] = str(script)
|
||||
record = {"type": "ArchiveResult", "status": "succeeded"}
|
||||
record["plugin"] = plugin_name
|
||||
record["plugin_hook"] = str(script)
|
||||
|
||||
self.assertEqual(record['plugin'], 'wget')
|
||||
self.assertIn('on_Snapshot__50_wget.py', record['plugin_hook'])
|
||||
self.assertEqual(record["plugin"], "wget")
|
||||
self.assertIn("on_Snapshot__50_wget.py", record["plugin_hook"])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -30,6 +30,7 @@ from archivebox.machine.models import (
|
||||
ProcessMachine,
|
||||
MACHINE_RECHECK_INTERVAL,
|
||||
PID_REUSE_WINDOW,
|
||||
PROCESS_TIMEOUT_GRACE,
|
||||
)
|
||||
|
||||
|
||||
@@ -39,6 +40,7 @@ class TestMachineModel(TestCase):
|
||||
def setUp(self):
|
||||
"""Reset cached machine between tests."""
|
||||
import archivebox.machine.models as models
|
||||
|
||||
models._CURRENT_MACHINE = None
|
||||
|
||||
def test_machine_current_creates_machine(self):
|
||||
@@ -49,7 +51,7 @@ class TestMachineModel(TestCase):
|
||||
self.assertIsNotNone(machine.id)
|
||||
self.assertIsNotNone(machine.guid)
|
||||
self.assertEqual(machine.hostname, os.uname().nodename)
|
||||
self.assertIn(machine.os_family, ['linux', 'darwin', 'windows', 'freebsd'])
|
||||
self.assertIn(machine.os_family, ["linux", "darwin", "windows", "freebsd"])
|
||||
|
||||
def test_machine_current_returns_cached(self):
|
||||
"""Machine.current() should return cached machine within recheck interval."""
|
||||
@@ -78,8 +80,8 @@ class TestMachineModel(TestCase):
|
||||
"""Machine.from_json() should update machine config."""
|
||||
Machine.current() # Ensure machine exists
|
||||
record = {
|
||||
'config': {
|
||||
'WGET_BINARY': '/usr/bin/wget',
|
||||
"config": {
|
||||
"WGET_BINARY": "/usr/bin/wget",
|
||||
},
|
||||
}
|
||||
|
||||
@@ -87,15 +89,15 @@ class TestMachineModel(TestCase):
|
||||
|
||||
self.assertIsNotNone(result)
|
||||
assert result is not None
|
||||
self.assertEqual(result.config.get('WGET_BINARY'), '/usr/bin/wget')
|
||||
self.assertEqual(result.config.get("WGET_BINARY"), "/usr/bin/wget")
|
||||
|
||||
def test_machine_from_jsonl_strips_legacy_chromium_version(self):
|
||||
"""Machine.from_json() should ignore legacy browser version keys."""
|
||||
Machine.current() # Ensure machine exists
|
||||
record = {
|
||||
'config': {
|
||||
'WGET_BINARY': '/usr/bin/wget',
|
||||
'CHROMIUM_VERSION': '123.4.5',
|
||||
"config": {
|
||||
"WGET_BINARY": "/usr/bin/wget",
|
||||
"CHROMIUM_VERSION": "123.4.5",
|
||||
},
|
||||
}
|
||||
|
||||
@@ -103,12 +105,12 @@ class TestMachineModel(TestCase):
|
||||
|
||||
self.assertIsNotNone(result)
|
||||
assert result is not None
|
||||
self.assertEqual(result.config.get('WGET_BINARY'), '/usr/bin/wget')
|
||||
self.assertNotIn('CHROMIUM_VERSION', result.config)
|
||||
self.assertEqual(result.config.get("WGET_BINARY"), "/usr/bin/wget")
|
||||
self.assertNotIn("CHROMIUM_VERSION", result.config)
|
||||
|
||||
def test_machine_from_jsonl_invalid(self):
|
||||
"""Machine.from_json() should return None for invalid records."""
|
||||
result = Machine.from_json({'invalid': 'record'})
|
||||
result = Machine.from_json({"invalid": "record"})
|
||||
self.assertIsNone(result)
|
||||
|
||||
def test_machine_current_strips_legacy_chromium_version(self):
|
||||
@@ -117,16 +119,16 @@ class TestMachineModel(TestCase):
|
||||
|
||||
machine = Machine.current()
|
||||
machine.config = {
|
||||
'CHROME_BINARY': '/tmp/chromium',
|
||||
'CHROMIUM_VERSION': '123.4.5',
|
||||
"CHROME_BINARY": "/tmp/chromium",
|
||||
"CHROMIUM_VERSION": "123.4.5",
|
||||
}
|
||||
machine.save(update_fields=['config'])
|
||||
machine.save(update_fields=["config"])
|
||||
models._CURRENT_MACHINE = machine
|
||||
|
||||
refreshed = Machine.current()
|
||||
|
||||
self.assertEqual(refreshed.config.get('CHROME_BINARY'), '/tmp/chromium')
|
||||
self.assertNotIn('CHROMIUM_VERSION', refreshed.config)
|
||||
self.assertEqual(refreshed.config.get("CHROME_BINARY"), "/tmp/chromium")
|
||||
self.assertNotIn("CHROMIUM_VERSION", refreshed.config)
|
||||
|
||||
def test_machine_manager_current(self):
|
||||
"""Machine.objects.current() should return current machine."""
|
||||
@@ -141,6 +143,7 @@ class TestNetworkInterfaceModel(TestCase):
|
||||
def setUp(self):
|
||||
"""Reset cached interface between tests."""
|
||||
import archivebox.machine.models as models
|
||||
|
||||
models._CURRENT_MACHINE = None
|
||||
models._CURRENT_INTERFACE = None
|
||||
|
||||
@@ -170,24 +173,24 @@ class TestNetworkInterfaceModel(TestCase):
|
||||
import archivebox.machine.models as models
|
||||
|
||||
first = {
|
||||
'mac_address': 'aa:bb:cc:dd:ee:01',
|
||||
'ip_public': '1.1.1.1',
|
||||
'ip_local': '192.168.1.10',
|
||||
'dns_server': '8.8.8.8',
|
||||
'hostname': 'host-a',
|
||||
'iface': 'en0',
|
||||
'isp': 'ISP A',
|
||||
'city': 'City',
|
||||
'region': 'Region',
|
||||
'country': 'Country',
|
||||
"mac_address": "aa:bb:cc:dd:ee:01",
|
||||
"ip_public": "1.1.1.1",
|
||||
"ip_local": "192.168.1.10",
|
||||
"dns_server": "8.8.8.8",
|
||||
"hostname": "host-a",
|
||||
"iface": "en0",
|
||||
"isp": "ISP A",
|
||||
"city": "City",
|
||||
"region": "Region",
|
||||
"country": "Country",
|
||||
}
|
||||
second = {
|
||||
**first,
|
||||
'ip_public': '2.2.2.2',
|
||||
'ip_local': '10.0.0.5',
|
||||
"ip_public": "2.2.2.2",
|
||||
"ip_local": "10.0.0.5",
|
||||
}
|
||||
|
||||
with patch.object(models, 'get_host_network', side_effect=[first, second]):
|
||||
with patch.object(models, "get_host_network", side_effect=[first, second]):
|
||||
interface1 = NetworkInterface.current(refresh=True)
|
||||
interface2 = NetworkInterface.current(refresh=True)
|
||||
|
||||
@@ -202,6 +205,7 @@ class TestBinaryModel(TestCase):
|
||||
def setUp(self):
|
||||
"""Reset cached binaries and create a machine."""
|
||||
import archivebox.machine.models as models
|
||||
|
||||
models._CURRENT_MACHINE = None
|
||||
models._CURRENT_BINARIES = {}
|
||||
self.machine = Machine.current()
|
||||
@@ -210,22 +214,23 @@ class TestBinaryModel(TestCase):
|
||||
"""Binary should be created with default values."""
|
||||
binary = Binary.objects.create(
|
||||
machine=self.machine,
|
||||
name='wget',
|
||||
binproviders='apt,brew,env',
|
||||
name="wget",
|
||||
binproviders="apt,brew,env",
|
||||
)
|
||||
|
||||
self.assertIsNotNone(binary.id)
|
||||
self.assertEqual(binary.name, 'wget')
|
||||
self.assertEqual(binary.name, "wget")
|
||||
self.assertEqual(binary.status, Binary.StatusChoices.QUEUED)
|
||||
self.assertFalse(binary.is_valid)
|
||||
|
||||
def test_binary_is_valid(self):
|
||||
"""Binary.is_valid should be True when abspath and version are set."""
|
||||
"""Binary.is_valid should be True for installed binaries with a resolved path."""
|
||||
binary = Binary.objects.create(
|
||||
machine=self.machine,
|
||||
name='wget',
|
||||
abspath='/usr/bin/wget',
|
||||
version='1.21',
|
||||
name="wget",
|
||||
abspath="/usr/bin/wget",
|
||||
version="1.21",
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
)
|
||||
|
||||
self.assertTrue(binary.is_valid)
|
||||
@@ -233,25 +238,26 @@ class TestBinaryModel(TestCase):
|
||||
def test_binary_manager_get_valid_binary(self):
|
||||
"""BinaryManager.get_valid_binary() should find valid binaries."""
|
||||
# Create invalid binary (no abspath)
|
||||
Binary.objects.create(machine=self.machine, name='wget')
|
||||
Binary.objects.create(machine=self.machine, name="wget")
|
||||
|
||||
# Create valid binary
|
||||
Binary.objects.create(
|
||||
machine=self.machine,
|
||||
name='wget',
|
||||
abspath='/usr/bin/wget',
|
||||
version='1.21',
|
||||
name="wget",
|
||||
abspath="/usr/bin/wget",
|
||||
version="1.21",
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
)
|
||||
|
||||
result = cast(BinaryManager, Binary.objects).get_valid_binary('wget')
|
||||
result = cast(BinaryManager, Binary.objects).get_valid_binary("wget")
|
||||
|
||||
self.assertIsNotNone(result)
|
||||
assert result is not None
|
||||
self.assertEqual(result.abspath, '/usr/bin/wget')
|
||||
self.assertEqual(result.abspath, "/usr/bin/wget")
|
||||
|
||||
def test_binary_update_and_requeue(self):
|
||||
"""Binary.update_and_requeue() should update fields and save."""
|
||||
binary = Binary.objects.create(machine=self.machine, name='test')
|
||||
binary = Binary.objects.create(machine=self.machine, name="test")
|
||||
old_modified = binary.modified_at
|
||||
|
||||
binary.update_and_requeue(
|
||||
@@ -266,16 +272,18 @@ class TestBinaryModel(TestCase):
|
||||
def test_binary_from_json_preserves_install_args_overrides(self):
|
||||
"""Binary.from_json() should persist canonical install_args overrides unchanged."""
|
||||
overrides = {
|
||||
'apt': {'install_args': ['chromium']},
|
||||
'npm': {'install_args': 'puppeteer'},
|
||||
'custom': {'install_args': ['bash', '-lc', 'echo ok']},
|
||||
"apt": {"install_args": ["chromium"]},
|
||||
"npm": {"install_args": "puppeteer"},
|
||||
"custom": {"install_args": ["bash", "-lc", "echo ok"]},
|
||||
}
|
||||
|
||||
binary = Binary.from_json({
|
||||
'name': 'chrome',
|
||||
'binproviders': 'apt,npm,custom',
|
||||
'overrides': overrides,
|
||||
})
|
||||
binary = Binary.from_json(
|
||||
{
|
||||
"name": "chrome",
|
||||
"binproviders": "apt,npm,custom",
|
||||
"overrides": overrides,
|
||||
},
|
||||
)
|
||||
|
||||
self.assertIsNotNone(binary)
|
||||
assert binary is not None
|
||||
@@ -284,15 +292,17 @@ class TestBinaryModel(TestCase):
|
||||
def test_binary_from_json_does_not_coerce_legacy_override_shapes(self):
|
||||
"""Binary.from_json() should no longer translate legacy non-dict provider overrides."""
|
||||
overrides = {
|
||||
'apt': ['chromium'],
|
||||
'npm': 'puppeteer',
|
||||
"apt": ["chromium"],
|
||||
"npm": "puppeteer",
|
||||
}
|
||||
|
||||
binary = Binary.from_json({
|
||||
'name': 'chrome',
|
||||
'binproviders': 'apt,npm',
|
||||
'overrides': overrides,
|
||||
})
|
||||
binary = Binary.from_json(
|
||||
{
|
||||
"name": "chrome",
|
||||
"binproviders": "apt,npm",
|
||||
"overrides": overrides,
|
||||
},
|
||||
)
|
||||
|
||||
self.assertIsNotNone(binary)
|
||||
assert binary is not None
|
||||
@@ -300,23 +310,25 @@ class TestBinaryModel(TestCase):
|
||||
|
||||
def test_binary_from_json_prefers_published_readability_package(self):
|
||||
"""Binary.from_json() should rewrite readability's npm git URL to the published package."""
|
||||
binary = Binary.from_json({
|
||||
'name': 'readability-extractor',
|
||||
'binproviders': 'env,npm',
|
||||
'overrides': {
|
||||
'npm': {
|
||||
'install_args': ['https://github.com/ArchiveBox/readability-extractor'],
|
||||
binary = Binary.from_json(
|
||||
{
|
||||
"name": "readability-extractor",
|
||||
"binproviders": "env,npm",
|
||||
"overrides": {
|
||||
"npm": {
|
||||
"install_args": ["https://github.com/ArchiveBox/readability-extractor"],
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
)
|
||||
|
||||
self.assertIsNotNone(binary)
|
||||
assert binary is not None
|
||||
self.assertEqual(
|
||||
binary.overrides,
|
||||
{
|
||||
'npm': {
|
||||
'install_args': ['readability-extractor'],
|
||||
"npm": {
|
||||
"install_args": ["readability-extractor"],
|
||||
},
|
||||
},
|
||||
)
|
||||
@@ -328,12 +340,13 @@ class TestBinaryStateMachine(TestCase):
|
||||
def setUp(self):
|
||||
"""Create a machine and binary for state machine tests."""
|
||||
import archivebox.machine.models as models
|
||||
|
||||
models._CURRENT_MACHINE = None
|
||||
self.machine = Machine.current()
|
||||
self.binary = Binary.objects.create(
|
||||
machine=self.machine,
|
||||
name='test-binary',
|
||||
binproviders='env',
|
||||
name="test-binary",
|
||||
binproviders="env",
|
||||
)
|
||||
|
||||
def test_binary_state_machine_initial_state(self):
|
||||
@@ -346,7 +359,7 @@ class TestBinaryStateMachine(TestCase):
|
||||
sm = BinaryMachine(self.binary)
|
||||
self.assertTrue(sm.can_install())
|
||||
|
||||
self.binary.binproviders = ''
|
||||
self.binary.binproviders = ""
|
||||
self.binary.save()
|
||||
sm = BinaryMachine(self.binary)
|
||||
self.assertFalse(sm.can_install())
|
||||
@@ -358,6 +371,7 @@ class TestProcessModel(TestCase):
|
||||
def setUp(self):
|
||||
"""Create a machine for process tests."""
|
||||
import archivebox.machine.models as models
|
||||
|
||||
models._CURRENT_MACHINE = None
|
||||
models._CURRENT_PROCESS = None
|
||||
self.machine = Machine.current()
|
||||
@@ -366,12 +380,12 @@ class TestProcessModel(TestCase):
|
||||
"""Process should be created with default values."""
|
||||
process = Process.objects.create(
|
||||
machine=self.machine,
|
||||
cmd=['echo', 'hello'],
|
||||
pwd='/tmp',
|
||||
cmd=["echo", "hello"],
|
||||
pwd="/tmp",
|
||||
)
|
||||
|
||||
self.assertIsNotNone(process.id)
|
||||
self.assertEqual(process.cmd, ['echo', 'hello'])
|
||||
self.assertEqual(process.cmd, ["echo", "hello"])
|
||||
self.assertEqual(process.status, Process.StatusChoices.QUEUED)
|
||||
self.assertIsNone(process.pid)
|
||||
self.assertIsNone(process.exit_code)
|
||||
@@ -380,20 +394,20 @@ class TestProcessModel(TestCase):
|
||||
"""Process.to_json() should serialize correctly."""
|
||||
process = Process.objects.create(
|
||||
machine=self.machine,
|
||||
cmd=['echo', 'hello'],
|
||||
pwd='/tmp',
|
||||
cmd=["echo", "hello"],
|
||||
pwd="/tmp",
|
||||
timeout=60,
|
||||
)
|
||||
json_data = process.to_json()
|
||||
|
||||
self.assertEqual(json_data['type'], 'Process')
|
||||
self.assertEqual(json_data['cmd'], ['echo', 'hello'])
|
||||
self.assertEqual(json_data['pwd'], '/tmp')
|
||||
self.assertEqual(json_data['timeout'], 60)
|
||||
self.assertEqual(json_data["type"], "Process")
|
||||
self.assertEqual(json_data["cmd"], ["echo", "hello"])
|
||||
self.assertEqual(json_data["pwd"], "/tmp")
|
||||
self.assertEqual(json_data["timeout"], 60)
|
||||
|
||||
def test_process_update_and_requeue(self):
|
||||
"""Process.update_and_requeue() should update fields and save."""
|
||||
process = Process.objects.create(machine=self.machine, cmd=['test'])
|
||||
process = Process.objects.create(machine=self.machine, cmd=["test"])
|
||||
|
||||
process.update_and_requeue(
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
@@ -413,6 +427,7 @@ class TestProcessCurrent(TestCase):
|
||||
def setUp(self):
|
||||
"""Reset caches."""
|
||||
import archivebox.machine.models as models
|
||||
|
||||
models._CURRENT_MACHINE = None
|
||||
models._CURRENT_PROCESS = None
|
||||
|
||||
@@ -437,25 +452,25 @@ class TestProcessCurrent(TestCase):
|
||||
|
||||
def test_process_detect_type_runner(self):
|
||||
"""_detect_process_type should detect the background runner command."""
|
||||
with patch('sys.argv', ['archivebox', 'run', '--daemon']):
|
||||
with patch("sys.argv", ["archivebox", "run", "--daemon"]):
|
||||
result = Process._detect_process_type()
|
||||
self.assertEqual(result, Process.TypeChoices.ORCHESTRATOR)
|
||||
|
||||
def test_process_detect_type_runner_watch(self):
|
||||
"""runner_watch should be classified as a worker, not the orchestrator itself."""
|
||||
with patch('sys.argv', ['archivebox', 'manage', 'runner_watch', '--pidfile=/tmp/runserver.pid']):
|
||||
with patch("sys.argv", ["archivebox", "manage", "runner_watch", "--pidfile=/tmp/runserver.pid"]):
|
||||
result = Process._detect_process_type()
|
||||
self.assertEqual(result, Process.TypeChoices.WORKER)
|
||||
|
||||
def test_process_detect_type_cli(self):
|
||||
"""_detect_process_type should detect CLI commands."""
|
||||
with patch('sys.argv', ['archivebox', 'add', 'http://example.com']):
|
||||
with patch("sys.argv", ["archivebox", "add", "http://example.com"]):
|
||||
result = Process._detect_process_type()
|
||||
self.assertEqual(result, Process.TypeChoices.CLI)
|
||||
|
||||
def test_process_detect_type_binary(self):
|
||||
"""_detect_process_type should detect non-ArchiveBox subprocesses as binary processes."""
|
||||
with patch('sys.argv', ['/usr/bin/wget', 'https://example.com']):
|
||||
with patch("sys.argv", ["/usr/bin/wget", "https://example.com"]):
|
||||
result = Process._detect_process_type()
|
||||
self.assertEqual(result, Process.TypeChoices.BINARY)
|
||||
|
||||
@@ -463,7 +478,7 @@ class TestProcessCurrent(TestCase):
|
||||
"""Process.proc should accept a script recorded in DB when wrapped by an interpreter in psutil."""
|
||||
proc = Process.objects.create(
|
||||
machine=Machine.current(),
|
||||
cmd=['/tmp/on_Crawl__90_chrome_launch.daemon.bg.js', '--url=https://example.com/'],
|
||||
cmd=["/tmp/on_Crawl__90_chrome_launch.daemon.bg.js", "--url=https://example.com/"],
|
||||
pid=12345,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
started_at=timezone.now(),
|
||||
@@ -472,12 +487,12 @@ class TestProcessCurrent(TestCase):
|
||||
os_proc = Mock()
|
||||
os_proc.create_time.return_value = proc.started_at.timestamp()
|
||||
os_proc.cmdline.return_value = [
|
||||
'node',
|
||||
'/tmp/on_Crawl__90_chrome_launch.daemon.bg.js',
|
||||
'--url=https://example.com/',
|
||||
"node",
|
||||
"/tmp/on_Crawl__90_chrome_launch.daemon.bg.js",
|
||||
"--url=https://example.com/",
|
||||
]
|
||||
|
||||
with patch('archivebox.machine.models.psutil.Process', return_value=os_proc):
|
||||
with patch("archivebox.machine.models.psutil.Process", return_value=os_proc):
|
||||
self.assertIs(proc.proc, os_proc)
|
||||
|
||||
|
||||
@@ -487,6 +502,7 @@ class TestProcessHierarchy(TestCase):
|
||||
def setUp(self):
|
||||
"""Create machine."""
|
||||
import archivebox.machine.models as models
|
||||
|
||||
models._CURRENT_MACHINE = None
|
||||
self.machine = Machine.current()
|
||||
|
||||
@@ -561,6 +577,7 @@ class TestProcessLifecycle(TestCase):
|
||||
def setUp(self):
|
||||
"""Create machine."""
|
||||
import archivebox.machine.models as models
|
||||
|
||||
models._CURRENT_MACHINE = None
|
||||
self.machine = Machine.current()
|
||||
|
||||
@@ -643,6 +660,7 @@ class TestProcessClassMethods(TestCase):
|
||||
def setUp(self):
|
||||
"""Create machine."""
|
||||
import archivebox.machine.models as models
|
||||
|
||||
models._CURRENT_MACHINE = None
|
||||
self.machine = Machine.current()
|
||||
|
||||
@@ -689,6 +707,77 @@ class TestProcessClassMethods(TestCase):
|
||||
stale.refresh_from_db()
|
||||
self.assertEqual(stale.status, Process.StatusChoices.EXITED)
|
||||
|
||||
def test_cleanup_stale_running_marks_timed_out_rows_exited(self):
|
||||
"""cleanup_stale_running should retire RUNNING rows that exceed timeout + grace."""
|
||||
stale = Process.objects.create(
|
||||
machine=self.machine,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=999998,
|
||||
timeout=5,
|
||||
started_at=timezone.now() - PROCESS_TIMEOUT_GRACE - timedelta(seconds=10),
|
||||
)
|
||||
|
||||
cleaned = Process.cleanup_stale_running()
|
||||
|
||||
self.assertGreaterEqual(cleaned, 1)
|
||||
stale.refresh_from_db()
|
||||
self.assertEqual(stale.status, Process.StatusChoices.EXITED)
|
||||
|
||||
def test_cleanup_stale_running_marks_timed_out_live_hooks_exited(self):
|
||||
"""Timed-out live hook rows should be retired in the DB without trying to kill the process."""
|
||||
stale = Process.objects.create(
|
||||
machine=self.machine,
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=os.getpid(),
|
||||
timeout=5,
|
||||
started_at=timezone.now() - PROCESS_TIMEOUT_GRACE - timedelta(seconds=10),
|
||||
)
|
||||
|
||||
with (
|
||||
patch.object(Process, "poll", return_value=None),
|
||||
patch.object(Process, "kill_tree") as kill_tree,
|
||||
patch.object(Process, "terminate") as terminate,
|
||||
):
|
||||
cleaned = Process.cleanup_stale_running()
|
||||
|
||||
self.assertGreaterEqual(cleaned, 1)
|
||||
stale.refresh_from_db()
|
||||
self.assertEqual(stale.status, Process.StatusChoices.EXITED)
|
||||
kill_tree.assert_not_called()
|
||||
terminate.assert_not_called()
|
||||
|
||||
def test_cleanup_orphaned_workers_marks_dead_root_children_exited(self):
|
||||
"""cleanup_orphaned_workers should retire rows whose CLI/orchestrator root is gone."""
|
||||
import psutil
|
||||
from datetime import datetime
|
||||
|
||||
started_at = datetime.fromtimestamp(psutil.Process(os.getpid()).create_time(), tz=timezone.get_current_timezone())
|
||||
parent = Process.objects.create(
|
||||
machine=self.machine,
|
||||
process_type=Process.TypeChoices.CLI,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=999997,
|
||||
started_at=timezone.now() - timedelta(minutes=5),
|
||||
)
|
||||
child = Process.objects.create(
|
||||
machine=self.machine,
|
||||
parent=parent,
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=os.getpid(),
|
||||
started_at=started_at,
|
||||
)
|
||||
|
||||
with patch.object(Process, "kill_tree") as kill_tree, patch.object(Process, "terminate") as terminate:
|
||||
cleaned = Process.cleanup_orphaned_workers()
|
||||
|
||||
self.assertEqual(cleaned, 1)
|
||||
child.refresh_from_db()
|
||||
self.assertEqual(child.status, Process.StatusChoices.EXITED)
|
||||
kill_tree.assert_not_called()
|
||||
terminate.assert_not_called()
|
||||
|
||||
|
||||
class TestProcessStateMachine(TestCase):
|
||||
"""Test the ProcessMachine state machine."""
|
||||
@@ -696,12 +785,13 @@ class TestProcessStateMachine(TestCase):
|
||||
def setUp(self):
|
||||
"""Create a machine and process for state machine tests."""
|
||||
import archivebox.machine.models as models
|
||||
|
||||
models._CURRENT_MACHINE = None
|
||||
self.machine = Machine.current()
|
||||
self.process = Process.objects.create(
|
||||
machine=self.machine,
|
||||
cmd=['echo', 'test'],
|
||||
pwd='/tmp',
|
||||
cmd=["echo", "test"],
|
||||
pwd="/tmp",
|
||||
)
|
||||
|
||||
def test_process_state_machine_initial_state(self):
|
||||
@@ -730,5 +820,5 @@ class TestProcessStateMachine(TestCase):
|
||||
self.assertTrue(sm.is_exited())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
|
||||
@@ -31,7 +31,7 @@ class TestMigrationFrom04x(unittest.TestCase):
|
||||
def setUp(self):
|
||||
"""Create a temporary directory with 0.4.x schema and data."""
|
||||
self.work_dir = Path(tempfile.mkdtemp())
|
||||
self.db_path = self.work_dir / 'index.sqlite3'
|
||||
self.db_path = self.work_dir / "index.sqlite3"
|
||||
|
||||
# Create directory structure
|
||||
create_data_dir_structure(self.work_dir)
|
||||
@@ -50,9 +50,9 @@ class TestMigrationFrom04x(unittest.TestCase):
|
||||
|
||||
def test_migration_preserves_snapshot_count(self):
|
||||
"""Migration should preserve all snapshots from 0.4.x."""
|
||||
expected_count = len(self.original_data['snapshots'])
|
||||
expected_count = len(self.original_data["snapshots"])
|
||||
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
ok, msg = verify_snapshot_count(self.db_path, expected_count)
|
||||
@@ -60,9 +60,9 @@ class TestMigrationFrom04x(unittest.TestCase):
|
||||
|
||||
def test_migration_preserves_snapshot_urls(self):
|
||||
"""Migration should preserve all snapshot URLs from 0.4.x."""
|
||||
expected_urls = [s['url'] for s in self.original_data['snapshots']]
|
||||
expected_urls = [s["url"] for s in self.original_data["snapshots"]]
|
||||
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
ok, msg = verify_snapshot_urls(self.db_path, expected_urls)
|
||||
@@ -70,14 +70,14 @@ class TestMigrationFrom04x(unittest.TestCase):
|
||||
|
||||
def test_migration_converts_string_tags_to_model(self):
|
||||
"""Migration should convert comma-separated tags to Tag model instances."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
# Collect unique tags from original data
|
||||
original_tags = set()
|
||||
for tags_str in cast(list[str], self.original_data['tags_str']):
|
||||
for tags_str in cast(list[str], self.original_data["tags_str"]):
|
||||
if tags_str:
|
||||
for tag in tags_str.split(','):
|
||||
for tag in tags_str.split(","):
|
||||
original_tags.add(tag.strip())
|
||||
|
||||
# Tags should have been created
|
||||
@@ -86,7 +86,7 @@ class TestMigrationFrom04x(unittest.TestCase):
|
||||
|
||||
def test_migration_preserves_snapshot_titles(self):
|
||||
"""Migration should preserve all snapshot titles."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
@@ -95,43 +95,46 @@ class TestMigrationFrom04x(unittest.TestCase):
|
||||
actual = {row[0]: row[1] for row in cursor.fetchall()}
|
||||
conn.close()
|
||||
|
||||
for snapshot in self.original_data['snapshots']:
|
||||
for snapshot in self.original_data["snapshots"]:
|
||||
self.assertEqual(
|
||||
actual.get(snapshot['url']),
|
||||
snapshot['title'],
|
||||
f"Title mismatch for {snapshot['url']}"
|
||||
actual.get(snapshot["url"]),
|
||||
snapshot["title"],
|
||||
f"Title mismatch for {snapshot['url']}",
|
||||
)
|
||||
|
||||
def test_status_works_after_migration(self):
|
||||
"""Status command should work after migration."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
result = run_archivebox(self.work_dir, ['status'])
|
||||
result = run_archivebox(self.work_dir, ["status"])
|
||||
self.assertEqual(result.returncode, 0, f"Status failed after migration: {result.stderr}")
|
||||
|
||||
def test_list_works_after_migration(self):
|
||||
"""List command should work and show ALL migrated snapshots."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
result = run_archivebox(self.work_dir, ['list'])
|
||||
result = run_archivebox(self.work_dir, ["list"])
|
||||
self.assertEqual(result.returncode, 0, f"List failed after migration: {result.stderr}")
|
||||
|
||||
# Verify ALL snapshots appear in output
|
||||
output = result.stdout + result.stderr
|
||||
for snapshot in self.original_data['snapshots']:
|
||||
url_fragment = snapshot['url'][:30]
|
||||
self.assertIn(url_fragment, output,
|
||||
f"Snapshot {snapshot['url']} not found in list output")
|
||||
for snapshot in self.original_data["snapshots"]:
|
||||
url_fragment = snapshot["url"][:30]
|
||||
self.assertIn(
|
||||
url_fragment,
|
||||
output,
|
||||
f"Snapshot {snapshot['url']} not found in list output",
|
||||
)
|
||||
|
||||
def test_add_works_after_migration(self):
|
||||
"""Adding new URLs should work after migration from 0.4.x."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
# Try to add a new URL after migration
|
||||
result = run_archivebox(self.work_dir, ['add', '--index-only', 'https://example.com/new-page'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["add", "--index-only", "https://example.com/new-page"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Add failed after migration: {result.stderr}")
|
||||
|
||||
# Verify snapshot was added
|
||||
@@ -145,7 +148,7 @@ class TestMigrationFrom04x(unittest.TestCase):
|
||||
|
||||
def test_new_schema_elements_created(self):
|
||||
"""Migration should create new 0.9.x schema elements."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
@@ -155,25 +158,25 @@ class TestMigrationFrom04x(unittest.TestCase):
|
||||
conn.close()
|
||||
|
||||
# New tables should exist
|
||||
self.assertIn('crawls_crawl', tables, "crawls_crawl table not created")
|
||||
self.assertIn('core_tag', tables, "core_tag table not created")
|
||||
self.assertIn('core_archiveresult', tables, "core_archiveresult table not created")
|
||||
self.assertIn("crawls_crawl", tables, "crawls_crawl table not created")
|
||||
self.assertIn("core_tag", tables, "core_tag table not created")
|
||||
self.assertIn("core_archiveresult", tables, "core_archiveresult table not created")
|
||||
|
||||
def test_snapshots_have_new_fields(self):
|
||||
"""Migrated snapshots should have new 0.9.x fields."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('PRAGMA table_info(core_snapshot)')
|
||||
cursor.execute("PRAGMA table_info(core_snapshot)")
|
||||
columns = {row[1] for row in cursor.fetchall()}
|
||||
conn.close()
|
||||
|
||||
required_columns = {'status', 'depth', 'created_at', 'modified_at'}
|
||||
required_columns = {"status", "depth", "created_at", "modified_at"}
|
||||
for col in required_columns:
|
||||
self.assertIn(col, columns, f"Snapshot missing new column: {col}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -35,7 +35,7 @@ class TestMigrationFrom07x(unittest.TestCase):
|
||||
def setUp(self):
|
||||
"""Create a temporary directory with 0.7.x schema and data."""
|
||||
self.work_dir = Path(tempfile.mkdtemp())
|
||||
self.db_path = self.work_dir / 'index.sqlite3'
|
||||
self.db_path = self.work_dir / "index.sqlite3"
|
||||
|
||||
# Create directory structure
|
||||
create_data_dir_structure(self.work_dir)
|
||||
@@ -54,9 +54,9 @@ class TestMigrationFrom07x(unittest.TestCase):
|
||||
|
||||
def test_migration_preserves_snapshot_count(self):
|
||||
"""Migration should preserve all snapshots."""
|
||||
expected_count = len(self.original_data['snapshots'])
|
||||
expected_count = len(self.original_data["snapshots"])
|
||||
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
ok, msg = verify_snapshot_count(self.db_path, expected_count)
|
||||
@@ -64,9 +64,9 @@ class TestMigrationFrom07x(unittest.TestCase):
|
||||
|
||||
def test_migration_preserves_snapshot_urls(self):
|
||||
"""Migration should preserve all snapshot URLs."""
|
||||
expected_urls = [s['url'] for s in self.original_data['snapshots']]
|
||||
expected_urls = [s["url"] for s in self.original_data["snapshots"]]
|
||||
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
ok, msg = verify_snapshot_urls(self.db_path, expected_urls)
|
||||
@@ -74,9 +74,9 @@ class TestMigrationFrom07x(unittest.TestCase):
|
||||
|
||||
def test_migration_preserves_snapshot_titles(self):
|
||||
"""Migration should preserve all snapshot titles."""
|
||||
expected_titles = {s['url']: s['title'] for s in self.original_data['snapshots']}
|
||||
expected_titles = {s["url"]: s["title"] for s in self.original_data["snapshots"]}
|
||||
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
ok, msg = verify_snapshot_titles(self.db_path, expected_titles)
|
||||
@@ -84,9 +84,9 @@ class TestMigrationFrom07x(unittest.TestCase):
|
||||
|
||||
def test_migration_preserves_tags(self):
|
||||
"""Migration should preserve all tags."""
|
||||
expected_count = len(self.original_data['tags'])
|
||||
expected_count = len(self.original_data["tags"])
|
||||
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
ok, msg = verify_tag_count(self.db_path, expected_count)
|
||||
@@ -94,9 +94,9 @@ class TestMigrationFrom07x(unittest.TestCase):
|
||||
|
||||
def test_migration_preserves_archiveresults(self):
|
||||
"""Migration should preserve all archive results."""
|
||||
expected_count = len(self.original_data['archiveresults'])
|
||||
expected_count = len(self.original_data["archiveresults"])
|
||||
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
ok, msg = verify_archiveresult_count(self.db_path, expected_count)
|
||||
@@ -104,7 +104,7 @@ class TestMigrationFrom07x(unittest.TestCase):
|
||||
|
||||
def test_migration_preserves_foreign_keys(self):
|
||||
"""Migration should maintain foreign key relationships."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
ok, msg = verify_foreign_keys(self.db_path)
|
||||
@@ -112,41 +112,41 @@ class TestMigrationFrom07x(unittest.TestCase):
|
||||
|
||||
def test_status_works_after_migration(self):
|
||||
"""Status command should work after migration."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
result = run_archivebox(self.work_dir, ['status'])
|
||||
result = run_archivebox(self.work_dir, ["status"])
|
||||
self.assertEqual(result.returncode, 0, f"Status failed after migration: {result.stderr}")
|
||||
|
||||
def test_search_works_after_migration(self):
|
||||
"""Search command should find ALL migrated snapshots."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
result = run_archivebox(self.work_dir, ['search'])
|
||||
result = run_archivebox(self.work_dir, ["search"])
|
||||
self.assertEqual(result.returncode, 0, f"Search failed after migration: {result.stderr}")
|
||||
|
||||
# Verify ALL snapshots appear in output
|
||||
output = result.stdout + result.stderr
|
||||
ok, msg = verify_all_snapshots_in_output(output, self.original_data['snapshots'])
|
||||
ok, msg = verify_all_snapshots_in_output(output, self.original_data["snapshots"])
|
||||
self.assertTrue(ok, msg)
|
||||
|
||||
def test_list_works_after_migration(self):
|
||||
"""List command should work and show ALL migrated data."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
result = run_archivebox(self.work_dir, ['snapshot', 'list'])
|
||||
result = run_archivebox(self.work_dir, ["snapshot", "list"])
|
||||
self.assertEqual(result.returncode, 0, f"List failed after migration: {result.stderr}")
|
||||
|
||||
# Verify ALL snapshots appear in output
|
||||
output = result.stdout + result.stderr
|
||||
ok, msg = verify_all_snapshots_in_output(output, self.original_data['snapshots'])
|
||||
ok, msg = verify_all_snapshots_in_output(output, self.original_data["snapshots"])
|
||||
self.assertTrue(ok, msg)
|
||||
|
||||
def test_new_schema_elements_created_after_migration(self):
|
||||
"""Migration should create new 0.9.x schema elements (crawls_crawl, etc.)."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
@@ -158,29 +158,29 @@ class TestMigrationFrom07x(unittest.TestCase):
|
||||
conn.close()
|
||||
|
||||
# 0.9.x should have crawls_crawl table
|
||||
self.assertIn('crawls_crawl', tables, "crawls_crawl table not created during migration")
|
||||
self.assertIn("crawls_crawl", tables, "crawls_crawl table not created during migration")
|
||||
|
||||
def test_snapshots_have_new_fields_after_migration(self):
|
||||
"""Migrated snapshots should have new 0.9.x fields (status, depth, etc.)."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Check snapshot table has new columns
|
||||
cursor.execute('PRAGMA table_info(core_snapshot)')
|
||||
cursor.execute("PRAGMA table_info(core_snapshot)")
|
||||
columns = {row[1] for row in cursor.fetchall()}
|
||||
conn.close()
|
||||
|
||||
# 0.9.x snapshots should have status, depth, created_at, modified_at
|
||||
required_new_columns = {'status', 'depth', 'created_at', 'modified_at'}
|
||||
required_new_columns = {"status", "depth", "created_at", "modified_at"}
|
||||
for col in required_new_columns:
|
||||
self.assertIn(col, columns, f"Snapshot missing new column: {col}")
|
||||
|
||||
def test_add_works_after_migration(self):
|
||||
"""Adding new URLs should work after migration from 0.7.x."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
# Verify that init created the crawls_crawl table before proceeding
|
||||
@@ -192,7 +192,7 @@ class TestMigrationFrom07x(unittest.TestCase):
|
||||
self.assertTrue(table_exists, f"Init failed to create crawls_crawl table. Init stderr: {result.stderr[-500:]}")
|
||||
|
||||
# Try to add a new URL after migration (use --index-only for speed)
|
||||
result = run_archivebox(self.work_dir, ['add', '--index-only', 'https://example.com/new-page'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["add", "--index-only", "https://example.com/new-page"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Add failed after migration: {result.stderr}")
|
||||
|
||||
# Verify a Crawl was created for the new URL
|
||||
@@ -206,7 +206,7 @@ class TestMigrationFrom07x(unittest.TestCase):
|
||||
|
||||
def test_archiveresult_status_preserved_after_migration(self):
|
||||
"""Migration should preserve archive result status values."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
@@ -218,35 +218,39 @@ class TestMigrationFrom07x(unittest.TestCase):
|
||||
conn.close()
|
||||
|
||||
# Original data has known status distribution: succeeded, failed, skipped
|
||||
self.assertIn('succeeded', status_counts, "Should have succeeded results")
|
||||
self.assertIn('failed', status_counts, "Should have failed results")
|
||||
self.assertIn('skipped', status_counts, "Should have skipped results")
|
||||
self.assertIn("succeeded", status_counts, "Should have succeeded results")
|
||||
self.assertIn("failed", status_counts, "Should have failed results")
|
||||
self.assertIn("skipped", status_counts, "Should have skipped results")
|
||||
|
||||
def test_version_works_after_migration(self):
|
||||
"""Version command should work after migration."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
result = run_archivebox(self.work_dir, ['version'])
|
||||
result = run_archivebox(self.work_dir, ["version"])
|
||||
self.assertEqual(result.returncode, 0, f"Version failed after migration: {result.stderr}")
|
||||
|
||||
# Should show version info
|
||||
output = result.stdout + result.stderr
|
||||
self.assertTrue('ArchiveBox' in output or 'version' in output.lower(),
|
||||
f"Version output missing expected content: {output[:500]}")
|
||||
self.assertTrue(
|
||||
"ArchiveBox" in output or "version" in output.lower(),
|
||||
f"Version output missing expected content: {output[:500]}",
|
||||
)
|
||||
|
||||
def test_help_works_after_migration(self):
|
||||
"""Help command should work after migration."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
result = run_archivebox(self.work_dir, ['help'])
|
||||
result = run_archivebox(self.work_dir, ["help"])
|
||||
self.assertEqual(result.returncode, 0, f"Help failed after migration: {result.stderr}")
|
||||
|
||||
# Should show available commands
|
||||
output = result.stdout + result.stderr
|
||||
self.assertTrue('add' in output.lower() and 'status' in output.lower(),
|
||||
f"Help output missing expected commands: {output[:500]}")
|
||||
self.assertTrue(
|
||||
"add" in output.lower() and "status" in output.lower(),
|
||||
f"Help output missing expected commands: {output[:500]}",
|
||||
)
|
||||
|
||||
|
||||
class TestMigrationDataIntegrity07x(unittest.TestCase):
|
||||
@@ -255,7 +259,7 @@ class TestMigrationDataIntegrity07x(unittest.TestCase):
|
||||
def test_no_duplicate_snapshots_after_migration(self):
|
||||
"""Migration should not create duplicate snapshots."""
|
||||
work_dir = Path(tempfile.mkdtemp())
|
||||
db_path = work_dir / 'index.sqlite3'
|
||||
db_path = work_dir / "index.sqlite3"
|
||||
|
||||
try:
|
||||
create_data_dir_structure(work_dir)
|
||||
@@ -264,7 +268,7 @@ class TestMigrationDataIntegrity07x(unittest.TestCase):
|
||||
conn.close()
|
||||
seed_0_7_data(db_path)
|
||||
|
||||
result = run_archivebox(work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
# Check for duplicate URLs
|
||||
@@ -285,7 +289,7 @@ class TestMigrationDataIntegrity07x(unittest.TestCase):
|
||||
def test_no_orphaned_archiveresults_after_migration(self):
|
||||
"""Migration should not leave orphaned ArchiveResults."""
|
||||
work_dir = Path(tempfile.mkdtemp())
|
||||
db_path = work_dir / 'index.sqlite3'
|
||||
db_path = work_dir / "index.sqlite3"
|
||||
|
||||
try:
|
||||
create_data_dir_structure(work_dir)
|
||||
@@ -294,7 +298,7 @@ class TestMigrationDataIntegrity07x(unittest.TestCase):
|
||||
conn.close()
|
||||
seed_0_7_data(db_path)
|
||||
|
||||
result = run_archivebox(work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
ok, msg = verify_foreign_keys(db_path)
|
||||
@@ -306,7 +310,7 @@ class TestMigrationDataIntegrity07x(unittest.TestCase):
|
||||
def test_timestamps_preserved_after_migration(self):
|
||||
"""Migration should preserve original timestamps."""
|
||||
work_dir = Path(tempfile.mkdtemp())
|
||||
db_path = work_dir / 'index.sqlite3'
|
||||
db_path = work_dir / "index.sqlite3"
|
||||
|
||||
try:
|
||||
create_data_dir_structure(work_dir)
|
||||
@@ -315,9 +319,9 @@ class TestMigrationDataIntegrity07x(unittest.TestCase):
|
||||
conn.close()
|
||||
original_data = seed_0_7_data(db_path)
|
||||
|
||||
original_timestamps = {s['url']: s['timestamp'] for s in original_data['snapshots']}
|
||||
original_timestamps = {s["url"]: s["timestamp"] for s in original_data["snapshots"]}
|
||||
|
||||
result = run_archivebox(work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
@@ -328,8 +332,9 @@ class TestMigrationDataIntegrity07x(unittest.TestCase):
|
||||
|
||||
for url, original_ts in original_timestamps.items():
|
||||
self.assertEqual(
|
||||
migrated_timestamps.get(url), original_ts,
|
||||
f"Timestamp changed for {url}: {original_ts} -> {migrated_timestamps.get(url)}"
|
||||
migrated_timestamps.get(url),
|
||||
original_ts,
|
||||
f"Timestamp changed for {url}: {original_ts} -> {migrated_timestamps.get(url)}",
|
||||
)
|
||||
|
||||
finally:
|
||||
@@ -338,7 +343,7 @@ class TestMigrationDataIntegrity07x(unittest.TestCase):
|
||||
def test_tag_associations_preserved_after_migration(self):
|
||||
"""Migration should preserve snapshot-tag associations."""
|
||||
work_dir = Path(tempfile.mkdtemp())
|
||||
db_path = work_dir / 'index.sqlite3'
|
||||
db_path = work_dir / "index.sqlite3"
|
||||
|
||||
try:
|
||||
create_data_dir_structure(work_dir)
|
||||
@@ -354,7 +359,7 @@ class TestMigrationDataIntegrity07x(unittest.TestCase):
|
||||
original_count = cursor.fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
result = run_archivebox(work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
# Count tag associations after migration
|
||||
@@ -364,12 +369,15 @@ class TestMigrationDataIntegrity07x(unittest.TestCase):
|
||||
migrated_count = cursor.fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
self.assertEqual(migrated_count, original_count,
|
||||
f"Tag associations changed: {original_count} -> {migrated_count}")
|
||||
self.assertEqual(
|
||||
migrated_count,
|
||||
original_count,
|
||||
f"Tag associations changed: {original_count} -> {migrated_count}",
|
||||
)
|
||||
|
||||
finally:
|
||||
shutil.rmtree(work_dir, ignore_errors=True)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -39,7 +39,7 @@ class TestMigrationFrom08x(unittest.TestCase):
|
||||
def setUp(self):
|
||||
"""Create a temporary directory with 0.8.x schema and data."""
|
||||
self.work_dir = Path(tempfile.mkdtemp())
|
||||
self.db_path = self.work_dir / 'index.sqlite3'
|
||||
self.db_path = self.work_dir / "index.sqlite3"
|
||||
|
||||
# Create directory structure
|
||||
create_data_dir_structure(self.work_dir)
|
||||
@@ -58,9 +58,9 @@ class TestMigrationFrom08x(unittest.TestCase):
|
||||
|
||||
def test_migration_preserves_snapshot_count(self):
|
||||
"""Migration should preserve all snapshots from 0.8.x."""
|
||||
expected_count = len(self.original_data['snapshots'])
|
||||
expected_count = len(self.original_data["snapshots"])
|
||||
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
ok, msg = verify_snapshot_count(self.db_path, expected_count)
|
||||
@@ -68,9 +68,9 @@ class TestMigrationFrom08x(unittest.TestCase):
|
||||
|
||||
def test_migration_preserves_snapshot_urls(self):
|
||||
"""Migration should preserve all snapshot URLs from 0.8.x."""
|
||||
expected_urls = [s['url'] for s in self.original_data['snapshots']]
|
||||
expected_urls = [s["url"] for s in self.original_data["snapshots"]]
|
||||
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
ok, msg = verify_snapshot_urls(self.db_path, expected_urls)
|
||||
@@ -78,14 +78,14 @@ class TestMigrationFrom08x(unittest.TestCase):
|
||||
|
||||
def test_migration_preserves_crawls(self):
|
||||
"""Migration should preserve all Crawl records and create default crawl if needed."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
# Count snapshots with NULL crawl_id in original data
|
||||
snapshots_without_crawl = sum(1 for s in self.original_data['snapshots'] if s['crawl_id'] is None)
|
||||
snapshots_without_crawl = sum(1 for s in self.original_data["snapshots"] if s["crawl_id"] is None)
|
||||
|
||||
# Expected count: original crawls + 1 default crawl if any snapshots had NULL crawl_id
|
||||
expected_count = len(self.original_data['crawls'])
|
||||
expected_count = len(self.original_data["crawls"])
|
||||
if snapshots_without_crawl > 0:
|
||||
expected_count += 1 # Migration 0024 creates a default crawl
|
||||
|
||||
@@ -94,42 +94,47 @@ class TestMigrationFrom08x(unittest.TestCase):
|
||||
|
||||
def test_migration_preserves_snapshot_crawl_links(self):
|
||||
"""Migration should preserve snapshot-to-crawl relationships and assign default crawl to orphans."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Check EVERY snapshot has a crawl_id after migration
|
||||
for snapshot in self.original_data['snapshots']:
|
||||
cursor.execute("SELECT crawl_id FROM core_snapshot WHERE url = ?", (snapshot['url'],))
|
||||
for snapshot in self.original_data["snapshots"]:
|
||||
cursor.execute("SELECT crawl_id FROM core_snapshot WHERE url = ?", (snapshot["url"],))
|
||||
row = cursor.fetchone()
|
||||
self.assertIsNotNone(row, f"Snapshot {snapshot['url']} not found after migration")
|
||||
|
||||
if snapshot['crawl_id'] is not None:
|
||||
if snapshot["crawl_id"] is not None:
|
||||
# Snapshots that had a crawl should keep it
|
||||
self.assertEqual(row[0], snapshot['crawl_id'],
|
||||
f"Crawl ID changed for {snapshot['url']}: expected {snapshot['crawl_id']}, got {row[0]}")
|
||||
self.assertEqual(
|
||||
row[0],
|
||||
snapshot["crawl_id"],
|
||||
f"Crawl ID changed for {snapshot['url']}: expected {snapshot['crawl_id']}, got {row[0]}",
|
||||
)
|
||||
else:
|
||||
# Snapshots without a crawl should now have one (the default crawl)
|
||||
self.assertIsNotNone(row[0],
|
||||
f"Snapshot {snapshot['url']} should have been assigned to default crawl but has NULL")
|
||||
self.assertIsNotNone(
|
||||
row[0],
|
||||
f"Snapshot {snapshot['url']} should have been assigned to default crawl but has NULL",
|
||||
)
|
||||
|
||||
conn.close()
|
||||
|
||||
def test_migration_preserves_tags(self):
|
||||
"""Migration should preserve all tags."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
ok, msg = verify_tag_count(self.db_path, len(self.original_data['tags']))
|
||||
ok, msg = verify_tag_count(self.db_path, len(self.original_data["tags"]))
|
||||
self.assertTrue(ok, msg)
|
||||
|
||||
def test_migration_preserves_archiveresults(self):
|
||||
"""Migration should preserve all archive results."""
|
||||
expected_count = len(self.original_data['archiveresults'])
|
||||
expected_count = len(self.original_data["archiveresults"])
|
||||
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
ok, msg = verify_archiveresult_count(self.db_path, expected_count)
|
||||
@@ -137,7 +142,7 @@ class TestMigrationFrom08x(unittest.TestCase):
|
||||
|
||||
def test_migration_preserves_archiveresult_status(self):
|
||||
"""Migration should preserve archive result status values."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
@@ -149,49 +154,49 @@ class TestMigrationFrom08x(unittest.TestCase):
|
||||
conn.close()
|
||||
|
||||
# Original data has known status distribution: succeeded, failed, skipped
|
||||
self.assertIn('succeeded', status_counts, "Should have succeeded results")
|
||||
self.assertIn('failed', status_counts, "Should have failed results")
|
||||
self.assertIn('skipped', status_counts, "Should have skipped results")
|
||||
self.assertIn("succeeded", status_counts, "Should have succeeded results")
|
||||
self.assertIn("failed", status_counts, "Should have failed results")
|
||||
self.assertIn("skipped", status_counts, "Should have skipped results")
|
||||
|
||||
def test_status_works_after_migration(self):
|
||||
"""Status command should work after migration."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
result = run_archivebox(self.work_dir, ['status'])
|
||||
result = run_archivebox(self.work_dir, ["status"])
|
||||
self.assertEqual(result.returncode, 0, f"Status failed after migration: {result.stderr}")
|
||||
|
||||
def test_list_works_after_migration(self):
|
||||
"""List command should work and show ALL migrated data."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
result = run_archivebox(self.work_dir, ['snapshot', 'list'])
|
||||
result = run_archivebox(self.work_dir, ["snapshot", "list"])
|
||||
self.assertEqual(result.returncode, 0, f"List failed after migration: {result.stderr}")
|
||||
|
||||
# Verify ALL snapshots appear in output
|
||||
output = result.stdout + result.stderr
|
||||
ok, msg = verify_all_snapshots_in_output(output, self.original_data['snapshots'])
|
||||
ok, msg = verify_all_snapshots_in_output(output, self.original_data["snapshots"])
|
||||
self.assertTrue(ok, msg)
|
||||
|
||||
def test_search_works_after_migration(self):
|
||||
"""Search command should find ALL migrated snapshots."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
result = run_archivebox(self.work_dir, ['search'])
|
||||
result = run_archivebox(self.work_dir, ["search"])
|
||||
self.assertEqual(result.returncode, 0, f"Search failed after migration: {result.stderr}")
|
||||
|
||||
# Verify ALL snapshots appear in output
|
||||
output = result.stdout + result.stderr
|
||||
ok, msg = verify_all_snapshots_in_output(output, self.original_data['snapshots'])
|
||||
ok, msg = verify_all_snapshots_in_output(output, self.original_data["snapshots"])
|
||||
self.assertTrue(ok, msg)
|
||||
|
||||
def test_migration_preserves_snapshot_titles(self):
|
||||
"""Migration should preserve all snapshot titles."""
|
||||
expected_titles = {s['url']: s['title'] for s in self.original_data['snapshots']}
|
||||
expected_titles = {s["url"]: s["title"] for s in self.original_data["snapshots"]}
|
||||
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
ok, msg = verify_snapshot_titles(self.db_path, expected_titles)
|
||||
@@ -199,7 +204,7 @@ class TestMigrationFrom08x(unittest.TestCase):
|
||||
|
||||
def test_migration_preserves_foreign_keys(self):
|
||||
"""Migration should maintain foreign key relationships."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
ok, msg = verify_foreign_keys(self.db_path)
|
||||
@@ -207,7 +212,7 @@ class TestMigrationFrom08x(unittest.TestCase):
|
||||
|
||||
def test_migration_removes_seed_id_column(self):
|
||||
"""Migration should remove seed_id column from archivebox.crawls.crawl."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
@@ -216,12 +221,15 @@ class TestMigrationFrom08x(unittest.TestCase):
|
||||
columns = [row[1] for row in cursor.fetchall()]
|
||||
conn.close()
|
||||
|
||||
self.assertNotIn('seed_id', columns,
|
||||
f"seed_id column should have been removed by migration. Columns: {columns}")
|
||||
self.assertNotIn(
|
||||
"seed_id",
|
||||
columns,
|
||||
f"seed_id column should have been removed by migration. Columns: {columns}",
|
||||
)
|
||||
|
||||
def test_migration_removes_seed_table(self):
|
||||
"""Migration should remove crawls_seed table."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
@@ -234,10 +242,13 @@ class TestMigrationFrom08x(unittest.TestCase):
|
||||
|
||||
def test_add_works_after_migration(self):
|
||||
"""Adding new URLs should work after migration from 0.8.x."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
# Check that init actually ran and applied migrations
|
||||
self.assertIn('Applying', result.stdout + result.stderr,
|
||||
f"Init did not apply migrations. stdout: {result.stdout[:500]}, stderr: {result.stderr[:500]}")
|
||||
self.assertIn(
|
||||
"Applying",
|
||||
result.stdout + result.stderr,
|
||||
f"Init did not apply migrations. stdout: {result.stdout[:500]}, stderr: {result.stderr[:500]}",
|
||||
)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
# Count existing crawls
|
||||
@@ -248,7 +259,7 @@ class TestMigrationFrom08x(unittest.TestCase):
|
||||
conn.close()
|
||||
|
||||
# Try to add a new URL after migration (use --index-only for speed)
|
||||
result = run_archivebox(self.work_dir, ['add', '--index-only', 'https://example.com/new-page'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["add", "--index-only", "https://example.com/new-page"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Add failed after migration: {result.stderr}")
|
||||
|
||||
# Verify a new Crawl was created
|
||||
@@ -258,35 +269,40 @@ class TestMigrationFrom08x(unittest.TestCase):
|
||||
new_crawl_count = cursor.fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
self.assertGreater(new_crawl_count, initial_crawl_count,
|
||||
f"No new Crawl created when adding URL. Add stderr: {result.stderr[-500:]}")
|
||||
self.assertGreater(
|
||||
new_crawl_count,
|
||||
initial_crawl_count,
|
||||
f"No new Crawl created when adding URL. Add stderr: {result.stderr[-500:]}",
|
||||
)
|
||||
|
||||
def test_version_works_after_migration(self):
|
||||
"""Version command should work after migration."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
result = run_archivebox(self.work_dir, ['version'])
|
||||
result = run_archivebox(self.work_dir, ["version"])
|
||||
self.assertEqual(result.returncode, 0, f"Version failed after migration: {result.stderr}")
|
||||
|
||||
# Should show version info
|
||||
output = result.stdout + result.stderr
|
||||
self.assertTrue('ArchiveBox' in output or 'version' in output.lower(),
|
||||
f"Version output missing expected content: {output[:500]}")
|
||||
self.assertTrue(
|
||||
"ArchiveBox" in output or "version" in output.lower(),
|
||||
f"Version output missing expected content: {output[:500]}",
|
||||
)
|
||||
|
||||
def test_migration_creates_process_records(self):
|
||||
"""Migration should create Process records for all ArchiveResults."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
# Verify Process records created
|
||||
expected_count = len(self.original_data['archiveresults'])
|
||||
expected_count = len(self.original_data["archiveresults"])
|
||||
ok, msg = verify_process_migration(self.db_path, expected_count)
|
||||
self.assertTrue(ok, msg)
|
||||
|
||||
def test_migration_creates_binary_records(self):
|
||||
"""Migration should create Binary records from cmd_version data."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
@@ -297,15 +313,18 @@ class TestMigrationFrom08x(unittest.TestCase):
|
||||
binary_count = cursor.fetchone()[0]
|
||||
|
||||
# Should have at least one binary per unique extractor
|
||||
extractors = set(ar['extractor'] for ar in self.original_data['archiveresults'])
|
||||
self.assertGreaterEqual(binary_count, len(extractors),
|
||||
f"Expected at least {len(extractors)} Binaries, got {binary_count}")
|
||||
extractors = {ar["extractor"] for ar in self.original_data["archiveresults"]}
|
||||
self.assertGreaterEqual(
|
||||
binary_count,
|
||||
len(extractors),
|
||||
f"Expected at least {len(extractors)} Binaries, got {binary_count}",
|
||||
)
|
||||
|
||||
conn.close()
|
||||
|
||||
def test_migration_preserves_cmd_data(self):
|
||||
"""Migration should preserve cmd data in Process.cmd field."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
@@ -316,9 +335,12 @@ class TestMigrationFrom08x(unittest.TestCase):
|
||||
cmd_records = cursor.fetchall()
|
||||
|
||||
# All Processes should have non-empty cmd (test data has json.dumps([extractor, '--version']))
|
||||
expected_count = len(self.original_data['archiveresults'])
|
||||
self.assertEqual(len(cmd_records), expected_count,
|
||||
f"Expected {expected_count} Processes with cmd, got {len(cmd_records)}")
|
||||
expected_count = len(self.original_data["archiveresults"])
|
||||
self.assertEqual(
|
||||
len(cmd_records),
|
||||
expected_count,
|
||||
f"Expected {expected_count} Processes with cmd, got {len(cmd_records)}",
|
||||
)
|
||||
|
||||
conn.close()
|
||||
|
||||
@@ -329,7 +351,7 @@ class TestMigrationDataIntegrity08x(unittest.TestCase):
|
||||
def test_no_duplicate_snapshots_after_migration(self):
|
||||
"""Migration should not create duplicate snapshots."""
|
||||
work_dir = Path(tempfile.mkdtemp())
|
||||
db_path = work_dir / 'index.sqlite3'
|
||||
db_path = work_dir / "index.sqlite3"
|
||||
|
||||
try:
|
||||
create_data_dir_structure(work_dir)
|
||||
@@ -338,7 +360,7 @@ class TestMigrationDataIntegrity08x(unittest.TestCase):
|
||||
conn.close()
|
||||
seed_0_8_data(db_path)
|
||||
|
||||
result = run_archivebox(work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
# Check for duplicate URLs
|
||||
@@ -359,7 +381,7 @@ class TestMigrationDataIntegrity08x(unittest.TestCase):
|
||||
def test_no_orphaned_archiveresults_after_migration(self):
|
||||
"""Migration should not leave orphaned ArchiveResults."""
|
||||
work_dir = Path(tempfile.mkdtemp())
|
||||
db_path = work_dir / 'index.sqlite3'
|
||||
db_path = work_dir / "index.sqlite3"
|
||||
|
||||
try:
|
||||
create_data_dir_structure(work_dir)
|
||||
@@ -368,7 +390,7 @@ class TestMigrationDataIntegrity08x(unittest.TestCase):
|
||||
conn.close()
|
||||
seed_0_8_data(db_path)
|
||||
|
||||
result = run_archivebox(work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
ok, msg = verify_foreign_keys(db_path)
|
||||
@@ -380,7 +402,7 @@ class TestMigrationDataIntegrity08x(unittest.TestCase):
|
||||
def test_timestamps_preserved_after_migration(self):
|
||||
"""Migration should preserve original timestamps."""
|
||||
work_dir = Path(tempfile.mkdtemp())
|
||||
db_path = work_dir / 'index.sqlite3'
|
||||
db_path = work_dir / "index.sqlite3"
|
||||
|
||||
try:
|
||||
create_data_dir_structure(work_dir)
|
||||
@@ -389,9 +411,9 @@ class TestMigrationDataIntegrity08x(unittest.TestCase):
|
||||
conn.close()
|
||||
original_data = seed_0_8_data(db_path)
|
||||
|
||||
original_timestamps = {s['url']: s['timestamp'] for s in original_data['snapshots']}
|
||||
original_timestamps = {s["url"]: s["timestamp"] for s in original_data["snapshots"]}
|
||||
|
||||
result = run_archivebox(work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
@@ -402,8 +424,9 @@ class TestMigrationDataIntegrity08x(unittest.TestCase):
|
||||
|
||||
for url, original_ts in original_timestamps.items():
|
||||
self.assertEqual(
|
||||
migrated_timestamps.get(url), original_ts,
|
||||
f"Timestamp changed for {url}: {original_ts} -> {migrated_timestamps.get(url)}"
|
||||
migrated_timestamps.get(url),
|
||||
original_ts,
|
||||
f"Timestamp changed for {url}: {original_ts} -> {migrated_timestamps.get(url)}",
|
||||
)
|
||||
|
||||
finally:
|
||||
@@ -412,7 +435,7 @@ class TestMigrationDataIntegrity08x(unittest.TestCase):
|
||||
def test_crawl_data_preserved_after_migration(self):
|
||||
"""Migration should preserve crawl metadata (urls, label, status)."""
|
||||
work_dir = Path(tempfile.mkdtemp())
|
||||
db_path = work_dir / 'index.sqlite3'
|
||||
db_path = work_dir / "index.sqlite3"
|
||||
|
||||
try:
|
||||
create_data_dir_structure(work_dir)
|
||||
@@ -421,19 +444,19 @@ class TestMigrationDataIntegrity08x(unittest.TestCase):
|
||||
conn.close()
|
||||
original_data = seed_0_8_data(db_path)
|
||||
|
||||
result = run_archivebox(work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Check each crawl's data is preserved
|
||||
for crawl in original_data['crawls']:
|
||||
cursor.execute("SELECT urls, label FROM crawls_crawl WHERE id = ?", (crawl['id'],))
|
||||
for crawl in original_data["crawls"]:
|
||||
cursor.execute("SELECT urls, label FROM crawls_crawl WHERE id = ?", (crawl["id"],))
|
||||
row = cursor.fetchone()
|
||||
self.assertIsNotNone(row, f"Crawl {crawl['id']} not found after migration")
|
||||
self.assertEqual(row[0], crawl['urls'], f"URLs mismatch for crawl {crawl['id']}")
|
||||
self.assertEqual(row[1], crawl['label'], f"Label mismatch for crawl {crawl['id']}")
|
||||
self.assertEqual(row[0], crawl["urls"], f"URLs mismatch for crawl {crawl['id']}")
|
||||
self.assertEqual(row[1], crawl["label"], f"Label mismatch for crawl {crawl['id']}")
|
||||
|
||||
conn.close()
|
||||
|
||||
@@ -443,7 +466,7 @@ class TestMigrationDataIntegrity08x(unittest.TestCase):
|
||||
def test_tag_associations_preserved_after_migration(self):
|
||||
"""Migration should preserve snapshot-tag associations."""
|
||||
work_dir = Path(tempfile.mkdtemp())
|
||||
db_path = work_dir / 'index.sqlite3'
|
||||
db_path = work_dir / "index.sqlite3"
|
||||
|
||||
try:
|
||||
create_data_dir_structure(work_dir)
|
||||
@@ -459,7 +482,7 @@ class TestMigrationDataIntegrity08x(unittest.TestCase):
|
||||
original_count = cursor.fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
result = run_archivebox(work_dir, ['init'], timeout=45)
|
||||
result = run_archivebox(work_dir, ["init"], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
# Count tag associations after migration
|
||||
@@ -469,8 +492,11 @@ class TestMigrationDataIntegrity08x(unittest.TestCase):
|
||||
migrated_count = cursor.fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
self.assertEqual(migrated_count, original_count,
|
||||
f"Tag associations changed: {original_count} -> {migrated_count}")
|
||||
self.assertEqual(
|
||||
migrated_count,
|
||||
original_count,
|
||||
f"Tag associations changed: {original_count} -> {migrated_count}",
|
||||
)
|
||||
|
||||
finally:
|
||||
shutil.rmtree(work_dir, ignore_errors=True)
|
||||
@@ -482,7 +508,7 @@ class TestFilesystemMigration08to09(unittest.TestCase):
|
||||
def setUp(self):
|
||||
"""Create a temporary directory for testing."""
|
||||
self.work_dir = Path(tempfile.mkdtemp())
|
||||
self.db_path = self.work_dir / 'index.sqlite3'
|
||||
self.db_path = self.work_dir / "index.sqlite3"
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up temporary directory."""
|
||||
@@ -500,12 +526,13 @@ class TestFilesystemMigration08to09(unittest.TestCase):
|
||||
5. Old archive/timestamp/ directories are cleaned up
|
||||
"""
|
||||
# Use the real 0.7.2 database which has actual ArchiveResults with files
|
||||
gold_db = Path('/Users/squash/Local/Code/archiveboxes/archivebox-migration-path/archivebox-v0.7.2/data')
|
||||
gold_db = Path("/Users/squash/Local/Code/archiveboxes/archivebox-migration-path/archivebox-v0.7.2/data")
|
||||
if not gold_db.exists():
|
||||
self.skipTest(f"Gold standard database not found at {gold_db}")
|
||||
|
||||
# Copy gold database to test directory
|
||||
import shutil
|
||||
|
||||
for item in gold_db.iterdir():
|
||||
if item.is_dir():
|
||||
shutil.copytree(item, self.work_dir / item.name, dirs_exist_ok=True)
|
||||
@@ -513,23 +540,23 @@ class TestFilesystemMigration08to09(unittest.TestCase):
|
||||
shutil.copy2(item, self.work_dir / item.name)
|
||||
|
||||
# Count archive directories and files BEFORE migration
|
||||
archive_dir = self.work_dir / 'archive'
|
||||
dirs_before = list(archive_dir.glob('*')) if archive_dir.exists() else []
|
||||
archive_dir = self.work_dir / "archive"
|
||||
dirs_before = list(archive_dir.glob("*")) if archive_dir.exists() else []
|
||||
dirs_before_count = len([d for d in dirs_before if d.is_dir()])
|
||||
|
||||
# Count total files in all archive directories
|
||||
files_before = []
|
||||
for d in dirs_before:
|
||||
if d.is_dir():
|
||||
files_before.extend([f for f in d.rglob('*') if f.is_file()])
|
||||
files_before.extend([f for f in d.rglob("*") if f.is_file()])
|
||||
files_before_count = len(files_before)
|
||||
|
||||
# Sample some specific files to check they're preserved
|
||||
sample_files = [
|
||||
'favicon.ico',
|
||||
'screenshot.png',
|
||||
'singlefile.html',
|
||||
'headers.json',
|
||||
"favicon.ico",
|
||||
"screenshot.png",
|
||||
"singlefile.html",
|
||||
"headers.json",
|
||||
]
|
||||
sample_paths_before = {}
|
||||
for d in dirs_before:
|
||||
@@ -544,17 +571,17 @@ class TestFilesystemMigration08to09(unittest.TestCase):
|
||||
print(f"[*] Sample files found: {len(sample_paths_before)}")
|
||||
|
||||
# Run init to trigger migration
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=60)
|
||||
result = run_archivebox(self.work_dir, ["init"], timeout=60)
|
||||
self.assertEqual(result.returncode, 0, f"Init (migration) failed: {result.stderr}")
|
||||
|
||||
# Count archive directories and files AFTER migration
|
||||
dirs_after = list(archive_dir.glob('*')) if archive_dir.exists() else []
|
||||
dirs_after = list(archive_dir.glob("*")) if archive_dir.exists() else []
|
||||
dirs_after_count = len([d for d in dirs_after if d.is_dir()])
|
||||
|
||||
files_after = []
|
||||
for d in dirs_after:
|
||||
if d.is_dir():
|
||||
files_after.extend([f for f in d.rglob('*') if f.is_file()])
|
||||
files_after.extend([f for f in d.rglob("*") if f.is_file()])
|
||||
files_after_count = len(files_after)
|
||||
|
||||
# Verify sample files still exist
|
||||
@@ -571,26 +598,32 @@ class TestFilesystemMigration08to09(unittest.TestCase):
|
||||
print(f"[*] Sample files found: {len(sample_paths_after)}")
|
||||
|
||||
# Verify files still in old structure after migration (not moved yet)
|
||||
self.assertEqual(dirs_before_count, dirs_after_count,
|
||||
f"Archive directories lost during migration: {dirs_before_count} -> {dirs_after_count}")
|
||||
self.assertEqual(files_before_count, files_after_count,
|
||||
f"Files lost during migration: {files_before_count} -> {files_after_count}")
|
||||
self.assertEqual(
|
||||
dirs_before_count,
|
||||
dirs_after_count,
|
||||
f"Archive directories lost during migration: {dirs_before_count} -> {dirs_after_count}",
|
||||
)
|
||||
self.assertEqual(
|
||||
files_before_count,
|
||||
files_after_count,
|
||||
f"Files lost during migration: {files_before_count} -> {files_after_count}",
|
||||
)
|
||||
|
||||
# Run update to trigger filesystem reorganization
|
||||
print("\n[*] Running archivebox update to reorganize filesystem...")
|
||||
result = run_archivebox(self.work_dir, ['update'], timeout=120)
|
||||
result = run_archivebox(self.work_dir, ["update"], timeout=120)
|
||||
self.assertEqual(result.returncode, 0, f"Update failed: {result.stderr}")
|
||||
|
||||
# Check new filesystem structure
|
||||
# New structure: users/username/snapshots/YYYYMMDD/example.com/snap-uuid-here/output.ext
|
||||
users_dir = self.work_dir / 'users'
|
||||
users_dir = self.work_dir / "users"
|
||||
snapshots_base = None
|
||||
|
||||
if users_dir.exists():
|
||||
# Find the snapshots directory
|
||||
for user_dir in users_dir.iterdir():
|
||||
if user_dir.is_dir():
|
||||
user_snapshots = user_dir / 'snapshots'
|
||||
user_snapshots = user_dir / "snapshots"
|
||||
if user_snapshots.exists():
|
||||
snapshots_base = user_snapshots
|
||||
break
|
||||
@@ -610,7 +643,7 @@ class TestFilesystemMigration08to09(unittest.TestCase):
|
||||
for snap_dir in domain_dir.iterdir():
|
||||
if snap_dir.is_dir():
|
||||
# Files are directly in snap-uuid/ directory (no plugin subdirs)
|
||||
for f in snap_dir.rglob('*'):
|
||||
for f in snap_dir.rglob("*"):
|
||||
if f.is_file():
|
||||
files_new_structure.append(f)
|
||||
# Track sample files
|
||||
@@ -622,15 +655,15 @@ class TestFilesystemMigration08to09(unittest.TestCase):
|
||||
print(f"[*] Sample files in new structure: {len(new_sample_files)}")
|
||||
|
||||
# Check old structure (should be gone or empty)
|
||||
old_archive_dir = self.work_dir / 'archive'
|
||||
old_archive_dir = self.work_dir / "archive"
|
||||
old_files_remaining = []
|
||||
unmigrated_dirs = []
|
||||
if old_archive_dir.exists():
|
||||
for d in old_archive_dir.glob('*'):
|
||||
for d in old_archive_dir.glob("*"):
|
||||
# Only count REAL directories, not symlinks (symlinks are the migrated ones)
|
||||
if d.is_dir(follow_symlinks=False) and d.name.replace('.', '').isdigit():
|
||||
if d.is_dir(follow_symlinks=False) and d.name.replace(".", "").isdigit():
|
||||
# This is a timestamp directory (old structure)
|
||||
files_in_dir = [f for f in d.rglob('*') if f.is_file()]
|
||||
files_in_dir = [f for f in d.rglob("*") if f.is_file()]
|
||||
if files_in_dir:
|
||||
unmigrated_dirs.append((d.name, len(files_in_dir)))
|
||||
old_files_remaining.extend(files_in_dir)
|
||||
@@ -641,30 +674,48 @@ class TestFilesystemMigration08to09(unittest.TestCase):
|
||||
print(f"[*] Unmigrated directories: {unmigrated_dirs}")
|
||||
|
||||
# CRITICAL: Verify files were moved to new structure
|
||||
self.assertGreater(files_new_count, 0,
|
||||
"No files found in new structure after update")
|
||||
self.assertGreater(
|
||||
files_new_count,
|
||||
0,
|
||||
"No files found in new structure after update",
|
||||
)
|
||||
|
||||
# CRITICAL: Verify old structure is cleaned up
|
||||
self.assertEqual(old_files_count, 0,
|
||||
f"Old structure not cleaned up: {old_files_count} files still in archive/timestamp/ directories")
|
||||
self.assertEqual(
|
||||
old_files_count,
|
||||
0,
|
||||
f"Old structure not cleaned up: {old_files_count} files still in archive/timestamp/ directories",
|
||||
)
|
||||
|
||||
# CRITICAL: Verify all files were moved (total count should match)
|
||||
total_after_update = files_new_count + old_files_count
|
||||
self.assertEqual(files_before_count, total_after_update,
|
||||
f"Files lost during reorganization: {files_before_count} before → {total_after_update} after")
|
||||
self.assertEqual(
|
||||
files_before_count,
|
||||
total_after_update,
|
||||
f"Files lost during reorganization: {files_before_count} before → {total_after_update} after",
|
||||
)
|
||||
|
||||
# CRITICAL: Verify sample files exist in new structure
|
||||
self.assertGreater(len(new_sample_files), 0,
|
||||
"Sample files not found in new structure")
|
||||
self.assertGreater(
|
||||
len(new_sample_files),
|
||||
0,
|
||||
"Sample files not found in new structure",
|
||||
)
|
||||
|
||||
# Verify new path format
|
||||
for path_key, file_path in new_sample_files.items():
|
||||
# Path should contain: snapshots/YYYYMMDD/domain/snap-uuid/plugin/file
|
||||
path_parts = file_path.parts
|
||||
self.assertIn('snapshots', path_parts,
|
||||
f"New path should contain 'snapshots': {file_path}")
|
||||
self.assertIn('users', path_parts,
|
||||
f"New path should contain 'users': {file_path}")
|
||||
self.assertIn(
|
||||
"snapshots",
|
||||
path_parts,
|
||||
f"New path should contain 'snapshots': {file_path}",
|
||||
)
|
||||
self.assertIn(
|
||||
"users",
|
||||
path_parts,
|
||||
f"New path should contain 'users': {file_path}",
|
||||
)
|
||||
print(f" ✓ {path_key} → {file_path.relative_to(self.work_dir)}")
|
||||
|
||||
# Verify Process and Binary records were created
|
||||
@@ -692,24 +743,33 @@ class TestFilesystemMigration08to09(unittest.TestCase):
|
||||
|
||||
# Verify data migration happened correctly
|
||||
# The 0.7.2 gold database has 44 ArchiveResults
|
||||
self.assertEqual(archiveresult_count, 44,
|
||||
f"Expected 44 ArchiveResults from 0.7.2 database, got {archiveresult_count}")
|
||||
self.assertEqual(
|
||||
archiveresult_count,
|
||||
44,
|
||||
f"Expected 44 ArchiveResults from 0.7.2 database, got {archiveresult_count}",
|
||||
)
|
||||
|
||||
# Each ArchiveResult should create one Process record
|
||||
self.assertEqual(process_count, 44,
|
||||
f"Expected 44 Process records (1 per ArchiveResult), got {process_count}")
|
||||
self.assertEqual(
|
||||
process_count,
|
||||
44,
|
||||
f"Expected 44 Process records (1 per ArchiveResult), got {process_count}",
|
||||
)
|
||||
|
||||
# The 44 ArchiveResults use 7 unique binaries (curl, wget, etc.)
|
||||
self.assertEqual(binary_count, 7,
|
||||
f"Expected 7 unique Binary records, got {binary_count}")
|
||||
self.assertEqual(
|
||||
binary_count,
|
||||
7,
|
||||
f"Expected 7 unique Binary records, got {binary_count}",
|
||||
)
|
||||
|
||||
# ALL ArchiveResults should be linked to Process records
|
||||
self.assertEqual(linked_count, 44,
|
||||
f"Expected all 44 ArchiveResults linked to Process, got {linked_count}")
|
||||
self.assertEqual(
|
||||
linked_count,
|
||||
44,
|
||||
f"Expected all 44 ArchiveResults linked to Process, got {linked_count}",
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -22,13 +22,13 @@ class TestFreshInstall(unittest.TestCase):
|
||||
work_dir = Path(tempfile.mkdtemp())
|
||||
|
||||
try:
|
||||
result = run_archivebox(work_dir, ['init'])
|
||||
result = run_archivebox(work_dir, ["init"])
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
# Verify database was created
|
||||
self.assertTrue((work_dir / 'index.sqlite3').exists(), "Database not created")
|
||||
self.assertTrue((work_dir / "index.sqlite3").exists(), "Database not created")
|
||||
# Verify archive directory exists
|
||||
self.assertTrue((work_dir / 'archive').is_dir(), "Archive dir not created")
|
||||
self.assertTrue((work_dir / "archive").is_dir(), "Archive dir not created")
|
||||
|
||||
finally:
|
||||
shutil.rmtree(work_dir, ignore_errors=True)
|
||||
@@ -38,10 +38,10 @@ class TestFreshInstall(unittest.TestCase):
|
||||
work_dir = Path(tempfile.mkdtemp())
|
||||
|
||||
try:
|
||||
result = run_archivebox(work_dir, ['init'])
|
||||
result = run_archivebox(work_dir, ["init"])
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
result = run_archivebox(work_dir, ['status'])
|
||||
result = run_archivebox(work_dir, ["status"])
|
||||
self.assertEqual(result.returncode, 0, f"Status failed: {result.stderr}")
|
||||
|
||||
finally:
|
||||
@@ -52,14 +52,14 @@ class TestFreshInstall(unittest.TestCase):
|
||||
work_dir = Path(tempfile.mkdtemp())
|
||||
|
||||
try:
|
||||
result = run_archivebox(work_dir, ['init'])
|
||||
result = run_archivebox(work_dir, ["init"])
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
# Add a URL with --index-only for speed
|
||||
result = run_archivebox(work_dir, ['add', '--index-only', 'https://example.com'])
|
||||
result = run_archivebox(work_dir, ["add", "--index-only", "https://example.com"])
|
||||
self.assertEqual(result.returncode, 0, f"Add command failed: {result.stderr}")
|
||||
|
||||
conn = sqlite3.connect(str(work_dir / 'index.sqlite3'))
|
||||
conn = sqlite3.connect(str(work_dir / "index.sqlite3"))
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Verify a Crawl was created
|
||||
@@ -82,18 +82,18 @@ class TestFreshInstall(unittest.TestCase):
|
||||
work_dir = Path(tempfile.mkdtemp())
|
||||
|
||||
try:
|
||||
result = run_archivebox(work_dir, ['init'])
|
||||
result = run_archivebox(work_dir, ["init"])
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
result = run_archivebox(work_dir, ['add', '--index-only', 'https://example.com'])
|
||||
result = run_archivebox(work_dir, ["add", "--index-only", "https://example.com"])
|
||||
self.assertEqual(result.returncode, 0, f"Add failed: {result.stderr}")
|
||||
|
||||
result = run_archivebox(work_dir, ['list'])
|
||||
result = run_archivebox(work_dir, ["list"])
|
||||
self.assertEqual(result.returncode, 0, f"List failed: {result.stderr}")
|
||||
|
||||
# Verify the URL appears in output
|
||||
output = result.stdout + result.stderr
|
||||
self.assertIn('example.com', output, f"Added URL not in list output: {output[:500]}")
|
||||
self.assertIn("example.com", output, f"Added URL not in list output: {output[:500]}")
|
||||
|
||||
finally:
|
||||
shutil.rmtree(work_dir, ignore_errors=True)
|
||||
@@ -103,10 +103,10 @@ class TestFreshInstall(unittest.TestCase):
|
||||
work_dir = Path(tempfile.mkdtemp())
|
||||
|
||||
try:
|
||||
result = run_archivebox(work_dir, ['init'])
|
||||
result = run_archivebox(work_dir, ["init"])
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
conn = sqlite3.connect(str(work_dir / 'index.sqlite3'))
|
||||
conn = sqlite3.connect(str(work_dir / "index.sqlite3"))
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT COUNT(*) FROM django_migrations")
|
||||
count = cursor.fetchone()[0]
|
||||
@@ -123,16 +123,16 @@ class TestFreshInstall(unittest.TestCase):
|
||||
work_dir = Path(tempfile.mkdtemp())
|
||||
|
||||
try:
|
||||
result = run_archivebox(work_dir, ['init'])
|
||||
result = run_archivebox(work_dir, ["init"])
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
conn = sqlite3.connect(str(work_dir / 'index.sqlite3'))
|
||||
conn = sqlite3.connect(str(work_dir / "index.sqlite3"))
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT name FROM django_migrations WHERE app='core' ORDER BY name")
|
||||
migrations = [row[0] for row in cursor.fetchall()]
|
||||
conn.close()
|
||||
|
||||
self.assertIn('0001_initial', migrations)
|
||||
self.assertIn("0001_initial", migrations)
|
||||
|
||||
finally:
|
||||
shutil.rmtree(work_dir, ignore_errors=True)
|
||||
@@ -146,16 +146,16 @@ class TestSchemaIntegrity(unittest.TestCase):
|
||||
work_dir = Path(tempfile.mkdtemp())
|
||||
|
||||
try:
|
||||
result = run_archivebox(work_dir, ['init'])
|
||||
result = run_archivebox(work_dir, ["init"])
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
conn = sqlite3.connect(str(work_dir / 'index.sqlite3'))
|
||||
conn = sqlite3.connect(str(work_dir / "index.sqlite3"))
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('PRAGMA table_info(core_snapshot)')
|
||||
cursor.execute("PRAGMA table_info(core_snapshot)")
|
||||
columns = {row[1] for row in cursor.fetchall()}
|
||||
conn.close()
|
||||
|
||||
required = {'id', 'url', 'timestamp', 'title', 'status', 'created_at', 'modified_at'}
|
||||
required = {"id", "url", "timestamp", "title", "status", "created_at", "modified_at"}
|
||||
for col in required:
|
||||
self.assertIn(col, columns, f"Missing column: {col}")
|
||||
|
||||
@@ -167,16 +167,16 @@ class TestSchemaIntegrity(unittest.TestCase):
|
||||
work_dir = Path(tempfile.mkdtemp())
|
||||
|
||||
try:
|
||||
result = run_archivebox(work_dir, ['init'])
|
||||
result = run_archivebox(work_dir, ["init"])
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
conn = sqlite3.connect(str(work_dir / 'index.sqlite3'))
|
||||
conn = sqlite3.connect(str(work_dir / "index.sqlite3"))
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('PRAGMA table_info(core_archiveresult)')
|
||||
cursor.execute("PRAGMA table_info(core_archiveresult)")
|
||||
columns = {row[1] for row in cursor.fetchall()}
|
||||
conn.close()
|
||||
|
||||
required = {'id', 'snapshot_id', 'plugin', 'status', 'created_at', 'modified_at'}
|
||||
required = {"id", "snapshot_id", "plugin", "status", "created_at", "modified_at"}
|
||||
for col in required:
|
||||
self.assertIn(col, columns, f"Missing column: {col}")
|
||||
|
||||
@@ -188,16 +188,16 @@ class TestSchemaIntegrity(unittest.TestCase):
|
||||
work_dir = Path(tempfile.mkdtemp())
|
||||
|
||||
try:
|
||||
result = run_archivebox(work_dir, ['init'])
|
||||
result = run_archivebox(work_dir, ["init"])
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
conn = sqlite3.connect(str(work_dir / 'index.sqlite3'))
|
||||
conn = sqlite3.connect(str(work_dir / "index.sqlite3"))
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('PRAGMA table_info(core_tag)')
|
||||
cursor.execute("PRAGMA table_info(core_tag)")
|
||||
columns = {row[1] for row in cursor.fetchall()}
|
||||
conn.close()
|
||||
|
||||
required = {'id', 'name', 'slug'}
|
||||
required = {"id", "name", "slug"}
|
||||
for col in required:
|
||||
self.assertIn(col, columns, f"Missing column: {col}")
|
||||
|
||||
@@ -209,21 +209,21 @@ class TestSchemaIntegrity(unittest.TestCase):
|
||||
work_dir = Path(tempfile.mkdtemp())
|
||||
|
||||
try:
|
||||
result = run_archivebox(work_dir, ['init'])
|
||||
result = run_archivebox(work_dir, ["init"])
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
conn = sqlite3.connect(str(work_dir / 'index.sqlite3'))
|
||||
conn = sqlite3.connect(str(work_dir / "index.sqlite3"))
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('PRAGMA table_info(crawls_crawl)')
|
||||
cursor.execute("PRAGMA table_info(crawls_crawl)")
|
||||
columns = {row[1] for row in cursor.fetchall()}
|
||||
conn.close()
|
||||
|
||||
required = {'id', 'urls', 'status', 'created_at', 'created_by_id'}
|
||||
required = {"id", "urls", "status", "created_at", "created_by_id"}
|
||||
for col in required:
|
||||
self.assertIn(col, columns, f"Missing column: {col}")
|
||||
|
||||
# seed_id should NOT exist (removed in 0.9.x)
|
||||
self.assertNotIn('seed_id', columns, "seed_id column should not exist in 0.9.x")
|
||||
self.assertNotIn("seed_id", columns, "seed_id column should not exist in 0.9.x")
|
||||
|
||||
finally:
|
||||
shutil.rmtree(work_dir, ignore_errors=True)
|
||||
@@ -237,17 +237,17 @@ class TestMultipleSnapshots(unittest.TestCase):
|
||||
work_dir = Path(tempfile.mkdtemp())
|
||||
|
||||
try:
|
||||
result = run_archivebox(work_dir, ['init'])
|
||||
result = run_archivebox(work_dir, ["init"])
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
# Add URLs one at a time
|
||||
result = run_archivebox(work_dir, ['add', '--index-only', 'https://example.com'])
|
||||
result = run_archivebox(work_dir, ["add", "--index-only", "https://example.com"])
|
||||
self.assertEqual(result.returncode, 0, f"Add 1 failed: {result.stderr}")
|
||||
|
||||
result = run_archivebox(work_dir, ['add', '--index-only', 'https://example.org'])
|
||||
result = run_archivebox(work_dir, ["add", "--index-only", "https://example.org"])
|
||||
self.assertEqual(result.returncode, 0, f"Add 2 failed: {result.stderr}")
|
||||
|
||||
conn = sqlite3.connect(str(work_dir / 'index.sqlite3'))
|
||||
conn = sqlite3.connect(str(work_dir / "index.sqlite3"))
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Verify snapshots were created
|
||||
@@ -270,13 +270,13 @@ class TestMultipleSnapshots(unittest.TestCase):
|
||||
work_dir = Path(tempfile.mkdtemp())
|
||||
|
||||
try:
|
||||
result = run_archivebox(work_dir, ['init'])
|
||||
result = run_archivebox(work_dir, ["init"])
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
result = run_archivebox(work_dir, ['add', '--index-only', 'https://example.com'])
|
||||
result = run_archivebox(work_dir, ["add", "--index-only", "https://example.com"])
|
||||
self.assertEqual(result.returncode, 0, f"Add failed: {result.stderr}")
|
||||
|
||||
conn = sqlite3.connect(str(work_dir / 'index.sqlite3'))
|
||||
conn = sqlite3.connect(str(work_dir / "index.sqlite3"))
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Check that snapshot has a crawl_id
|
||||
@@ -291,5 +291,5 @@ class TestMultipleSnapshots(unittest.TestCase):
|
||||
shutil.rmtree(work_dir, ignore_errors=True)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -53,23 +53,23 @@ def test_persona_prepare_runtime_for_crawl_clones_and_cleans_profile(initialized
|
||||
'template_dir_recorded': (runtime_root / 'template_dir.txt').read_text().strip(),
|
||||
'chrome_binary_recorded': (runtime_root / 'chrome_binary.txt').read_text().strip(),
|
||||
}))
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
stdout, stderr, code = run_python_cwd(script, cwd=initialized_archive, timeout=60)
|
||||
assert code == 0, stderr
|
||||
|
||||
payload = json.loads(stdout.strip().splitlines()[-1])
|
||||
assert payload['runtime_root_exists'] is True
|
||||
assert payload['runtime_profile_exists'] is True
|
||||
assert payload['runtime_downloads_exists'] is True
|
||||
assert payload['preferences_copied'] is True
|
||||
assert payload['singleton_removed'] is True
|
||||
assert payload['cache_removed'] is True
|
||||
assert payload['log_removed'] is True
|
||||
assert payload['persona_name_recorded'] == 'Default'
|
||||
assert payload['template_dir_recorded'].endswith('/personas/Default/chrome_user_data')
|
||||
assert payload['chrome_binary_recorded'] == '/Applications/Chromium.app/Contents/MacOS/Chromium'
|
||||
assert payload["runtime_root_exists"] is True
|
||||
assert payload["runtime_profile_exists"] is True
|
||||
assert payload["runtime_downloads_exists"] is True
|
||||
assert payload["preferences_copied"] is True
|
||||
assert payload["singleton_removed"] is True
|
||||
assert payload["cache_removed"] is True
|
||||
assert payload["log_removed"] is True
|
||||
assert payload["persona_name_recorded"] == "Default"
|
||||
assert payload["template_dir_recorded"].endswith("/personas/Default/chrome_user_data")
|
||||
assert payload["chrome_binary_recorded"] == "/Applications/Chromium.app/Contents/MacOS/Chromium"
|
||||
|
||||
|
||||
def test_persona_cleanup_runtime_for_crawl_removes_only_runtime_copy(initialized_archive):
|
||||
@@ -102,15 +102,15 @@ def test_persona_cleanup_runtime_for_crawl_removes_only_runtime_copy(initialized
|
||||
'runtime_removed': not runtime_root.exists(),
|
||||
'template_still_exists': (template_dir / 'Default' / 'Preferences').exists(),
|
||||
}))
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
stdout, stderr, code = run_python_cwd(script, cwd=initialized_archive, timeout=60)
|
||||
assert code == 0, stderr
|
||||
|
||||
payload = json.loads(stdout.strip().splitlines()[-1])
|
||||
assert payload['runtime_removed'] is True
|
||||
assert payload['template_still_exists'] is True
|
||||
assert payload["runtime_removed"] is True
|
||||
assert payload["template_still_exists"] is True
|
||||
|
||||
|
||||
def test_crawl_resolve_persona_raises_for_missing_persona_id(initialized_archive):
|
||||
@@ -135,15 +135,15 @@ def test_crawl_resolve_persona_raises_for_missing_persona_id(initialized_archive
|
||||
print(json.dumps({'raised': True, 'message': str(err)}))
|
||||
else:
|
||||
raise SystemExit('resolve_persona unexpectedly succeeded')
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
stdout, stderr, code = run_python_cwd(script, cwd=initialized_archive, timeout=60)
|
||||
assert code == 0, stderr
|
||||
|
||||
payload = json.loads(stdout.strip().splitlines()[-1])
|
||||
assert payload['raised'] is True
|
||||
assert 'references missing Persona' in payload['message']
|
||||
assert payload["raised"] is True
|
||||
assert "references missing Persona" in payload["message"]
|
||||
|
||||
|
||||
def test_get_config_raises_for_missing_persona_id(initialized_archive):
|
||||
@@ -169,12 +169,12 @@ def test_get_config_raises_for_missing_persona_id(initialized_archive):
|
||||
print(json.dumps({'raised': True, 'message': str(err)}))
|
||||
else:
|
||||
raise SystemExit('get_config unexpectedly succeeded')
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
stdout, stderr, code = run_python_cwd(script, cwd=initialized_archive, timeout=60)
|
||||
assert code == 0, stderr
|
||||
|
||||
payload = json.loads(stdout.strip().splitlines()[-1])
|
||||
assert payload['raised'] is True
|
||||
assert 'references missing Persona' in payload['message']
|
||||
assert payload["raised"] is True
|
||||
assert "references missing Persona" in payload["message"]
|
||||
|
||||
@@ -3,7 +3,7 @@ import unittest
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
|
||||
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "archivebox.settings")
|
||||
|
||||
|
||||
from archivebox.machine.models import Process
|
||||
@@ -13,26 +13,25 @@ class TestProcessRuntimePaths(unittest.TestCase):
|
||||
def test_hook_processes_use_isolated_runtime_dir(self):
|
||||
process = Process(
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
pwd='/tmp/archive/example/chrome',
|
||||
cmd=['node', '/plugins/chrome/on_Snapshot__11_chrome_wait.js', '--url=https://example.com'],
|
||||
pwd="/tmp/archive/example/chrome",
|
||||
cmd=["node", "/plugins/chrome/on_Snapshot__11_chrome_wait.js", "--url=https://example.com"],
|
||||
)
|
||||
|
||||
expected_dir = Path('/tmp/archive/example/chrome/.hooks/on_Snapshot__11_chrome_wait.js')
|
||||
expected_dir = Path("/tmp/archive/example/chrome/.hooks/on_Snapshot__11_chrome_wait.js")
|
||||
self.assertEqual(process.runtime_dir, expected_dir)
|
||||
self.assertEqual(process.stdout_file, expected_dir / 'stdout.log')
|
||||
self.assertEqual(process.stderr_file, expected_dir / 'stderr.log')
|
||||
self.assertEqual(process.pid_file, expected_dir / 'process.pid')
|
||||
self.assertEqual(process.stdout_file, expected_dir / "stdout.log")
|
||||
self.assertEqual(process.stderr_file, expected_dir / "stderr.log")
|
||||
self.assertEqual(process.pid_file, expected_dir / "process.pid")
|
||||
|
||||
def test_non_hook_processes_keep_runtime_files_in_pwd(self):
|
||||
process = Process(
|
||||
process_type=Process.TypeChoices.WORKER,
|
||||
pwd='/tmp/archive/example',
|
||||
cmd=['archivebox', 'run', '--snapshot-id', '123'],
|
||||
pwd="/tmp/archive/example",
|
||||
cmd=["archivebox", "run", "--snapshot-id", "123"],
|
||||
)
|
||||
|
||||
expected_dir = Path('/tmp/archive/example')
|
||||
expected_dir = Path("/tmp/archive/example")
|
||||
self.assertEqual(process.runtime_dir, expected_dir)
|
||||
self.assertEqual(process.stdout_file, expected_dir / 'stdout.log')
|
||||
self.assertEqual(process.stderr_file, expected_dir / 'stderr.log')
|
||||
self.assertEqual(process.pid_file, expected_dir / 'process.pid')
|
||||
|
||||
self.assertEqual(process.stdout_file, expected_dir / "stdout.log")
|
||||
self.assertEqual(process.stderr_file, expected_dir / "stderr.log")
|
||||
self.assertEqual(process.pid_file, expected_dir / "process.pid")
|
||||
|
||||
@@ -11,7 +11,6 @@ from pathlib import Path
|
||||
import pytest
|
||||
|
||||
|
||||
|
||||
def wait_for_db_condition(timeout, condition, interval=0.5):
|
||||
deadline = time.time() + timeout
|
||||
while time.time() < deadline:
|
||||
@@ -45,9 +44,7 @@ def run_add_until(args, env, condition, timeout=120):
|
||||
env=env,
|
||||
)
|
||||
|
||||
assert wait_for_db_condition(timeout=timeout, condition=condition), (
|
||||
f"Timed out waiting for condition while running: {' '.join(args)}"
|
||||
)
|
||||
assert wait_for_db_condition(timeout=timeout, condition=condition), f"Timed out waiting for condition while running: {' '.join(args)}"
|
||||
return stop_process(proc)
|
||||
|
||||
|
||||
@@ -60,26 +57,28 @@ def test_background_hooks_dont_block_parser_extractors(tmp_path, process, recurs
|
||||
|
||||
# Enable only parser extractors and background hooks for this test
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
# Disable most extractors
|
||||
"SAVE_WGET": "false",
|
||||
"SAVE_SINGLEFILE": "false",
|
||||
"SAVE_READABILITY": "false",
|
||||
"SAVE_MERCURY": "false",
|
||||
"SAVE_HTMLTOTEXT": "false",
|
||||
"SAVE_PDF": "false",
|
||||
"SAVE_SCREENSHOT": "false",
|
||||
"SAVE_DOM": "false",
|
||||
"SAVE_HEADERS": "false",
|
||||
"SAVE_GIT": "false",
|
||||
"SAVE_YTDLP": "false",
|
||||
"SAVE_ARCHIVEDOTORG": "false",
|
||||
"SAVE_TITLE": "false",
|
||||
"SAVE_FAVICON": "true",
|
||||
})
|
||||
env.update(
|
||||
{
|
||||
# Disable most extractors
|
||||
"SAVE_WGET": "false",
|
||||
"SAVE_SINGLEFILE": "false",
|
||||
"SAVE_READABILITY": "false",
|
||||
"SAVE_MERCURY": "false",
|
||||
"SAVE_HTMLTOTEXT": "false",
|
||||
"SAVE_PDF": "false",
|
||||
"SAVE_SCREENSHOT": "false",
|
||||
"SAVE_DOM": "false",
|
||||
"SAVE_HEADERS": "false",
|
||||
"SAVE_GIT": "false",
|
||||
"SAVE_YTDLP": "false",
|
||||
"SAVE_ARCHIVEDOTORG": "false",
|
||||
"SAVE_TITLE": "false",
|
||||
"SAVE_FAVICON": "true",
|
||||
},
|
||||
)
|
||||
|
||||
proc = subprocess.Popen(
|
||||
['archivebox', 'add', '--depth=1', '--plugins=favicon,parse_html_urls', recursive_test_site['root_url']],
|
||||
["archivebox", "add", "--depth=1", "--plugins=favicon,parse_html_urls", recursive_test_site["root_url"]],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
@@ -88,9 +87,12 @@ def test_background_hooks_dont_block_parser_extractors(tmp_path, process, recurs
|
||||
|
||||
assert wait_for_db_condition(
|
||||
timeout=120,
|
||||
condition=lambda c: c.execute(
|
||||
"SELECT COUNT(*) FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls' AND status IN ('started', 'succeeded', 'failed')"
|
||||
).fetchone()[0] > 0,
|
||||
condition=lambda c: (
|
||||
c.execute(
|
||||
"SELECT COUNT(*) FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls' AND status IN ('started', 'succeeded', 'failed')",
|
||||
).fetchone()[0]
|
||||
> 0
|
||||
),
|
||||
), "Parser extractors never progressed beyond queued status"
|
||||
stdout, stderr = stop_process(proc)
|
||||
|
||||
@@ -99,18 +101,18 @@ def test_background_hooks_dont_block_parser_extractors(tmp_path, process, recurs
|
||||
if stdout:
|
||||
print(f"\n=== STDOUT (last 2000 chars) ===\n{stdout[-2000:]}\n=== END STDOUT ===\n")
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
|
||||
snapshots = c.execute("SELECT url, depth, status FROM core_snapshot").fetchall()
|
||||
bg_hooks = c.execute(
|
||||
"SELECT plugin, status FROM core_archiveresult WHERE plugin IN ('favicon', 'consolelog', 'ssl', 'responses', 'redirects', 'staticfile') ORDER BY plugin"
|
||||
"SELECT plugin, status FROM core_archiveresult WHERE plugin IN ('favicon', 'consolelog', 'ssl', 'responses', 'redirects', 'staticfile') ORDER BY plugin",
|
||||
).fetchall()
|
||||
parser_extractors = c.execute(
|
||||
"SELECT plugin, status FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls' ORDER BY plugin"
|
||||
"SELECT plugin, status FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls' ORDER BY plugin",
|
||||
).fetchall()
|
||||
all_extractors = c.execute(
|
||||
"SELECT plugin, status FROM core_archiveresult ORDER BY plugin"
|
||||
"SELECT plugin, status FROM core_archiveresult ORDER BY plugin",
|
||||
).fetchall()
|
||||
|
||||
conn.close()
|
||||
@@ -122,14 +124,13 @@ def test_background_hooks_dont_block_parser_extractors(tmp_path, process, recurs
|
||||
)
|
||||
|
||||
assert len(all_extractors) > 0, (
|
||||
f"Should have extractors created for snapshot. "
|
||||
f"If this fails, Snapshot.run() may not have started. "
|
||||
f"Got: {all_extractors}"
|
||||
f"Should have extractors created for snapshot. If this fails, Snapshot.run() may not have started. Got: {all_extractors}"
|
||||
)
|
||||
|
||||
parser_statuses = [status for _, status in parser_extractors]
|
||||
assert 'started' in parser_statuses or 'succeeded' in parser_statuses or 'failed' in parser_statuses, \
|
||||
assert "started" in parser_statuses or "succeeded" in parser_statuses or "failed" in parser_statuses, (
|
||||
f"Parser extractors should have run, got statuses: {parser_statuses}. Background hooks: {bg_hooks}"
|
||||
)
|
||||
|
||||
|
||||
def test_parser_extractors_emit_snapshot_jsonl(tmp_path, process, recursive_test_site):
|
||||
@@ -137,26 +138,28 @@ def test_parser_extractors_emit_snapshot_jsonl(tmp_path, process, recursive_test
|
||||
os.chdir(tmp_path)
|
||||
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
"SAVE_WGET": "false",
|
||||
"SAVE_SINGLEFILE": "false",
|
||||
"SAVE_READABILITY": "false",
|
||||
"SAVE_MERCURY": "false",
|
||||
"SAVE_HTMLTOTEXT": "false",
|
||||
"SAVE_PDF": "false",
|
||||
"SAVE_SCREENSHOT": "false",
|
||||
"SAVE_DOM": "false",
|
||||
"SAVE_HEADERS": "false",
|
||||
"SAVE_GIT": "false",
|
||||
"SAVE_YTDLP": "false",
|
||||
"SAVE_ARCHIVEDOTORG": "false",
|
||||
"SAVE_TITLE": "false",
|
||||
"SAVE_FAVICON": "false",
|
||||
"USE_CHROME": "false",
|
||||
})
|
||||
env.update(
|
||||
{
|
||||
"SAVE_WGET": "false",
|
||||
"SAVE_SINGLEFILE": "false",
|
||||
"SAVE_READABILITY": "false",
|
||||
"SAVE_MERCURY": "false",
|
||||
"SAVE_HTMLTOTEXT": "false",
|
||||
"SAVE_PDF": "false",
|
||||
"SAVE_SCREENSHOT": "false",
|
||||
"SAVE_DOM": "false",
|
||||
"SAVE_HEADERS": "false",
|
||||
"SAVE_GIT": "false",
|
||||
"SAVE_YTDLP": "false",
|
||||
"SAVE_ARCHIVEDOTORG": "false",
|
||||
"SAVE_TITLE": "false",
|
||||
"SAVE_FAVICON": "false",
|
||||
"USE_CHROME": "false",
|
||||
},
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'add', '--depth=0', '--plugins=wget,parse_html_urls', recursive_test_site['root_url']],
|
||||
["archivebox", "add", "--depth=0", "--plugins=wget,parse_html_urls", recursive_test_site["root_url"]],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
@@ -164,11 +167,11 @@ def test_parser_extractors_emit_snapshot_jsonl(tmp_path, process, recursive_test
|
||||
)
|
||||
assert result.returncode == 0, result.stderr
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
|
||||
parse_html = c.execute(
|
||||
"SELECT id, status, output_str FROM core_archiveresult WHERE plugin LIKE '%parse_html_urls' ORDER BY id LIMIT 1"
|
||||
"SELECT id, status, output_str FROM core_archiveresult WHERE plugin LIKE '%parse_html_urls' ORDER BY id LIMIT 1",
|
||||
).fetchone()
|
||||
|
||||
conn.close()
|
||||
@@ -177,11 +180,10 @@ def test_parser_extractors_emit_snapshot_jsonl(tmp_path, process, recursive_test
|
||||
status = parse_html[1]
|
||||
output = parse_html[2] or ""
|
||||
|
||||
assert status in ['started', 'succeeded', 'failed'], \
|
||||
f"60_parse_html_urls should have run, got status: {status}"
|
||||
assert status in ["started", "succeeded", "failed"], f"60_parse_html_urls should have run, got status: {status}"
|
||||
|
||||
if status == 'succeeded' and output:
|
||||
assert 'parsed' in output.lower(), "Parser summary should report parsed URLs"
|
||||
if status == "succeeded" and output:
|
||||
assert "parsed" in output.lower(), "Parser summary should report parsed URLs"
|
||||
|
||||
urls_jsonl_files = list(Path("users/system/snapshots").rglob("parse_html_urls/**/urls.jsonl"))
|
||||
assert urls_jsonl_files, "parse_html_urls should write urls.jsonl output"
|
||||
@@ -192,8 +194,7 @@ def test_parser_extractors_emit_snapshot_jsonl(tmp_path, process, recursive_test
|
||||
records.append(json.loads(line))
|
||||
|
||||
assert records, "urls.jsonl should contain parsed Snapshot records"
|
||||
assert all(record.get("type") == "Snapshot" for record in records), \
|
||||
f"Expected Snapshot JSONL records, got: {records}"
|
||||
assert all(record.get("type") == "Snapshot" for record in records), f"Expected Snapshot JSONL records, got: {records}"
|
||||
|
||||
|
||||
def test_recursive_crawl_creates_child_snapshots(tmp_path, process, recursive_test_site):
|
||||
@@ -201,27 +202,29 @@ def test_recursive_crawl_creates_child_snapshots(tmp_path, process, recursive_te
|
||||
os.chdir(tmp_path)
|
||||
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
"URL_ALLOWLIST": r"127\.0\.0\.1[:/].*",
|
||||
"SAVE_READABILITY": "false",
|
||||
"SAVE_SINGLEFILE": "false",
|
||||
"SAVE_MERCURY": "false",
|
||||
"SAVE_SCREENSHOT": "false",
|
||||
"SAVE_PDF": "false",
|
||||
"SAVE_HEADERS": "false",
|
||||
"SAVE_ARCHIVEDOTORG": "false",
|
||||
"SAVE_GIT": "false",
|
||||
"SAVE_YTDLP": "false",
|
||||
"SAVE_TITLE": "false",
|
||||
})
|
||||
env.update(
|
||||
{
|
||||
"URL_ALLOWLIST": r"127\.0\.0\.1[:/].*",
|
||||
"SAVE_READABILITY": "false",
|
||||
"SAVE_SINGLEFILE": "false",
|
||||
"SAVE_MERCURY": "false",
|
||||
"SAVE_SCREENSHOT": "false",
|
||||
"SAVE_PDF": "false",
|
||||
"SAVE_HEADERS": "false",
|
||||
"SAVE_ARCHIVEDOTORG": "false",
|
||||
"SAVE_GIT": "false",
|
||||
"SAVE_YTDLP": "false",
|
||||
"SAVE_TITLE": "false",
|
||||
},
|
||||
)
|
||||
|
||||
stdout, stderr = run_add_until(
|
||||
['archivebox', 'add', '--depth=1', '--plugins=wget,parse_html_urls', recursive_test_site['root_url']],
|
||||
["archivebox", "add", "--depth=1", "--plugins=wget,parse_html_urls", recursive_test_site["root_url"]],
|
||||
env=env,
|
||||
timeout=120,
|
||||
condition=lambda c: (
|
||||
c.execute("SELECT COUNT(*) FROM core_snapshot WHERE depth = 0").fetchone()[0] >= 1
|
||||
and c.execute("SELECT COUNT(*) FROM core_snapshot WHERE depth = 1").fetchone()[0] >= len(recursive_test_site['child_urls'])
|
||||
and c.execute("SELECT COUNT(*) FROM core_snapshot WHERE depth = 1").fetchone()[0] >= len(recursive_test_site["child_urls"])
|
||||
),
|
||||
)
|
||||
|
||||
@@ -230,26 +233,26 @@ def test_recursive_crawl_creates_child_snapshots(tmp_path, process, recursive_te
|
||||
if stdout:
|
||||
print(f"\n=== STDOUT (last 2000 chars) ===\n{stdout[-2000:]}\n=== END STDOUT ===\n")
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
|
||||
all_snapshots = c.execute("SELECT url, depth FROM core_snapshot").fetchall()
|
||||
root_snapshot = c.execute(
|
||||
"SELECT id, url, depth, parent_snapshot_id FROM core_snapshot WHERE depth = 0 ORDER BY created_at LIMIT 1"
|
||||
"SELECT id, url, depth, parent_snapshot_id FROM core_snapshot WHERE depth = 0 ORDER BY created_at LIMIT 1",
|
||||
).fetchone()
|
||||
child_snapshots = c.execute(
|
||||
"SELECT id, url, depth, parent_snapshot_id FROM core_snapshot WHERE depth = 1"
|
||||
"SELECT id, url, depth, parent_snapshot_id FROM core_snapshot WHERE depth = 1",
|
||||
).fetchall()
|
||||
crawl = c.execute(
|
||||
"SELECT id, max_depth FROM crawls_crawl ORDER BY created_at DESC LIMIT 1"
|
||||
"SELECT id, max_depth FROM crawls_crawl ORDER BY created_at DESC LIMIT 1",
|
||||
).fetchone()
|
||||
parser_status = c.execute(
|
||||
"SELECT plugin, status FROM core_archiveresult WHERE snapshot_id = ? AND plugin LIKE 'parse_%_urls'",
|
||||
(root_snapshot[0] if root_snapshot else '',)
|
||||
(root_snapshot[0] if root_snapshot else "",),
|
||||
).fetchall()
|
||||
started_extractors = c.execute(
|
||||
"SELECT plugin, status FROM core_archiveresult WHERE snapshot_id = ? AND status = 'started'",
|
||||
(root_snapshot[0] if root_snapshot else '',)
|
||||
(root_snapshot[0] if root_snapshot else "",),
|
||||
).fetchall()
|
||||
|
||||
conn.close()
|
||||
@@ -260,13 +263,13 @@ def test_recursive_crawl_creates_child_snapshots(tmp_path, process, recursive_te
|
||||
assert crawl is not None, "Crawl should be created"
|
||||
assert crawl[1] == 1, f"Crawl max_depth should be 1, got {crawl[1]}"
|
||||
|
||||
assert len(child_snapshots) > 0, \
|
||||
assert len(child_snapshots) > 0, (
|
||||
f"Child snapshots should be created from monadical.com links. Parser status: {parser_status}. Started extractors blocking: {started_extractors}"
|
||||
)
|
||||
|
||||
for child_id, child_url, child_depth, parent_id in child_snapshots:
|
||||
assert child_depth == 1, f"Child snapshot should have depth=1, got {child_depth}"
|
||||
assert parent_id == root_id, \
|
||||
f"Child snapshot {child_url} should have parent_snapshot_id={root_id}, got {parent_id}"
|
||||
assert parent_id == root_id, f"Child snapshot {child_url} should have parent_snapshot_id={root_id}, got {parent_id}"
|
||||
|
||||
|
||||
def test_recursive_crawl_respects_depth_limit(tmp_path, process, disable_extractors_dict, recursive_test_site):
|
||||
@@ -277,45 +280,45 @@ def test_recursive_crawl_respects_depth_limit(tmp_path, process, disable_extract
|
||||
env["URL_ALLOWLIST"] = r"127\.0\.0\.1[:/].*"
|
||||
|
||||
stdout, stderr = run_add_until(
|
||||
['archivebox', 'add', '--depth=1', '--plugins=wget,parse_html_urls', recursive_test_site['root_url']],
|
||||
["archivebox", "add", "--depth=1", "--plugins=wget,parse_html_urls", recursive_test_site["root_url"]],
|
||||
env=env,
|
||||
timeout=120,
|
||||
condition=lambda c: (
|
||||
c.execute("SELECT COUNT(*) FROM core_snapshot WHERE depth = 0").fetchone()[0] >= 1
|
||||
and c.execute("SELECT COUNT(*) FROM core_snapshot WHERE depth = 1").fetchone()[0] >= len(recursive_test_site['child_urls'])
|
||||
and c.execute("SELECT COUNT(*) FROM core_snapshot WHERE depth = 1").fetchone()[0] >= len(recursive_test_site["child_urls"])
|
||||
and c.execute(
|
||||
"SELECT COUNT(DISTINCT ar.snapshot_id) "
|
||||
"FROM core_archiveresult ar "
|
||||
"JOIN core_snapshot s ON s.id = ar.snapshot_id "
|
||||
"WHERE s.depth = 1 "
|
||||
"AND ar.plugin LIKE 'parse_%_urls' "
|
||||
"AND ar.status IN ('started', 'succeeded', 'failed')"
|
||||
).fetchone()[0] >= len(recursive_test_site['child_urls'])
|
||||
"AND ar.status IN ('started', 'succeeded', 'failed')",
|
||||
).fetchone()[0]
|
||||
>= len(recursive_test_site["child_urls"])
|
||||
),
|
||||
)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
|
||||
max_depth_found = c.execute(
|
||||
"SELECT MAX(depth) FROM core_snapshot"
|
||||
"SELECT MAX(depth) FROM core_snapshot",
|
||||
).fetchone()[0]
|
||||
depth_counts = c.execute(
|
||||
"SELECT depth, COUNT(*) FROM core_snapshot GROUP BY depth ORDER BY depth"
|
||||
"SELECT depth, COUNT(*) FROM core_snapshot GROUP BY depth ORDER BY depth",
|
||||
).fetchall()
|
||||
|
||||
conn.close()
|
||||
|
||||
assert max_depth_found is not None, "Should have at least one snapshot"
|
||||
assert max_depth_found <= 1, \
|
||||
f"Max depth should not exceed 1, got {max_depth_found}. Depth distribution: {depth_counts}"
|
||||
assert max_depth_found <= 1, f"Max depth should not exceed 1, got {max_depth_found}. Depth distribution: {depth_counts}"
|
||||
|
||||
|
||||
def test_crawl_snapshot_has_parent_snapshot_field(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that Snapshot model has parent_snapshot field."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
|
||||
# Check schema for parent_snapshot_id column
|
||||
@@ -324,15 +327,14 @@ def test_crawl_snapshot_has_parent_snapshot_field(tmp_path, process, disable_ext
|
||||
|
||||
column_names = [col[1] for col in schema]
|
||||
|
||||
assert 'parent_snapshot_id' in column_names, \
|
||||
f"Snapshot table should have parent_snapshot_id column. Columns: {column_names}"
|
||||
assert "parent_snapshot_id" in column_names, f"Snapshot table should have parent_snapshot_id column. Columns: {column_names}"
|
||||
|
||||
|
||||
def test_snapshot_depth_field_exists(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that Snapshot model has depth field."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
|
||||
# Check schema for depth column
|
||||
@@ -341,8 +343,7 @@ def test_snapshot_depth_field_exists(tmp_path, process, disable_extractors_dict)
|
||||
|
||||
column_names = [col[1] for col in schema]
|
||||
|
||||
assert 'depth' in column_names, \
|
||||
f"Snapshot table should have depth column. Columns: {column_names}"
|
||||
assert "depth" in column_names, f"Snapshot table should have depth column. Columns: {column_names}"
|
||||
|
||||
|
||||
def test_root_snapshot_has_depth_zero(tmp_path, process, disable_extractors_dict, recursive_test_site):
|
||||
@@ -353,21 +354,24 @@ def test_root_snapshot_has_depth_zero(tmp_path, process, disable_extractors_dict
|
||||
env["URL_ALLOWLIST"] = r"127\.0\.0\.1[:/].*"
|
||||
|
||||
stdout, stderr = run_add_until(
|
||||
['archivebox', 'add', '--depth=1', '--plugins=wget,parse_html_urls', recursive_test_site['root_url']],
|
||||
["archivebox", "add", "--depth=1", "--plugins=wget,parse_html_urls", recursive_test_site["root_url"]],
|
||||
env=env,
|
||||
timeout=120,
|
||||
condition=lambda c: c.execute(
|
||||
"SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
|
||||
(recursive_test_site['root_url'],),
|
||||
).fetchone()[0] >= 1,
|
||||
condition=lambda c: (
|
||||
c.execute(
|
||||
"SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
|
||||
(recursive_test_site["root_url"],),
|
||||
).fetchone()[0]
|
||||
>= 1
|
||||
),
|
||||
)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
|
||||
snapshot = c.execute(
|
||||
"SELECT id, depth FROM core_snapshot WHERE url = ? ORDER BY created_at LIMIT 1",
|
||||
(recursive_test_site['root_url'],)
|
||||
(recursive_test_site["root_url"],),
|
||||
).fetchone()
|
||||
|
||||
conn.close()
|
||||
@@ -381,42 +385,47 @@ def test_archiveresult_worker_queue_filters_by_foreground_extractors(tmp_path, p
|
||||
os.chdir(tmp_path)
|
||||
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
"SAVE_WGET": "true",
|
||||
"SAVE_SINGLEFILE": "false",
|
||||
"SAVE_PDF": "false",
|
||||
"SAVE_SCREENSHOT": "false",
|
||||
"SAVE_FAVICON": "true",
|
||||
})
|
||||
|
||||
stdout, stderr = run_add_until(
|
||||
['archivebox', 'add', '--plugins=favicon,wget,parse_html_urls', recursive_test_site['root_url']],
|
||||
env=env,
|
||||
timeout=120,
|
||||
condition=lambda c: c.execute(
|
||||
"SELECT COUNT(*) FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls' AND status IN ('started', 'succeeded', 'failed')"
|
||||
).fetchone()[0] > 0,
|
||||
env.update(
|
||||
{
|
||||
"SAVE_WGET": "true",
|
||||
"SAVE_SINGLEFILE": "false",
|
||||
"SAVE_PDF": "false",
|
||||
"SAVE_SCREENSHOT": "false",
|
||||
"SAVE_FAVICON": "true",
|
||||
},
|
||||
)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
stdout, stderr = run_add_until(
|
||||
["archivebox", "add", "--plugins=favicon,wget,parse_html_urls", recursive_test_site["root_url"]],
|
||||
env=env,
|
||||
timeout=120,
|
||||
condition=lambda c: (
|
||||
c.execute(
|
||||
"SELECT COUNT(*) FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls' AND status IN ('started', 'succeeded', 'failed')",
|
||||
).fetchone()[0]
|
||||
> 0
|
||||
),
|
||||
)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
|
||||
bg_results = c.execute(
|
||||
"SELECT plugin, status FROM core_archiveresult WHERE plugin IN ('favicon', 'consolelog', 'ssl', 'responses', 'redirects', 'staticfile') AND status IN ('started', 'succeeded', 'failed')"
|
||||
"SELECT plugin, status FROM core_archiveresult WHERE plugin IN ('favicon', 'consolelog', 'ssl', 'responses', 'redirects', 'staticfile') AND status IN ('started', 'succeeded', 'failed')",
|
||||
).fetchall()
|
||||
parser_status = c.execute(
|
||||
"SELECT plugin, status FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls'"
|
||||
"SELECT plugin, status FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls'",
|
||||
).fetchall()
|
||||
|
||||
conn.close()
|
||||
|
||||
if len(bg_results) > 0:
|
||||
parser_statuses = [status for _, status in parser_status]
|
||||
non_queued = [s for s in parser_statuses if s != 'queued']
|
||||
assert len(non_queued) > 0 or len(parser_status) == 0, \
|
||||
f"With {len(bg_results)} background hooks started, parser extractors should still run. " \
|
||||
f"Got statuses: {parser_statuses}"
|
||||
non_queued = [s for s in parser_statuses if s != "queued"]
|
||||
assert len(non_queued) > 0 or len(parser_status) == 0, (
|
||||
f"With {len(bg_results)} background hooks started, parser extractors should still run. Got statuses: {parser_statuses}"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
import asyncio
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
|
||||
import pytest
|
||||
@@ -12,6 +15,15 @@ pytestmark = pytest.mark.django_db
|
||||
class _DummyBus:
|
||||
def __init__(self, name: str):
|
||||
self.name = name
|
||||
self.registrations = []
|
||||
|
||||
def on(self, event_pattern, handler):
|
||||
registration = SimpleNamespace(event_pattern=event_pattern, handler=handler)
|
||||
self.registrations.append(registration)
|
||||
return registration
|
||||
|
||||
def off(self, event_pattern, registration):
|
||||
self.registrations = [existing for existing in self.registrations if existing is not registration]
|
||||
|
||||
async def stop(self):
|
||||
return None
|
||||
@@ -41,16 +53,16 @@ def test_run_snapshot_uses_isolated_bus_per_snapshot(monkeypatch):
|
||||
from archivebox.services import runner as runner_module
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls='https://blog.sweeting.me\nhttps://sweeting.me',
|
||||
urls="https://blog.sweeting.me\nhttps://sweeting.me",
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
)
|
||||
snapshot_a = Snapshot.objects.create(
|
||||
url='https://blog.sweeting.me',
|
||||
url="https://blog.sweeting.me",
|
||||
crawl=crawl,
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
)
|
||||
snapshot_b = Snapshot.objects.create(
|
||||
url='https://sweeting.me',
|
||||
url="https://sweeting.me",
|
||||
crawl=crawl,
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
)
|
||||
@@ -62,64 +74,66 @@ def test_run_snapshot_uses_isolated_bus_per_snapshot(monkeypatch):
|
||||
created_buses.append(bus)
|
||||
return bus
|
||||
|
||||
monkeypatch.setattr(runner_module, 'create_bus', fake_create_bus)
|
||||
monkeypatch.setattr(runner_module, 'discover_plugins', lambda: {})
|
||||
monkeypatch.setattr(runner_module, 'ProcessService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'MachineService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'BinaryService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'TagService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'CrawlService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'SnapshotService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'ArchiveResultService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'setup_abx_services', lambda *args, **kwargs: _DummyAbxServices())
|
||||
monkeypatch.setattr(runner_module, "create_bus", fake_create_bus)
|
||||
monkeypatch.setattr(runner_module, "discover_plugins", lambda: {})
|
||||
monkeypatch.setattr(runner_module, "ProcessService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "MachineService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "BinaryService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "TagService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "SnapshotService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: _DummyAbxServices())
|
||||
|
||||
download_calls = []
|
||||
|
||||
async def fake_download(*, url, bus, config_overrides, snapshot, **kwargs):
|
||||
download_calls.append(
|
||||
{
|
||||
'url': url,
|
||||
'bus': bus,
|
||||
'snapshot_id': config_overrides['SNAPSHOT_ID'],
|
||||
'source_url': config_overrides['SOURCE_URL'],
|
||||
'abx_snapshot_id': snapshot.id,
|
||||
}
|
||||
"url": url,
|
||||
"bus": bus,
|
||||
"snapshot_id": config_overrides["SNAPSHOT_ID"],
|
||||
"source_url": config_overrides["SOURCE_URL"],
|
||||
"abx_snapshot_id": snapshot.id,
|
||||
},
|
||||
)
|
||||
await asyncio.sleep(0)
|
||||
return []
|
||||
|
||||
monkeypatch.setattr(runner_module, 'download', fake_download)
|
||||
monkeypatch.setattr(runner_module, "download", fake_download)
|
||||
|
||||
crawl_runner = runner_module.CrawlRunner(crawl)
|
||||
snapshot_data = {
|
||||
str(snapshot_a.id): {
|
||||
'id': str(snapshot_a.id),
|
||||
'url': snapshot_a.url,
|
||||
'title': snapshot_a.title,
|
||||
'timestamp': snapshot_a.timestamp,
|
||||
'bookmarked_at': snapshot_a.bookmarked_at.isoformat() if snapshot_a.bookmarked_at else "",
|
||||
'created_at': snapshot_a.created_at.isoformat() if snapshot_a.created_at else "",
|
||||
'tags': snapshot_a.tags_str(),
|
||||
'depth': snapshot_a.depth,
|
||||
'parent_snapshot_id': str(snapshot_a.parent_snapshot_id) if snapshot_a.parent_snapshot_id else None,
|
||||
'output_dir': str(snapshot_a.output_dir),
|
||||
'config': crawl_runner._snapshot_config(snapshot_a),
|
||||
"id": str(snapshot_a.id),
|
||||
"url": snapshot_a.url,
|
||||
"status": snapshot_a.status,
|
||||
"title": snapshot_a.title,
|
||||
"timestamp": snapshot_a.timestamp,
|
||||
"bookmarked_at": snapshot_a.bookmarked_at.isoformat() if snapshot_a.bookmarked_at else "",
|
||||
"created_at": snapshot_a.created_at.isoformat() if snapshot_a.created_at else "",
|
||||
"tags": snapshot_a.tags_str(),
|
||||
"depth": snapshot_a.depth,
|
||||
"parent_snapshot_id": str(snapshot_a.parent_snapshot_id) if snapshot_a.parent_snapshot_id else None,
|
||||
"output_dir": str(snapshot_a.output_dir),
|
||||
"config": crawl_runner._snapshot_config(snapshot_a),
|
||||
},
|
||||
str(snapshot_b.id): {
|
||||
'id': str(snapshot_b.id),
|
||||
'url': snapshot_b.url,
|
||||
'title': snapshot_b.title,
|
||||
'timestamp': snapshot_b.timestamp,
|
||||
'bookmarked_at': snapshot_b.bookmarked_at.isoformat() if snapshot_b.bookmarked_at else "",
|
||||
'created_at': snapshot_b.created_at.isoformat() if snapshot_b.created_at else "",
|
||||
'tags': snapshot_b.tags_str(),
|
||||
'depth': snapshot_b.depth,
|
||||
'parent_snapshot_id': str(snapshot_b.parent_snapshot_id) if snapshot_b.parent_snapshot_id else None,
|
||||
'output_dir': str(snapshot_b.output_dir),
|
||||
'config': crawl_runner._snapshot_config(snapshot_b),
|
||||
"id": str(snapshot_b.id),
|
||||
"url": snapshot_b.url,
|
||||
"status": snapshot_b.status,
|
||||
"title": snapshot_b.title,
|
||||
"timestamp": snapshot_b.timestamp,
|
||||
"bookmarked_at": snapshot_b.bookmarked_at.isoformat() if snapshot_b.bookmarked_at else "",
|
||||
"created_at": snapshot_b.created_at.isoformat() if snapshot_b.created_at else "",
|
||||
"tags": snapshot_b.tags_str(),
|
||||
"depth": snapshot_b.depth,
|
||||
"parent_snapshot_id": str(snapshot_b.parent_snapshot_id) if snapshot_b.parent_snapshot_id else None,
|
||||
"output_dir": str(snapshot_b.output_dir),
|
||||
"config": crawl_runner._snapshot_config(snapshot_b),
|
||||
},
|
||||
}
|
||||
monkeypatch.setattr(crawl_runner, '_load_snapshot_run_data', lambda snapshot_id: snapshot_data[snapshot_id])
|
||||
monkeypatch.setattr(crawl_runner, "_load_snapshot_run_data", lambda snapshot_id: snapshot_data[snapshot_id])
|
||||
|
||||
async def run_both():
|
||||
await asyncio.gather(
|
||||
@@ -130,9 +144,9 @@ def test_run_snapshot_uses_isolated_bus_per_snapshot(monkeypatch):
|
||||
asyncio.run(run_both())
|
||||
|
||||
assert len(download_calls) == 2
|
||||
assert {call['snapshot_id'] for call in download_calls} == {str(snapshot_a.id), str(snapshot_b.id)}
|
||||
assert {call['source_url'] for call in download_calls} == {snapshot_a.url, snapshot_b.url}
|
||||
assert len({id(call['bus']) for call in download_calls}) == 2
|
||||
assert {call["snapshot_id"] for call in download_calls} == {str(snapshot_a.id), str(snapshot_b.id)}
|
||||
assert {call["source_url"] for call in download_calls} == {snapshot_a.url, snapshot_b.url}
|
||||
assert len({id(call["bus"]) for call in download_calls}) == 2
|
||||
assert len(created_buses) == 3 # 1 crawl bus + 2 isolated snapshot buses
|
||||
|
||||
|
||||
@@ -146,38 +160,40 @@ def test_ensure_background_runner_starts_when_none_running(monkeypatch):
|
||||
def __init__(self, args, **kwargs):
|
||||
popen_calls.append((args, kwargs))
|
||||
|
||||
monkeypatch.setattr(machine_models.Process, 'cleanup_stale_running', classmethod(lambda cls, machine=None: 0))
|
||||
monkeypatch.setattr(machine_models.Machine, 'current', classmethod(lambda cls: SimpleNamespace(id='machine-1')))
|
||||
monkeypatch.setattr(machine_models.Process, "cleanup_stale_running", classmethod(lambda cls, machine=None: 0))
|
||||
monkeypatch.setattr(machine_models.Process, "cleanup_orphaned_workers", classmethod(lambda cls: 0))
|
||||
monkeypatch.setattr(machine_models.Machine, "current", classmethod(lambda cls: SimpleNamespace(id="machine-1")))
|
||||
monkeypatch.setattr(
|
||||
machine_models.Process.objects,
|
||||
'filter',
|
||||
"filter",
|
||||
lambda **kwargs: SimpleNamespace(exists=lambda: False),
|
||||
)
|
||||
monkeypatch.setattr(runner_module.subprocess, 'Popen', DummyPopen)
|
||||
monkeypatch.setattr(runner_module.subprocess, "Popen", DummyPopen)
|
||||
|
||||
started = runner_module.ensure_background_runner(allow_under_pytest=True)
|
||||
|
||||
assert started is True
|
||||
assert len(popen_calls) == 1
|
||||
assert popen_calls[0][0] == [runner_module.sys.executable, '-m', 'archivebox', 'run', '--daemon']
|
||||
assert popen_calls[0][1]['stdin'] is subprocess.DEVNULL
|
||||
assert popen_calls[0][0] == [runner_module.sys.executable, "-m", "archivebox", "run", "--daemon"]
|
||||
assert popen_calls[0][1]["stdin"] is subprocess.DEVNULL
|
||||
|
||||
|
||||
def test_ensure_background_runner_skips_when_orchestrator_running(monkeypatch):
|
||||
import archivebox.machine.models as machine_models
|
||||
from archivebox.services import runner as runner_module
|
||||
|
||||
monkeypatch.setattr(machine_models.Process, 'cleanup_stale_running', classmethod(lambda cls, machine=None: 0))
|
||||
monkeypatch.setattr(machine_models.Machine, 'current', classmethod(lambda cls: SimpleNamespace(id='machine-1')))
|
||||
monkeypatch.setattr(machine_models.Process, "cleanup_stale_running", classmethod(lambda cls, machine=None: 0))
|
||||
monkeypatch.setattr(machine_models.Process, "cleanup_orphaned_workers", classmethod(lambda cls: 0))
|
||||
monkeypatch.setattr(machine_models.Machine, "current", classmethod(lambda cls: SimpleNamespace(id="machine-1")))
|
||||
monkeypatch.setattr(
|
||||
machine_models.Process.objects,
|
||||
'filter',
|
||||
"filter",
|
||||
lambda **kwargs: SimpleNamespace(exists=lambda: True),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
runner_module.subprocess,
|
||||
'Popen',
|
||||
lambda *args, **kwargs: (_ for _ in ()).throw(AssertionError('runner should not be spawned')),
|
||||
"Popen",
|
||||
lambda *args, **kwargs: (_ for _ in ()).throw(AssertionError("runner should not be spawned")),
|
||||
)
|
||||
|
||||
started = runner_module.ensure_background_runner(allow_under_pytest=True)
|
||||
@@ -191,20 +207,20 @@ def test_runner_prepare_refreshes_network_interface_and_attaches_current_process
|
||||
from archivebox.services import runner as runner_module
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls='https://example.com',
|
||||
urls="https://example.com",
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
)
|
||||
|
||||
class _Iface:
|
||||
id = 'iface-1'
|
||||
machine = SimpleNamespace(id='machine-1')
|
||||
machine_id = 'machine-1'
|
||||
id = "iface-1"
|
||||
machine = SimpleNamespace(id="machine-1")
|
||||
machine_id = "machine-1"
|
||||
|
||||
saved_updates = []
|
||||
|
||||
class _Proc:
|
||||
iface_id = None
|
||||
machine_id = 'machine-1'
|
||||
machine_id = "machine-1"
|
||||
iface = None
|
||||
machine = None
|
||||
|
||||
@@ -213,23 +229,23 @@ def test_runner_prepare_refreshes_network_interface_and_attaches_current_process
|
||||
|
||||
proc = _Proc()
|
||||
|
||||
monkeypatch.setattr(runner_module, 'discover_plugins', lambda: {})
|
||||
monkeypatch.setattr(runner_module, 'create_bus', lambda **kwargs: _DummyBus(kwargs['name']))
|
||||
monkeypatch.setattr(runner_module, 'ProcessService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'MachineService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'BinaryService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'TagService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'CrawlService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'SnapshotService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'ArchiveResultService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, "discover_plugins", lambda: {})
|
||||
monkeypatch.setattr(runner_module, "create_bus", lambda **kwargs: _DummyBus(kwargs["name"]))
|
||||
monkeypatch.setattr(runner_module, "ProcessService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "MachineService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "BinaryService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "TagService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "SnapshotService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService)
|
||||
|
||||
from archivebox.machine.models import NetworkInterface, Process
|
||||
from archivebox.config import configset as configset_module
|
||||
|
||||
refresh_calls = []
|
||||
monkeypatch.setattr(NetworkInterface, 'current', classmethod(lambda cls, refresh=False: refresh_calls.append(refresh) or _Iface()))
|
||||
monkeypatch.setattr(Process, 'current', classmethod(lambda cls: proc))
|
||||
monkeypatch.setattr(configset_module, 'get_config', lambda **kwargs: {})
|
||||
monkeypatch.setattr(NetworkInterface, "current", classmethod(lambda cls, refresh=False: refresh_calls.append(refresh) or _Iface()))
|
||||
monkeypatch.setattr(Process, "current", classmethod(lambda cls: proc))
|
||||
monkeypatch.setattr(configset_module, "get_config", lambda **kwargs: {})
|
||||
|
||||
crawl_runner = runner_module.CrawlRunner(crawl)
|
||||
crawl_runner._prepare()
|
||||
@@ -237,7 +253,182 @@ def test_runner_prepare_refreshes_network_interface_and_attaches_current_process
|
||||
assert refresh_calls == [True]
|
||||
assert proc.iface is not None
|
||||
assert proc.machine == proc.iface.machine
|
||||
assert saved_updates == [('iface', 'machine', 'modified_at')]
|
||||
assert saved_updates == [("iface", "machine", "modified_at")]
|
||||
|
||||
|
||||
def test_installed_binary_config_overrides_include_valid_installed_binaries(monkeypatch):
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
from archivebox.services import runner as runner_module
|
||||
from abx_dl.models import Plugin
|
||||
|
||||
machine = Machine.objects.create(
|
||||
guid="test-guid-runner-overrides",
|
||||
hostname="runner-host",
|
||||
hw_in_docker=False,
|
||||
hw_in_vm=False,
|
||||
hw_manufacturer="Test",
|
||||
hw_product="Test Product",
|
||||
hw_uuid="test-hw-runner-overrides",
|
||||
os_arch="arm64",
|
||||
os_family="darwin",
|
||||
os_platform="macOS",
|
||||
os_release="14.0",
|
||||
os_kernel="Darwin",
|
||||
stats={},
|
||||
config={},
|
||||
)
|
||||
mercury_binary = Binary.objects.create(
|
||||
machine=machine,
|
||||
name="postlight-parser",
|
||||
abspath=sys.executable,
|
||||
version="2.0.0",
|
||||
binprovider="pip",
|
||||
binproviders="env,pip",
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
)
|
||||
wget_binary = Binary.objects.create(
|
||||
machine=machine,
|
||||
name="wget",
|
||||
abspath="/tmp/not-an-executable",
|
||||
version="1.0.0",
|
||||
binprovider="env",
|
||||
binproviders="env",
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
)
|
||||
|
||||
monkeypatch.setattr(Machine, "current", classmethod(lambda cls: machine))
|
||||
monkeypatch.setattr(Path, "is_file", lambda self: str(self) in {sys.executable, mercury_binary.abspath, wget_binary.abspath})
|
||||
monkeypatch.setattr(
|
||||
runner_module.os,
|
||||
"access",
|
||||
lambda path, mode: str(path) == sys.executable,
|
||||
)
|
||||
|
||||
overrides = runner_module._installed_binary_config_overrides(
|
||||
{
|
||||
"mercury": Plugin(
|
||||
name="mercury",
|
||||
path=Path("."),
|
||||
hooks=[],
|
||||
config_schema={"MERCURY_BINARY": {"type": "string", "default": "postlight-parser"}},
|
||||
),
|
||||
},
|
||||
)
|
||||
|
||||
assert overrides["MERCURY_BINARY"] == sys.executable
|
||||
assert overrides["POSTLIGHT_PARSER_BINARY"] == sys.executable
|
||||
assert "WGET_BINARY" not in overrides
|
||||
|
||||
|
||||
def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch):
|
||||
import asgiref.sync
|
||||
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.services import runner as runner_module
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls="https://example.com",
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
max_size=16,
|
||||
)
|
||||
|
||||
monkeypatch.setattr(runner_module, "discover_plugins", lambda: {})
|
||||
monkeypatch.setattr(runner_module, "create_bus", lambda **kwargs: _DummyBus(kwargs["name"]))
|
||||
monkeypatch.setattr(runner_module, "ProcessService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "MachineService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "BinaryService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "TagService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "SnapshotService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "_limit_stop_reason", lambda config: "max_size")
|
||||
monkeypatch.setattr(
|
||||
asgiref.sync,
|
||||
"sync_to_async",
|
||||
lambda func, thread_sensitive=True: lambda *args, **kwargs: _call_sync(func, *args, **kwargs),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
runner_module,
|
||||
"download",
|
||||
lambda *args, **kwargs: (_ for _ in ()).throw(AssertionError("snapshot download should have been skipped")),
|
||||
)
|
||||
|
||||
crawl_runner = runner_module.CrawlRunner(crawl)
|
||||
cancelled: list[str] = []
|
||||
crawl_runner._load_snapshot_run_data = lambda snapshot_id: {
|
||||
"id": snapshot_id,
|
||||
"url": "https://example.com/child",
|
||||
"title": "",
|
||||
"timestamp": "",
|
||||
"bookmarked_at": "",
|
||||
"created_at": "",
|
||||
"tags": "",
|
||||
"depth": 1,
|
||||
"status": "queued",
|
||||
"parent_snapshot_id": None,
|
||||
"output_dir": "/tmp/child",
|
||||
"config": {"CRAWL_DIR": "/tmp/crawl", "MAX_SIZE": 16},
|
||||
}
|
||||
crawl_runner._cancel_snapshot_due_to_limit = lambda snapshot_id: cancelled.append(snapshot_id)
|
||||
|
||||
asyncio.run(crawl_runner._run_snapshot("child-1"))
|
||||
|
||||
assert cancelled == ["child-1"]
|
||||
|
||||
|
||||
def test_seal_snapshot_cancels_queued_descendants_after_max_size():
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.services.snapshot_service import SnapshotService
|
||||
from abx_dl.orchestrator import create_bus
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls="https://example.com",
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
max_size=16,
|
||||
)
|
||||
root = Snapshot.objects.create(
|
||||
url="https://example.com",
|
||||
crawl=crawl,
|
||||
status=Snapshot.StatusChoices.STARTED,
|
||||
)
|
||||
child = Snapshot.objects.create(
|
||||
url="https://example.com/child",
|
||||
crawl=crawl,
|
||||
depth=1,
|
||||
parent_snapshot_id=root.id,
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
)
|
||||
|
||||
state_dir = Path(crawl.output_dir) / ".abx-dl"
|
||||
state_dir.mkdir(parents=True, exist_ok=True)
|
||||
(state_dir / "limits.json").write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"admitted_snapshot_ids": [str(root.id), str(child.id)],
|
||||
"counted_process_ids": ["proc-1"],
|
||||
"total_size": 32,
|
||||
"stop_reason": "max_size",
|
||||
},
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
bus = create_bus(name="test_snapshot_limit_cancel")
|
||||
service = SnapshotService(bus, crawl_id=str(crawl.id), schedule_snapshot=lambda snapshot_id: None)
|
||||
try:
|
||||
sealed_id = service._seal_snapshot(str(root.id))
|
||||
finally:
|
||||
asyncio.run(bus.stop())
|
||||
|
||||
root.refresh_from_db()
|
||||
child.refresh_from_db()
|
||||
assert sealed_id == str(root.id)
|
||||
assert root.status == Snapshot.StatusChoices.SEALED
|
||||
assert child.status == Snapshot.StatusChoices.SEALED
|
||||
assert child.retry_at is None
|
||||
|
||||
|
||||
def test_create_crawl_api_queues_crawl_without_spawning_runner(monkeypatch):
|
||||
@@ -245,28 +436,28 @@ def test_create_crawl_api_queues_crawl_without_spawning_runner(monkeypatch):
|
||||
from archivebox.api.v1_crawls import CrawlCreateSchema, create_crawl
|
||||
|
||||
user = get_user_model().objects.create_superuser(
|
||||
username='runner-api-admin',
|
||||
email='runner-api-admin@example.com',
|
||||
password='testpassword',
|
||||
username="runner-api-admin",
|
||||
email="runner-api-admin@example.com",
|
||||
password="testpassword",
|
||||
)
|
||||
request = RequestFactory().post('/api/v1/crawls')
|
||||
request = RequestFactory().post("/api/v1/crawls")
|
||||
request.user = user
|
||||
|
||||
crawl = create_crawl(
|
||||
request,
|
||||
CrawlCreateSchema(
|
||||
urls=['https://example.com'],
|
||||
urls=["https://example.com"],
|
||||
max_depth=0,
|
||||
tags=[],
|
||||
tags_str='',
|
||||
label='',
|
||||
notes='',
|
||||
tags_str="",
|
||||
label="",
|
||||
notes="",
|
||||
config={},
|
||||
),
|
||||
)
|
||||
|
||||
assert str(crawl.id)
|
||||
assert crawl.status == 'queued'
|
||||
assert crawl.status == "queued"
|
||||
assert crawl.retry_at is not None
|
||||
|
||||
|
||||
@@ -278,36 +469,36 @@ def test_crawl_runner_does_not_seal_unfinished_crawl(monkeypatch):
|
||||
from archivebox.services import runner as runner_module
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls='https://example.com',
|
||||
urls="https://example.com",
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
)
|
||||
snapshot = Snapshot.objects.create(
|
||||
url='https://example.com',
|
||||
url="https://example.com",
|
||||
crawl=crawl,
|
||||
status=Snapshot.StatusChoices.STARTED,
|
||||
)
|
||||
|
||||
monkeypatch.setattr(runner_module, '_attach_bus_trace', lambda bus: None)
|
||||
monkeypatch.setattr(runner_module, '_stop_bus_trace', lambda bus: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module, 'setup_abx_services', lambda *args, **kwargs: _DummyAbxServices())
|
||||
monkeypatch.setattr(runner_module, "_attach_bus_trace", lambda bus: None)
|
||||
monkeypatch.setattr(runner_module, "_stop_bus_trace", lambda bus: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: _DummyAbxServices())
|
||||
monkeypatch.setenv("DJANGO_ALLOW_ASYNC_UNSAFE", "true")
|
||||
monkeypatch.setattr(
|
||||
asgiref.sync,
|
||||
'sync_to_async',
|
||||
lambda func, thread_sensitive=True: (lambda *args, **kwargs: _call_sync(func, *args, **kwargs)),
|
||||
"sync_to_async",
|
||||
lambda func, thread_sensitive=True: lambda *args, **kwargs: _call_sync(func, *args, **kwargs),
|
||||
)
|
||||
monkeypatch.setattr(Crawl.objects, 'get', lambda id: crawl)
|
||||
monkeypatch.setattr(crawl, 'is_finished', lambda: False)
|
||||
monkeypatch.setattr(crawl, 'save', lambda *args, **kwargs: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_prepare', lambda self: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_create_live_ui', lambda self: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_initial_snapshot_ids', lambda self: [str(snapshot.id)])
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_run_crawl_setup', lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, 'enqueue_snapshot', lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_wait_for_snapshot_tasks', lambda self: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_run_crawl_cleanup', lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_cleanup_persona', lambda self: None)
|
||||
monkeypatch.setattr(Crawl.objects, "get", lambda id: crawl)
|
||||
monkeypatch.setattr(crawl, "is_finished", lambda: False)
|
||||
monkeypatch.setattr(crawl, "save", lambda *args, **kwargs: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_prepare", lambda self: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_create_live_ui", lambda self: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_initial_snapshot_ids", lambda self: [str(snapshot.id)])
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "enqueue_snapshot", lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_run_crawl_cleanup", lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_cleanup_persona", lambda self: None)
|
||||
|
||||
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
|
||||
|
||||
@@ -323,39 +514,39 @@ def test_crawl_runner_finalizes_with_sync_to_async_for_is_finished(monkeypatch):
|
||||
from archivebox.services import runner as runner_module
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls='https://example.com',
|
||||
urls="https://example.com",
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
)
|
||||
snapshot = Snapshot.objects.create(
|
||||
url='https://example.com',
|
||||
url="https://example.com",
|
||||
crawl=crawl,
|
||||
status=Snapshot.StatusChoices.STARTED,
|
||||
)
|
||||
|
||||
monkeypatch.setattr(runner_module, 'create_bus', lambda *args, **kwargs: _DummyBus('runner'))
|
||||
monkeypatch.setattr(runner_module, 'discover_plugins', lambda: {})
|
||||
monkeypatch.setattr(runner_module, 'ProcessService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'MachineService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'BinaryService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'TagService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'CrawlService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'SnapshotService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, 'ArchiveResultService', _DummyService)
|
||||
monkeypatch.setattr(runner_module, '_attach_bus_trace', lambda bus: None)
|
||||
monkeypatch.setattr(runner_module, '_stop_bus_trace', lambda bus: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module, 'setup_abx_services', lambda *args, **kwargs: _DummyAbxServices())
|
||||
monkeypatch.setattr(Crawl.objects, 'get', lambda id: crawl)
|
||||
monkeypatch.setattr(crawl, 'save', lambda *args, **kwargs: None)
|
||||
monkeypatch.setattr(crawl, 'cleanup', lambda: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_prepare', lambda self: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_create_live_ui', lambda self: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_initial_snapshot_ids', lambda self: [str(snapshot.id)])
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_run_crawl_setup', lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, 'enqueue_snapshot', lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_wait_for_snapshot_tasks', lambda self: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_run_crawl_cleanup', lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_cleanup_persona', lambda self: None)
|
||||
monkeypatch.setattr(runner_module, "create_bus", lambda *args, **kwargs: _DummyBus("runner"))
|
||||
monkeypatch.setattr(runner_module, "discover_plugins", lambda: {})
|
||||
monkeypatch.setattr(runner_module, "ProcessService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "MachineService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "BinaryService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "TagService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "SnapshotService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService)
|
||||
monkeypatch.setattr(runner_module, "_attach_bus_trace", lambda bus: None)
|
||||
monkeypatch.setattr(runner_module, "_stop_bus_trace", lambda bus: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: _DummyAbxServices())
|
||||
monkeypatch.setattr(Crawl.objects, "get", lambda id: crawl)
|
||||
monkeypatch.setattr(crawl, "save", lambda *args, **kwargs: None)
|
||||
monkeypatch.setattr(crawl, "cleanup", lambda: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_prepare", lambda self: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_create_live_ui", lambda self: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_initial_snapshot_ids", lambda self: [str(snapshot.id)])
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "enqueue_snapshot", lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_run_crawl_cleanup", lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_cleanup_persona", lambda self: None)
|
||||
|
||||
sync_to_async_wrapped: list[str] = []
|
||||
sync_to_async_active = False
|
||||
@@ -363,28 +554,29 @@ def test_crawl_runner_finalizes_with_sync_to_async_for_is_finished(monkeypatch):
|
||||
def fake_sync_to_async(func, thread_sensitive=True):
|
||||
async def wrapper(*args, **kwargs):
|
||||
nonlocal sync_to_async_active
|
||||
sync_to_async_wrapped.append(getattr(func, '__name__', repr(func)))
|
||||
sync_to_async_wrapped.append(getattr(func, "__name__", repr(func)))
|
||||
previous = sync_to_async_active
|
||||
sync_to_async_active = True
|
||||
try:
|
||||
return func(*args, **kwargs)
|
||||
finally:
|
||||
sync_to_async_active = previous
|
||||
|
||||
return wrapper
|
||||
|
||||
def guarded_is_finished():
|
||||
assert sync_to_async_active is True
|
||||
return False
|
||||
|
||||
monkeypatch.setattr(asgiref.sync, 'sync_to_async', fake_sync_to_async)
|
||||
monkeypatch.setattr(crawl, 'is_finished', guarded_is_finished)
|
||||
monkeypatch.setattr(asgiref.sync, "sync_to_async", fake_sync_to_async)
|
||||
monkeypatch.setattr(crawl, "is_finished", guarded_is_finished)
|
||||
|
||||
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
|
||||
|
||||
crawl.refresh_from_db()
|
||||
assert crawl.status == Crawl.StatusChoices.STARTED
|
||||
assert crawl.retry_at is not None
|
||||
assert 'guarded_is_finished' in sync_to_async_wrapped
|
||||
assert "guarded_is_finished" in sync_to_async_wrapped
|
||||
|
||||
|
||||
def test_wait_for_snapshot_tasks_surfaces_already_failed_task():
|
||||
@@ -393,16 +585,16 @@ def test_wait_for_snapshot_tasks_surfaces_already_failed_task():
|
||||
from archivebox.services import runner as runner_module
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls='https://example.com',
|
||||
urls="https://example.com",
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
)
|
||||
crawl_runner = runner_module.CrawlRunner(crawl)
|
||||
|
||||
async def run_test():
|
||||
task = asyncio.get_running_loop().create_future()
|
||||
task.set_exception(RuntimeError('snapshot failed'))
|
||||
crawl_runner.snapshot_tasks['snap-1'] = task
|
||||
with pytest.raises(RuntimeError, match='snapshot failed'):
|
||||
task.set_exception(RuntimeError("snapshot failed"))
|
||||
crawl_runner.snapshot_tasks["snap-1"] = task
|
||||
with pytest.raises(RuntimeError, match="snapshot failed"):
|
||||
await crawl_runner._wait_for_snapshot_tasks()
|
||||
|
||||
asyncio.run(run_test())
|
||||
@@ -414,7 +606,7 @@ def test_wait_for_snapshot_tasks_returns_after_completed_tasks_are_pruned():
|
||||
from archivebox.services import runner as runner_module
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls='https://example.com',
|
||||
urls="https://example.com",
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
)
|
||||
crawl_runner = runner_module.CrawlRunner(crawl)
|
||||
@@ -424,7 +616,7 @@ def test_wait_for_snapshot_tasks_returns_after_completed_tasks_are_pruned():
|
||||
|
||||
async def run_test():
|
||||
task = asyncio.create_task(finish_snapshot())
|
||||
crawl_runner.snapshot_tasks['snap-1'] = task
|
||||
crawl_runner.snapshot_tasks["snap-1"] = task
|
||||
await asyncio.wait_for(crawl_runner._wait_for_snapshot_tasks(), timeout=0.5)
|
||||
assert crawl_runner.snapshot_tasks == {}
|
||||
|
||||
@@ -439,43 +631,47 @@ def test_crawl_runner_calls_crawl_cleanup_after_snapshot_phase(monkeypatch):
|
||||
from archivebox.services import runner as runner_module
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls='https://example.com',
|
||||
urls="https://example.com",
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
)
|
||||
snapshot = Snapshot.objects.create(
|
||||
url='https://example.com',
|
||||
url="https://example.com",
|
||||
crawl=crawl,
|
||||
status=Snapshot.StatusChoices.STARTED,
|
||||
)
|
||||
|
||||
monkeypatch.setattr(runner_module, '_attach_bus_trace', lambda bus: None)
|
||||
monkeypatch.setattr(runner_module, '_stop_bus_trace', lambda bus: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module, 'setup_abx_services', lambda *args, **kwargs: _DummyAbxServices())
|
||||
monkeypatch.setattr(runner_module, "_attach_bus_trace", lambda bus: None)
|
||||
monkeypatch.setattr(runner_module, "_stop_bus_trace", lambda bus: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: _DummyAbxServices())
|
||||
monkeypatch.setenv("DJANGO_ALLOW_ASYNC_UNSAFE", "true")
|
||||
monkeypatch.setattr(
|
||||
asgiref.sync,
|
||||
'sync_to_async',
|
||||
lambda func, thread_sensitive=True: (lambda *args, **kwargs: _call_sync(func, *args, **kwargs)),
|
||||
"sync_to_async",
|
||||
lambda func, thread_sensitive=True: lambda *args, **kwargs: _call_sync(func, *args, **kwargs),
|
||||
)
|
||||
monkeypatch.setattr(Crawl.objects, 'get', lambda id: crawl)
|
||||
monkeypatch.setattr(crawl, 'is_finished', lambda: False)
|
||||
monkeypatch.setattr(crawl, 'save', lambda *args, **kwargs: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_prepare', lambda self: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_create_live_ui', lambda self: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_initial_snapshot_ids', lambda self: [str(snapshot.id)])
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_run_crawl_setup', lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, 'enqueue_snapshot', lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_wait_for_snapshot_tasks', lambda self: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_cleanup_persona', lambda self: None)
|
||||
monkeypatch.setattr(Crawl.objects, "get", lambda id: crawl)
|
||||
monkeypatch.setattr(crawl, "is_finished", lambda: False)
|
||||
monkeypatch.setattr(crawl, "save", lambda *args, **kwargs: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_prepare", lambda self: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_create_live_ui", lambda self: None)
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_initial_snapshot_ids", lambda self: [str(snapshot.id)])
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "enqueue_snapshot", lambda self, snapshot_id: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, "_cleanup_persona", lambda self: None)
|
||||
|
||||
cleanup_calls = []
|
||||
monkeypatch.setattr(runner_module.CrawlRunner, '_run_crawl_cleanup', lambda self, snapshot_id: cleanup_calls.append('abx_cleanup') or asyncio.sleep(0))
|
||||
monkeypatch.setattr(crawl, 'cleanup', lambda: cleanup_calls.append('crawl_cleanup'))
|
||||
monkeypatch.setattr(
|
||||
runner_module.CrawlRunner,
|
||||
"_run_crawl_cleanup",
|
||||
lambda self, snapshot_id: cleanup_calls.append("abx_cleanup") or asyncio.sleep(0),
|
||||
)
|
||||
monkeypatch.setattr(crawl, "cleanup", lambda: cleanup_calls.append("crawl_cleanup"))
|
||||
|
||||
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
|
||||
|
||||
assert cleanup_calls == ['crawl_cleanup', 'abx_cleanup']
|
||||
assert cleanup_calls == ["crawl_cleanup", "abx_cleanup"]
|
||||
|
||||
|
||||
def test_abx_process_service_background_monitor_finishes_after_process_exit(monkeypatch, tmp_path):
|
||||
@@ -497,7 +693,7 @@ def test_abx_process_service_background_monitor_finishes_after_process_exit(monk
|
||||
return ["daemon output\n"]
|
||||
|
||||
service._emit_event = fake_emit_event
|
||||
monkeypatch.setattr(service, '_stream_stdout', fake_stream_stdout)
|
||||
monkeypatch.setattr(service, "_stream_stdout", fake_stream_stdout)
|
||||
|
||||
class FakeAsyncProcess:
|
||||
def __init__(self):
|
||||
@@ -509,32 +705,32 @@ def test_abx_process_service_background_monitor_finishes_after_process_exit(monk
|
||||
self.returncode = 0
|
||||
return 0
|
||||
|
||||
plugin_output_dir = tmp_path / 'chrome'
|
||||
plugin_output_dir = tmp_path / "chrome"
|
||||
plugin_output_dir.mkdir()
|
||||
stdout_file = plugin_output_dir / 'on_Crawl__90_chrome_launch.daemon.bg.stdout.log'
|
||||
stderr_file = plugin_output_dir / 'on_Crawl__90_chrome_launch.daemon.bg.stderr.log'
|
||||
stderr_file.write_text('')
|
||||
pid_file = plugin_output_dir / 'on_Crawl__90_chrome_launch.daemon.bg.pid'
|
||||
pid_file.write_text('12345')
|
||||
stdout_file = plugin_output_dir / "on_Crawl__90_chrome_launch.daemon.bg.stdout.log"
|
||||
stderr_file = plugin_output_dir / "on_Crawl__90_chrome_launch.daemon.bg.stderr.log"
|
||||
stderr_file.write_text("")
|
||||
pid_file = plugin_output_dir / "on_Crawl__90_chrome_launch.daemon.bg.pid"
|
||||
pid_file.write_text("12345")
|
||||
|
||||
proc = AbxProcess(
|
||||
cmd=['hook'],
|
||||
cmd=["hook"],
|
||||
pwd=str(plugin_output_dir),
|
||||
timeout=60,
|
||||
started_at=now_iso(),
|
||||
plugin='chrome',
|
||||
hook_name='on_Crawl__90_chrome_launch.daemon.bg',
|
||||
plugin="chrome",
|
||||
hook_name="on_Crawl__90_chrome_launch.daemon.bg",
|
||||
)
|
||||
process = FakeAsyncProcess()
|
||||
event = SimpleNamespace(
|
||||
plugin_name='chrome',
|
||||
hook_name='on_Crawl__90_chrome_launch.daemon.bg',
|
||||
hook_path='hook',
|
||||
hook_args=['--url=https://example.org/'],
|
||||
plugin_name="chrome",
|
||||
hook_name="on_Crawl__90_chrome_launch.daemon.bg",
|
||||
hook_path="hook",
|
||||
hook_args=["--url=https://example.org/"],
|
||||
env={},
|
||||
output_dir=str(plugin_output_dir),
|
||||
timeout=60,
|
||||
snapshot_id='snap-1',
|
||||
snapshot_id="snap-1",
|
||||
is_background=True,
|
||||
)
|
||||
|
||||
@@ -566,28 +762,29 @@ def test_run_pending_crawls_runs_due_snapshot_in_place(monkeypatch):
|
||||
from archivebox.services import runner as runner_module
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls='https://example.com',
|
||||
urls="https://example.com",
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
status=Crawl.StatusChoices.SEALED,
|
||||
)
|
||||
snapshot = Snapshot.objects.create(
|
||||
url='https://example.com',
|
||||
url="https://example.com",
|
||||
crawl=crawl,
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
retry_at=runner_module.timezone.now(),
|
||||
)
|
||||
|
||||
monkeypatch.setattr(type(snapshot), 'claim_processing_lock', lambda self, lock_seconds=60: True)
|
||||
monkeypatch.setattr(type(crawl), 'claim_processing_lock', lambda self, lock_seconds=60: True)
|
||||
monkeypatch.setattr(type(snapshot), "claim_processing_lock", lambda self, lock_seconds=60: True)
|
||||
monkeypatch.setattr(type(crawl), "claim_processing_lock", lambda self, lock_seconds=60: True)
|
||||
|
||||
run_calls: list[tuple[str, list[str] | None, bool]] = []
|
||||
|
||||
def fake_run_crawl(crawl_id, snapshot_ids=None, selected_plugins=None, process_discovered_snapshots_inline=True):
|
||||
run_calls.append((crawl_id, snapshot_ids, process_discovered_snapshots_inline))
|
||||
snapshot.status = Snapshot.StatusChoices.SEALED
|
||||
snapshot.retry_at = None
|
||||
snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
|
||||
|
||||
monkeypatch.setattr(runner_module, 'run_crawl', fake_run_crawl)
|
||||
monkeypatch.setattr(runner_module, "run_crawl", fake_run_crawl)
|
||||
|
||||
result = runner_module.run_pending_crawls(daemon=False)
|
||||
|
||||
@@ -602,26 +799,26 @@ def test_run_pending_crawls_prioritizes_new_queued_crawl_before_snapshot_backlog
|
||||
from archivebox.services import runner as runner_module
|
||||
|
||||
older_crawl = Crawl.objects.create(
|
||||
urls='https://older.example.com',
|
||||
urls="https://older.example.com",
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
)
|
||||
older_snapshot = Snapshot.objects.create(
|
||||
url='https://older.example.com',
|
||||
url="https://older.example.com",
|
||||
crawl=older_crawl,
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
retry_at=runner_module.timezone.now(),
|
||||
)
|
||||
newer_crawl = Crawl.objects.create(
|
||||
urls='https://newer.example.com',
|
||||
urls="https://newer.example.com",
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
status=Crawl.StatusChoices.QUEUED,
|
||||
retry_at=runner_module.timezone.now(),
|
||||
)
|
||||
|
||||
monkeypatch.setattr(type(older_snapshot), 'claim_processing_lock', lambda self, lock_seconds=60: True)
|
||||
monkeypatch.setattr(type(older_crawl), 'claim_processing_lock', lambda self, lock_seconds=60: True)
|
||||
monkeypatch.setattr(type(newer_crawl), 'claim_processing_lock', lambda self, lock_seconds=60: True)
|
||||
monkeypatch.setattr(type(older_snapshot), "claim_processing_lock", lambda self, lock_seconds=60: True)
|
||||
monkeypatch.setattr(type(older_crawl), "claim_processing_lock", lambda self, lock_seconds=60: True)
|
||||
monkeypatch.setattr(type(newer_crawl), "claim_processing_lock", lambda self, lock_seconds=60: True)
|
||||
|
||||
run_calls: list[tuple[str, list[str] | None, bool]] = []
|
||||
|
||||
@@ -632,7 +829,7 @@ def test_run_pending_crawls_prioritizes_new_queued_crawl_before_snapshot_backlog
|
||||
run_calls.append((crawl_id, snapshot_ids, process_discovered_snapshots_inline))
|
||||
raise _StopScheduling
|
||||
|
||||
monkeypatch.setattr(runner_module, 'run_crawl', fake_run_crawl)
|
||||
monkeypatch.setattr(runner_module, "run_crawl", fake_run_crawl)
|
||||
|
||||
with pytest.raises(_StopScheduling):
|
||||
runner_module.run_pending_crawls(daemon=False)
|
||||
|
||||
@@ -9,10 +9,18 @@ from pathlib import Path
|
||||
from archivebox.tests.conftest import create_test_url
|
||||
|
||||
|
||||
ADMIN_HOST = 'admin.archivebox.localhost:8000'
|
||||
ADMIN_HOST = "admin.archivebox.localhost:8000"
|
||||
|
||||
|
||||
def _run_savepagenow_script(initialized_archive: Path, request_url: str, expected_url: str, *, login: bool, public_add_view: bool, host: str):
|
||||
def _run_savepagenow_script(
|
||||
initialized_archive: Path,
|
||||
request_url: str,
|
||||
expected_url: str,
|
||||
*,
|
||||
login: bool,
|
||||
public_add_view: bool,
|
||||
host: str,
|
||||
):
|
||||
script = textwrap.dedent(
|
||||
f"""
|
||||
import os
|
||||
@@ -52,34 +60,34 @@ def _run_savepagenow_script(initialized_archive: Path, request_url: str, expecte
|
||||
assert resp2.status_code == 302, resp2.status_code
|
||||
assert Snapshot.objects.filter(url={expected_url!r}).count() == 1
|
||||
assert resp2['Location'] == f"/{{snapshot.url_path}}"
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
env = {
|
||||
**os.environ,
|
||||
'DATA_DIR': str(initialized_archive),
|
||||
'USE_COLOR': 'False',
|
||||
'SHOW_PROGRESS': 'False',
|
||||
'PUBLIC_ADD_VIEW': 'True' if public_add_view else 'False',
|
||||
'SAVE_ARCHIVEDOTORG': 'False',
|
||||
'SAVE_TITLE': 'False',
|
||||
'SAVE_FAVICON': 'False',
|
||||
'SAVE_WGET': 'False',
|
||||
'SAVE_WARC': 'False',
|
||||
'SAVE_PDF': 'False',
|
||||
'SAVE_SCREENSHOT': 'False',
|
||||
'SAVE_DOM': 'False',
|
||||
'SAVE_SINGLEFILE': 'False',
|
||||
'SAVE_READABILITY': 'False',
|
||||
'SAVE_MERCURY': 'False',
|
||||
'SAVE_GIT': 'False',
|
||||
'SAVE_YTDLP': 'False',
|
||||
'SAVE_HEADERS': 'False',
|
||||
'SAVE_HTMLTOTEXT': 'False',
|
||||
"DATA_DIR": str(initialized_archive),
|
||||
"USE_COLOR": "False",
|
||||
"SHOW_PROGRESS": "False",
|
||||
"PUBLIC_ADD_VIEW": "True" if public_add_view else "False",
|
||||
"SAVE_ARCHIVEDOTORG": "False",
|
||||
"SAVE_TITLE": "False",
|
||||
"SAVE_FAVICON": "False",
|
||||
"SAVE_WGET": "False",
|
||||
"SAVE_WARC": "False",
|
||||
"SAVE_PDF": "False",
|
||||
"SAVE_SCREENSHOT": "False",
|
||||
"SAVE_DOM": "False",
|
||||
"SAVE_SINGLEFILE": "False",
|
||||
"SAVE_READABILITY": "False",
|
||||
"SAVE_MERCURY": "False",
|
||||
"SAVE_GIT": "False",
|
||||
"SAVE_YTDLP": "False",
|
||||
"SAVE_HEADERS": "False",
|
||||
"SAVE_HTMLTOTEXT": "False",
|
||||
}
|
||||
|
||||
return subprocess.run(
|
||||
[sys.executable, '-c', script],
|
||||
[sys.executable, "-c", script],
|
||||
cwd=initialized_archive,
|
||||
env=env,
|
||||
text=True,
|
||||
@@ -105,36 +113,104 @@ def _run_savepagenow_not_found_script(initialized_archive: Path, request_url: st
|
||||
target_url = {request_url!r}
|
||||
|
||||
resp = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000')
|
||||
assert resp.status_code == 404, resp.status_code
|
||||
assert resp.status_code == 302, resp.status_code
|
||||
assert resp['Location'] == f'http://{ADMIN_HOST}/web/' + target_url
|
||||
assert Snapshot.objects.count() == 0
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
env = {
|
||||
**os.environ,
|
||||
'DATA_DIR': str(initialized_archive),
|
||||
'USE_COLOR': 'False',
|
||||
'SHOW_PROGRESS': 'False',
|
||||
'PUBLIC_ADD_VIEW': 'False',
|
||||
'SAVE_ARCHIVEDOTORG': 'False',
|
||||
'SAVE_TITLE': 'False',
|
||||
'SAVE_FAVICON': 'False',
|
||||
'SAVE_WGET': 'False',
|
||||
'SAVE_WARC': 'False',
|
||||
'SAVE_PDF': 'False',
|
||||
'SAVE_SCREENSHOT': 'False',
|
||||
'SAVE_DOM': 'False',
|
||||
'SAVE_SINGLEFILE': 'False',
|
||||
'SAVE_READABILITY': 'False',
|
||||
'SAVE_MERCURY': 'False',
|
||||
'SAVE_GIT': 'False',
|
||||
'SAVE_YTDLP': 'False',
|
||||
'SAVE_HEADERS': 'False',
|
||||
'SAVE_HTMLTOTEXT': 'False',
|
||||
"DATA_DIR": str(initialized_archive),
|
||||
"USE_COLOR": "False",
|
||||
"SHOW_PROGRESS": "False",
|
||||
"PUBLIC_ADD_VIEW": "False",
|
||||
"SAVE_ARCHIVEDOTORG": "False",
|
||||
"SAVE_TITLE": "False",
|
||||
"SAVE_FAVICON": "False",
|
||||
"SAVE_WGET": "False",
|
||||
"SAVE_WARC": "False",
|
||||
"SAVE_PDF": "False",
|
||||
"SAVE_SCREENSHOT": "False",
|
||||
"SAVE_DOM": "False",
|
||||
"SAVE_SINGLEFILE": "False",
|
||||
"SAVE_READABILITY": "False",
|
||||
"SAVE_MERCURY": "False",
|
||||
"SAVE_GIT": "False",
|
||||
"SAVE_YTDLP": "False",
|
||||
"SAVE_HEADERS": "False",
|
||||
"SAVE_HTMLTOTEXT": "False",
|
||||
}
|
||||
|
||||
return subprocess.run(
|
||||
[sys.executable, '-c', script],
|
||||
[sys.executable, "-c", script],
|
||||
cwd=initialized_archive,
|
||||
env=env,
|
||||
text=True,
|
||||
capture_output=True,
|
||||
timeout=60,
|
||||
)
|
||||
|
||||
|
||||
def _run_savepagenow_via_web_host_redirect_script(initialized_archive: Path, request_url: str, expected_url: str):
|
||||
script = textwrap.dedent(
|
||||
f"""
|
||||
import os
|
||||
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from django.test import Client
|
||||
from django.contrib.auth import get_user_model
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
client = Client()
|
||||
user = get_user_model().objects.create_user(username='tester', password='pw')
|
||||
client.force_login(user)
|
||||
|
||||
target_url = {request_url!r}
|
||||
|
||||
resp = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000')
|
||||
assert resp.status_code == 302, resp.status_code
|
||||
assert resp['Location'] == f'http://{ADMIN_HOST}/web/' + target_url
|
||||
|
||||
resp2 = client.get('/web/' + target_url, HTTP_HOST={ADMIN_HOST!r})
|
||||
assert resp2.status_code == 302, resp2.status_code
|
||||
|
||||
snapshot = Snapshot.objects.filter(url={expected_url!r}).order_by('-created_at').first()
|
||||
assert snapshot is not None
|
||||
assert resp2['Location'] == f"/{{snapshot.url_path}}"
|
||||
assert Snapshot.objects.filter(url={expected_url!r}).count() == 1
|
||||
""",
|
||||
)
|
||||
|
||||
env = {
|
||||
**os.environ,
|
||||
"DATA_DIR": str(initialized_archive),
|
||||
"USE_COLOR": "False",
|
||||
"SHOW_PROGRESS": "False",
|
||||
"PUBLIC_ADD_VIEW": "False",
|
||||
"SAVE_ARCHIVEDOTORG": "False",
|
||||
"SAVE_TITLE": "False",
|
||||
"SAVE_FAVICON": "False",
|
||||
"SAVE_WGET": "False",
|
||||
"SAVE_WARC": "False",
|
||||
"SAVE_PDF": "False",
|
||||
"SAVE_SCREENSHOT": "False",
|
||||
"SAVE_DOM": "False",
|
||||
"SAVE_SINGLEFILE": "False",
|
||||
"SAVE_READABILITY": "False",
|
||||
"SAVE_MERCURY": "False",
|
||||
"SAVE_GIT": "False",
|
||||
"SAVE_YTDLP": "False",
|
||||
"SAVE_HEADERS": "False",
|
||||
"SAVE_HTMLTOTEXT": "False",
|
||||
}
|
||||
|
||||
return subprocess.run(
|
||||
[sys.executable, "-c", script],
|
||||
cwd=initialized_archive,
|
||||
env=env,
|
||||
text=True,
|
||||
@@ -168,34 +244,34 @@ def _run_savepagenow_existing_snapshot_script(initialized_archive: Path, request
|
||||
resp = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000')
|
||||
assert resp.status_code == 302, resp.status_code
|
||||
assert resp['Location'] == f"/{{snapshot.url_path}}"
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
env = {
|
||||
**os.environ,
|
||||
'DATA_DIR': str(initialized_archive),
|
||||
'USE_COLOR': 'False',
|
||||
'SHOW_PROGRESS': 'False',
|
||||
'PUBLIC_ADD_VIEW': 'False',
|
||||
'SAVE_ARCHIVEDOTORG': 'False',
|
||||
'SAVE_TITLE': 'False',
|
||||
'SAVE_FAVICON': 'False',
|
||||
'SAVE_WGET': 'False',
|
||||
'SAVE_WARC': 'False',
|
||||
'SAVE_PDF': 'False',
|
||||
'SAVE_SCREENSHOT': 'False',
|
||||
'SAVE_DOM': 'False',
|
||||
'SAVE_SINGLEFILE': 'False',
|
||||
'SAVE_READABILITY': 'False',
|
||||
'SAVE_MERCURY': 'False',
|
||||
'SAVE_GIT': 'False',
|
||||
'SAVE_YTDLP': 'False',
|
||||
'SAVE_HEADERS': 'False',
|
||||
'SAVE_HTMLTOTEXT': 'False',
|
||||
"DATA_DIR": str(initialized_archive),
|
||||
"USE_COLOR": "False",
|
||||
"SHOW_PROGRESS": "False",
|
||||
"PUBLIC_ADD_VIEW": "False",
|
||||
"SAVE_ARCHIVEDOTORG": "False",
|
||||
"SAVE_TITLE": "False",
|
||||
"SAVE_FAVICON": "False",
|
||||
"SAVE_WGET": "False",
|
||||
"SAVE_WARC": "False",
|
||||
"SAVE_PDF": "False",
|
||||
"SAVE_SCREENSHOT": "False",
|
||||
"SAVE_DOM": "False",
|
||||
"SAVE_SINGLEFILE": "False",
|
||||
"SAVE_READABILITY": "False",
|
||||
"SAVE_MERCURY": "False",
|
||||
"SAVE_GIT": "False",
|
||||
"SAVE_YTDLP": "False",
|
||||
"SAVE_HEADERS": "False",
|
||||
"SAVE_HTMLTOTEXT": "False",
|
||||
}
|
||||
|
||||
return subprocess.run(
|
||||
[sys.executable, '-c', script],
|
||||
[sys.executable, "-c", script],
|
||||
cwd=initialized_archive,
|
||||
env=env,
|
||||
text=True,
|
||||
@@ -206,47 +282,49 @@ def _run_savepagenow_existing_snapshot_script(initialized_archive: Path, request
|
||||
|
||||
def test_web_add_creates_and_reuses_snapshot_logged_in(initialized_archive):
|
||||
"""/web/https://... should work for authenticated users even when public add is off."""
|
||||
url = create_test_url(domain='example.com', path='savepagenow-auth')
|
||||
request_url = url.replace('https://', '')
|
||||
url = create_test_url(domain="example.com", path="savepagenow-auth")
|
||||
request_url = url.replace("https://", "")
|
||||
result = _run_savepagenow_script(initialized_archive, request_url, url, login=True, public_add_view=False, host=ADMIN_HOST)
|
||||
assert result.returncode == 0, (
|
||||
"SavePageNow shortcut (logged-in) test failed.\n"
|
||||
f"stdout:\n{result.stdout}\n"
|
||||
f"stderr:\n{result.stderr}"
|
||||
)
|
||||
assert result.returncode == 0, f"SavePageNow shortcut (logged-in) test failed.\nstdout:\n{result.stdout}\nstderr:\n{result.stderr}"
|
||||
|
||||
|
||||
def test_web_add_creates_and_reuses_snapshot_public(initialized_archive):
|
||||
"""/web/https://... should work when PUBLIC_ADD_VIEW is enabled without login."""
|
||||
url = create_test_url(domain='example.com', path='savepagenow-public')
|
||||
request_url = url.replace('https://', '')
|
||||
result = _run_savepagenow_script(initialized_archive, request_url, url, login=False, public_add_view=True, host='web.archivebox.localhost:8000')
|
||||
assert result.returncode == 0, (
|
||||
"SavePageNow shortcut (public add) test failed.\n"
|
||||
f"stdout:\n{result.stdout}\n"
|
||||
f"stderr:\n{result.stderr}"
|
||||
url = create_test_url(domain="example.com", path="savepagenow-public")
|
||||
request_url = url
|
||||
result = _run_savepagenow_script(
|
||||
initialized_archive,
|
||||
request_url,
|
||||
url,
|
||||
login=False,
|
||||
public_add_view=True,
|
||||
host="web.archivebox.localhost:8000",
|
||||
)
|
||||
assert result.returncode == 0, f"SavePageNow shortcut (public add) test failed.\nstdout:\n{result.stdout}\nstderr:\n{result.stderr}"
|
||||
|
||||
|
||||
def test_web_add_requires_login_when_public_off(initialized_archive):
|
||||
"""/web/https://... should 404 for new URLs when PUBLIC_ADD_VIEW is false and not logged in."""
|
||||
url = create_test_url(domain='example.com', path='savepagenow-404')
|
||||
request_url = url.replace('https://', '')
|
||||
"""/web/https://... should bounce to admin when PUBLIC_ADD_VIEW is false and not logged in."""
|
||||
url = create_test_url(domain="example.com", path="savepagenow-404")
|
||||
request_url = url
|
||||
result = _run_savepagenow_not_found_script(initialized_archive, request_url)
|
||||
assert result.returncode == 0, f"SavePageNow shortcut (no public add) test failed.\nstdout:\n{result.stdout}\nstderr:\n{result.stderr}"
|
||||
|
||||
|
||||
def test_web_add_redirects_to_admin_and_creates_when_logged_in(initialized_archive):
|
||||
"""/web/https://... on web host should redirect to admin host and create when the user is logged in there."""
|
||||
url = create_test_url(domain="example.com", path="savepagenow-web-admin")
|
||||
result = _run_savepagenow_via_web_host_redirect_script(initialized_archive, url, url)
|
||||
assert result.returncode == 0, (
|
||||
"SavePageNow shortcut (no public add) test failed.\n"
|
||||
f"stdout:\n{result.stdout}\n"
|
||||
f"stderr:\n{result.stderr}"
|
||||
f"SavePageNow shortcut (web->admin redirect) test failed.\nstdout:\n{result.stdout}\nstderr:\n{result.stderr}"
|
||||
)
|
||||
|
||||
|
||||
def test_web_add_redirects_existing_snapshot_when_public_off(initialized_archive):
|
||||
"""/web/https://... should redirect to existing snapshot even when public add is off and not logged in."""
|
||||
url = create_test_url(domain='example.com', path='savepagenow-existing')
|
||||
request_url = url.replace('https://', '')
|
||||
url = create_test_url(domain="example.com", path="savepagenow-existing")
|
||||
request_url = url.replace("https://", "")
|
||||
result = _run_savepagenow_existing_snapshot_script(initialized_archive, request_url, url)
|
||||
assert result.returncode == 0, (
|
||||
"SavePageNow shortcut (existing snapshot) test failed.\n"
|
||||
f"stdout:\n{result.stdout}\n"
|
||||
f"stderr:\n{result.stderr}"
|
||||
f"SavePageNow shortcut (existing snapshot) test failed.\nstdout:\n{result.stdout}\nstderr:\n{result.stderr}"
|
||||
)
|
||||
|
||||
@@ -8,7 +8,6 @@ import subprocess
|
||||
import pytest
|
||||
|
||||
|
||||
|
||||
def _fetchone(tmp_path, query):
|
||||
conn = sqlite3.connect(tmp_path / "index.sqlite3")
|
||||
try:
|
||||
@@ -21,7 +20,7 @@ def test_schedule_creates_enabled_db_schedule(tmp_path, process):
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'schedule', '--every=daily', '--depth=1', 'https://example.com/feed.xml'],
|
||||
["archivebox", "schedule", "--every=daily", "--depth=1", "https://example.com/feed.xml"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
@@ -37,50 +36,50 @@ def test_schedule_creates_enabled_db_schedule(tmp_path, process):
|
||||
"SELECT urls, status, max_depth FROM crawls_crawl ORDER BY created_at DESC LIMIT 1",
|
||||
)
|
||||
|
||||
assert schedule_row == ('daily', 1, 'Scheduled import: https://example.com/feed.xml')
|
||||
assert crawl_row == ('https://example.com/feed.xml', 'sealed', 1)
|
||||
assert schedule_row == ("daily", 1, "Scheduled import: https://example.com/feed.xml")
|
||||
assert crawl_row == ("https://example.com/feed.xml", "sealed", 1)
|
||||
|
||||
|
||||
def test_schedule_show_lists_enabled_schedules(tmp_path, process):
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'schedule', '--every=weekly', 'https://example.com/feed.xml'],
|
||||
["archivebox", "schedule", "--every=weekly", "https://example.com/feed.xml"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'schedule', '--show'],
|
||||
["archivebox", "schedule", "--show"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'Active scheduled crawls' in result.stdout
|
||||
assert 'https://example.com/feed.xml' in result.stdout
|
||||
assert 'weekly' in result.stdout
|
||||
assert "Active scheduled crawls" in result.stdout
|
||||
assert "https://example.com/feed.xml" in result.stdout
|
||||
assert "weekly" in result.stdout
|
||||
|
||||
|
||||
def test_schedule_clear_disables_existing_schedules(tmp_path, process):
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'schedule', '--every=daily', 'https://example.com/feed.xml'],
|
||||
["archivebox", "schedule", "--every=daily", "https://example.com/feed.xml"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'schedule', '--clear'],
|
||||
["archivebox", "schedule", "--clear"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'Disabled 1 scheduled crawl' in result.stdout
|
||||
assert "Disabled 1 scheduled crawl" in result.stdout
|
||||
|
||||
disabled_count = _fetchone(
|
||||
tmp_path,
|
||||
@@ -99,13 +98,13 @@ def test_schedule_every_requires_valid_period(tmp_path, process):
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'schedule', '--every=invalid_period', 'https://example.com/feed.xml'],
|
||||
["archivebox", "schedule", "--every=invalid_period", "https://example.com/feed.xml"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode != 0
|
||||
assert 'Invalid schedule' in result.stderr or 'Invalid schedule' in result.stdout
|
||||
assert "Invalid schedule" in result.stderr or "Invalid schedule" in result.stdout
|
||||
|
||||
|
||||
class TestScheduleCLI:
|
||||
@@ -113,17 +112,17 @@ class TestScheduleCLI:
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'schedule', '--help'],
|
||||
["archivebox", "schedule", "--help"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert '--every' in result.stdout
|
||||
assert '--show' in result.stdout
|
||||
assert '--clear' in result.stdout
|
||||
assert '--run-all' in result.stdout
|
||||
assert "--every" in result.stdout
|
||||
assert "--show" in result.stdout
|
||||
assert "--clear" in result.stdout
|
||||
assert "--run-all" in result.stdout
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
|
||||
@@ -21,7 +21,7 @@ REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||
|
||||
def init_archive(cwd: Path) -> None:
|
||||
result = subprocess.run(
|
||||
[sys.executable, '-m', 'archivebox', 'init', '--quick'],
|
||||
[sys.executable, "-m", "archivebox", "init", "--quick"],
|
||||
cwd=cwd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
@@ -32,46 +32,48 @@ def init_archive(cwd: Path) -> None:
|
||||
|
||||
def build_test_env(port: int, **extra: str) -> dict[str, str]:
|
||||
env = os.environ.copy()
|
||||
env.pop('DATA_DIR', None)
|
||||
env.update({
|
||||
'LISTEN_HOST': f'archivebox.localhost:{port}',
|
||||
'ALLOWED_HOSTS': '*',
|
||||
'CSRF_TRUSTED_ORIGINS': f'http://admin.archivebox.localhost:{port}',
|
||||
'PUBLIC_ADD_VIEW': 'True',
|
||||
'USE_COLOR': 'False',
|
||||
'SHOW_PROGRESS': 'False',
|
||||
'TIMEOUT': '20',
|
||||
'URL_ALLOWLIST': r'127\.0\.0\.1[:/].*',
|
||||
'SAVE_ARCHIVEDOTORG': 'False',
|
||||
'SAVE_TITLE': 'False',
|
||||
'SAVE_FAVICON': 'False',
|
||||
'SAVE_WARC': 'False',
|
||||
'SAVE_PDF': 'False',
|
||||
'SAVE_SCREENSHOT': 'False',
|
||||
'SAVE_DOM': 'False',
|
||||
'SAVE_SINGLEFILE': 'False',
|
||||
'SAVE_READABILITY': 'False',
|
||||
'SAVE_MERCURY': 'False',
|
||||
'SAVE_GIT': 'False',
|
||||
'SAVE_YTDLP': 'False',
|
||||
'SAVE_HEADERS': 'False',
|
||||
'SAVE_HTMLTOTEXT': 'False',
|
||||
'SAVE_WGET': 'True',
|
||||
'USE_CHROME': 'False',
|
||||
})
|
||||
env.pop("DATA_DIR", None)
|
||||
env.update(
|
||||
{
|
||||
"LISTEN_HOST": f"archivebox.localhost:{port}",
|
||||
"ALLOWED_HOSTS": "*",
|
||||
"CSRF_TRUSTED_ORIGINS": f"http://admin.archivebox.localhost:{port}",
|
||||
"PUBLIC_ADD_VIEW": "True",
|
||||
"USE_COLOR": "False",
|
||||
"SHOW_PROGRESS": "False",
|
||||
"TIMEOUT": "20",
|
||||
"URL_ALLOWLIST": r"127\.0\.0\.1[:/].*",
|
||||
"SAVE_ARCHIVEDOTORG": "False",
|
||||
"SAVE_TITLE": "False",
|
||||
"SAVE_FAVICON": "False",
|
||||
"SAVE_WARC": "False",
|
||||
"SAVE_PDF": "False",
|
||||
"SAVE_SCREENSHOT": "False",
|
||||
"SAVE_DOM": "False",
|
||||
"SAVE_SINGLEFILE": "False",
|
||||
"SAVE_READABILITY": "False",
|
||||
"SAVE_MERCURY": "False",
|
||||
"SAVE_GIT": "False",
|
||||
"SAVE_YTDLP": "False",
|
||||
"SAVE_HEADERS": "False",
|
||||
"SAVE_HTMLTOTEXT": "False",
|
||||
"SAVE_WGET": "True",
|
||||
"USE_CHROME": "False",
|
||||
},
|
||||
)
|
||||
env.update(extra)
|
||||
return env
|
||||
|
||||
|
||||
def get_free_port() -> int:
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
|
||||
sock.bind(('127.0.0.1', 0))
|
||||
sock.bind(("127.0.0.1", 0))
|
||||
return sock.getsockname()[1]
|
||||
|
||||
|
||||
def start_server(cwd: Path, env: dict[str, str], port: int) -> None:
|
||||
result = subprocess.run(
|
||||
[sys.executable, '-m', 'archivebox', 'server', '--daemonize', f'127.0.0.1:{port}'],
|
||||
[sys.executable, "-m", "archivebox", "server", "--daemonize", f"127.0.0.1:{port}"],
|
||||
cwd=cwd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
@@ -91,19 +93,19 @@ def stop_server(cwd: Path) -> None:
|
||||
from archivebox.workers.supervisord_util import stop_existing_supervisord_process
|
||||
stop_existing_supervisord_process()
|
||||
print('stopped')
|
||||
"""
|
||||
""",
|
||||
)
|
||||
run_python_cwd(script, cwd=cwd, timeout=30)
|
||||
|
||||
|
||||
def wait_for_http(port: int, host: str, path: str = '/', timeout: int = 30) -> requests.Response:
|
||||
def wait_for_http(port: int, host: str, path: str = "/", timeout: int = 30) -> requests.Response:
|
||||
deadline = time.time() + timeout
|
||||
last_exc = None
|
||||
while time.time() < deadline:
|
||||
try:
|
||||
response = requests.get(
|
||||
f'http://127.0.0.1:{port}{path}',
|
||||
headers={'Host': host},
|
||||
f"http://127.0.0.1:{port}{path}",
|
||||
headers={"Host": host},
|
||||
timeout=2,
|
||||
allow_redirects=False,
|
||||
)
|
||||
@@ -112,11 +114,11 @@ def wait_for_http(port: int, host: str, path: str = '/', timeout: int = 30) -> r
|
||||
except requests.RequestException as exc:
|
||||
last_exc = exc
|
||||
time.sleep(0.5)
|
||||
raise AssertionError(f'Timed out waiting for HTTP on {host}: {last_exc}')
|
||||
raise AssertionError(f"Timed out waiting for HTTP on {host}: {last_exc}")
|
||||
|
||||
|
||||
def make_latest_schedule_due(cwd: Path) -> None:
|
||||
conn = sqlite3.connect(cwd / 'index.sqlite3')
|
||||
conn = sqlite3.connect(cwd / "index.sqlite3")
|
||||
try:
|
||||
conn.execute(
|
||||
"""
|
||||
@@ -129,7 +131,7 @@ def make_latest_schedule_due(cwd: Path) -> None:
|
||||
ORDER BY created_at DESC
|
||||
LIMIT 1
|
||||
)
|
||||
"""
|
||||
""",
|
||||
)
|
||||
conn.commit()
|
||||
finally:
|
||||
@@ -182,7 +184,7 @@ def get_snapshot_file_text(cwd: Path, url: str) -> str:
|
||||
|
||||
assert candidates, f'no captured html/txt files found in {{snapshot_dir}}'
|
||||
print(candidates[0].read_text(errors='ignore'))
|
||||
"""
|
||||
""",
|
||||
)
|
||||
stdout, stderr, code = run_python_cwd(script, cwd=cwd, timeout=60)
|
||||
assert code == 0, stderr
|
||||
@@ -198,11 +200,11 @@ def wait_for_snapshot_capture(cwd: Path, url: str, timeout: int = 180) -> str:
|
||||
except AssertionError as err:
|
||||
last_error = err
|
||||
time.sleep(2)
|
||||
raise AssertionError(f'timed out waiting for captured content for {url}: {last_error}')
|
||||
raise AssertionError(f"timed out waiting for captured content for {url}: {last_error}")
|
||||
|
||||
|
||||
def get_counts(cwd: Path, scheduled_url: str, one_shot_url: str) -> tuple[int, int, int]:
|
||||
conn = sqlite3.connect(cwd / 'index.sqlite3')
|
||||
conn = sqlite3.connect(cwd / "index.sqlite3")
|
||||
try:
|
||||
scheduled_snapshots = conn.execute(
|
||||
"SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
|
||||
@@ -259,7 +261,7 @@ def create_admin_and_token(cwd: Path) -> str:
|
||||
expires=timezone.now() + timedelta(days=1),
|
||||
)
|
||||
print(token.token)
|
||||
"""
|
||||
""",
|
||||
)
|
||||
stdout, stderr, code = run_python_cwd(script, cwd=cwd, timeout=60)
|
||||
assert code == 0, stderr
|
||||
@@ -275,7 +277,7 @@ def test_server_processes_due_cli_schedule_and_saves_real_content(tmp_path, recu
|
||||
env = build_test_env(port)
|
||||
|
||||
schedule_result = subprocess.run(
|
||||
[sys.executable, '-m', 'archivebox', 'schedule', '--every=daily', '--depth=0', recursive_test_site['root_url']],
|
||||
[sys.executable, "-m", "archivebox", "schedule", "--every=daily", "--depth=0", recursive_test_site["root_url"]],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
@@ -283,16 +285,16 @@ def test_server_processes_due_cli_schedule_and_saves_real_content(tmp_path, recu
|
||||
timeout=60,
|
||||
)
|
||||
assert schedule_result.returncode == 0, schedule_result.stderr
|
||||
assert 'Created scheduled crawl' in schedule_result.stdout
|
||||
assert "Created scheduled crawl" in schedule_result.stdout
|
||||
|
||||
make_latest_schedule_due(tmp_path)
|
||||
|
||||
try:
|
||||
start_server(tmp_path, env=env, port=port)
|
||||
wait_for_http(port, host=f'web.archivebox.localhost:{port}')
|
||||
captured_text = wait_for_snapshot_capture(tmp_path, recursive_test_site['root_url'], timeout=180)
|
||||
assert 'Root' in captured_text
|
||||
assert 'About' in captured_text
|
||||
wait_for_http(port, host=f"web.archivebox.localhost:{port}")
|
||||
captured_text = wait_for_snapshot_capture(tmp_path, recursive_test_site["root_url"], timeout=180)
|
||||
assert "Root" in captured_text
|
||||
assert "About" in captured_text
|
||||
finally:
|
||||
stop_server(tmp_path)
|
||||
|
||||
@@ -304,11 +306,11 @@ def test_archivebox_add_remains_one_shot_even_when_schedule_is_due(tmp_path, rec
|
||||
|
||||
port = get_free_port()
|
||||
env = build_test_env(port)
|
||||
scheduled_url = recursive_test_site['root_url']
|
||||
one_shot_url = recursive_test_site['child_urls'][0]
|
||||
scheduled_url = recursive_test_site["root_url"]
|
||||
one_shot_url = recursive_test_site["child_urls"][0]
|
||||
|
||||
schedule_result = subprocess.run(
|
||||
[sys.executable, '-m', 'archivebox', 'schedule', '--every=daily', '--depth=0', scheduled_url],
|
||||
[sys.executable, "-m", "archivebox", "schedule", "--every=daily", "--depth=0", scheduled_url],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
@@ -320,7 +322,7 @@ def test_archivebox_add_remains_one_shot_even_when_schedule_is_due(tmp_path, rec
|
||||
make_latest_schedule_due(tmp_path)
|
||||
|
||||
add_result = subprocess.run(
|
||||
[sys.executable, '-m', 'archivebox', 'add', '--depth=0', '--plugins=wget', one_shot_url],
|
||||
[sys.executable, "-m", "archivebox", "add", "--depth=0", "--plugins=wget", one_shot_url],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
@@ -329,7 +331,7 @@ def test_archivebox_add_remains_one_shot_even_when_schedule_is_due(tmp_path, rec
|
||||
)
|
||||
assert add_result.returncode == 0, add_result.stderr
|
||||
captured_text = wait_for_snapshot_capture(tmp_path, one_shot_url, timeout=120)
|
||||
assert 'Deep About' in captured_text or 'About' in captured_text
|
||||
assert "Deep About" in captured_text or "About" in captured_text
|
||||
|
||||
scheduled_snapshots, one_shot_snapshots, scheduled_crawls = get_counts(tmp_path, scheduled_url, one_shot_url)
|
||||
assert one_shot_snapshots >= 1
|
||||
@@ -348,27 +350,27 @@ def test_schedule_rest_api_works_over_running_server(tmp_path, recursive_test_si
|
||||
|
||||
try:
|
||||
start_server(tmp_path, env=env, port=port)
|
||||
wait_for_http(port, host=f'api.archivebox.localhost:{port}', path='/api/v1/docs')
|
||||
wait_for_http(port, host=f"api.archivebox.localhost:{port}", path="/api/v1/docs")
|
||||
|
||||
response = requests.post(
|
||||
f'http://127.0.0.1:{port}/api/v1/cli/schedule',
|
||||
f"http://127.0.0.1:{port}/api/v1/cli/schedule",
|
||||
headers={
|
||||
'Host': f'api.archivebox.localhost:{port}',
|
||||
'X-ArchiveBox-API-Key': api_token,
|
||||
"Host": f"api.archivebox.localhost:{port}",
|
||||
"X-ArchiveBox-API-Key": api_token,
|
||||
},
|
||||
json={
|
||||
'every': 'daily',
|
||||
'import_path': recursive_test_site['root_url'],
|
||||
'quiet': True,
|
||||
"every": "daily",
|
||||
"import_path": recursive_test_site["root_url"],
|
||||
"quiet": True,
|
||||
},
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
assert response.status_code == 200, response.text
|
||||
payload = response.json()
|
||||
assert payload['success'] is True
|
||||
assert payload['result_format'] == 'json'
|
||||
assert len(payload['result']['created_schedule_ids']) == 1
|
||||
assert payload["success"] is True
|
||||
assert payload["result_format"] == "json"
|
||||
assert len(payload["result"]["created_schedule_ids"]) == 1
|
||||
finally:
|
||||
stop_server(tmp_path)
|
||||
|
||||
@@ -379,21 +381,21 @@ def test_schedule_web_ui_post_works_over_running_server(tmp_path, recursive_test
|
||||
init_archive(tmp_path)
|
||||
|
||||
port = get_free_port()
|
||||
env = build_test_env(port, PUBLIC_ADD_VIEW='True')
|
||||
env = build_test_env(port, PUBLIC_ADD_VIEW="True")
|
||||
|
||||
try:
|
||||
start_server(tmp_path, env=env, port=port)
|
||||
wait_for_http(port, host=f'web.archivebox.localhost:{port}', path='/add/')
|
||||
wait_for_http(port, host=f"web.archivebox.localhost:{port}", path="/add/")
|
||||
|
||||
response = requests.post(
|
||||
f'http://127.0.0.1:{port}/add/',
|
||||
headers={'Host': f'web.archivebox.localhost:{port}'},
|
||||
f"http://127.0.0.1:{port}/add/",
|
||||
headers={"Host": f"web.archivebox.localhost:{port}"},
|
||||
data={
|
||||
'url': recursive_test_site['root_url'],
|
||||
'depth': '0',
|
||||
'schedule': 'daily',
|
||||
'tag': 'web-ui',
|
||||
'notes': 'created from web ui',
|
||||
"url": recursive_test_site["root_url"],
|
||||
"depth": "0",
|
||||
"schedule": "daily",
|
||||
"tag": "web-ui",
|
||||
"notes": "created from web ui",
|
||||
},
|
||||
timeout=10,
|
||||
allow_redirects=False,
|
||||
@@ -401,7 +403,7 @@ def test_schedule_web_ui_post_works_over_running_server(tmp_path, recursive_test
|
||||
|
||||
assert response.status_code in (302, 303), response.text
|
||||
|
||||
conn = sqlite3.connect(tmp_path / 'index.sqlite3')
|
||||
conn = sqlite3.connect(tmp_path / "index.sqlite3")
|
||||
try:
|
||||
row = conn.execute(
|
||||
"""
|
||||
@@ -410,11 +412,11 @@ def test_schedule_web_ui_post_works_over_running_server(tmp_path, recursive_test
|
||||
JOIN crawls_crawl c ON c.schedule_id = cs.id
|
||||
ORDER BY cs.created_at DESC
|
||||
LIMIT 1
|
||||
"""
|
||||
""",
|
||||
).fetchone()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
assert row == ('daily', recursive_test_site['root_url'], 'web-ui')
|
||||
assert row == ("daily", recursive_test_site["root_url"], "web-ui")
|
||||
finally:
|
||||
stop_server(tmp_path)
|
||||
|
||||
@@ -103,7 +103,10 @@ async function main() {
|
||||
timeout: 15000,
|
||||
});
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, 1500));
|
||||
await page.waitForFunction(
|
||||
() => window.__dangerousScriptRan !== true || window.__probeResults !== undefined,
|
||||
{timeout: 15000},
|
||||
);
|
||||
|
||||
const pageState = await page.evaluate(() => ({
|
||||
href: location.href,
|
||||
@@ -297,7 +300,7 @@ def _seed_archive(data_dir: Path) -> dict[str, object]:
|
||||
"password": "testpassword",
|
||||
"snapshots": snapshots,
|
||||
}))
|
||||
"""
|
||||
""",
|
||||
)
|
||||
stdout, stderr, returncode = run_python_cwd(script, cwd=data_dir, timeout=120)
|
||||
assert returncode == 0, stderr
|
||||
@@ -310,10 +313,17 @@ def _get_free_port() -> int:
|
||||
return sock.getsockname()[1]
|
||||
|
||||
|
||||
def _wait_for_http(port: int, host: str, timeout: float = 30.0) -> None:
|
||||
def _wait_for_http(
|
||||
port: int,
|
||||
host: str,
|
||||
timeout: float = 30.0,
|
||||
process: subprocess.Popen[str] | None = None,
|
||||
) -> None:
|
||||
deadline = time.time() + timeout
|
||||
last_error = "server did not answer"
|
||||
while time.time() < deadline:
|
||||
if process is not None and process.poll() is not None:
|
||||
raise AssertionError(f"Server exited before becoming ready with code {process.returncode}")
|
||||
try:
|
||||
response = requests.get(
|
||||
f"http://127.0.0.1:{port}/",
|
||||
@@ -358,7 +368,7 @@ def _start_server(data_dir: Path, *, mode: str, port: int) -> subprocess.Popen[s
|
||||
"SAVE_HEADERS": "False",
|
||||
"SAVE_HTMLTOTEXT": "False",
|
||||
"USE_CHROME": "False",
|
||||
}
|
||||
},
|
||||
)
|
||||
process = subprocess.Popen(
|
||||
[sys.executable, "-m", "archivebox", "server", "--debug", "--nothreading", f"127.0.0.1:{port}"],
|
||||
@@ -369,7 +379,11 @@ def _start_server(data_dir: Path, *, mode: str, port: int) -> subprocess.Popen[s
|
||||
text=True,
|
||||
start_new_session=True,
|
||||
)
|
||||
_wait_for_http(port, f"archivebox.localhost:{port}")
|
||||
try:
|
||||
_wait_for_http(port, f"archivebox.localhost:{port}", process=process)
|
||||
except AssertionError as exc:
|
||||
server_log = _stop_server(process)
|
||||
raise AssertionError(f"{exc}\n\nSERVER LOG:\n{server_log}") from exc
|
||||
return process
|
||||
|
||||
|
||||
@@ -414,7 +428,7 @@ def _build_probe_config(mode: str, port: int, fixture: dict[str, object], runtim
|
||||
"victim": victim_url,
|
||||
"admin": f"{admin_origin}/admin/",
|
||||
"api": f"{admin_origin}/api/v1/docs",
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
return {
|
||||
@@ -427,7 +441,13 @@ def _build_probe_config(mode: str, port: int, fixture: dict[str, object], runtim
|
||||
}
|
||||
|
||||
|
||||
def _run_browser_probe(data_dir: Path, runtime: dict[str, Path], mode: str, fixture: dict[str, object], tmp_path: Path) -> dict[str, object]:
|
||||
def _run_browser_probe(
|
||||
data_dir: Path,
|
||||
runtime: dict[str, Path],
|
||||
mode: str,
|
||||
fixture: dict[str, object],
|
||||
tmp_path: Path,
|
||||
) -> dict[str, object]:
|
||||
port = _get_free_port()
|
||||
process = _start_server(data_dir, mode=mode, port=port)
|
||||
probe_path = tmp_path / "server_security_probe.js"
|
||||
@@ -517,7 +537,13 @@ def _run_browser_probe(data_dir: Path, runtime: dict[str, Path], mode: str, fixt
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_server_security_modes_in_chrome(initialized_archive: Path, browser_runtime, tmp_path: Path, mode: str, expected: dict[str, object]) -> None:
|
||||
def test_server_security_modes_in_chrome(
|
||||
initialized_archive: Path,
|
||||
browser_runtime,
|
||||
tmp_path: Path,
|
||||
mode: str,
|
||||
expected: dict[str, object],
|
||||
) -> None:
|
||||
fixture = _seed_archive(initialized_archive)
|
||||
result = _run_browser_probe(initialized_archive, browser_runtime, mode, fixture, tmp_path)
|
||||
|
||||
|
||||
@@ -12,32 +12,31 @@ import uuid
|
||||
import pytest
|
||||
|
||||
|
||||
|
||||
def test_snapshot_creates_snapshot_with_correct_url(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that snapshot stores the exact URL in the database."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'snapshot', 'create', 'https://example.com'],
|
||||
["archivebox", "snapshot", "create", "https://example.com"],
|
||||
capture_output=True,
|
||||
env={**disable_extractors_dict, 'DATA_DIR': str(tmp_path)},
|
||||
env={**disable_extractors_dict, "DATA_DIR": str(tmp_path)},
|
||||
)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
snapshot_row = c.execute(
|
||||
"SELECT id, created_at, url, crawl_id FROM core_snapshot WHERE url = ?",
|
||||
('https://example.com',)
|
||||
("https://example.com",),
|
||||
).fetchone()
|
||||
assert snapshot_row is not None
|
||||
crawl_row = c.execute(
|
||||
"SELECT id, created_at, urls, created_by_id FROM crawls_crawl WHERE id = ?",
|
||||
(snapshot_row[3],)
|
||||
(snapshot_row[3],),
|
||||
).fetchone()
|
||||
assert crawl_row is not None
|
||||
user_row = c.execute(
|
||||
"SELECT username FROM auth_user WHERE id = ?",
|
||||
(crawl_row[3],)
|
||||
(crawl_row[3],),
|
||||
).fetchone()
|
||||
assert user_row is not None
|
||||
conn.close()
|
||||
@@ -45,15 +44,12 @@ def test_snapshot_creates_snapshot_with_correct_url(tmp_path, process, disable_e
|
||||
snapshot_id_raw, snapshot_created_at, snapshot_url, crawl_id = snapshot_row
|
||||
snapshot_id = str(uuid.UUID(snapshot_id_raw))
|
||||
username = user_row[0]
|
||||
snapshot_date_str = datetime.fromisoformat(snapshot_created_at).strftime('%Y%m%d')
|
||||
domain = urlparse(snapshot_url).hostname or 'unknown'
|
||||
snapshot_date_str = datetime.fromisoformat(snapshot_created_at).strftime("%Y%m%d")
|
||||
domain = urlparse(snapshot_url).hostname or "unknown"
|
||||
|
||||
# Verify crawl symlink exists and is relative
|
||||
target_path = tmp_path / 'users' / username / 'snapshots' / snapshot_date_str / domain / snapshot_id
|
||||
symlinks = [
|
||||
p for p in tmp_path.rglob(str(snapshot_id))
|
||||
if p.is_symlink()
|
||||
]
|
||||
target_path = tmp_path / "users" / username / "snapshots" / snapshot_date_str / domain / snapshot_id
|
||||
symlinks = [p for p in tmp_path.rglob(str(snapshot_id)) if p.is_symlink()]
|
||||
assert symlinks, "Snapshot symlink should exist under crawl dir"
|
||||
link_path = symlinks[0]
|
||||
|
||||
@@ -68,21 +64,25 @@ def test_snapshot_multiple_urls_creates_multiple_records(tmp_path, process, disa
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'snapshot', 'create',
|
||||
'https://example.com',
|
||||
'https://iana.org'],
|
||||
[
|
||||
"archivebox",
|
||||
"snapshot",
|
||||
"create",
|
||||
"https://example.com",
|
||||
"https://iana.org",
|
||||
],
|
||||
capture_output=True,
|
||||
env={**disable_extractors_dict, 'DATA_DIR': str(tmp_path)},
|
||||
env={**disable_extractors_dict, "DATA_DIR": str(tmp_path)},
|
||||
)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
urls = c.execute("SELECT url FROM core_snapshot ORDER BY url").fetchall()
|
||||
conn.close()
|
||||
|
||||
urls = [u[0] for u in urls]
|
||||
assert 'https://example.com' in urls
|
||||
assert 'https://iana.org' in urls
|
||||
assert "https://example.com" in urls
|
||||
assert "https://iana.org" in urls
|
||||
assert len(urls) >= 2
|
||||
|
||||
|
||||
@@ -91,31 +91,41 @@ def test_snapshot_tag_creates_tag_and_links_to_snapshot(tmp_path, process, disab
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'snapshot', 'create', '--tag=mytesttag',
|
||||
'https://example.com'],
|
||||
[
|
||||
"archivebox",
|
||||
"snapshot",
|
||||
"create",
|
||||
"--tag=mytesttag",
|
||||
"https://example.com",
|
||||
],
|
||||
capture_output=True,
|
||||
env={**disable_extractors_dict, 'DATA_DIR': str(tmp_path)},
|
||||
env={**disable_extractors_dict, "DATA_DIR": str(tmp_path)},
|
||||
)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
|
||||
# Verify tag was created
|
||||
tag = c.execute("SELECT id, name FROM core_tag WHERE name = ?", ('mytesttag',)).fetchone()
|
||||
tag = c.execute("SELECT id, name FROM core_tag WHERE name = ?", ("mytesttag",)).fetchone()
|
||||
assert tag is not None, "Tag 'mytesttag' should exist in core_tag"
|
||||
tag_id = tag[0]
|
||||
|
||||
# Verify snapshot exists
|
||||
snapshot = c.execute("SELECT id FROM core_snapshot WHERE url = ?",
|
||||
('https://example.com',)).fetchone()
|
||||
snapshot = c.execute(
|
||||
"SELECT id FROM core_snapshot WHERE url = ?",
|
||||
("https://example.com",),
|
||||
).fetchone()
|
||||
assert snapshot is not None
|
||||
snapshot_id = snapshot[0]
|
||||
|
||||
# Verify tag is linked to snapshot via join table
|
||||
link = c.execute("""
|
||||
link = c.execute(
|
||||
"""
|
||||
SELECT * FROM core_snapshot_tags
|
||||
WHERE snapshot_id = ? AND tag_id = ?
|
||||
""", (snapshot_id, tag_id)).fetchone()
|
||||
""",
|
||||
(snapshot_id, tag_id),
|
||||
).fetchone()
|
||||
conn.close()
|
||||
|
||||
assert link is not None, "Tag should be linked to snapshot via core_snapshot_tags"
|
||||
@@ -127,23 +137,23 @@ def test_snapshot_jsonl_output_has_correct_structure(tmp_path, process, disable_
|
||||
|
||||
# Pass URL as argument instead of stdin for more reliable behavior
|
||||
result = subprocess.run(
|
||||
['archivebox', 'snapshot', 'create', 'https://example.com'],
|
||||
["archivebox", "snapshot", "create", "https://example.com"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env={**disable_extractors_dict, 'DATA_DIR': str(tmp_path)},
|
||||
env={**disable_extractors_dict, "DATA_DIR": str(tmp_path)},
|
||||
)
|
||||
|
||||
# Parse JSONL output lines
|
||||
records = Process.parse_records_from_text(result.stdout)
|
||||
snapshot_records = [r for r in records if r.get('type') == 'Snapshot']
|
||||
snapshot_records = [r for r in records if r.get("type") == "Snapshot"]
|
||||
|
||||
assert len(snapshot_records) >= 1, "Should output at least one Snapshot JSONL record"
|
||||
|
||||
record = snapshot_records[0]
|
||||
assert record.get('type') == 'Snapshot'
|
||||
assert 'id' in record, "Snapshot record should have 'id' field"
|
||||
assert 'url' in record, "Snapshot record should have 'url' field"
|
||||
assert record['url'] == 'https://example.com'
|
||||
assert record.get("type") == "Snapshot"
|
||||
assert "id" in record, "Snapshot record should have 'id' field"
|
||||
assert "url" in record, "Snapshot record should have 'url' field"
|
||||
assert record["url"] == "https://example.com"
|
||||
|
||||
|
||||
def test_snapshot_with_tag_stores_tag_name(tmp_path, process, disable_extractors_dict):
|
||||
@@ -152,22 +162,24 @@ def test_snapshot_with_tag_stores_tag_name(tmp_path, process, disable_extractors
|
||||
|
||||
# Use command line args instead of stdin
|
||||
subprocess.run(
|
||||
['archivebox', 'snapshot', 'create', '--tag=customtag', 'https://example.com'],
|
||||
["archivebox", "snapshot", "create", "--tag=customtag", "https://example.com"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env={**disable_extractors_dict, 'DATA_DIR': str(tmp_path)},
|
||||
env={**disable_extractors_dict, "DATA_DIR": str(tmp_path)},
|
||||
)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
|
||||
# Verify tag was created with correct name
|
||||
tag = c.execute("SELECT name FROM core_tag WHERE name = ?",
|
||||
('customtag',)).fetchone()
|
||||
tag = c.execute(
|
||||
"SELECT name FROM core_tag WHERE name = ?",
|
||||
("customtag",),
|
||||
).fetchone()
|
||||
conn.close()
|
||||
|
||||
assert tag is not None
|
||||
assert tag[0] == 'customtag'
|
||||
assert tag[0] == "customtag"
|
||||
|
||||
|
||||
def test_snapshot_with_depth_sets_snapshot_depth(tmp_path, process, disable_extractors_dict):
|
||||
@@ -175,13 +187,18 @@ def test_snapshot_with_depth_sets_snapshot_depth(tmp_path, process, disable_extr
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'snapshot', 'create', '--depth=1',
|
||||
'https://example.com'],
|
||||
[
|
||||
"archivebox",
|
||||
"snapshot",
|
||||
"create",
|
||||
"--depth=1",
|
||||
"https://example.com",
|
||||
],
|
||||
capture_output=True,
|
||||
env={**disable_extractors_dict, 'DATA_DIR': str(tmp_path)},
|
||||
env={**disable_extractors_dict, "DATA_DIR": str(tmp_path)},
|
||||
)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
snapshot = c.execute("SELECT depth FROM core_snapshot ORDER BY created_at DESC LIMIT 1").fetchone()
|
||||
conn.close()
|
||||
@@ -196,24 +213,26 @@ def test_snapshot_allows_duplicate_urls_across_crawls(tmp_path, process, disable
|
||||
|
||||
# Add same URL twice
|
||||
subprocess.run(
|
||||
['archivebox', 'snapshot', 'create', 'https://example.com'],
|
||||
["archivebox", "snapshot", "create", "https://example.com"],
|
||||
capture_output=True,
|
||||
env={**disable_extractors_dict, 'DATA_DIR': str(tmp_path)},
|
||||
env={**disable_extractors_dict, "DATA_DIR": str(tmp_path)},
|
||||
)
|
||||
subprocess.run(
|
||||
['archivebox', 'snapshot', 'create', 'https://example.com'],
|
||||
["archivebox", "snapshot", "create", "https://example.com"],
|
||||
capture_output=True,
|
||||
env={**disable_extractors_dict, 'DATA_DIR': str(tmp_path)},
|
||||
env={**disable_extractors_dict, "DATA_DIR": str(tmp_path)},
|
||||
)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
count = c.execute("SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
|
||||
('https://example.com',)).fetchone()[0]
|
||||
count = c.execute(
|
||||
"SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
|
||||
("https://example.com",),
|
||||
).fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
assert count == 2, "Same URL should create separate snapshots across different crawls"
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
|
||||
@@ -13,15 +13,15 @@ pytestmark = pytest.mark.django_db
|
||||
|
||||
|
||||
User = get_user_model()
|
||||
ADMIN_HOST = 'admin.archivebox.localhost:8000'
|
||||
ADMIN_HOST = "admin.archivebox.localhost:8000"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def admin_user(db):
|
||||
return cast(UserManager, User.objects).create_superuser(
|
||||
username='tagadmin',
|
||||
email='tagadmin@test.com',
|
||||
password='testpassword',
|
||||
username="tagadmin",
|
||||
email="tagadmin@test.com",
|
||||
password="testpassword",
|
||||
)
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@ def crawl(admin_user):
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
return Crawl.objects.create(
|
||||
urls='https://example.com',
|
||||
urls="https://example.com",
|
||||
created_by=admin_user,
|
||||
)
|
||||
|
||||
@@ -48,15 +48,15 @@ def crawl(admin_user):
|
||||
def tagged_data(crawl, admin_user):
|
||||
from archivebox.core.models import Snapshot, Tag
|
||||
|
||||
tag = Tag.objects.create(name='Alpha Research', created_by=admin_user)
|
||||
tag = Tag.objects.create(name="Alpha Research", created_by=admin_user)
|
||||
first = Snapshot.objects.create(
|
||||
url='https://example.com/one',
|
||||
title='Example One',
|
||||
url="https://example.com/one",
|
||||
title="Example One",
|
||||
crawl=crawl,
|
||||
)
|
||||
second = Snapshot.objects.create(
|
||||
url='https://example.com/two',
|
||||
title='Example Two',
|
||||
url="https://example.com/two",
|
||||
title="Example Two",
|
||||
crawl=crawl,
|
||||
)
|
||||
first.tags.add(tag)
|
||||
@@ -65,27 +65,26 @@ def tagged_data(crawl, admin_user):
|
||||
|
||||
|
||||
def test_tag_admin_changelist_renders_custom_ui(client, admin_user, tagged_data):
|
||||
client.login(username='tagadmin', password='testpassword')
|
||||
client.login(username="tagadmin", password="testpassword")
|
||||
|
||||
response = client.get(reverse('admin:core_tag_changelist'), HTTP_HOST=ADMIN_HOST)
|
||||
response = client.get(reverse("admin:core_tag_changelist"), HTTP_HOST=ADMIN_HOST)
|
||||
|
||||
assert response.status_code == 200
|
||||
assert b'id="tag-live-search"' in response.content
|
||||
assert b'id="tag-sort-select"' in response.content
|
||||
assert b'id="tag-created-by-select"' in response.content
|
||||
assert b'id="tag-year-select"' in response.content
|
||||
assert b'id="tag-has-snapshots-select"' in response.content
|
||||
assert b'Alpha Research' in response.content
|
||||
assert b"Alpha Research" in response.content
|
||||
assert b'class="tag-card"' in response.content
|
||||
|
||||
|
||||
def test_tag_admin_add_view_renders_similar_tag_reference(client, admin_user):
|
||||
client.login(username='tagadmin', password='testpassword')
|
||||
client.login(username="tagadmin", password="testpassword")
|
||||
|
||||
response = client.get(reverse('admin:core_tag_add'), HTTP_HOST=ADMIN_HOST)
|
||||
response = client.get(reverse("admin:core_tag_add"), HTTP_HOST=ADMIN_HOST)
|
||||
|
||||
assert response.status_code == 200
|
||||
assert b'Similar Tags' in response.content
|
||||
assert b"Similar Tags" in response.content
|
||||
assert b'data-tag-name-input="1"' in response.content
|
||||
|
||||
|
||||
@@ -93,40 +92,40 @@ def test_tag_search_api_returns_card_payload(client, api_token, tagged_data):
|
||||
tag, snapshots = tagged_data
|
||||
|
||||
response = client.get(
|
||||
reverse('api-1:search_tags'),
|
||||
{'q': 'Alpha', 'api_key': api_token},
|
||||
reverse("api-1:search_tags"),
|
||||
{"q": "Alpha", "api_key": api_token},
|
||||
HTTP_HOST=ADMIN_HOST,
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
payload = response.json()
|
||||
assert payload['sort'] == 'created_desc'
|
||||
assert payload['created_by'] == ''
|
||||
assert payload['year'] == ''
|
||||
assert payload['has_snapshots'] == 'all'
|
||||
assert payload['tags'][0]['id'] == tag.id
|
||||
assert payload['tags'][0]['name'] == 'Alpha Research'
|
||||
assert payload['tags'][0]['num_snapshots'] == 2
|
||||
assert payload['tags'][0]['snapshots'][0]['title'] in {'Example One', 'Example Two'}
|
||||
assert payload['tags'][0]['export_jsonl_url'].endswith(f'/api/v1/core/tag/{tag.id}/snapshots.jsonl')
|
||||
assert payload['tags'][0]['filter_url'].endswith(f'/admin/core/snapshot/?tags__id__exact={tag.id}')
|
||||
assert {snapshot['url'] for snapshot in payload['tags'][0]['snapshots']} == {snap.url for snap in snapshots}
|
||||
assert payload["sort"] == "created_desc"
|
||||
assert payload["created_by"] == ""
|
||||
assert payload["year"] == ""
|
||||
assert payload["has_snapshots"] == "all"
|
||||
assert payload["tags"][0]["id"] == tag.id
|
||||
assert payload["tags"][0]["name"] == "Alpha Research"
|
||||
assert payload["tags"][0]["num_snapshots"] == 2
|
||||
assert payload["tags"][0]["snapshots"][0]["title"] in {"Example One", "Example Two"}
|
||||
assert payload["tags"][0]["export_jsonl_url"].endswith(f"/api/v1/core/tag/{tag.id}/snapshots.jsonl")
|
||||
assert payload["tags"][0]["filter_url"].endswith(f"/admin/core/snapshot/?tags__id__exact={tag.id}")
|
||||
assert {snapshot["url"] for snapshot in payload["tags"][0]["snapshots"]} == {snap.url for snap in snapshots}
|
||||
|
||||
|
||||
def test_tag_search_api_respects_sort_and_filters(client, api_token, admin_user, crawl, tagged_data):
|
||||
from archivebox.core.models import Snapshot, Tag
|
||||
|
||||
other_user = cast(UserManager, User.objects).create_user(
|
||||
username='tagother',
|
||||
email='tagother@test.com',
|
||||
password='unused',
|
||||
username="tagother",
|
||||
email="tagother@test.com",
|
||||
password="unused",
|
||||
)
|
||||
tag_with_snapshots = tagged_data[0]
|
||||
empty_tag = Tag.objects.create(name='Zulu Empty', created_by=other_user)
|
||||
alpha_tag = Tag.objects.create(name='Alpha Empty', created_by=other_user)
|
||||
empty_tag = Tag.objects.create(name="Zulu Empty", created_by=other_user)
|
||||
alpha_tag = Tag.objects.create(name="Alpha Empty", created_by=other_user)
|
||||
Snapshot.objects.create(
|
||||
url='https://example.com/three',
|
||||
title='Example Three',
|
||||
url="https://example.com/three",
|
||||
title="Example Three",
|
||||
crawl=crawl,
|
||||
).tags.add(alpha_tag)
|
||||
|
||||
@@ -135,24 +134,24 @@ def test_tag_search_api_respects_sort_and_filters(client, api_token, admin_user,
|
||||
Tag.objects.filter(pk=tag_with_snapshots.pk).update(created_at=timezone.make_aware(datetime(2026, 1, 1, 12, 0, 0)))
|
||||
|
||||
response = client.get(
|
||||
reverse('api-1:search_tags'),
|
||||
reverse("api-1:search_tags"),
|
||||
{
|
||||
'sort': 'name_desc',
|
||||
'created_by': str(other_user.pk),
|
||||
'year': '2024',
|
||||
'has_snapshots': 'no',
|
||||
'api_key': api_token,
|
||||
"sort": "name_desc",
|
||||
"created_by": str(other_user.pk),
|
||||
"year": "2024",
|
||||
"has_snapshots": "no",
|
||||
"api_key": api_token,
|
||||
},
|
||||
HTTP_HOST=ADMIN_HOST,
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
payload = response.json()
|
||||
assert payload['sort'] == 'name_desc'
|
||||
assert payload['created_by'] == str(other_user.pk)
|
||||
assert payload['year'] == '2024'
|
||||
assert payload['has_snapshots'] == 'no'
|
||||
assert [tag['name'] for tag in payload['tags']] == ['Zulu Empty']
|
||||
assert payload["sort"] == "name_desc"
|
||||
assert payload["created_by"] == str(other_user.pk)
|
||||
assert payload["year"] == "2024"
|
||||
assert payload["has_snapshots"] == "no"
|
||||
assert [tag["name"] for tag in payload["tags"]] == ["Zulu Empty"]
|
||||
|
||||
|
||||
def test_tag_rename_api_updates_slug(client, api_token, tagged_data):
|
||||
@@ -160,30 +159,30 @@ def test_tag_rename_api_updates_slug(client, api_token, tagged_data):
|
||||
|
||||
response = client.post(
|
||||
f"{reverse('api-1:rename_tag', args=[tag.id])}?api_key={api_token}",
|
||||
data=json.dumps({'name': 'Alpha Archive'}),
|
||||
content_type='application/json',
|
||||
data=json.dumps({"name": "Alpha Archive"}),
|
||||
content_type="application/json",
|
||||
HTTP_HOST=ADMIN_HOST,
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
|
||||
tag.refresh_from_db()
|
||||
assert tag.name == 'Alpha Archive'
|
||||
assert tag.slug == 'alpha-archive'
|
||||
assert tag.name == "Alpha Archive"
|
||||
assert tag.slug == "alpha-archive"
|
||||
|
||||
|
||||
def test_tag_snapshots_export_returns_jsonl(client, api_token, tagged_data):
|
||||
tag, _ = tagged_data
|
||||
|
||||
response = client.get(
|
||||
reverse('api-1:tag_snapshots_export', args=[tag.id]),
|
||||
{'api_key': api_token},
|
||||
reverse("api-1:tag_snapshots_export", args=[tag.id]),
|
||||
{"api_key": api_token},
|
||||
HTTP_HOST=ADMIN_HOST,
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
assert response['Content-Type'].startswith('application/x-ndjson')
|
||||
assert f'tag-{tag.slug}-snapshots.jsonl' in response['Content-Disposition']
|
||||
assert response["Content-Type"].startswith("application/x-ndjson")
|
||||
assert f"tag-{tag.slug}-snapshots.jsonl" in response["Content-Disposition"]
|
||||
body = response.content.decode()
|
||||
assert '"type": "Snapshot"' in body
|
||||
assert '"tags": "Alpha Research"' in body
|
||||
@@ -193,13 +192,13 @@ def test_tag_urls_export_returns_plain_text_urls(client, api_token, tagged_data)
|
||||
tag, snapshots = tagged_data
|
||||
|
||||
response = client.get(
|
||||
reverse('api-1:tag_urls_export', args=[tag.id]),
|
||||
{'api_key': api_token},
|
||||
reverse("api-1:tag_urls_export", args=[tag.id]),
|
||||
{"api_key": api_token},
|
||||
HTTP_HOST=ADMIN_HOST,
|
||||
)
|
||||
|
||||
assert response.status_code == 200
|
||||
assert response['Content-Type'].startswith('text/plain')
|
||||
assert f'tag-{tag.slug}-urls.txt' in response['Content-Disposition']
|
||||
assert response["Content-Type"].startswith("text/plain")
|
||||
assert f"tag-{tag.slug}-urls.txt" in response["Content-Disposition"]
|
||||
exported_urls = set(filter(None, response.content.decode().splitlines()))
|
||||
assert exported_urls == {snapshot.url for snapshot in snapshots}
|
||||
|
||||
@@ -6,11 +6,12 @@ from .fixtures import disable_extractors_dict, process
|
||||
|
||||
FIXTURES = (disable_extractors_dict, process)
|
||||
|
||||
|
||||
def test_title_is_extracted(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that title is extracted from the page."""
|
||||
disable_extractors_dict.update({"SAVE_TITLE": "true"})
|
||||
add_process = subprocess.run(
|
||||
['archivebox', 'add', '--plugins=title', 'https://example.com'],
|
||||
["archivebox", "add", "--plugins=title", "https://example.com"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
@@ -28,6 +29,7 @@ def test_title_is_extracted(tmp_path, process, disable_extractors_dict):
|
||||
assert snapshot[0] is not None
|
||||
assert "Example" in snapshot[0]
|
||||
|
||||
|
||||
def test_title_is_htmlencoded_in_index_html(tmp_path, process, disable_extractors_dict):
|
||||
"""
|
||||
https://github.com/ArchiveBox/ArchiveBox/issues/330
|
||||
@@ -36,7 +38,7 @@ def test_title_is_htmlencoded_in_index_html(tmp_path, process, disable_extractor
|
||||
"""
|
||||
disable_extractors_dict.update({"SAVE_TITLE": "true"})
|
||||
add_process = subprocess.run(
|
||||
['archivebox', 'add', '--plugins=title', 'https://example.com'],
|
||||
["archivebox", "add", "--plugins=title", "https://example.com"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
|
||||
@@ -1,28 +1,37 @@
|
||||
import json
|
||||
import sqlite3
|
||||
import subprocess
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
import pytest
|
||||
from django.utils import timezone
|
||||
|
||||
from .fixtures import disable_extractors_dict, process
|
||||
|
||||
FIXTURES = (disable_extractors_dict, process)
|
||||
|
||||
|
||||
def test_update_imports_orphaned_snapshots(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that archivebox update imports real legacy archive directories."""
|
||||
legacy_timestamp = '1710000000'
|
||||
legacy_dir = tmp_path / 'archive' / legacy_timestamp
|
||||
legacy_timestamp = "1710000000"
|
||||
legacy_dir = tmp_path / "archive" / legacy_timestamp
|
||||
legacy_dir.mkdir(parents=True, exist_ok=True)
|
||||
(legacy_dir / 'singlefile.html').write_text('<html>example</html>')
|
||||
(legacy_dir / 'index.json').write_text(json.dumps({
|
||||
'url': 'https://example.com',
|
||||
'timestamp': legacy_timestamp,
|
||||
'title': 'Example Domain',
|
||||
'fs_version': '0.8.0',
|
||||
'archive_results': [],
|
||||
}))
|
||||
(legacy_dir / "singlefile.html").write_text("<html>example</html>")
|
||||
(legacy_dir / "index.json").write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"url": "https://example.com",
|
||||
"timestamp": legacy_timestamp,
|
||||
"title": "Example Domain",
|
||||
"fs_version": "0.8.0",
|
||||
"archive_results": [],
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
# Run update without filters - should import and migrate the legacy directory.
|
||||
update_process = subprocess.run(
|
||||
['archivebox', 'update'],
|
||||
["archivebox", "update"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
@@ -36,10 +45,151 @@ def test_update_imports_orphaned_snapshots(tmp_path, process, disable_extractors
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
assert row == ('https://example.com', '0.9.0')
|
||||
assert row == ("https://example.com", "0.9.0")
|
||||
assert legacy_dir.is_symlink()
|
||||
|
||||
migrated_dir = legacy_dir.resolve()
|
||||
assert migrated_dir.exists()
|
||||
assert (migrated_dir / 'index.jsonl').exists()
|
||||
assert (migrated_dir / 'singlefile.html').exists()
|
||||
assert (migrated_dir / "index.jsonl").exists()
|
||||
assert (migrated_dir / "singlefile.html").exists()
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_reindex_snapshots_resets_existing_search_results_and_reruns_requested_plugins(monkeypatch):
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.cli.archivebox_update import reindex_snapshots
|
||||
from archivebox.core.models import ArchiveResult, Snapshot
|
||||
from archivebox.crawls.models import Crawl
|
||||
import archivebox.cli.archivebox_extract as extract_mod
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls="https://example.com",
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
)
|
||||
snapshot = Snapshot.objects.create(
|
||||
url="https://example.com",
|
||||
crawl=crawl,
|
||||
status=Snapshot.StatusChoices.SEALED,
|
||||
)
|
||||
result = ArchiveResult.objects.create(
|
||||
snapshot=snapshot,
|
||||
plugin="search_backend_sqlite",
|
||||
hook_name="on_Snapshot__90_index_sqlite.py",
|
||||
status=ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
output_str="old index hit",
|
||||
output_json={"indexed": True},
|
||||
output_files={"search.sqlite3": {"size": 123}},
|
||||
output_size=123,
|
||||
)
|
||||
|
||||
captured: dict[str, object] = {}
|
||||
|
||||
def fake_run_plugins(*, args, records, wait, emit_results, plugins=""):
|
||||
captured["args"] = args
|
||||
captured["records"] = records
|
||||
captured["wait"] = wait
|
||||
captured["emit_results"] = emit_results
|
||||
captured["plugins"] = plugins
|
||||
return 0
|
||||
|
||||
monkeypatch.setattr(extract_mod, "run_plugins", fake_run_plugins)
|
||||
|
||||
stats = reindex_snapshots(
|
||||
Snapshot.objects.filter(id=snapshot.id),
|
||||
search_plugins=["search_backend_sqlite"],
|
||||
batch_size=10,
|
||||
)
|
||||
|
||||
result.refresh_from_db()
|
||||
|
||||
assert stats["processed"] == 1
|
||||
assert stats["queued"] == 1
|
||||
assert stats["reindexed"] == 1
|
||||
assert result.status == ArchiveResult.StatusChoices.QUEUED
|
||||
assert result.output_str == ""
|
||||
assert result.output_json is None
|
||||
assert result.output_files == {}
|
||||
assert captured == {
|
||||
"args": (),
|
||||
"records": [{"type": "ArchiveResult", "snapshot_id": str(snapshot.id), "plugin": "search_backend_sqlite"}],
|
||||
"wait": True,
|
||||
"emit_results": False,
|
||||
"plugins": "",
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_build_filtered_snapshots_queryset_respects_resume_cutoff():
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.cli.archivebox_update import _build_filtered_snapshots_queryset
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls="https://example.com\nhttps://example.org\nhttps://example.net",
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
)
|
||||
base = timezone.make_aware(datetime(2026, 3, 23, 12, 0, 0))
|
||||
older = Snapshot.objects.create(
|
||||
url="https://example.net",
|
||||
crawl=crawl,
|
||||
bookmarked_at=base - timedelta(hours=2),
|
||||
)
|
||||
middle = Snapshot.objects.create(
|
||||
url="https://example.org",
|
||||
crawl=crawl,
|
||||
bookmarked_at=base - timedelta(hours=1),
|
||||
)
|
||||
newer = Snapshot.objects.create(
|
||||
url="https://example.com",
|
||||
crawl=crawl,
|
||||
bookmarked_at=base,
|
||||
)
|
||||
|
||||
snapshots = list(
|
||||
_build_filtered_snapshots_queryset(
|
||||
filter_patterns=(),
|
||||
filter_type="exact",
|
||||
before=None,
|
||||
after=None,
|
||||
resume=middle.timestamp,
|
||||
).values_list("id", flat=True),
|
||||
)
|
||||
|
||||
assert str(newer.id) not in {str(snapshot_id) for snapshot_id in snapshots}
|
||||
assert set(map(str, snapshots)) == {str(middle.id), str(older.id)}
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_reconcile_with_index_json_tolerates_null_title(tmp_path):
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls="https://example.com",
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
)
|
||||
snapshot = Snapshot.objects.create(
|
||||
url="https://example.com",
|
||||
crawl=crawl,
|
||||
title="Example Domain",
|
||||
status=Snapshot.StatusChoices.SEALED,
|
||||
)
|
||||
output_dir = snapshot.output_dir
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
(output_dir / "index.json").write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"url": snapshot.url,
|
||||
"timestamp": snapshot.timestamp,
|
||||
"title": None,
|
||||
"archive_results": [],
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
snapshot.reconcile_with_index_json()
|
||||
snapshot.refresh_from_db()
|
||||
|
||||
assert snapshot.title == "Example Domain"
|
||||
|
||||
@@ -49,19 +49,22 @@ def _build_script(body: str) -> str:
|
||||
from django.contrib.auth import get_user_model
|
||||
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
from archivebox.core.host_utils import (
|
||||
get_admin_host,
|
||||
get_admin_base_url,
|
||||
get_api_host,
|
||||
get_web_host,
|
||||
get_web_base_url,
|
||||
get_public_host,
|
||||
get_snapshot_subdomain,
|
||||
get_snapshot_host,
|
||||
get_original_host,
|
||||
get_listen_subdomain,
|
||||
split_host_port,
|
||||
host_matches,
|
||||
is_snapshot_subdomain,
|
||||
build_admin_url,
|
||||
build_snapshot_url,
|
||||
)
|
||||
|
||||
@@ -82,45 +85,12 @@ def _build_script(body: str) -> str:
|
||||
|
||||
def get_snapshot():
|
||||
snapshot = Snapshot.objects.order_by("-created_at").first()
|
||||
if snapshot is None:
|
||||
admin = ensure_admin_user()
|
||||
crawl = Crawl.objects.create(
|
||||
urls="https://example.com",
|
||||
created_by=admin,
|
||||
)
|
||||
snapshot = Snapshot.objects.create(
|
||||
url="https://example.com",
|
||||
title="Example Domain",
|
||||
crawl=crawl,
|
||||
status=Snapshot.StatusChoices.SEALED,
|
||||
)
|
||||
snapshot_dir = Path(snapshot.output_dir)
|
||||
snapshot_dir.mkdir(parents=True, exist_ok=True)
|
||||
(snapshot_dir / "index.json").write_text('{"url": "https://example.com"}', encoding="utf-8")
|
||||
(snapshot_dir / "favicon.ico").write_bytes(b"ico")
|
||||
screenshot_dir = snapshot_dir / "screenshot"
|
||||
screenshot_dir.mkdir(parents=True, exist_ok=True)
|
||||
(screenshot_dir / "screenshot.png").write_bytes(b"png")
|
||||
responses_root = snapshot_dir / "responses" / snapshot.domain
|
||||
responses_root.mkdir(parents=True, exist_ok=True)
|
||||
(responses_root / "index.html").write_text(
|
||||
"<!doctype html><html><body><h1>Example Domain</h1></body></html>",
|
||||
encoding="utf-8",
|
||||
)
|
||||
ArchiveResult.objects.get_or_create(
|
||||
snapshot=snapshot,
|
||||
plugin="screenshot",
|
||||
defaults={"status": "succeeded", "output_size": 1, "output_str": "."},
|
||||
)
|
||||
ArchiveResult.objects.get_or_create(
|
||||
snapshot=snapshot,
|
||||
plugin="responses",
|
||||
defaults={"status": "succeeded", "output_size": 1, "output_str": "."},
|
||||
)
|
||||
assert snapshot is not None, "Expected real_archive_with_example to seed a snapshot"
|
||||
return snapshot
|
||||
|
||||
def get_snapshot_files(snapshot):
|
||||
output_rel = None
|
||||
reserved_snapshot_paths = {"index.html"}
|
||||
for output in snapshot.discover_outputs():
|
||||
candidate = output.get("path")
|
||||
if not candidate:
|
||||
@@ -144,10 +114,22 @@ def _build_script(body: str) -> str:
|
||||
if not candidate.is_file():
|
||||
continue
|
||||
rel = candidate.relative_to(responses_root)
|
||||
if str(rel) in reserved_snapshot_paths:
|
||||
continue
|
||||
if not (Path(snapshot.output_dir) / rel).exists():
|
||||
response_file = candidate
|
||||
response_rel = str(rel)
|
||||
break
|
||||
if response_file is None:
|
||||
for candidate in responses_root.rglob("*"):
|
||||
if not candidate.is_file():
|
||||
continue
|
||||
rel = candidate.relative_to(responses_root)
|
||||
if str(rel) in reserved_snapshot_paths:
|
||||
continue
|
||||
response_file = candidate
|
||||
response_rel = str(rel)
|
||||
break
|
||||
if response_file is None:
|
||||
response_file = next(p for p in responses_root.rglob("*") if p.is_file())
|
||||
response_rel = str(response_file.relative_to(responses_root))
|
||||
@@ -170,7 +152,7 @@ def _build_script(body: str) -> str:
|
||||
encoding="utf-8",
|
||||
)
|
||||
return "dangerous.html", "safe.json", "dangerous-response"
|
||||
"""
|
||||
""",
|
||||
)
|
||||
return prelude + "\n" + textwrap.dedent(body)
|
||||
|
||||
@@ -179,13 +161,26 @@ class TestUrlRouting:
|
||||
data_dir: Path
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _setup_data_dir(self, initialized_archive: Path) -> None:
|
||||
self.data_dir = initialized_archive
|
||||
def _setup_data_dir(self, real_archive_with_example: Path) -> None:
|
||||
self.data_dir = real_archive_with_example
|
||||
|
||||
def _run(self, body: str, timeout: int = 120, mode: str | None = None) -> None:
|
||||
def _run(
|
||||
self,
|
||||
body: str,
|
||||
timeout: int = 120,
|
||||
mode: str | None = None,
|
||||
env_overrides: dict[str, str] | None = None,
|
||||
) -> None:
|
||||
script = _build_script(body)
|
||||
env_overrides = {"SERVER_SECURITY_MODE": mode} if mode else None
|
||||
result = _run_python(script, cwd=self.data_dir, timeout=timeout, env_overrides=env_overrides)
|
||||
merged_env = dict(env_overrides or {})
|
||||
if mode:
|
||||
merged_env["SERVER_SECURITY_MODE"] = mode
|
||||
result = _run_python(
|
||||
script,
|
||||
cwd=self.data_dir,
|
||||
timeout=timeout,
|
||||
env_overrides=merged_env or None,
|
||||
)
|
||||
assert result.returncode == 0, result.stderr
|
||||
assert "OK" in result.stdout
|
||||
|
||||
@@ -200,6 +195,7 @@ class TestUrlRouting:
|
||||
admin_host = get_admin_host()
|
||||
api_host = get_api_host()
|
||||
public_host = get_public_host()
|
||||
snapshot_subdomain = get_snapshot_subdomain(snapshot_id)
|
||||
snapshot_host = get_snapshot_host(snapshot_id)
|
||||
original_host = get_original_host(domain)
|
||||
base_host = SERVER_CONFIG.LISTEN_HOST
|
||||
@@ -211,15 +207,17 @@ class TestUrlRouting:
|
||||
assert admin_host == "admin.archivebox.localhost:8000"
|
||||
assert api_host == "api.archivebox.localhost:8000"
|
||||
assert public_host == "public.archivebox.localhost:8000"
|
||||
assert snapshot_host == f"{snapshot_id}.archivebox.localhost:8000"
|
||||
assert snapshot_subdomain == f"snap-{snapshot_id[-12:].lower()}"
|
||||
assert snapshot_host == f"{snapshot_subdomain}.archivebox.localhost:8000"
|
||||
assert original_host == f"{domain}.archivebox.localhost:8000"
|
||||
assert get_listen_subdomain(web_host) == "web"
|
||||
assert get_listen_subdomain(admin_host) == "admin"
|
||||
assert get_listen_subdomain(api_host) == "api"
|
||||
assert get_listen_subdomain(snapshot_host) == snapshot_id
|
||||
assert get_listen_subdomain(snapshot_host) == snapshot_subdomain
|
||||
assert get_listen_subdomain(original_host) == domain
|
||||
assert get_listen_subdomain(base_host) == ""
|
||||
assert host_matches(web_host, get_web_host())
|
||||
assert is_snapshot_subdomain(snapshot_subdomain)
|
||||
assert is_snapshot_subdomain(snapshot_id)
|
||||
|
||||
client = Client()
|
||||
@@ -236,37 +234,77 @@ class TestUrlRouting:
|
||||
assert resp["Location"].startswith("/api/")
|
||||
|
||||
print("OK")
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
def test_web_admin_routing(self) -> None:
|
||||
self._run(
|
||||
"""
|
||||
ensure_admin_user()
|
||||
snapshot = get_snapshot()
|
||||
client = Client()
|
||||
web_host = get_web_host()
|
||||
public_host = get_public_host()
|
||||
admin_host = get_admin_host()
|
||||
snapshot_host = get_snapshot_host(str(snapshot.id))
|
||||
original_host = get_original_host(snapshot.domain)
|
||||
|
||||
resp = client.get("/admin/login/", HTTP_HOST=web_host)
|
||||
assert resp.status_code in (301, 302)
|
||||
assert admin_host in resp["Location"]
|
||||
|
||||
resp = client.get("/admin/login/?next=/admin/", HTTP_HOST=public_host)
|
||||
assert resp.status_code in (301, 302)
|
||||
assert resp["Location"] == f"http://{admin_host}/admin/login/?next=/admin/"
|
||||
|
||||
resp = client.get("/admin/login/?next=/admin/", HTTP_HOST=snapshot_host)
|
||||
assert resp.status_code in (301, 302)
|
||||
assert resp["Location"] == f"http://{admin_host}/admin/login/?next=/admin/"
|
||||
|
||||
resp = client.get("/admin/login/?next=/admin/", HTTP_HOST=original_host)
|
||||
assert resp.status_code in (301, 302)
|
||||
assert resp["Location"] == f"http://{admin_host}/admin/login/?next=/admin/"
|
||||
|
||||
resp = client.get("/admin/login/", HTTP_HOST=admin_host)
|
||||
assert resp.status_code == 200
|
||||
|
||||
resp = client.get(f"/{snapshot.url_path}", HTTP_HOST=admin_host)
|
||||
assert resp.status_code in (301, 302)
|
||||
assert resp["Location"] == f"http://{snapshot_host}"
|
||||
|
||||
resp = client.get(f"/{snapshot.url_path}/index.html", HTTP_HOST=admin_host)
|
||||
assert resp.status_code in (301, 302)
|
||||
assert resp["Location"] == f"http://{snapshot_host}"
|
||||
|
||||
resp = client.get("/static/jquery.min.js", HTTP_HOST=snapshot_host)
|
||||
assert resp.status_code == 200
|
||||
assert "javascript" in (resp.headers.get("Content-Type") or "")
|
||||
|
||||
resp = client.get("/static/jquery.min.js", HTTP_HOST=original_host)
|
||||
assert resp.status_code == 200
|
||||
assert "javascript" in (resp.headers.get("Content-Type") or "")
|
||||
|
||||
print("OK")
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
def test_snapshot_routing_and_hosts(self) -> None:
|
||||
self._run(
|
||||
"""
|
||||
import io
|
||||
import zipfile
|
||||
|
||||
snapshot = get_snapshot()
|
||||
output_rel, response_file, response_rel, response_output_path = get_snapshot_files(snapshot)
|
||||
snapshot_id = str(snapshot.id)
|
||||
snapshot_subdomain = get_snapshot_subdomain(snapshot_id)
|
||||
snapshot_host = get_snapshot_host(snapshot_id)
|
||||
original_host = get_original_host(snapshot.domain)
|
||||
web_host = get_web_host()
|
||||
host_only, port = split_host_port(SERVER_CONFIG.LISTEN_HOST)
|
||||
legacy_snapshot_host = f"{snapshot_id}.{host_only}"
|
||||
if port:
|
||||
legacy_snapshot_host = f"{legacy_snapshot_host}:{port}"
|
||||
|
||||
client = Client()
|
||||
|
||||
@@ -289,6 +327,11 @@ class TestUrlRouting:
|
||||
assert resp.status_code in (301, 302)
|
||||
assert snapshot_host in resp["Location"]
|
||||
|
||||
resp = client.get("/", HTTP_HOST=legacy_snapshot_host)
|
||||
assert resp.status_code in (301, 302)
|
||||
assert resp["Location"].startswith(f"http://{snapshot_host}")
|
||||
assert snapshot_subdomain in resp["Location"]
|
||||
|
||||
resp = client.get(f"/{output_rel}", HTTP_HOST=snapshot_host)
|
||||
assert resp.status_code == 200
|
||||
assert response_body(resp) == Path(snapshot.output_dir, output_rel).read_bytes()
|
||||
@@ -296,7 +339,10 @@ class TestUrlRouting:
|
||||
resp = client.get(f"/{response_rel}", HTTP_HOST=snapshot_host)
|
||||
assert resp.status_code == 200
|
||||
snapshot_body = response_body(resp)
|
||||
if response_output_path.exists():
|
||||
if response_rel == "index.html":
|
||||
assert f"http://{snapshot_host}/".encode() in snapshot_body
|
||||
assert b"See all files..." in snapshot_body
|
||||
elif response_output_path.exists():
|
||||
assert snapshot_body == response_output_path.read_bytes()
|
||||
else:
|
||||
assert snapshot_body == response_file.read_bytes()
|
||||
@@ -319,8 +365,149 @@ class TestUrlRouting:
|
||||
files_html = response_body(resp).decode("utf-8", "ignore")
|
||||
assert output_rel.split("/", 1)[0] in files_html
|
||||
|
||||
resp = client.get("/?files=1&download=zip", HTTP_HOST=snapshot_host)
|
||||
assert resp.status_code == 200
|
||||
assert resp["Content-Type"] == "application/zip"
|
||||
assert ".zip" in resp["Content-Disposition"]
|
||||
assert resp.streaming
|
||||
with zipfile.ZipFile(io.BytesIO(response_body(resp))) as zip_file:
|
||||
assert any(name.endswith(f"/{output_rel}") for name in zip_file.namelist())
|
||||
|
||||
output_dir = next((output.get("path", "").split("/", 1)[0] for output in snapshot.discover_outputs() if "/" in (output.get("path") or "")), None)
|
||||
assert output_dir is not None
|
||||
resp = client.get(f"/{output_dir}/", HTTP_HOST=snapshot_host)
|
||||
assert resp.status_code == 200
|
||||
dir_html = response_body(resp).decode("utf-8", "ignore")
|
||||
assert f"Index of {output_dir}/" in dir_html
|
||||
|
||||
print("OK")
|
||||
""",
|
||||
)
|
||||
|
||||
def test_safe_subdomains_original_domain_host_uses_latest_matching_response(self) -> None:
|
||||
self._run(
|
||||
"""
|
||||
from datetime import timedelta
|
||||
import shutil
|
||||
from django.utils import timezone
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
snapshot = get_snapshot()
|
||||
original_host = get_original_host(snapshot.domain)
|
||||
client = Client()
|
||||
|
||||
assert SERVER_CONFIG.SERVER_SECURITY_MODE == "safe-subdomains-fullreplay"
|
||||
|
||||
now = timezone.now()
|
||||
created_by_id = snapshot.crawl.created_by_id
|
||||
created_snapshots = []
|
||||
created_crawls = []
|
||||
|
||||
def make_snapshot(url):
|
||||
crawl = Crawl.objects.create(urls=url, created_by_id=created_by_id)
|
||||
created_crawls.append(crawl)
|
||||
snap = Snapshot.objects.create(url=url, crawl=crawl, status=Snapshot.StatusChoices.STARTED)
|
||||
created_snapshots.append(snap)
|
||||
return snap
|
||||
|
||||
try:
|
||||
fixtures = (
|
||||
(make_snapshot("https://example.com"), now + timedelta(minutes=1), "old root"),
|
||||
(make_snapshot("https://example.com"), now + timedelta(minutes=2), "new root"),
|
||||
(make_snapshot("https://example.com/about.html"), now + timedelta(minutes=3), "old about"),
|
||||
(make_snapshot("https://example.com/about.html"), now + timedelta(minutes=4), "new about"),
|
||||
)
|
||||
|
||||
for snap, stamp, content in fixtures:
|
||||
snap.created_at = stamp
|
||||
snap.bookmarked_at = stamp
|
||||
snap.downloaded_at = stamp
|
||||
snap.save(update_fields=["created_at", "bookmarked_at", "downloaded_at", "modified_at"])
|
||||
responses_root = Path(snap.output_dir) / "responses" / snap.domain
|
||||
responses_root.mkdir(parents=True, exist_ok=True)
|
||||
rel_path = "about.html" if snap.url.endswith("/about.html") else "index.html"
|
||||
(responses_root / rel_path).write_text(content, encoding="utf-8")
|
||||
|
||||
resp = client.get("/", HTTP_HOST=original_host)
|
||||
assert resp.status_code == 200
|
||||
root_html = response_body(resp).decode("utf-8", "ignore")
|
||||
assert "new root" in root_html
|
||||
assert "old root" not in root_html
|
||||
|
||||
resp = client.get("/about.html", HTTP_HOST=original_host)
|
||||
assert resp.status_code == 200
|
||||
about_html = response_body(resp).decode("utf-8", "ignore")
|
||||
assert "new about" in about_html
|
||||
assert "old about" not in about_html
|
||||
finally:
|
||||
for snap in created_snapshots:
|
||||
shutil.rmtree(snap.output_dir, ignore_errors=True)
|
||||
for crawl in created_crawls:
|
||||
crawl.delete()
|
||||
|
||||
print("OK")
|
||||
""",
|
||||
)
|
||||
|
||||
def test_safe_subdomains_original_domain_host_falls_back_to_latest_snapshot_live_page(self) -> None:
|
||||
self._run(
|
||||
"""
|
||||
import shutil
|
||||
from django.utils import timezone
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
snapshot = get_snapshot()
|
||||
fallback_domain = "fallback-original-host.example"
|
||||
original_host = get_original_host(fallback_domain)
|
||||
client = Client()
|
||||
|
||||
assert SERVER_CONFIG.SERVER_SECURITY_MODE == "safe-subdomains-fullreplay"
|
||||
|
||||
crawl = Crawl.objects.create(urls=f"https://{fallback_domain}", created_by_id=snapshot.crawl.created_by_id)
|
||||
latest_snapshot = Snapshot.objects.create(
|
||||
url=f"https://{fallback_domain}",
|
||||
crawl=crawl,
|
||||
status=Snapshot.StatusChoices.STARTED,
|
||||
)
|
||||
|
||||
stamp = timezone.now()
|
||||
latest_snapshot.created_at = stamp
|
||||
latest_snapshot.bookmarked_at = stamp
|
||||
latest_snapshot.downloaded_at = stamp
|
||||
latest_snapshot.save(update_fields=["created_at", "bookmarked_at", "downloaded_at", "modified_at"])
|
||||
|
||||
try:
|
||||
shutil.rmtree(Path(latest_snapshot.output_dir) / "responses", ignore_errors=True)
|
||||
|
||||
resp = client.get("/", HTTP_HOST=original_host)
|
||||
assert resp.status_code == 200
|
||||
html = response_body(resp).decode("utf-8", "ignore")
|
||||
assert latest_snapshot.url in html
|
||||
assert f"http://{get_snapshot_host(str(latest_snapshot.id))}/" in html
|
||||
finally:
|
||||
shutil.rmtree(latest_snapshot.output_dir, ignore_errors=True)
|
||||
crawl.delete()
|
||||
|
||||
print("OK")
|
||||
""",
|
||||
)
|
||||
|
||||
def test_safe_subdomains_original_domain_host_redirects_to_save_page_now_when_missing_and_authenticated(self) -> None:
|
||||
self._run(
|
||||
"""
|
||||
ensure_admin_user()
|
||||
client = Client()
|
||||
client.login(username="testadmin", password="testpassword")
|
||||
|
||||
missing_domain = "missing-original-host.example"
|
||||
original_host = get_original_host(missing_domain)
|
||||
resp = client.get("/", HTTP_HOST=original_host)
|
||||
|
||||
assert resp.status_code in (301, 302)
|
||||
assert resp["Location"] == f"http://{get_web_host()}/web/https://{missing_domain}"
|
||||
|
||||
print("OK")
|
||||
""",
|
||||
)
|
||||
|
||||
def test_safe_subdomains_fullreplay_leaves_risky_replay_unrestricted(self) -> None:
|
||||
@@ -346,7 +533,7 @@ class TestUrlRouting:
|
||||
assert resp.headers.get("Content-Security-Policy") is None
|
||||
|
||||
print("OK")
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
def test_safe_onedomain_nojsreplay_routes_and_neuters_risky_documents(self) -> None:
|
||||
@@ -396,6 +583,9 @@ class TestUrlRouting:
|
||||
assert resp.headers.get("Content-Security-Policy") is None
|
||||
assert resp.headers.get("X-Content-Type-Options") == "nosniff"
|
||||
|
||||
resp = client.get("/snapshot/{}/singlefile/".format(snapshot_id), HTTP_HOST=base_host)
|
||||
assert resp.status_code == 404
|
||||
|
||||
resp = client.get(f"/snapshot/{snapshot_id}/{sniffed_rel}", HTTP_HOST=base_host)
|
||||
assert resp.status_code == 200
|
||||
csp = resp.headers.get("Content-Security-Policy") or ""
|
||||
@@ -486,6 +676,33 @@ class TestUrlRouting:
|
||||
mode="danger-onedomain-fullreplay",
|
||||
)
|
||||
|
||||
def test_onedomain_base_url_overrides_are_preserved_for_external_links(self) -> None:
|
||||
self._run(
|
||||
"""
|
||||
snapshot = get_snapshot()
|
||||
snapshot_id = str(snapshot.id)
|
||||
base_host = SERVER_CONFIG.LISTEN_HOST
|
||||
|
||||
assert SERVER_CONFIG.SERVER_SECURITY_MODE == "safe-onedomain-nojsreplay"
|
||||
assert get_admin_host() == base_host
|
||||
assert get_web_host() == base_host
|
||||
|
||||
assert get_admin_base_url() == "https://admin.archivebox.example"
|
||||
assert get_web_base_url() == "https://archivebox.example"
|
||||
assert build_admin_url("/admin/login/") == "https://admin.archivebox.example/admin/login/"
|
||||
assert build_snapshot_url(snapshot_id, "index.jsonl") == (
|
||||
f"https://archivebox.example/snapshot/{snapshot_id}/index.jsonl"
|
||||
)
|
||||
|
||||
print("OK")
|
||||
""",
|
||||
mode="safe-onedomain-nojsreplay",
|
||||
env_overrides={
|
||||
"ADMIN_BASE_URL": "https://admin.archivebox.example",
|
||||
"ARCHIVE_BASE_URL": "https://archivebox.example",
|
||||
},
|
||||
)
|
||||
|
||||
def test_template_and_admin_links(self) -> None:
|
||||
self._run(
|
||||
"""
|
||||
@@ -510,6 +727,25 @@ class TestUrlRouting:
|
||||
live_html = response_body(resp).decode("utf-8", "ignore")
|
||||
assert f"http://{snapshot_host}/" in live_html
|
||||
assert f"http://{public_host}/static/archive.png" in live_html
|
||||
assert "?preview=1" in live_html
|
||||
assert "function createMainFrame(previousFrame)" in live_html
|
||||
assert "function activateCardPreview(card, link)" in live_html
|
||||
assert "ensureMainFrame(true)" in live_html
|
||||
assert "previousFrame.parentNode.replaceChild(frame, previousFrame)" in live_html
|
||||
assert "previousFrame.src = 'about:blank'" in live_html
|
||||
assert "event.stopImmediatePropagation()" in live_html
|
||||
assert "const matchingLink = [...document.querySelectorAll('a[target=preview]')].find" in live_html
|
||||
assert "jQuery(link).click()" not in live_html
|
||||
assert "searchParams.delete('preview')" in live_html
|
||||
assert "doc.body.style.flexDirection = 'column'" in live_html
|
||||
assert "doc.body.style.alignItems = 'center'" in live_html
|
||||
assert "img.style.margin = '0 auto'" in live_html
|
||||
assert "window.location.hash = getPreviewHashValue(link)" in live_html
|
||||
assert "const selectedPreviewHash = decodeURIComponent(window.location.hash.slice(1)).toLowerCase()" in live_html
|
||||
assert "pointer-events: none;" in live_html
|
||||
assert "pointer-events: auto;" in live_html
|
||||
assert 'class="thumbnail-click-overlay"' in live_html
|
||||
assert "window.location.hash = getPreviewTypeFromPath(link)" not in live_html
|
||||
assert ">WARC<" not in live_html
|
||||
assert ">Media<" not in live_html
|
||||
assert ">Git<" not in live_html
|
||||
@@ -517,6 +753,25 @@ class TestUrlRouting:
|
||||
static_html = Path(snapshot.output_dir, "index.html").read_text(encoding="utf-8", errors="ignore")
|
||||
assert f"http://{snapshot_host}/" in static_html
|
||||
assert f"http://{public_host}/static/archive.png" in static_html
|
||||
assert "?preview=1" in static_html
|
||||
assert "function createMainFrame(previousFrame)" in static_html
|
||||
assert "function activateCardPreview(card, link)" in static_html
|
||||
assert "ensureMainFrame(true)" in static_html
|
||||
assert "previousFrame.parentNode.replaceChild(frame, previousFrame)" in static_html
|
||||
assert "previousFrame.src = 'about:blank'" in static_html
|
||||
assert "e.stopImmediatePropagation()" in static_html
|
||||
assert "const matchingLink = [...document.querySelectorAll('a[target=preview]')].find" in static_html
|
||||
assert "jQuery(link).click()" not in static_html
|
||||
assert "searchParams.delete('preview')" in static_html
|
||||
assert "doc.body.style.flexDirection = 'column'" in static_html
|
||||
assert "doc.body.style.alignItems = 'center'" in static_html
|
||||
assert "img.style.margin = '0 auto'" in static_html
|
||||
assert "window.location.hash = getPreviewHashValue(link)" in static_html
|
||||
assert "const selectedPreviewHash = decodeURIComponent(window.location.hash.slice(1)).toLowerCase()" in static_html
|
||||
assert "pointer-events: none;" in static_html
|
||||
assert "pointer-events: auto;" in static_html
|
||||
assert 'class="thumbnail-click-overlay"' in static_html
|
||||
assert "window.location.hash = getPreviewTypeFromPath(link)" not in static_html
|
||||
assert ">WARC<" not in static_html
|
||||
assert ">Media<" not in static_html
|
||||
assert ">Git<" not in static_html
|
||||
@@ -536,7 +791,53 @@ class TestUrlRouting:
|
||||
assert f"http://{snapshot_host}/" in ar_html
|
||||
|
||||
print("OK")
|
||||
""",
|
||||
)
|
||||
|
||||
def test_snapshot_pages_preview_filesystem_text_outputs(self) -> None:
|
||||
self._run(
|
||||
"""
|
||||
snapshot = get_snapshot()
|
||||
web_host = get_web_host()
|
||||
|
||||
consolelog_dir = Path(snapshot.output_dir) / "consolelog"
|
||||
consolelog_dir.mkdir(parents=True, exist_ok=True)
|
||||
(consolelog_dir / "console.jsonl").write_text(
|
||||
'{"level":"log","text":"console preview works"}\\n'
|
||||
'{"level":"warn","text":"second line"}\\n',
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
client = Client()
|
||||
resp = client.get(f"/{snapshot.url_path}/index.html", HTTP_HOST=web_host)
|
||||
assert resp.status_code == 200
|
||||
live_html = response_body(resp).decode("utf-8", "ignore")
|
||||
assert 'data-plugin="consolelog" data-compact="1"' in live_html
|
||||
assert "console preview works" in live_html
|
||||
snapshot_host = get_snapshot_host(str(snapshot.id))
|
||||
resp = client.get("/consolelog/console.jsonl?preview=1", HTTP_HOST=snapshot_host)
|
||||
assert resp.status_code == 200
|
||||
assert resp["Content-Type"].startswith("text/html")
|
||||
preview_html = response_body(resp).decode("utf-8", "ignore")
|
||||
assert "archivebox-text-preview" in preview_html
|
||||
assert "console preview works" in preview_html
|
||||
|
||||
screenshot_dir = Path(snapshot.output_dir) / "screenshot"
|
||||
screenshot_dir.mkdir(parents=True, exist_ok=True)
|
||||
(screenshot_dir / "screenshot.png").write_bytes(
|
||||
bytes.fromhex(
|
||||
"89504e470d0a1a0a"
|
||||
"0000000d49484452000000010000000108060000001f15c489"
|
||||
"0000000d49444154789c63f8ffffff7f0009fb03fd2a86e38a"
|
||||
"0000000049454e44ae426082",
|
||||
),
|
||||
)
|
||||
resp = client.get("/screenshot/screenshot.png?preview=1", HTTP_HOST=snapshot_host)
|
||||
assert resp.status_code == 200
|
||||
assert resp["Content-Type"].startswith("text/html")
|
||||
|
||||
print("OK")
|
||||
""",
|
||||
)
|
||||
|
||||
def test_api_available_on_admin_and_api_hosts(self) -> None:
|
||||
@@ -553,7 +854,7 @@ class TestUrlRouting:
|
||||
assert resp.status_code == 200
|
||||
|
||||
print("OK")
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
def test_api_auth_token_endpoint_available_on_admin_and_api_hosts(self) -> None:
|
||||
@@ -587,7 +888,7 @@ class TestUrlRouting:
|
||||
assert data.get("token")
|
||||
|
||||
print("OK")
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
def test_api_post_with_token_on_admin_and_api_hosts(self) -> None:
|
||||
@@ -631,5 +932,5 @@ class TestUrlRouting:
|
||||
assert data.get("tag_name") == "apitest-tag"
|
||||
|
||||
print("OK")
|
||||
"""
|
||||
""",
|
||||
)
|
||||
|
||||
@@ -16,6 +16,7 @@ class _ExampleHandler(BaseHTTPRequestHandler):
|
||||
def log_message(self, format, *args):
|
||||
return
|
||||
|
||||
|
||||
def test_download_url_downloads_content():
|
||||
server = ThreadingHTTPServer(("127.0.0.1", 0), _ExampleHandler)
|
||||
thread = Thread(target=server.serve_forever, daemon=True)
|
||||
|
||||
Reference in New Issue
Block a user