This commit is contained in:
Nick Sweeting
2026-03-23 03:58:32 -07:00
parent 268856bcfb
commit b749b26c5d
286 changed files with 21704 additions and 13480 deletions

View File

@@ -8,7 +8,7 @@ import textwrap
import time
import shutil
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
from typing import Any
import pytest
@@ -24,13 +24,14 @@ os.environ.setdefault("DATA_DIR", str(SESSION_DATA_DIR))
# CLI Helpers (defined before fixtures that use them)
# =============================================================================
def run_archivebox_cmd(
args: List[str],
args: list[str],
data_dir: Path,
stdin: Optional[str] = None,
stdin: str | None = None,
timeout: int = 60,
env: Optional[Dict[str, str]] = None,
) -> Tuple[str, str, int]:
env: dict[str, str] | None = None,
) -> tuple[str, str, int]:
"""
Run archivebox command via subprocess, return (stdout, stderr, returncode).
@@ -44,28 +45,28 @@ def run_archivebox_cmd(
Returns:
Tuple of (stdout, stderr, returncode)
"""
cmd = [sys.executable, '-m', 'archivebox'] + args
cmd = [sys.executable, "-m", "archivebox"] + args
base_env = os.environ.copy()
base_env['DATA_DIR'] = str(data_dir)
base_env['USE_COLOR'] = 'False'
base_env['SHOW_PROGRESS'] = 'False'
base_env["DATA_DIR"] = str(data_dir)
base_env["USE_COLOR"] = "False"
base_env["SHOW_PROGRESS"] = "False"
# Disable slow extractors for faster tests
base_env['SAVE_ARCHIVEDOTORG'] = 'False'
base_env['SAVE_TITLE'] = 'False'
base_env['SAVE_FAVICON'] = 'False'
base_env['SAVE_WGET'] = 'False'
base_env['SAVE_WARC'] = 'False'
base_env['SAVE_PDF'] = 'False'
base_env['SAVE_SCREENSHOT'] = 'False'
base_env['SAVE_DOM'] = 'False'
base_env['SAVE_SINGLEFILE'] = 'False'
base_env['SAVE_READABILITY'] = 'False'
base_env['SAVE_MERCURY'] = 'False'
base_env['SAVE_GIT'] = 'False'
base_env['SAVE_YTDLP'] = 'False'
base_env['SAVE_HEADERS'] = 'False'
base_env['SAVE_HTMLTOTEXT'] = 'False'
base_env["SAVE_ARCHIVEDOTORG"] = "False"
base_env["SAVE_TITLE"] = "False"
base_env["SAVE_FAVICON"] = "False"
base_env["SAVE_WGET"] = "False"
base_env["SAVE_WARC"] = "False"
base_env["SAVE_PDF"] = "False"
base_env["SAVE_SCREENSHOT"] = "False"
base_env["SAVE_DOM"] = "False"
base_env["SAVE_SINGLEFILE"] = "False"
base_env["SAVE_READABILITY"] = "False"
base_env["SAVE_MERCURY"] = "False"
base_env["SAVE_GIT"] = "False"
base_env["SAVE_YTDLP"] = "False"
base_env["SAVE_HEADERS"] = "False"
base_env["SAVE_HTMLTOTEXT"] = "False"
if env:
base_env.update(env)
@@ -87,6 +88,7 @@ def run_archivebox_cmd(
# Fixtures
# =============================================================================
@pytest.fixture(autouse=True)
def isolate_test_runtime(tmp_path):
"""
@@ -117,6 +119,7 @@ def isolate_test_runtime(tmp_path):
def pytest_sessionfinish(session, exitstatus):
shutil.rmtree(SESSION_DATA_DIR, ignore_errors=True)
@pytest.fixture
def isolated_data_dir(tmp_path):
"""
@@ -124,7 +127,7 @@ def isolated_data_dir(tmp_path):
Uses tmp_path for complete isolation.
"""
data_dir = tmp_path / 'archivebox_data'
data_dir = tmp_path / "archivebox_data"
data_dir.mkdir()
return data_dir
@@ -137,7 +140,7 @@ def initialized_archive(isolated_data_dir):
Runs `archivebox init` via subprocess to set up database and directories.
"""
stdout, stderr, returncode = run_archivebox_cmd(
['init', '--quick'],
["init", "--quick"],
data_dir=isolated_data_dir,
timeout=60,
)
@@ -149,23 +152,24 @@ def initialized_archive(isolated_data_dir):
# CWD-based CLI Helpers (no DATA_DIR env)
# =============================================================================
def run_archivebox_cmd_cwd(
args: List[str],
args: list[str],
cwd: Path,
stdin: Optional[str] = None,
stdin: str | None = None,
timeout: int = 60,
env: Optional[Dict[str, str]] = None,
) -> Tuple[str, str, int]:
env: dict[str, str] | None = None,
) -> tuple[str, str, int]:
"""
Run archivebox command via subprocess using cwd as DATA_DIR (no DATA_DIR env).
Returns (stdout, stderr, returncode).
"""
cmd = [sys.executable, '-m', 'archivebox'] + args
cmd = [sys.executable, "-m", "archivebox"] + args
base_env = os.environ.copy()
base_env.pop('DATA_DIR', None)
base_env['USE_COLOR'] = 'False'
base_env['SHOW_PROGRESS'] = 'False'
base_env.pop("DATA_DIR", None)
base_env["USE_COLOR"] = "False"
base_env["SHOW_PROGRESS"] = "False"
if env:
base_env.update(env)
@@ -183,7 +187,7 @@ def run_archivebox_cmd_cwd(
return result.stdout, result.stderr, result.returncode
def stop_process(proc: subprocess.Popen[str]) -> Tuple[str, str]:
def stop_process(proc: subprocess.Popen[str]) -> tuple[str, str]:
if proc.poll() is None:
proc.terminate()
try:
@@ -197,11 +201,11 @@ def run_python_cwd(
script: str,
cwd: Path,
timeout: int = 60,
) -> Tuple[str, str, int]:
) -> tuple[str, str, int]:
base_env = os.environ.copy()
base_env.pop('DATA_DIR', None)
base_env.pop("DATA_DIR", None)
result = subprocess.run(
[sys.executable, '-'],
[sys.executable, "-"],
input=script,
capture_output=True,
text=True,
@@ -253,7 +257,7 @@ def wait_for_archive_outputs(
rel_path = candidate.relative_to(snapshot_dir)
if rel_path.parts and rel_path.parts[0] == 'responses':
continue
if rel_path.name in {'stdout.log', 'stderr.log', 'cmd.sh'}:
if rel_path.name in {"stdout.log", "stderr.log", "cmd.sh"}:
continue
output_rel = str(rel_path)
break
@@ -267,64 +271,68 @@ def wait_for_archive_outputs(
raise SystemExit(1)
print('READY')
"""
""",
)
deadline = time.time() + timeout
while time.time() < deadline:
stdout, _stderr, returncode = run_python_cwd(script, cwd=cwd, timeout=30)
if returncode == 0 and 'READY' in stdout:
if returncode == 0 and "READY" in stdout:
return True
time.sleep(interval)
return False
def _get_machine_type() -> str:
import platform
os_name = platform.system().lower()
arch = platform.machine().lower()
in_docker = os.environ.get('IN_DOCKER', '').lower() in ('1', 'true', 'yes')
suffix = '-docker' if in_docker else ''
return f'{arch}-{os_name}{suffix}'
in_docker = os.environ.get("IN_DOCKER", "").lower() in ("1", "true", "yes")
suffix = "-docker" if in_docker else ""
return f"{arch}-{os_name}{suffix}"
def _find_cached_chromium(lib_dir: Path) -> Optional[Path]:
def _find_cached_chromium(lib_dir: Path) -> Path | None:
candidates = [
lib_dir / 'puppeteer',
lib_dir / 'npm' / 'node_modules' / 'puppeteer' / '.local-chromium',
lib_dir / "puppeteer",
lib_dir / "npm" / "node_modules" / "puppeteer" / ".local-chromium",
]
for base in candidates:
if not base.exists():
continue
for path in base.rglob('Chromium.app/Contents/MacOS/Chromium'):
for path in base.rglob("Chromium.app/Contents/MacOS/Chromium"):
return path
for path in base.rglob('chrome-linux/chrome'):
for path in base.rglob("chrome-linux/chrome"):
return path
for path in base.rglob('chrome-linux64/chrome'):
for path in base.rglob("chrome-linux64/chrome"):
return path
return None
def _find_system_browser() -> Optional[Path]:
def _find_system_browser() -> Path | None:
candidates = [
Path('/Applications/Chromium.app/Contents/MacOS/Chromium'),
Path('/usr/bin/chromium'),
Path('/usr/bin/chromium-browser'),
Path("/Applications/Chromium.app/Contents/MacOS/Chromium"),
Path("/usr/bin/chromium"),
Path("/usr/bin/chromium-browser"),
]
for candidate in candidates:
if candidate.exists():
return candidate
return None
def _ensure_puppeteer(shared_lib: Path) -> None:
npm_prefix = shared_lib / 'npm'
node_modules = npm_prefix / 'node_modules'
puppeteer_dir = node_modules / 'puppeteer'
npm_prefix = shared_lib / "npm"
node_modules = npm_prefix / "node_modules"
puppeteer_dir = node_modules / "puppeteer"
if puppeteer_dir.exists():
return
npm_prefix.mkdir(parents=True, exist_ok=True)
env = os.environ.copy()
env['PUPPETEER_SKIP_DOWNLOAD'] = '1'
env["PUPPETEER_SKIP_DOWNLOAD"] = "1"
subprocess.run(
['npm', 'install', 'puppeteer'],
["npm", "install", "puppeteer"],
cwd=str(npm_prefix),
env=env,
check=True,
@@ -345,7 +353,7 @@ def real_archive_with_example(tmp_path_factory, request):
request.cls.data_dir = tmp_path
stdout, stderr, returncode = run_archivebox_cmd_cwd(
['init', '--quick'],
["init", "--quick"],
cwd=tmp_path,
timeout=120,
)
@@ -353,28 +361,28 @@ def real_archive_with_example(tmp_path_factory, request):
stdout, stderr, returncode = run_archivebox_cmd_cwd(
[
'config',
'--set',
'LISTEN_HOST=archivebox.localhost:8000',
'PUBLIC_INDEX=True',
'PUBLIC_SNAPSHOTS=True',
'PUBLIC_ADD_VIEW=True',
"config",
"--set",
"LISTEN_HOST=archivebox.localhost:8000",
"PUBLIC_INDEX=True",
"PUBLIC_SNAPSHOTS=True",
"PUBLIC_ADD_VIEW=True",
],
cwd=tmp_path,
)
assert returncode == 0, f"archivebox config failed: {stderr}"
add_env = {
'RESPONSES_ENABLED': 'True',
'SHOW_PROGRESS': 'False',
'USE_COLOR': 'False',
'RESPONSES_TIMEOUT': '30',
"RESPONSES_ENABLED": "True",
"SHOW_PROGRESS": "False",
"USE_COLOR": "False",
"RESPONSES_TIMEOUT": "30",
}
cmd = [sys.executable, '-m', 'archivebox', 'add', '--depth=0', '--plugins=responses', 'https://example.com']
cmd = [sys.executable, "-m", "archivebox", "add", "--depth=0", "--plugins=responses", "https://example.com"]
base_env = os.environ.copy()
base_env.pop('DATA_DIR', None)
base_env['USE_COLOR'] = 'False'
base_env['SHOW_PROGRESS'] = 'False'
base_env.pop("DATA_DIR", None)
base_env["USE_COLOR"] = "False"
base_env["SHOW_PROGRESS"] = "False"
base_env.update(add_env)
proc = subprocess.Popen(
@@ -386,7 +394,7 @@ def real_archive_with_example(tmp_path_factory, request):
env=base_env,
)
ready = wait_for_archive_outputs(tmp_path, 'https://example.com', timeout=600)
ready = wait_for_archive_outputs(tmp_path, "https://example.com", timeout=600)
stdout, stderr = stop_process(proc)
assert ready, f"archivebox add did not produce required outputs within timeout:\nSTDOUT:\n{stdout}\nSTDERR:\n{stderr}"
@@ -397,34 +405,34 @@ def real_archive_with_example(tmp_path_factory, request):
# Output Assertions
# =============================================================================
def parse_jsonl_output(stdout: str) -> List[Dict[str, Any]]:
def parse_jsonl_output(stdout: str) -> list[dict[str, Any]]:
"""Parse JSONL output into list of dicts via Process parser."""
from archivebox.machine.models import Process
return Process.parse_records_from_text(stdout or '')
return Process.parse_records_from_text(stdout or "")
def assert_jsonl_contains_type(stdout: str, record_type: str, min_count: int = 1):
"""Assert output contains at least min_count records of type."""
records = parse_jsonl_output(stdout)
matching = [r for r in records if r.get('type') == record_type]
assert len(matching) >= min_count, \
f"Expected >= {min_count} {record_type}, got {len(matching)}"
matching = [r for r in records if r.get("type") == record_type]
assert len(matching) >= min_count, f"Expected >= {min_count} {record_type}, got {len(matching)}"
return matching
def assert_jsonl_pass_through(stdout: str, input_records: List[Dict[str, Any]]):
def assert_jsonl_pass_through(stdout: str, input_records: list[dict[str, Any]]):
"""Assert that input records appear in output (pass-through behavior)."""
output_records = parse_jsonl_output(stdout)
output_ids = {r.get('id') for r in output_records if r.get('id')}
output_ids = {r.get("id") for r in output_records if r.get("id")}
for input_rec in input_records:
input_id = input_rec.get('id')
input_id = input_rec.get("id")
if input_id:
assert input_id in output_ids, \
f"Input record {input_id} not found in output (pass-through failed)"
assert input_id in output_ids, f"Input record {input_id} not found in output (pass-through failed)"
def assert_record_has_fields(record: Dict[str, Any], required_fields: List[str]):
def assert_record_has_fields(record: dict[str, Any], required_fields: list[str]):
"""Assert record has all required fields with non-None values."""
for field in required_fields:
assert field in record, f"Record missing field: {field}"
@@ -435,31 +443,32 @@ def assert_record_has_fields(record: Dict[str, Any], required_fields: List[str])
# Test Data Factories
# =============================================================================
def create_test_url(domain: str = 'example.com', path: str | None = None) -> str:
def create_test_url(domain: str = "example.com", path: str | None = None) -> str:
"""Generate unique test URL."""
path = path or uuid7().hex[:8]
return f'https://{domain}/{path}'
return f"https://{domain}/{path}"
def create_test_crawl_json(urls: List[str] | None = None, **kwargs) -> Dict[str, Any]:
def create_test_crawl_json(urls: list[str] | None = None, **kwargs) -> dict[str, Any]:
"""Create Crawl JSONL record for testing."""
urls = urls or [create_test_url()]
return {
'type': 'Crawl',
'urls': '\n'.join(urls),
'max_depth': kwargs.get('max_depth', 0),
'tags_str': kwargs.get('tags_str', ''),
'status': kwargs.get('status', 'queued'),
**{k: v for k, v in kwargs.items() if k not in ('max_depth', 'tags_str', 'status')},
"type": "Crawl",
"urls": "\n".join(urls),
"max_depth": kwargs.get("max_depth", 0),
"tags_str": kwargs.get("tags_str", ""),
"status": kwargs.get("status", "queued"),
**{k: v for k, v in kwargs.items() if k not in ("max_depth", "tags_str", "status")},
}
def create_test_snapshot_json(url: str | None = None, **kwargs) -> Dict[str, Any]:
def create_test_snapshot_json(url: str | None = None, **kwargs) -> dict[str, Any]:
"""Create Snapshot JSONL record for testing."""
return {
'type': 'Snapshot',
'url': url or create_test_url(),
'tags_str': kwargs.get('tags_str', ''),
'status': kwargs.get('status', 'queued'),
**{k: v for k, v in kwargs.items() if k not in ('tags_str', 'status')},
"type": "Snapshot",
"url": url or create_test_url(),
"tags_str": kwargs.get("tags_str", ""),
"status": kwargs.get("status", "queued"),
**{k: v for k, v in kwargs.items() if k not in ("tags_str", "status")},
}

View File

@@ -5,34 +5,38 @@ from threading import Thread
import pytest
@pytest.fixture
def process(tmp_path):
process = subprocess.run(
['archivebox', 'init'],
["archivebox", "init"],
capture_output=True,
cwd=tmp_path,
)
return process
@pytest.fixture
def disable_extractors_dict():
env = os.environ.copy()
env.update({
"SAVE_WGET": "false",
"SAVE_SINGLEFILE": "false",
"SAVE_READABILITY": "false",
"SAVE_MERCURY": "false",
"SAVE_HTMLTOTEXT": "false",
"SAVE_PDF": "false",
"SAVE_SCREENSHOT": "false",
"SAVE_DOM": "false",
"SAVE_HEADERS": "false",
"SAVE_GIT": "false",
"SAVE_YTDLP": "false",
"SAVE_ARCHIVEDOTORG": "false",
"SAVE_TITLE": "false",
"SAVE_FAVICON": "false",
})
env.update(
{
"SAVE_WGET": "false",
"SAVE_SINGLEFILE": "false",
"SAVE_READABILITY": "false",
"SAVE_MERCURY": "false",
"SAVE_HTMLTOTEXT": "false",
"SAVE_PDF": "false",
"SAVE_SCREENSHOT": "false",
"SAVE_DOM": "false",
"SAVE_HEADERS": "false",
"SAVE_GIT": "false",
"SAVE_YTDLP": "false",
"SAVE_ARCHIVEDOTORG": "false",
"SAVE_TITLE": "false",
"SAVE_FAVICON": "false",
},
)
return env

View File

@@ -15,7 +15,6 @@ import sqlite3
import subprocess
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, List, Tuple
from archivebox.uuid_compat import uuid7
@@ -494,6 +493,7 @@ INSERT INTO django_content_type (app_label, model) VALUES
# Test Data Generators
# =============================================================================
def generate_uuid() -> str:
"""Generate a UUID string without dashes for SQLite."""
return uuid7().hex
@@ -501,45 +501,50 @@ def generate_uuid() -> str:
def generate_timestamp() -> str:
"""Generate a timestamp string like ArchiveBox uses."""
return datetime.now(timezone.utc).strftime('%Y%m%d%H%M%S') + '.000000'
return datetime.now(timezone.utc).strftime("%Y%m%d%H%M%S") + ".000000"
def seed_0_4_data(db_path: Path) -> Dict[str, List[Dict]]:
def seed_0_4_data(db_path: Path) -> dict[str, list[dict]]:
"""Seed a 0.4.x database with realistic test data."""
conn = sqlite3.connect(str(db_path))
cursor = conn.cursor()
created_data = {
'snapshots': [],
'tags_str': [],
"snapshots": [],
"tags_str": [],
}
test_urls = [
('https://example.com/page1', 'Example Page 1', 'news,tech'),
('https://example.org/article', 'Article Title', 'blog,reading'),
('https://github.com/user/repo', 'GitHub Repository', 'code,github'),
('https://news.ycombinator.com/item?id=12345', 'HN Discussion', 'news,discussion'),
('https://en.wikipedia.org/wiki/Test', 'Wikipedia Test', 'reference,wiki'),
("https://example.com/page1", "Example Page 1", "news,tech"),
("https://example.org/article", "Article Title", "blog,reading"),
("https://github.com/user/repo", "GitHub Repository", "code,github"),
("https://news.ycombinator.com/item?id=12345", "HN Discussion", "news,discussion"),
("https://en.wikipedia.org/wiki/Test", "Wikipedia Test", "reference,wiki"),
]
for i, (url, title, tags) in enumerate(test_urls):
snapshot_id = generate_uuid()
timestamp = f'2024010{i+1}120000.000000'
added = f'2024-01-0{i+1} 12:00:00'
timestamp = f"2024010{i + 1}120000.000000"
added = f"2024-01-0{i + 1} 12:00:00"
cursor.execute("""
cursor.execute(
"""
INSERT INTO core_snapshot (id, url, timestamp, title, tags, added, updated)
VALUES (?, ?, ?, ?, ?, ?, ?)
""", (snapshot_id, url, timestamp, title, tags, added, added))
""",
(snapshot_id, url, timestamp, title, tags, added, added),
)
created_data['snapshots'].append({
'id': snapshot_id,
'url': url,
'timestamp': timestamp,
'title': title,
'tags': tags,
})
created_data['tags_str'].append(tags)
created_data["snapshots"].append(
{
"id": snapshot_id,
"url": url,
"timestamp": timestamp,
"title": title,
"tags": tags,
},
)
created_data["tags_str"].append(tags)
cursor.execute("""
INSERT INTO django_migrations (app, name, applied)
@@ -552,16 +557,16 @@ def seed_0_4_data(db_path: Path) -> Dict[str, List[Dict]]:
return created_data
def seed_0_7_data(db_path: Path) -> Dict[str, List[Dict]]:
def seed_0_7_data(db_path: Path) -> dict[str, list[dict]]:
"""Seed a 0.7.x database with realistic test data."""
conn = sqlite3.connect(str(db_path))
cursor = conn.cursor()
created_data = {
'users': [],
'snapshots': [],
'tags': [],
'archiveresults': [],
"users": [],
"snapshots": [],
"tags": [],
"archiveresults": [],
}
# Create a user
@@ -572,125 +577,145 @@ def seed_0_7_data(db_path: Path) -> Dict[str, List[Dict]]:
'admin@example.com', 1, 1, datetime('now'))
""")
user_id = cursor.lastrowid
created_data['users'].append({'id': user_id, 'username': 'admin'})
created_data["users"].append({"id": user_id, "username": "admin"})
# Create 5 tags
tag_names = ['news', 'tech', 'blog', 'reference', 'code']
tag_names = ["news", "tech", "blog", "reference", "code"]
for name in tag_names:
cursor.execute("""
cursor.execute(
"""
INSERT INTO core_tag (name, slug) VALUES (?, ?)
""", (name, name.lower()))
""",
(name, name.lower()),
)
tag_id = cursor.lastrowid
created_data['tags'].append({'id': tag_id, 'name': name, 'slug': name.lower()})
created_data["tags"].append({"id": tag_id, "name": name, "slug": name.lower()})
# Create 5 snapshots
test_urls = [
('https://example.com/page1', 'Example Page 1'),
('https://example.org/article', 'Article Title'),
('https://github.com/user/repo', 'GitHub Repository'),
('https://news.ycombinator.com/item?id=12345', 'HN Discussion'),
('https://en.wikipedia.org/wiki/Test', 'Wikipedia Test'),
("https://example.com/page1", "Example Page 1"),
("https://example.org/article", "Article Title"),
("https://github.com/user/repo", "GitHub Repository"),
("https://news.ycombinator.com/item?id=12345", "HN Discussion"),
("https://en.wikipedia.org/wiki/Test", "Wikipedia Test"),
]
for i, (url, title) in enumerate(test_urls):
snapshot_id = generate_uuid()
timestamp = f'2024010{i+1}120000.000000'
added = f'2024-01-0{i+1} 12:00:00'
timestamp = f"2024010{i + 1}120000.000000"
added = f"2024-01-0{i + 1} 12:00:00"
cursor.execute("""
cursor.execute(
"""
INSERT INTO core_snapshot (id, url, timestamp, title, added, updated)
VALUES (?, ?, ?, ?, ?, ?)
""", (snapshot_id, url, timestamp, title, added, added))
""",
(snapshot_id, url, timestamp, title, added, added),
)
created_data['snapshots'].append({
'id': snapshot_id,
'url': url,
'timestamp': timestamp,
'title': title,
})
created_data["snapshots"].append(
{
"id": snapshot_id,
"url": url,
"timestamp": timestamp,
"title": title,
},
)
# Assign 2 tags to each snapshot
tag_ids = [created_data['tags'][i % 5]['id'], created_data['tags'][(i + 1) % 5]['id']]
tag_ids = [created_data["tags"][i % 5]["id"], created_data["tags"][(i + 1) % 5]["id"]]
for tag_id in tag_ids:
cursor.execute("""
cursor.execute(
"""
INSERT INTO core_snapshot_tags (snapshot_id, tag_id) VALUES (?, ?)
""", (snapshot_id, tag_id))
""",
(snapshot_id, tag_id),
)
# Create 5 archive results for each snapshot
extractors = ['title', 'favicon', 'screenshot', 'singlefile', 'wget']
statuses = ['succeeded', 'succeeded', 'failed', 'succeeded', 'skipped']
extractors = ["title", "favicon", "screenshot", "singlefile", "wget"]
statuses = ["succeeded", "succeeded", "failed", "succeeded", "skipped"]
for j, (extractor, status) in enumerate(zip(extractors, statuses)):
cursor.execute("""
cursor.execute(
"""
INSERT INTO core_archiveresult
(snapshot_id, extractor, cmd, pwd, cmd_version, output, start_ts, end_ts, status)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
snapshot_id, extractor,
json.dumps([extractor, '--version']),
f'/data/archive/{timestamp}',
'1.0.0',
f'{extractor}/index.html' if status == 'succeeded' else '',
f'2024-01-0{i+1} 12:00:0{j}',
f'2024-01-0{i+1} 12:00:1{j}',
status
))
""",
(
snapshot_id,
extractor,
json.dumps([extractor, "--version"]),
f"/data/archive/{timestamp}",
"1.0.0",
f"{extractor}/index.html" if status == "succeeded" else "",
f"2024-01-0{i + 1} 12:00:0{j}",
f"2024-01-0{i + 1} 12:00:1{j}",
status,
),
)
created_data['archiveresults'].append({
'snapshot_id': snapshot_id,
'extractor': extractor,
'status': status,
})
created_data["archiveresults"].append(
{
"snapshot_id": snapshot_id,
"extractor": extractor,
"status": status,
},
)
# Record migrations as applied (0.7.x migrations up to 0022)
migrations = [
('contenttypes', '0001_initial'),
('contenttypes', '0002_remove_content_type_name'),
('auth', '0001_initial'),
('auth', '0002_alter_permission_name_max_length'),
('auth', '0003_alter_user_email_max_length'),
('auth', '0004_alter_user_username_opts'),
('auth', '0005_alter_user_last_login_null'),
('auth', '0006_require_contenttypes_0002'),
('auth', '0007_alter_validators_add_error_messages'),
('auth', '0008_alter_user_username_max_length'),
('auth', '0009_alter_user_last_name_max_length'),
('auth', '0010_alter_group_name_max_length'),
('auth', '0011_update_proxy_permissions'),
('auth', '0012_alter_user_first_name_max_length'),
('admin', '0001_initial'),
('admin', '0002_logentry_remove_auto_add'),
('admin', '0003_logentry_add_action_flag_choices'),
('sessions', '0001_initial'),
('core', '0001_initial'),
('core', '0002_auto_20200625_1521'),
('core', '0003_auto_20200630_1034'),
('core', '0004_auto_20200713_1552'),
('core', '0005_auto_20200728_0326'),
('core', '0006_auto_20201012_1520'),
('core', '0007_archiveresult'),
('core', '0008_auto_20210105_1421'),
('core', '0009_auto_20210216_1038'),
('core', '0010_auto_20210216_1055'),
('core', '0011_auto_20210216_1331'),
('core', '0012_auto_20210216_1425'),
('core', '0013_auto_20210218_0729'),
('core', '0014_auto_20210218_0729'),
('core', '0015_auto_20210218_0730'),
('core', '0016_auto_20210218_1204'),
('core', '0017_auto_20210219_0211'),
('core', '0018_auto_20210327_0952'),
('core', '0019_auto_20210401_0654'),
('core', '0020_auto_20210410_1031'),
('core', '0021_auto_20220914_0934'),
('core', '0022_auto_20231023_2008'),
("contenttypes", "0001_initial"),
("contenttypes", "0002_remove_content_type_name"),
("auth", "0001_initial"),
("auth", "0002_alter_permission_name_max_length"),
("auth", "0003_alter_user_email_max_length"),
("auth", "0004_alter_user_username_opts"),
("auth", "0005_alter_user_last_login_null"),
("auth", "0006_require_contenttypes_0002"),
("auth", "0007_alter_validators_add_error_messages"),
("auth", "0008_alter_user_username_max_length"),
("auth", "0009_alter_user_last_name_max_length"),
("auth", "0010_alter_group_name_max_length"),
("auth", "0011_update_proxy_permissions"),
("auth", "0012_alter_user_first_name_max_length"),
("admin", "0001_initial"),
("admin", "0002_logentry_remove_auto_add"),
("admin", "0003_logentry_add_action_flag_choices"),
("sessions", "0001_initial"),
("core", "0001_initial"),
("core", "0002_auto_20200625_1521"),
("core", "0003_auto_20200630_1034"),
("core", "0004_auto_20200713_1552"),
("core", "0005_auto_20200728_0326"),
("core", "0006_auto_20201012_1520"),
("core", "0007_archiveresult"),
("core", "0008_auto_20210105_1421"),
("core", "0009_auto_20210216_1038"),
("core", "0010_auto_20210216_1055"),
("core", "0011_auto_20210216_1331"),
("core", "0012_auto_20210216_1425"),
("core", "0013_auto_20210218_0729"),
("core", "0014_auto_20210218_0729"),
("core", "0015_auto_20210218_0730"),
("core", "0016_auto_20210218_1204"),
("core", "0017_auto_20210219_0211"),
("core", "0018_auto_20210327_0952"),
("core", "0019_auto_20210401_0654"),
("core", "0020_auto_20210410_1031"),
("core", "0021_auto_20220914_0934"),
("core", "0022_auto_20231023_2008"),
]
for app, name in migrations:
cursor.execute("""
cursor.execute(
"""
INSERT INTO django_migrations (app, name, applied)
VALUES (?, ?, datetime('now'))
""", (app, name))
""",
(app, name),
)
conn.commit()
conn.close()
@@ -698,17 +723,17 @@ def seed_0_7_data(db_path: Path) -> Dict[str, List[Dict]]:
return created_data
def seed_0_8_data(db_path: Path) -> Dict[str, List[Dict]]:
def seed_0_8_data(db_path: Path) -> dict[str, list[dict]]:
"""Seed a 0.8.x database with realistic test data including Crawls."""
conn = sqlite3.connect(str(db_path))
cursor = conn.cursor()
created_data = {
'users': [],
'crawls': [],
'snapshots': [],
'tags': [],
'archiveresults': [],
"users": [],
"crawls": [],
"snapshots": [],
"tags": [],
"archiveresults": [],
}
# Create a user
@@ -719,243 +744,271 @@ def seed_0_8_data(db_path: Path) -> Dict[str, List[Dict]]:
'admin@example.com', 1, 1, datetime('now'))
""")
user_id = cursor.lastrowid
created_data['users'].append({'id': user_id, 'username': 'admin'})
created_data["users"].append({"id": user_id, "username": "admin"})
# Create 5 tags
tag_names = ['news', 'tech', 'blog', 'reference', 'code']
tag_names = ["news", "tech", "blog", "reference", "code"]
for name in tag_names:
cursor.execute("""
cursor.execute(
"""
INSERT INTO core_tag (name, slug, created_at, modified_at, created_by_id)
VALUES (?, ?, datetime('now'), datetime('now'), ?)
""", (name, name.lower(), user_id))
""",
(name, name.lower(), user_id),
)
tag_id = cursor.lastrowid
created_data['tags'].append({'id': tag_id, 'name': name, 'slug': name.lower()})
created_data["tags"].append({"id": tag_id, "name": name, "slug": name.lower()})
# Create 2 Crawls (0.9.0 schema - no seeds)
test_crawls = [
('https://example.com\nhttps://example.org', 0, 'Example Crawl'),
('https://github.com/ArchiveBox', 1, 'GitHub Crawl'),
("https://example.com\nhttps://example.org", 0, "Example Crawl"),
("https://github.com/ArchiveBox", 1, "GitHub Crawl"),
]
for i, (urls, max_depth, label) in enumerate(test_crawls):
crawl_id = generate_uuid()
cursor.execute("""
cursor.execute(
"""
INSERT INTO crawls_crawl (id, created_at, created_by_id, modified_at, urls,
config, max_depth, tags_str, label, status, retry_at,
num_uses_failed, num_uses_succeeded)
VALUES (?, datetime('now'), ?, datetime('now'), ?, '{}', ?, '', ?, 'queued', datetime('now'), 0, 0)
""", (crawl_id, user_id, urls, max_depth, label))
""",
(crawl_id, user_id, urls, max_depth, label),
)
created_data['crawls'].append({
'id': crawl_id,
'urls': urls,
'max_depth': max_depth,
'label': label,
})
created_data["crawls"].append(
{
"id": crawl_id,
"urls": urls,
"max_depth": max_depth,
"label": label,
},
)
# Create 5 snapshots linked to crawls
test_urls = [
('https://example.com/page1', 'Example Page 1', created_data['crawls'][0]['id']),
('https://example.org/article', 'Article Title', created_data['crawls'][0]['id']),
('https://github.com/user/repo', 'GitHub Repository', created_data['crawls'][1]['id']),
('https://news.ycombinator.com/item?id=12345', 'HN Discussion', None),
('https://en.wikipedia.org/wiki/Test', 'Wikipedia Test', None),
("https://example.com/page1", "Example Page 1", created_data["crawls"][0]["id"]),
("https://example.org/article", "Article Title", created_data["crawls"][0]["id"]),
("https://github.com/user/repo", "GitHub Repository", created_data["crawls"][1]["id"]),
("https://news.ycombinator.com/item?id=12345", "HN Discussion", None),
("https://en.wikipedia.org/wiki/Test", "Wikipedia Test", None),
]
for i, (url, title, crawl_id) in enumerate(test_urls):
snapshot_id = generate_uuid()
timestamp = f'2024010{i+1}120000.000000'
created_at = f'2024-01-0{i+1} 12:00:00'
timestamp = f"2024010{i + 1}120000.000000"
created_at = f"2024-01-0{i + 1} 12:00:00"
cursor.execute("""
cursor.execute(
"""
INSERT INTO core_snapshot (id, created_by_id, created_at, modified_at, url, timestamp,
bookmarked_at, crawl_id, title, depth, status, config, notes)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, 0, 'queued', '{}', '')
""", (snapshot_id, user_id, created_at, created_at, url, timestamp, created_at, crawl_id, title))
""",
(snapshot_id, user_id, created_at, created_at, url, timestamp, created_at, crawl_id, title),
)
created_data['snapshots'].append({
'id': snapshot_id,
'url': url,
'timestamp': timestamp,
'title': title,
'crawl_id': crawl_id,
})
created_data["snapshots"].append(
{
"id": snapshot_id,
"url": url,
"timestamp": timestamp,
"title": title,
"crawl_id": crawl_id,
},
)
# Assign 2 tags to each snapshot
tag_ids = [created_data['tags'][i % 5]['id'], created_data['tags'][(i + 1) % 5]['id']]
tag_ids = [created_data["tags"][i % 5]["id"], created_data["tags"][(i + 1) % 5]["id"]]
for tag_id in tag_ids:
cursor.execute("""
cursor.execute(
"""
INSERT INTO core_snapshot_tags (snapshot_id, tag_id) VALUES (?, ?)
""", (snapshot_id, tag_id))
""",
(snapshot_id, tag_id),
)
# Create 5 archive results for each snapshot
extractors = ['title', 'favicon', 'screenshot', 'singlefile', 'wget']
statuses = ['succeeded', 'succeeded', 'failed', 'succeeded', 'skipped']
extractors = ["title", "favicon", "screenshot", "singlefile", "wget"]
statuses = ["succeeded", "succeeded", "failed", "succeeded", "skipped"]
for j, (extractor, status) in enumerate(zip(extractors, statuses)):
result_uuid = generate_uuid()
cursor.execute("""
cursor.execute(
"""
INSERT INTO core_archiveresult
(uuid, created_by_id, created_at, modified_at, snapshot_id, extractor, pwd,
cmd, cmd_version, output, start_ts, end_ts, status, retry_at, notes, output_dir)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now'), '', ?)
""", (
result_uuid, user_id, f'2024-01-0{i+1} 12:00:0{j}', f'2024-01-0{i+1} 12:00:1{j}',
snapshot_id, extractor,
f'/data/archive/{timestamp}',
json.dumps([extractor, '--version']),
'1.0.0',
f'{extractor}/index.html' if status == 'succeeded' else '',
f'2024-01-0{i+1} 12:00:0{j}',
f'2024-01-0{i+1} 12:00:1{j}',
status,
f'{extractor}',
))
""",
(
result_uuid,
user_id,
f"2024-01-0{i + 1} 12:00:0{j}",
f"2024-01-0{i + 1} 12:00:1{j}",
snapshot_id,
extractor,
f"/data/archive/{timestamp}",
json.dumps([extractor, "--version"]),
"1.0.0",
f"{extractor}/index.html" if status == "succeeded" else "",
f"2024-01-0{i + 1} 12:00:0{j}",
f"2024-01-0{i + 1} 12:00:1{j}",
status,
f"{extractor}",
),
)
created_data['archiveresults'].append({
'uuid': result_uuid,
'snapshot_id': snapshot_id,
'extractor': extractor,
'status': status,
})
created_data["archiveresults"].append(
{
"uuid": result_uuid,
"snapshot_id": snapshot_id,
"extractor": extractor,
"status": status,
},
)
# Record migrations as applied (0.8.x migrations)
migrations = [
('contenttypes', '0001_initial'),
('contenttypes', '0002_remove_content_type_name'),
('auth', '0001_initial'),
('auth', '0002_alter_permission_name_max_length'),
('auth', '0003_alter_user_email_max_length'),
('auth', '0004_alter_user_username_opts'),
('auth', '0005_alter_user_last_login_null'),
('auth', '0006_require_contenttypes_0002'),
('auth', '0007_alter_validators_add_error_messages'),
('auth', '0008_alter_user_username_max_length'),
('auth', '0009_alter_user_last_name_max_length'),
('auth', '0010_alter_group_name_max_length'),
('auth', '0011_update_proxy_permissions'),
('auth', '0012_alter_user_first_name_max_length'),
('admin', '0001_initial'),
('admin', '0002_logentry_remove_auto_add'),
('admin', '0003_logentry_add_action_flag_choices'),
('sessions', '0001_initial'),
('core', '0001_initial'),
('core', '0002_auto_20200625_1521'),
('core', '0003_auto_20200630_1034'),
('core', '0004_auto_20200713_1552'),
('core', '0005_auto_20200728_0326'),
('core', '0006_auto_20201012_1520'),
('core', '0007_archiveresult'),
('core', '0008_auto_20210105_1421'),
('core', '0009_auto_20210216_1038'),
('core', '0010_auto_20210216_1055'),
('core', '0011_auto_20210216_1331'),
('core', '0012_auto_20210216_1425'),
('core', '0013_auto_20210218_0729'),
('core', '0014_auto_20210218_0729'),
('core', '0015_auto_20210218_0730'),
('core', '0016_auto_20210218_1204'),
('core', '0017_auto_20210219_0211'),
('core', '0018_auto_20210327_0952'),
('core', '0019_auto_20210401_0654'),
('core', '0020_auto_20210410_1031'),
('core', '0021_auto_20220914_0934'),
('core', '0022_auto_20231023_2008'),
("contenttypes", "0001_initial"),
("contenttypes", "0002_remove_content_type_name"),
("auth", "0001_initial"),
("auth", "0002_alter_permission_name_max_length"),
("auth", "0003_alter_user_email_max_length"),
("auth", "0004_alter_user_username_opts"),
("auth", "0005_alter_user_last_login_null"),
("auth", "0006_require_contenttypes_0002"),
("auth", "0007_alter_validators_add_error_messages"),
("auth", "0008_alter_user_username_max_length"),
("auth", "0009_alter_user_last_name_max_length"),
("auth", "0010_alter_group_name_max_length"),
("auth", "0011_update_proxy_permissions"),
("auth", "0012_alter_user_first_name_max_length"),
("admin", "0001_initial"),
("admin", "0002_logentry_remove_auto_add"),
("admin", "0003_logentry_add_action_flag_choices"),
("sessions", "0001_initial"),
("core", "0001_initial"),
("core", "0002_auto_20200625_1521"),
("core", "0003_auto_20200630_1034"),
("core", "0004_auto_20200713_1552"),
("core", "0005_auto_20200728_0326"),
("core", "0006_auto_20201012_1520"),
("core", "0007_archiveresult"),
("core", "0008_auto_20210105_1421"),
("core", "0009_auto_20210216_1038"),
("core", "0010_auto_20210216_1055"),
("core", "0011_auto_20210216_1331"),
("core", "0012_auto_20210216_1425"),
("core", "0013_auto_20210218_0729"),
("core", "0014_auto_20210218_0729"),
("core", "0015_auto_20210218_0730"),
("core", "0016_auto_20210218_1204"),
("core", "0017_auto_20210219_0211"),
("core", "0018_auto_20210327_0952"),
("core", "0019_auto_20210401_0654"),
("core", "0020_auto_20210410_1031"),
("core", "0021_auto_20220914_0934"),
("core", "0022_auto_20231023_2008"),
# For 0.8.x (dev branch), record the migrations that 0023_new_schema replaces
('core', '0023_alter_archiveresult_options_archiveresult_abid_and_more'),
('core', '0024_auto_20240513_1143'),
('core', '0025_alter_archiveresult_uuid'),
('core', '0026_archiveresult_created_archiveresult_created_by_and_more'),
('core', '0027_update_snapshot_ids'),
('core', '0028_alter_archiveresult_uuid'),
('core', '0029_alter_archiveresult_id'),
('core', '0030_alter_archiveresult_uuid'),
('core', '0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more'),
('core', '0032_alter_archiveresult_id'),
('core', '0033_rename_id_archiveresult_old_id'),
('core', '0034_alter_archiveresult_old_id_alter_archiveresult_uuid'),
('core', '0035_remove_archiveresult_uuid_archiveresult_id'),
('core', '0036_alter_archiveresult_id_alter_archiveresult_old_id'),
('core', '0037_rename_id_snapshot_old_id'),
('core', '0038_rename_uuid_snapshot_id'),
('core', '0039_rename_snapshot_archiveresult_snapshot_old'),
('core', '0040_archiveresult_snapshot'),
('core', '0041_alter_archiveresult_snapshot_and_more'),
('core', '0042_remove_archiveresult_snapshot_old'),
('core', '0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
('core', '0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more'),
('core', '0045_alter_snapshot_old_id'),
('core', '0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
('core', '0047_alter_snapshottag_unique_together_and_more'),
('core', '0048_alter_archiveresult_snapshot_and_more'),
('core', '0049_rename_snapshot_snapshottag_snapshot_old_and_more'),
('core', '0050_alter_snapshottag_snapshot_old'),
('core', '0051_snapshottag_snapshot_alter_snapshottag_snapshot_old'),
('core', '0052_alter_snapshottag_unique_together_and_more'),
('core', '0053_remove_snapshottag_snapshot_old'),
('core', '0054_alter_snapshot_timestamp'),
('core', '0055_alter_tag_slug'),
('core', '0056_remove_tag_uuid'),
('core', '0057_rename_id_tag_old_id'),
('core', '0058_alter_tag_old_id'),
('core', '0059_tag_id'),
('core', '0060_alter_tag_id'),
('core', '0061_rename_tag_snapshottag_old_tag_and_more'),
('core', '0062_alter_snapshottag_old_tag'),
('core', '0063_snapshottag_tag_alter_snapshottag_old_tag'),
('core', '0064_alter_snapshottag_unique_together_and_more'),
('core', '0065_remove_snapshottag_old_tag'),
('core', '0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id'),
('core', '0067_alter_snapshottag_tag'),
('core', '0068_alter_archiveresult_options'),
('core', '0069_alter_archiveresult_created_alter_snapshot_added_and_more'),
('core', '0070_alter_archiveresult_created_by_alter_snapshot_added_and_more'),
('core', '0071_remove_archiveresult_old_id_remove_snapshot_old_id_and_more'),
('core', '0072_rename_added_snapshot_bookmarked_at_and_more'),
('core', '0073_rename_created_archiveresult_created_at_and_more'),
('core', '0074_alter_snapshot_downloaded_at'),
("core", "0023_alter_archiveresult_options_archiveresult_abid_and_more"),
("core", "0024_auto_20240513_1143"),
("core", "0025_alter_archiveresult_uuid"),
("core", "0026_archiveresult_created_archiveresult_created_by_and_more"),
("core", "0027_update_snapshot_ids"),
("core", "0028_alter_archiveresult_uuid"),
("core", "0029_alter_archiveresult_id"),
("core", "0030_alter_archiveresult_uuid"),
("core", "0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more"),
("core", "0032_alter_archiveresult_id"),
("core", "0033_rename_id_archiveresult_old_id"),
("core", "0034_alter_archiveresult_old_id_alter_archiveresult_uuid"),
("core", "0035_remove_archiveresult_uuid_archiveresult_id"),
("core", "0036_alter_archiveresult_id_alter_archiveresult_old_id"),
("core", "0037_rename_id_snapshot_old_id"),
("core", "0038_rename_uuid_snapshot_id"),
("core", "0039_rename_snapshot_archiveresult_snapshot_old"),
("core", "0040_archiveresult_snapshot"),
("core", "0041_alter_archiveresult_snapshot_and_more"),
("core", "0042_remove_archiveresult_snapshot_old"),
("core", "0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more"),
("core", "0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more"),
("core", "0045_alter_snapshot_old_id"),
("core", "0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more"),
("core", "0047_alter_snapshottag_unique_together_and_more"),
("core", "0048_alter_archiveresult_snapshot_and_more"),
("core", "0049_rename_snapshot_snapshottag_snapshot_old_and_more"),
("core", "0050_alter_snapshottag_snapshot_old"),
("core", "0051_snapshottag_snapshot_alter_snapshottag_snapshot_old"),
("core", "0052_alter_snapshottag_unique_together_and_more"),
("core", "0053_remove_snapshottag_snapshot_old"),
("core", "0054_alter_snapshot_timestamp"),
("core", "0055_alter_tag_slug"),
("core", "0056_remove_tag_uuid"),
("core", "0057_rename_id_tag_old_id"),
("core", "0058_alter_tag_old_id"),
("core", "0059_tag_id"),
("core", "0060_alter_tag_id"),
("core", "0061_rename_tag_snapshottag_old_tag_and_more"),
("core", "0062_alter_snapshottag_old_tag"),
("core", "0063_snapshottag_tag_alter_snapshottag_old_tag"),
("core", "0064_alter_snapshottag_unique_together_and_more"),
("core", "0065_remove_snapshottag_old_tag"),
("core", "0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id"),
("core", "0067_alter_snapshottag_tag"),
("core", "0068_alter_archiveresult_options"),
("core", "0069_alter_archiveresult_created_alter_snapshot_added_and_more"),
("core", "0070_alter_archiveresult_created_by_alter_snapshot_added_and_more"),
("core", "0071_remove_archiveresult_old_id_remove_snapshot_old_id_and_more"),
("core", "0072_rename_added_snapshot_bookmarked_at_and_more"),
("core", "0073_rename_created_archiveresult_created_at_and_more"),
("core", "0074_alter_snapshot_downloaded_at"),
# For 0.8.x: DO NOT record 0023_new_schema - it replaces 0023-0074 for fresh installs
# We already recorded 0023-0074 above, so Django will know the state
# For 0.8.x: Record original machine migrations (before squashing)
# DO NOT record 0001_squashed here - it replaces 0001-0004 for fresh installs
('machine', '0001_initial'),
('machine', '0002_alter_machine_stats_installedbinary'),
('machine', '0003_alter_installedbinary_options_and_more'),
('machine', '0004_alter_installedbinary_abspath_and_more'),
("machine", "0001_initial"),
("machine", "0002_alter_machine_stats_installedbinary"),
("machine", "0003_alter_installedbinary_options_and_more"),
("machine", "0004_alter_installedbinary_abspath_and_more"),
# Then the new migrations after squashing
('machine', '0002_rename_custom_cmds_to_overrides'),
('machine', '0003_alter_dependency_id_alter_installedbinary_dependency_and_more'),
('machine', '0004_drop_dependency_table'),
("machine", "0002_rename_custom_cmds_to_overrides"),
("machine", "0003_alter_dependency_id_alter_installedbinary_dependency_and_more"),
("machine", "0004_drop_dependency_table"),
# Crawls must come before core.0024 because 0024_b depends on it
('crawls', '0001_initial'),
("crawls", "0001_initial"),
# Core 0024 migrations chain (in dependency order)
('core', '0024_b_clear_config_fields'),
('core', '0024_c_disable_fk_checks'),
('core', '0024_d_fix_crawls_config'),
('core', '0024_snapshot_crawl'),
('core', '0024_f_add_snapshot_config'),
('core', '0025_allow_duplicate_urls_per_crawl'),
("core", "0024_b_clear_config_fields"),
("core", "0024_c_disable_fk_checks"),
("core", "0024_d_fix_crawls_config"),
("core", "0024_snapshot_crawl"),
("core", "0024_f_add_snapshot_config"),
("core", "0025_allow_duplicate_urls_per_crawl"),
# For 0.8.x: Record original api migration (before squashing)
# DO NOT record 0001_squashed here - it replaces 0001 for fresh installs
('api', '0001_initial'),
('api', '0002_alter_apitoken_options'),
('api', '0003_rename_user_apitoken_created_by_apitoken_abid_and_more'),
('api', '0004_alter_apitoken_id_alter_apitoken_uuid'),
('api', '0005_remove_apitoken_uuid_remove_outboundwebhook_uuid_and_more'),
('api', '0006_remove_outboundwebhook_uuid_apitoken_id_and_more'),
('api', '0007_alter_apitoken_created_by'),
('api', '0008_alter_apitoken_created_alter_apitoken_created_by_and_more'),
('api', '0009_rename_created_apitoken_created_at_and_more'),
("api", "0001_initial"),
("api", "0002_alter_apitoken_options"),
("api", "0003_rename_user_apitoken_created_by_apitoken_abid_and_more"),
("api", "0004_alter_apitoken_id_alter_apitoken_uuid"),
("api", "0005_remove_apitoken_uuid_remove_outboundwebhook_uuid_and_more"),
("api", "0006_remove_outboundwebhook_uuid_apitoken_id_and_more"),
("api", "0007_alter_apitoken_created_by"),
("api", "0008_alter_apitoken_created_alter_apitoken_created_by_and_more"),
("api", "0009_rename_created_apitoken_created_at_and_more"),
# Note: crawls.0001_initial moved earlier (before core.0024) due to dependencies
# Stop here - 0.8.x ends at core.0025, crawls.0001, and we want to TEST the later migrations
# Do NOT record 0026+ as they need to be tested during migration
]
for app, name in migrations:
cursor.execute("""
cursor.execute(
"""
INSERT INTO django_migrations (app, name, applied)
VALUES (?, ?, datetime('now'))
""", (app, name))
""",
(app, name),
)
conn.commit()
conn.close()
@@ -967,33 +1020,34 @@ def seed_0_8_data(db_path: Path) -> Dict[str, List[Dict]]:
# Helper Functions
# =============================================================================
def run_archivebox(data_dir: Path, args: list, timeout: int = 60, env: dict | None = None) -> subprocess.CompletedProcess:
"""Run archivebox command in subprocess with given data directory."""
base_env = os.environ.copy()
base_env['DATA_DIR'] = str(data_dir)
base_env['USE_COLOR'] = 'False'
base_env['SHOW_PROGRESS'] = 'False'
base_env["DATA_DIR"] = str(data_dir)
base_env["USE_COLOR"] = "False"
base_env["SHOW_PROGRESS"] = "False"
# Disable ALL extractors for faster tests (can be overridden by env parameter)
base_env['SAVE_ARCHIVEDOTORG'] = 'False'
base_env['SAVE_TITLE'] = 'False'
base_env['SAVE_FAVICON'] = 'False'
base_env['SAVE_WGET'] = 'False'
base_env['SAVE_SINGLEFILE'] = 'False'
base_env['SAVE_SCREENSHOT'] = 'False'
base_env['SAVE_PDF'] = 'False'
base_env['SAVE_DOM'] = 'False'
base_env['SAVE_READABILITY'] = 'False'
base_env['SAVE_MERCURY'] = 'False'
base_env['SAVE_GIT'] = 'False'
base_env['SAVE_YTDLP'] = 'False'
base_env['SAVE_HEADERS'] = 'False'
base_env['SAVE_HTMLTOTEXT'] = 'False'
base_env["SAVE_ARCHIVEDOTORG"] = "False"
base_env["SAVE_TITLE"] = "False"
base_env["SAVE_FAVICON"] = "False"
base_env["SAVE_WGET"] = "False"
base_env["SAVE_SINGLEFILE"] = "False"
base_env["SAVE_SCREENSHOT"] = "False"
base_env["SAVE_PDF"] = "False"
base_env["SAVE_DOM"] = "False"
base_env["SAVE_READABILITY"] = "False"
base_env["SAVE_MERCURY"] = "False"
base_env["SAVE_GIT"] = "False"
base_env["SAVE_YTDLP"] = "False"
base_env["SAVE_HEADERS"] = "False"
base_env["SAVE_HTMLTOTEXT"] = "False"
# Override with any custom env vars
if env:
base_env.update(env)
cmd = [sys.executable, '-m', 'archivebox'] + args
cmd = [sys.executable, "-m", "archivebox"] + args
return subprocess.run(
cmd,
@@ -1007,12 +1061,12 @@ def run_archivebox(data_dir: Path, args: list, timeout: int = 60, env: dict | No
def create_data_dir_structure(data_dir: Path):
"""Create the basic ArchiveBox data directory structure."""
(data_dir / 'archive').mkdir(parents=True, exist_ok=True)
(data_dir / 'sources').mkdir(parents=True, exist_ok=True)
(data_dir / 'logs').mkdir(parents=True, exist_ok=True)
(data_dir / "archive").mkdir(parents=True, exist_ok=True)
(data_dir / "sources").mkdir(parents=True, exist_ok=True)
(data_dir / "logs").mkdir(parents=True, exist_ok=True)
def verify_snapshot_count(db_path: Path, expected: int) -> Tuple[bool, str]:
def verify_snapshot_count(db_path: Path, expected: int) -> tuple[bool, str]:
"""Verify the number of snapshots in the database."""
conn = sqlite3.connect(str(db_path))
cursor = conn.cursor()
@@ -1025,7 +1079,7 @@ def verify_snapshot_count(db_path: Path, expected: int) -> Tuple[bool, str]:
return False, f"Snapshot count mismatch: expected {expected}, got {count}"
def verify_tag_count(db_path: Path, expected: int) -> Tuple[bool, str]:
def verify_tag_count(db_path: Path, expected: int) -> tuple[bool, str]:
"""Verify the number of tags in the database (exact match)."""
conn = sqlite3.connect(str(db_path))
cursor = conn.cursor()
@@ -1038,7 +1092,7 @@ def verify_tag_count(db_path: Path, expected: int) -> Tuple[bool, str]:
return False, f"Tag count mismatch: expected {expected}, got {count}"
def verify_archiveresult_count(db_path: Path, expected: int) -> Tuple[bool, str]:
def verify_archiveresult_count(db_path: Path, expected: int) -> tuple[bool, str]:
"""Verify the number of archive results in the database."""
conn = sqlite3.connect(str(db_path))
cursor = conn.cursor()
@@ -1051,7 +1105,7 @@ def verify_archiveresult_count(db_path: Path, expected: int) -> Tuple[bool, str]
return False, f"ArchiveResult count mismatch: expected {expected}, got {count}"
def verify_snapshot_urls(db_path: Path, expected_urls: List[str]) -> Tuple[bool, str]:
def verify_snapshot_urls(db_path: Path, expected_urls: list[str]) -> tuple[bool, str]:
"""Verify ALL expected URLs exist in snapshots."""
conn = sqlite3.connect(str(db_path))
cursor = conn.cursor()
@@ -1065,7 +1119,7 @@ def verify_snapshot_urls(db_path: Path, expected_urls: List[str]) -> Tuple[bool,
return False, f"Missing URLs: {missing}"
def verify_snapshot_titles(db_path: Path, expected_titles: Dict[str, str]) -> Tuple[bool, str]:
def verify_snapshot_titles(db_path: Path, expected_titles: dict[str, str]) -> tuple[bool, str]:
"""Verify ALL snapshot titles are preserved."""
conn = sqlite3.connect(str(db_path))
cursor = conn.cursor()
@@ -1085,7 +1139,7 @@ def verify_snapshot_titles(db_path: Path, expected_titles: Dict[str, str]) -> Tu
return False, f"Title mismatches: {mismatches}"
def verify_foreign_keys(db_path: Path) -> Tuple[bool, str]:
def verify_foreign_keys(db_path: Path) -> tuple[bool, str]:
"""Verify foreign key relationships are intact."""
conn = sqlite3.connect(str(db_path))
cursor = conn.cursor()
@@ -1104,21 +1158,21 @@ def verify_foreign_keys(db_path: Path) -> Tuple[bool, str]:
return False, f"Found {orphaned_results} orphaned ArchiveResults"
def verify_all_snapshots_in_output(output: str, snapshots: List[Dict]) -> Tuple[bool, str]:
def verify_all_snapshots_in_output(output: str, snapshots: list[dict]) -> tuple[bool, str]:
"""Verify ALL snapshots appear in command output (not just one)."""
missing = []
for snapshot in snapshots:
url_fragment = snapshot['url'][:30]
title = snapshot.get('title', '')
url_fragment = snapshot["url"][:30]
title = snapshot.get("title", "")
if url_fragment not in output and (not title or title not in output):
missing.append(snapshot['url'])
missing.append(snapshot["url"])
if not missing:
return True, "All snapshots found in output"
return False, f"Missing snapshots in output: {missing}"
def verify_crawl_count(db_path: Path, expected: int) -> Tuple[bool, str]:
def verify_crawl_count(db_path: Path, expected: int) -> tuple[bool, str]:
"""Verify the number of crawls in the database."""
conn = sqlite3.connect(str(db_path))
cursor = conn.cursor()
@@ -1131,7 +1185,7 @@ def verify_crawl_count(db_path: Path, expected: int) -> Tuple[bool, str]:
return False, f"Crawl count mismatch: expected {expected}, got {count}"
def verify_process_migration(db_path: Path, expected_archiveresult_count: int) -> Tuple[bool, str]:
def verify_process_migration(db_path: Path, expected_archiveresult_count: int) -> tuple[bool, str]:
"""
Verify that ArchiveResults were properly migrated to Process records.
@@ -1170,13 +1224,13 @@ def verify_process_migration(db_path: Path, expected_archiveresult_count: int) -
status_errors = []
for ar_status, p_status, p_exit_code in cursor.fetchall():
expected_p_status, expected_exit_code = {
'queued': ('queued', None),
'started': ('running', None),
'backoff': ('queued', None),
'succeeded': ('exited', 0),
'failed': ('exited', 1),
'skipped': ('exited', None),
}.get(ar_status, ('queued', None))
"queued": ("queued", None),
"started": ("running", None),
"backoff": ("queued", None),
"succeeded": ("exited", 0),
"failed": ("exited", 1),
"skipped": ("exited", None),
}.get(ar_status, ("queued", None))
if p_status != expected_p_status:
status_errors.append(f"AR status {ar_status} → Process {p_status}, expected {expected_p_status}")

View File

@@ -12,48 +12,50 @@ from archivebox.crawls.models import Crawl
pytestmark = pytest.mark.django_db
User = get_user_model()
WEB_HOST = 'web.archivebox.localhost:8000'
ADMIN_HOST = 'admin.archivebox.localhost:8000'
WEB_HOST = "web.archivebox.localhost:8000"
ADMIN_HOST = "admin.archivebox.localhost:8000"
@pytest.fixture
def admin_user(db):
return User.objects.create_superuser(
username='addviewadmin',
email='addviewadmin@test.com',
password='testpassword',
username="addviewadmin",
email="addviewadmin@test.com",
password="testpassword",
)
def test_add_view_renders_tag_editor_and_url_filter_fields(client, admin_user, monkeypatch):
monkeypatch.setattr(SERVER_CONFIG, 'PUBLIC_ADD_VIEW', True)
monkeypatch.setattr(SERVER_CONFIG, "PUBLIC_ADD_VIEW", True)
response = client.get(reverse('add'), HTTP_HOST=WEB_HOST)
response = client.get(reverse("add"), HTTP_HOST=WEB_HOST)
body = response.content.decode()
assert response.status_code == 200
assert 'tag-editor-container' in body
assert "tag-editor-container" in body
assert 'name="url_filters_allowlist"' in body
assert 'name="url_filters_denylist"' in body
assert 'Same domain only' in body
assert "Same domain only" in body
assert 'name="persona"' in body
assert 'Overwrite existing snapshots' not in body
assert 'Update/retry previously failed URLs' not in body
assert 'Index only dry run (add crawl but don&#x27;t archive yet)' in body
assert "Overwrite existing snapshots" not in body
assert "Update/retry previously failed URLs" not in body
assert "Index only dry run (add crawl but don&#x27;t archive yet)" in body
assert 'name="notes"' in body
assert 'name="max_urls"' in body
assert 'name="max_size"' in body
assert '<input type="text" name="notes"' in body
assert body.index('name="persona"') < body.index('<h3>Crawl Plugins</h3>')
assert 'data-url-regex=' in body
assert body.index('name="persona"') < body.index("<h3>Crawl Plugins</h3>")
assert "data-url-regex=" in body
assert 'id="url-highlight-layer"' in body
assert 'id="detected-urls-list"' in body
assert 'detected-url-toggle-btn' in body
assert "detected-url-toggle-btn" in body
def test_add_view_checks_configured_search_backend_by_default(client, monkeypatch):
monkeypatch.setattr(SERVER_CONFIG, 'PUBLIC_ADD_VIEW', True)
monkeypatch.setattr(SEARCH_BACKEND_CONFIG, 'SEARCH_BACKEND_ENGINE', 'sqlite')
monkeypatch.setattr(SERVER_CONFIG, "PUBLIC_ADD_VIEW", True)
monkeypatch.setattr(SEARCH_BACKEND_CONFIG, "SEARCH_BACKEND_ENGINE", "sqlite")
response = client.get(reverse('add'), HTTP_HOST=WEB_HOST)
response = client.get(reverse("add"), HTTP_HOST=WEB_HOST)
body = response.content.decode()
assert response.status_code == 200
@@ -65,99 +67,181 @@ def test_add_view_checks_configured_search_backend_by_default(client, monkeypatc
def test_add_view_creates_crawl_with_tag_and_url_filter_overrides(client, admin_user, monkeypatch):
monkeypatch.setattr(SERVER_CONFIG, 'PUBLIC_ADD_VIEW', True)
monkeypatch.setattr(SERVER_CONFIG, "PUBLIC_ADD_VIEW", True)
client.force_login(admin_user)
response = client.post(
reverse('add'),
reverse("add"),
data={
'url': 'https://example.com\nhttps://cdn.example.com/asset.js',
'tag': 'alpha,beta',
'depth': '1',
'url_filters_allowlist': 'example.com\n*.example.com',
'url_filters_denylist': 'cdn.example.com',
'notes': 'Created from /add/',
'schedule': '',
'persona': 'Default',
'index_only': '',
'config': '{}',
"url": "https://example.com\nhttps://cdn.example.com/asset.js",
"tag": "alpha,beta",
"depth": "1",
"max_urls": "3",
"max_size": "45mb",
"url_filters_allowlist": "example.com\n*.example.com",
"url_filters_denylist": "cdn.example.com",
"notes": "Created from /add/",
"schedule": "",
"persona": "Default",
"index_only": "",
"config": "{}",
},
HTTP_HOST=WEB_HOST,
)
assert response.status_code == 302
crawl = Crawl.objects.order_by('-created_at').first()
crawl = Crawl.objects.order_by("-created_at").first()
assert crawl is not None
assert crawl.tags_str == 'alpha,beta'
assert crawl.notes == 'Created from /add/'
assert crawl.config.get('DEFAULT_PERSONA') == 'Default'
assert crawl.config['URL_ALLOWLIST'] == 'example.com\n*.example.com'
assert crawl.config['URL_DENYLIST'] == 'cdn.example.com'
assert 'OVERWRITE' not in crawl.config
assert 'ONLY_NEW' not in crawl.config
assert crawl.tags_str == "alpha,beta"
assert crawl.notes == "Created from /add/"
assert crawl.max_urls == 3
assert crawl.max_size == 45 * 1024 * 1024
assert crawl.config.get("DEFAULT_PERSONA") == "Default"
assert crawl.config["MAX_URLS"] == 3
assert crawl.config["MAX_SIZE"] == 45 * 1024 * 1024
assert crawl.config["URL_ALLOWLIST"] == "example.com\n*.example.com"
assert crawl.config["URL_DENYLIST"] == "cdn.example.com"
assert "OVERWRITE" not in crawl.config
assert "ONLY_NEW" not in crawl.config
def test_add_view_starts_background_runner_after_creating_crawl(client, admin_user, monkeypatch):
monkeypatch.setattr(SERVER_CONFIG, "PUBLIC_ADD_VIEW", True)
client.force_login(admin_user)
runner_calls = []
monkeypatch.setattr("archivebox.services.runner.ensure_background_runner", lambda: runner_calls.append(True) or True)
response = client.post(
reverse("add"),
data={
"url": "https://example.com",
"tag": "",
"depth": "0",
"max_urls": "0",
"max_size": "0",
"url_filters_allowlist": "",
"url_filters_denylist": "",
"notes": "",
"schedule": "",
"persona": "Default",
"index_only": "",
"config": "{}",
},
HTTP_HOST=WEB_HOST,
)
assert response.status_code == 302
assert runner_calls == [True]
def test_add_view_extracts_urls_from_mixed_text_input(client, admin_user, monkeypatch):
monkeypatch.setattr(SERVER_CONFIG, 'PUBLIC_ADD_VIEW', True)
monkeypatch.setattr(SERVER_CONFIG, "PUBLIC_ADD_VIEW", True)
client.force_login(admin_user)
response = client.post(
reverse('add'),
reverse("add"),
data={
'url': '\n'.join([
'https://sweeting.me,https://google.com',
'Notes: [ArchiveBox](https://github.com/ArchiveBox/ArchiveBox), https://news.ycombinator.com',
'[Wiki](https://en.wikipedia.org/wiki/Classification_(machine_learning))',
'{"items":["https://example.com/three"]}',
'csv,https://example.com/four',
]),
'tag': '',
'depth': '0',
'url_filters_allowlist': '',
'url_filters_denylist': '',
'notes': '',
'schedule': '',
'persona': 'Default',
'index_only': '',
'config': '{}',
"url": "\n".join(
[
"https://sweeting.me,https://google.com",
"Notes: [ArchiveBox](https://github.com/ArchiveBox/ArchiveBox), https://news.ycombinator.com",
"[Wiki](https://en.wikipedia.org/wiki/Classification_(machine_learning))",
'{"items":["https://example.com/three"]}',
"csv,https://example.com/four",
],
),
"tag": "",
"depth": "0",
"max_urls": "0",
"max_size": "0",
"url_filters_allowlist": "",
"url_filters_denylist": "",
"notes": "",
"schedule": "",
"persona": "Default",
"index_only": "",
"config": "{}",
},
HTTP_HOST=WEB_HOST,
)
assert response.status_code == 302
crawl = Crawl.objects.order_by('-created_at').first()
crawl = Crawl.objects.order_by("-created_at").first()
assert crawl is not None
assert crawl.urls == '\n'.join([
'https://sweeting.me',
'https://google.com',
'https://github.com/ArchiveBox/ArchiveBox',
'https://news.ycombinator.com',
'https://en.wikipedia.org/wiki/Classification_(machine_learning)',
'https://example.com/three',
'https://example.com/four',
])
assert crawl.urls == "\n".join(
[
"https://sweeting.me",
"https://google.com",
"https://github.com/ArchiveBox/ArchiveBox",
"https://news.ycombinator.com",
"https://en.wikipedia.org/wiki/Classification_(machine_learning)",
"https://example.com/three",
"https://example.com/four",
],
)
def test_add_view_trims_trailing_punctuation_from_markdown_urls(client, admin_user, monkeypatch):
monkeypatch.setattr(SERVER_CONFIG, "PUBLIC_ADD_VIEW", True)
client.force_login(admin_user)
response = client.post(
reverse("add"),
data={
"url": "\n".join(
[
"Docs: https://github.com/ArchiveBox/ArchiveBox.",
"Issue: https://github.com/abc?abc#234234?.",
],
),
"tag": "",
"depth": "0",
"max_urls": "0",
"max_size": "0",
"url_filters_allowlist": "",
"url_filters_denylist": "",
"notes": "",
"schedule": "",
"persona": "Default",
"index_only": "",
"config": "{}",
},
HTTP_HOST=WEB_HOST,
)
assert response.status_code == 302
crawl = Crawl.objects.order_by("-created_at").first()
assert crawl is not None
assert crawl.urls == "\n".join(
[
"https://github.com/ArchiveBox/ArchiveBox",
"https://github.com/abc?abc#234234",
],
)
def test_add_view_exposes_api_token_for_tag_widget_autocomplete(client, admin_user, monkeypatch):
monkeypatch.setattr(SERVER_CONFIG, 'PUBLIC_ADD_VIEW', True)
monkeypatch.setattr(SERVER_CONFIG, "PUBLIC_ADD_VIEW", True)
client.force_login(admin_user)
response = client.get(reverse('add'), HTTP_HOST=WEB_HOST)
response = client.get(reverse("add"), HTTP_HOST=WEB_HOST)
assert response.status_code == 200
assert b'window.ARCHIVEBOX_API_KEY' in response.content
assert b"window.ARCHIVEBOX_API_KEY" in response.content
def test_tags_autocomplete_requires_auth_when_public_snapshots_list_disabled(client, settings):
settings.PUBLIC_SNAPSHOTS_LIST = False
settings.PUBLIC_INDEX = False
Tag.objects.create(name='archive')
Tag.objects.create(name="archive")
response = client.get(
reverse('api-1:tags_autocomplete'),
{'q': 'a'},
reverse("api-1:tags_autocomplete"),
{"q": "a"},
HTTP_HOST=ADMIN_HOST,
)
@@ -167,29 +251,29 @@ def test_tags_autocomplete_requires_auth_when_public_snapshots_list_disabled(cli
def test_tags_autocomplete_allows_public_access_when_public_snapshots_list_enabled(client, settings):
settings.PUBLIC_SNAPSHOTS_LIST = True
settings.PUBLIC_INDEX = False
Tag.objects.create(name='archive')
Tag.objects.create(name="archive")
response = client.get(
reverse('api-1:tags_autocomplete'),
{'q': 'a'},
reverse("api-1:tags_autocomplete"),
{"q": "a"},
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
assert response.json()['tags'][0]['name'] == 'archive'
assert response.json()["tags"][0]["name"] == "archive"
def test_tags_autocomplete_allows_authenticated_user_when_public_snapshots_list_disabled(client, admin_user, settings):
settings.PUBLIC_SNAPSHOTS_LIST = False
settings.PUBLIC_INDEX = False
Tag.objects.create(name='archive')
Tag.objects.create(name="archive")
client.force_login(admin_user)
response = client.get(
reverse('api-1:tags_autocomplete'),
{'q': 'a'},
reverse("api-1:tags_autocomplete"),
{"q": "a"},
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
assert response.json()['tags'][0]['name'] == 'archive'
assert response.json()["tags"][0]["name"] == "archive"

View File

@@ -4,83 +4,83 @@ from archivebox.base_models.admin import KeyValueWidget
def test_key_value_widget_renders_enum_autocomplete_metadata(monkeypatch):
monkeypatch.setattr(
KeyValueWidget,
'_get_config_options',
"_get_config_options",
lambda self: {
'CHROME_WAIT_FOR': {
'plugin': 'chrome',
'type': 'string',
'default': 'networkidle2',
'description': 'Page load completion condition',
'enum': ['domcontentloaded', 'load', 'networkidle0', 'networkidle2'],
"CHROME_WAIT_FOR": {
"plugin": "chrome",
"type": "string",
"default": "networkidle2",
"description": "Page load completion condition",
"enum": ["domcontentloaded", "load", "networkidle0", "networkidle2"],
},
},
)
html = str(
KeyValueWidget().render(
'config',
{'CHROME_WAIT_FOR': 'load'},
attrs={'id': 'id_config'},
)
"config",
{"CHROME_WAIT_FOR": "load"},
attrs={"id": "id_config"},
),
)
assert '"enum": ["domcontentloaded", "load", "networkidle0", "networkidle2"]' in html
assert 'class="kv-value-options"' in html
assert 'class="kv-help"' in html
assert 'configureValueInput_id_config' in html
assert 'describeMeta_id_config' in html
assert 'validateValueAgainstMeta_id_config' in html
assert "configureValueInput_id_config" in html
assert "describeMeta_id_config" in html
assert "validateValueAgainstMeta_id_config" in html
def test_key_value_widget_renders_numeric_and_pattern_constraints(monkeypatch):
monkeypatch.setattr(
KeyValueWidget,
'_get_config_options',
"_get_config_options",
lambda self: {
'TIMEOUT': {
'plugin': 'base',
'type': 'integer',
'default': 60,
'description': 'Timeout in seconds',
'minimum': 5,
'maximum': 120,
"TIMEOUT": {
"plugin": "base",
"type": "integer",
"default": 60,
"description": "Timeout in seconds",
"minimum": 5,
"maximum": 120,
},
'CHROME_RESOLUTION': {
'plugin': 'chrome',
'type': 'string',
'default': '1440,2000',
'description': 'Viewport resolution',
'pattern': '^\\d+,\\d+$',
"CHROME_RESOLUTION": {
"plugin": "chrome",
"type": "string",
"default": "1440,2000",
"description": "Viewport resolution",
"pattern": "^\\d+,\\d+$",
},
},
)
html = str(KeyValueWidget().render('config', {}, attrs={'id': 'id_config'}))
html = str(KeyValueWidget().render("config", {}, attrs={"id": "id_config"}))
assert '"minimum": 5' in html
assert '"maximum": 120' in html
assert '"pattern": "^\\\\d+,\\\\d+$"' in html
assert 'Expected: ' in html
assert 'Example: ' in html
assert 'setValueValidationState_id_config' in html
assert 'coerceValueForStorage_id_config' in html
assert "Expected: " in html
assert "Example: " in html
assert "setValueValidationState_id_config" in html
assert "coerceValueForStorage_id_config" in html
def test_key_value_widget_accepts_common_boolean_spellings(monkeypatch):
monkeypatch.setattr(
KeyValueWidget,
'_get_config_options',
"_get_config_options",
lambda self: {
'DEBUG': {
'plugin': 'base',
'type': 'boolean',
'default': False,
'description': 'Enable debug mode',
"DEBUG": {
"plugin": "base",
"type": "boolean",
"default": False,
"description": "Enable debug mode",
},
},
)
html = str(KeyValueWidget().render('config', {'DEBUG': 'True'}, attrs={'id': 'id_config'}))
html = str(KeyValueWidget().render("config", {"DEBUG": "True"}, attrs={"id": "id_config"}))
assert "enumValues = ['True', 'False']" in html
assert "raw.toLowerCase()" in html
@@ -91,35 +91,35 @@ def test_key_value_widget_accepts_common_boolean_spellings(monkeypatch):
def test_key_value_widget_shows_array_and_object_examples_and_binary_rules(monkeypatch):
monkeypatch.setattr(
KeyValueWidget,
'_get_config_options',
"_get_config_options",
lambda self: {
'WGET_ARGS_EXTRA': {
'plugin': 'wget',
'type': 'array',
'default': [],
'description': 'Extra arguments to append to wget command',
"WGET_ARGS_EXTRA": {
"plugin": "wget",
"type": "array",
"default": [],
"description": "Extra arguments to append to wget command",
},
'SAVE_ALLOWLIST': {
'plugin': 'base',
'type': 'object',
'default': {},
'description': 'Regex allowlist mapped to enabled methods',
"SAVE_ALLOWLIST": {
"plugin": "base",
"type": "object",
"default": {},
"description": "Regex allowlist mapped to enabled methods",
},
'WGET_BINARY': {
'plugin': 'wget',
'type': 'string',
'default': 'wget',
'description': 'Path to wget binary',
"WGET_BINARY": {
"plugin": "wget",
"type": "string",
"default": "wget",
"description": "Path to wget binary",
},
},
)
html = str(KeyValueWidget().render('config', {}, attrs={'id': 'id_config'}))
html = str(KeyValueWidget().render("config", {}, attrs={"id": "id_config"}))
assert 'Example: ["--extra-arg"]' in html
assert 'Example: {"^https://example\\\\.com": ["wget"]}' in html
assert 'Example: wget or /usr/bin/wget' in html
assert 'validateBinaryValue_id_config' in html
assert "Example: wget or /usr/bin/wget" in html
assert "validateBinaryValue_id_config" in html
assert "meta.key.endsWith('_BINARY')" in html
assert "Binary paths cannot contain quotes" in html
@@ -127,25 +127,25 @@ def test_key_value_widget_shows_array_and_object_examples_and_binary_rules(monke
def test_key_value_widget_falls_back_to_binary_validation_for_unknown_binary_keys(monkeypatch):
monkeypatch.setattr(
KeyValueWidget,
'_get_config_options',
"_get_config_options",
lambda self: {
'CHROME_BINARY': {
'plugin': 'base',
'type': 'string',
'default': '',
'description': 'Resolved Chromium/Chrome binary path shared across plugins',
"CHROME_BINARY": {
"plugin": "base",
"type": "string",
"default": "",
"description": "Resolved Chromium/Chrome binary path shared across plugins",
},
},
)
html = str(
KeyValueWidget().render(
'config',
{'NODE_BINARY': '/opt/homebrew/bin/node'},
attrs={'id': 'id_config'},
)
"config",
{"NODE_BINARY": "/opt/homebrew/bin/node"},
attrs={"id": "id_config"},
),
)
assert 'function getMetaForKey_id_config' in html
assert "function getMetaForKey_id_config" in html
assert "if (key.endsWith('_BINARY'))" in html
assert 'Path to binary executable' in html
assert "Path to binary executable" in html

View File

@@ -1,5 +1,8 @@
import pytest
from django.contrib.admin.sites import AdminSite
from django.test import RequestFactory
from django.urls import reverse
import html
from uuid import uuid4
@@ -26,18 +29,18 @@ def _create_machine():
from archivebox.machine.models import Machine
return Machine.objects.create(
guid=f'test-guid-{uuid4()}',
hostname='test-host',
guid=f"test-guid-{uuid4()}",
hostname="test-host",
hw_in_docker=False,
hw_in_vm=False,
hw_manufacturer='Test',
hw_product='Test Product',
hw_uuid=f'test-hw-{uuid4()}',
os_arch='arm64',
os_family='darwin',
os_platform='macOS',
os_release='14.0',
os_kernel='Darwin',
hw_manufacturer="Test",
hw_product="Test Product",
hw_uuid=f"test-hw-{uuid4()}",
os_arch="arm64",
os_family="darwin",
os_platform="macOS",
os_release="14.0",
os_kernel="Darwin",
stats={},
config={},
)
@@ -48,16 +51,16 @@ def _create_iface(machine):
return NetworkInterface.objects.create(
machine=machine,
mac_address='00:11:22:33:44:66',
ip_public='203.0.113.11',
ip_local='10.0.0.11',
dns_server='1.1.1.1',
hostname='test-host',
iface='en0',
isp='Test ISP',
city='Test City',
region='Test Region',
country='Test Country',
mac_address="00:11:22:33:44:66",
ip_public="203.0.113.11",
ip_local="10.0.0.11",
dns_server="1.1.1.1",
hostname="test-host",
iface="en0",
isp="Test ISP",
city="Test City",
region="Test Region",
country="Test Country",
)
@@ -72,14 +75,14 @@ def test_archiveresult_admin_links_plugin_and_process():
machine=iface.machine,
iface=iface,
process_type=Process.TypeChoices.HOOK,
pwd=str(snapshot.output_dir / 'wget'),
cmd=['/tmp/on_Snapshot__06_wget.finite.bg.py', '--url=https://example.com'],
pwd=str(snapshot.output_dir / "wget"),
cmd=["/tmp/on_Snapshot__06_wget.finite.bg.py", "--url=https://example.com"],
status=Process.StatusChoices.EXITED,
)
result = ArchiveResult.objects.create(
snapshot=snapshot,
plugin='wget',
hook_name='on_Snapshot__06_wget.finite.bg.py',
plugin="wget",
hook_name="on_Snapshot__06_wget.finite.bg.py",
process=process,
status=ArchiveResult.StatusChoices.SUCCEEDED,
)
@@ -89,8 +92,85 @@ def test_archiveresult_admin_links_plugin_and_process():
plugin_html = str(admin.plugin_with_icon(result))
process_html = str(admin.process_link(result))
assert '/admin/environment/plugins/builtin.wget/' in plugin_html
assert f'/admin/machine/process/{process.id}/change' in process_html
assert "/admin/environment/plugins/builtin.wget/" in plugin_html
assert f"/admin/machine/process/{process.id}/change" in process_html
def test_snapshot_admin_zip_links():
from archivebox.core.admin_snapshots import SnapshotAdmin
from archivebox.core.models import Snapshot
snapshot = _create_snapshot()
admin = SnapshotAdmin(Snapshot, AdminSite())
zip_url = admin.get_snapshot_zip_url(snapshot)
assert html.escape(zip_url, quote=True) not in str(admin.files(snapshot))
assert html.escape(zip_url, quote=True) in str(admin.size_with_stats(snapshot))
assert html.escape(zip_url, quote=True) in str(admin.admin_actions(snapshot))
def test_archiveresult_admin_zip_links():
from archivebox.core.admin_archiveresults import ArchiveResultAdmin
from archivebox.core.models import ArchiveResult
snapshot = _create_snapshot()
result = ArchiveResult.objects.create(
snapshot=snapshot,
plugin="wget",
hook_name="on_Snapshot__06_wget.finite.bg.py",
status=ArchiveResult.StatusChoices.SUCCEEDED,
output_str="Saved output",
)
admin = ArchiveResultAdmin(ArchiveResult, AdminSite())
zip_url = admin.get_output_zip_url(result)
assert html.escape(zip_url, quote=True) in str(admin.zip_link(result))
assert html.escape(zip_url, quote=True) in str(admin.admin_actions(result))
def test_archiveresult_admin_copy_command_redacts_sensitive_env_keys():
from archivebox.core.admin_archiveresults import ArchiveResultAdmin
from archivebox.core.models import ArchiveResult
from archivebox.machine.models import Process
snapshot = _create_snapshot()
iface = _create_iface(_create_machine())
process = Process.objects.create(
machine=iface.machine,
iface=iface,
process_type=Process.TypeChoices.HOOK,
pwd=str(snapshot.output_dir / "wget"),
cmd=["/tmp/on_Snapshot__06_wget.finite.bg.py", "--url=https://example.com"],
env={
"SOURCE_URL": "https://example.com",
"SAFE_FLAG": "1",
"API_KEY": "super-secret-key",
"ACCESS_TOKEN": "super-secret-token",
"SHARED_SECRET": "super-secret-secret",
},
status=Process.StatusChoices.EXITED,
)
result = ArchiveResult.objects.create(
snapshot=snapshot,
plugin="wget",
hook_name="on_Snapshot__06_wget.finite.bg.py",
process=process,
status=ArchiveResult.StatusChoices.SUCCEEDED,
)
admin = ArchiveResultAdmin(ArchiveResult, AdminSite())
cmd_html = str(admin.cmd_str(result))
assert "SAFE_FLAG=1" in cmd_html
assert "SOURCE_URL=https://example.com" in cmd_html
assert "API_KEY" not in cmd_html
assert "ACCESS_TOKEN" not in cmd_html
assert "SHARED_SECRET" not in cmd_html
assert "super-secret-key" not in cmd_html
assert "super-secret-token" not in cmd_html
assert "super-secret-secret" not in cmd_html
def test_process_admin_links_binary_and_iface():
@@ -101,11 +181,11 @@ def test_process_admin_links_binary_and_iface():
iface = _create_iface(machine)
binary = Binary.objects.create(
machine=machine,
name='wget',
abspath='/usr/local/bin/wget',
version='1.21.2',
binprovider='env',
binproviders='env',
name="wget",
abspath="/usr/local/bin/wget",
version="1.21.2",
binprovider="env",
binproviders="env",
status=Binary.StatusChoices.INSTALLED,
)
process = Process.objects.create(
@@ -113,8 +193,8 @@ def test_process_admin_links_binary_and_iface():
iface=iface,
binary=binary,
process_type=Process.TypeChoices.HOOK,
pwd='/tmp/wget',
cmd=['/tmp/on_Snapshot__06_wget.finite.bg.py', '--url=https://example.com'],
pwd="/tmp/wget",
cmd=["/tmp/on_Snapshot__06_wget.finite.bg.py", "--url=https://example.com"],
status=Process.StatusChoices.EXITED,
)
@@ -123,5 +203,107 @@ def test_process_admin_links_binary_and_iface():
binary_html = str(admin.binary_link(process))
iface_html = str(admin.iface_link(process))
assert f'/admin/machine/binary/{binary.id}/change' in binary_html
assert f'/admin/machine/networkinterface/{iface.id}/change' in iface_html
assert f"/admin/machine/binary/{binary.id}/change" in binary_html
assert f"/admin/machine/networkinterface/{iface.id}/change" in iface_html
def test_process_admin_kill_actions_only_terminate_running_processes(monkeypatch):
from archivebox.machine.admin import ProcessAdmin
from archivebox.machine.models import Process
machine = _create_machine()
running = Process.objects.create(
machine=machine,
process_type=Process.TypeChoices.HOOK,
pwd="/tmp/running",
cmd=["/tmp/on_Snapshot__06_wget.finite.bg.py", "--url=https://example.com"],
status=Process.StatusChoices.RUNNING,
)
exited = Process.objects.create(
machine=machine,
process_type=Process.TypeChoices.HOOK,
pwd="/tmp/exited",
cmd=["/tmp/on_Snapshot__06_wget.finite.bg.py", "--url=https://example.com"],
status=Process.StatusChoices.EXITED,
)
admin = ProcessAdmin(Process, AdminSite())
request = RequestFactory().post("/admin/machine/process/")
terminated = []
flashed = []
monkeypatch.setattr(Process, "is_running", property(lambda self: self.pk == running.pk), raising=False)
monkeypatch.setattr(Process, "terminate", lambda self, graceful_timeout=5.0: terminated.append(self.pk) or True)
monkeypatch.setattr(admin, "message_user", lambda req, msg, level=None: flashed.append((msg, level)))
admin.kill_processes(request, Process.objects.filter(pk__in=[running.pk, exited.pk]).order_by("created_at"))
assert terminated == [running.pk]
assert any("Killed 1 running process" in msg for msg, _level in flashed)
assert any("Skipped 1 process" in msg for msg, _level in flashed)
def test_process_admin_object_kill_action_redirects_and_skips_exited(monkeypatch):
from archivebox.machine.admin import ProcessAdmin
from archivebox.machine.models import Process
machine = _create_machine()
process = Process.objects.create(
machine=machine,
process_type=Process.TypeChoices.HOOK,
pwd="/tmp/exited",
cmd=["/tmp/on_Snapshot__06_wget.finite.bg.py", "--url=https://example.com"],
status=Process.StatusChoices.EXITED,
)
admin = ProcessAdmin(Process, AdminSite())
request = RequestFactory().post(f"/admin/machine/process/{process.pk}/change/")
terminated = []
flashed = []
monkeypatch.setattr(Process, "is_running", property(lambda self: False), raising=False)
monkeypatch.setattr(Process, "terminate", lambda self, graceful_timeout=5.0: terminated.append(self.pk) or True)
monkeypatch.setattr(admin, "message_user", lambda req, msg, level=None: flashed.append((msg, level)))
response = admin.kill_process(request, process)
assert response.status_code == 302
assert response.url == reverse("admin:machine_process_change", args=[process.pk])
assert terminated == []
assert any("Skipped 1 process" in msg for msg, _level in flashed)
def test_process_admin_output_summary_uses_archiveresult_output_files():
from archivebox.core.models import ArchiveResult
from archivebox.machine.admin import ProcessAdmin
from archivebox.machine.models import Process
snapshot = _create_snapshot()
machine = _create_machine()
process = Process.objects.create(
machine=machine,
process_type=Process.TypeChoices.HOOK,
pwd=str(snapshot.output_dir / "wget"),
cmd=["/tmp/on_Snapshot__06_wget.finite.bg.py", "--url=https://example.com"],
status=Process.StatusChoices.EXITED,
)
ArchiveResult.objects.create(
snapshot=snapshot,
plugin="wget",
hook_name="on_Snapshot__06_wget.finite.bg.py",
process=process,
status=ArchiveResult.StatusChoices.SUCCEEDED,
output_files={
"index.html": {"extension": "html", "mimetype": "text/html", "size": 1024},
"title.txt": {"extension": "txt", "mimetype": "text/plain", "size": "512"},
},
)
admin = ProcessAdmin(Process, AdminSite())
output_html = str(admin.output_summary(process))
assert "2 files" in output_html
assert "1.5 KB" in output_html

File diff suppressed because it is too large Load Diff

View File

@@ -12,25 +12,25 @@ User = get_user_model()
class CLIScheduleAPITests(TestCase):
def setUp(self):
self.user = User.objects.create_user(
username='api-user',
password='testpass123',
email='api@example.com',
username="api-user",
password="testpass123",
email="api@example.com",
)
def test_schedule_api_creates_schedule(self):
request = RequestFactory().post('/api/v1/cli/schedule')
request = RequestFactory().post("/api/v1/cli/schedule")
request.user = self.user
setattr(request, 'stdout', StringIO())
setattr(request, 'stderr', StringIO())
setattr(request, "stdout", StringIO())
setattr(request, "stderr", StringIO())
args = ScheduleCommandSchema(
every='daily',
import_path='https://example.com/feed.xml',
every="daily",
import_path="https://example.com/feed.xml",
quiet=True,
)
response = cli_schedule(request, args)
self.assertTrue(response['success'])
self.assertEqual(response['result_format'], 'json')
self.assertTrue(response["success"])
self.assertEqual(response["result_format"], "json")
self.assertEqual(CrawlSchedule.objects.count(), 1)
self.assertEqual(len(response['result']['created_schedule_ids']), 1)
self.assertEqual(len(response["result"]["created_schedule_ids"]), 1)

View File

@@ -4,8 +4,10 @@ from uuid import uuid4
import pytest
from django.db import connection
from abx_dl.events import ProcessCompletedEvent, ProcessStartedEvent
from abx_dl.events import BinaryEvent, ProcessCompletedEvent, ProcessStartedEvent
from abx_dl.orchestrator import create_bus
from abx_dl.output_files import OutputFile
pytestmark = pytest.mark.django_db
@@ -36,18 +38,18 @@ def _create_machine():
from archivebox.machine.models import Machine
return Machine.objects.create(
guid=f'test-guid-{uuid4()}',
hostname='test-host',
guid=f"test-guid-{uuid4()}",
hostname="test-host",
hw_in_docker=False,
hw_in_vm=False,
hw_manufacturer='Test',
hw_product='Test Product',
hw_uuid=f'test-hw-{uuid4()}',
os_arch='arm64',
os_family='darwin',
os_platform='macOS',
os_release='14.0',
os_kernel='Darwin',
hw_manufacturer="Test",
hw_product="Test Product",
hw_uuid=f"test-hw-{uuid4()}",
os_arch="arm64",
os_family="darwin",
os_platform="macOS",
os_release="14.0",
os_kernel="Darwin",
stats={},
config={},
)
@@ -58,16 +60,16 @@ def _create_iface(machine):
return NetworkInterface.objects.create(
machine=machine,
mac_address='00:11:22:33:44:55',
ip_public='203.0.113.10',
ip_local='10.0.0.10',
dns_server='1.1.1.1',
hostname='test-host',
iface='en0',
isp='Test ISP',
city='Test City',
region='Test Region',
country='Test Country',
mac_address="00:11:22:33:44:55",
ip_public="203.0.113.10",
ip_local="10.0.0.10",
dns_server="1.1.1.1",
hostname="test-host",
iface="en0",
isp="Test ISP",
city="Test City",
region="Test Region",
country="Test Country",
)
@@ -92,7 +94,7 @@ def test_process_completed_projects_inline_archiveresult():
stderr="",
exit_code=0,
output_dir=str(plugin_dir),
output_files=["index.html"],
output_files=[OutputFile(path="index.html", extension="html", mimetype="text/html", size=15)],
process_id="proc-inline",
snapshot_id=str(snapshot.id),
start_ts="2026-03-22T12:00:00+00:00",
@@ -118,6 +120,8 @@ def test_process_completed_projects_inline_archiveresult():
assert result.status == ArchiveResult.StatusChoices.SUCCEEDED
assert result.output_str == "wget/index.html"
assert "index.html" in result.output_files
assert result.output_files["index.html"] == {"extension": "html", "mimetype": "text/html", "size": 15}
assert result.output_size == 15
_cleanup_machine_process_rows()
@@ -215,24 +219,212 @@ def test_process_completed_projects_noresults_archiveresult():
result = ArchiveResult.objects.get(snapshot=snapshot, plugin="title", hook_name="on_Snapshot__54_title.js")
assert result.status == ArchiveResult.StatusChoices.NORESULTS
assert result.output_str == "No title found"
def test_retry_failed_archiveresults_requeues_snapshot_in_queued_state():
from archivebox.core.models import ArchiveResult, Snapshot
snapshot = _create_snapshot()
ArchiveResult.objects.create(
snapshot=snapshot,
plugin="chrome",
hook_name="on_Snapshot__11_chrome_wait",
status=ArchiveResult.StatusChoices.FAILED,
output_str="timed out",
output_files={"stderr.log": {}},
output_size=123,
output_mimetypes="text/plain",
)
reset_count = snapshot.retry_failed_archiveresults()
snapshot.refresh_from_db()
result = ArchiveResult.objects.get(snapshot=snapshot, plugin="chrome", hook_name="on_Snapshot__11_chrome_wait")
assert reset_count == 1
assert snapshot.status == Snapshot.StatusChoices.QUEUED
assert snapshot.retry_at is not None
assert snapshot.current_step == 0
assert result.status == ArchiveResult.StatusChoices.QUEUED
assert result.output_str == ""
assert result.output_json is None
assert result.output_files == {}
assert result.output_size == 0
assert result.output_mimetypes == ""
assert result.start_ts is None
assert result.end_ts is None
snapshot.refresh_from_db()
assert snapshot.title in (None, "")
_cleanup_machine_process_rows()
def test_process_completed_projects_snapshot_title_from_output_str():
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata
from archivebox.services.process_service import ProcessService
snapshot = _create_snapshot()
plugin_dir = Path(snapshot.output_dir) / "title"
plugin_dir.mkdir(parents=True, exist_ok=True)
bus = create_bus(name="test_snapshot_title_output_str")
process_service = ProcessService(bus)
service = ArchiveResultService(bus, process_service=process_service)
event = ProcessCompletedEvent(
plugin_name="title",
hook_name="on_Snapshot__54_title.js",
stdout='{"snapshot_id":"%s","type":"ArchiveResult","status":"succeeded","output_str":"Example Domain"}\n' % snapshot.id,
stderr="",
exit_code=0,
output_dir=str(plugin_dir),
output_files=[],
process_id="proc-title-output-str",
snapshot_id=str(snapshot.id),
start_ts="2026-03-22T12:00:00+00:00",
end_ts="2026-03-22T12:00:01+00:00",
)
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
service._project_from_process_completed(
event,
{
"snapshot_id": str(snapshot.id),
"plugin": "title",
"hook_name": "on_Snapshot__54_title.js",
"status": "succeeded",
"output_str": "Example Domain",
},
output_files,
output_size,
output_mimetypes,
)
snapshot.refresh_from_db()
assert snapshot.title == "Example Domain"
_cleanup_machine_process_rows()
def test_process_completed_projects_snapshot_title_from_title_file():
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata
from archivebox.services.process_service import ProcessService
snapshot = _create_snapshot()
plugin_dir = Path(snapshot.output_dir) / "title"
plugin_dir.mkdir(parents=True, exist_ok=True)
(plugin_dir / "title.txt").write_text("Example Domain")
bus = create_bus(name="test_snapshot_title_file")
process_service = ProcessService(bus)
service = ArchiveResultService(bus, process_service=process_service)
event = ProcessCompletedEvent(
plugin_name="title",
hook_name="on_Snapshot__54_title.js",
stdout='{"snapshot_id":"%s","type":"ArchiveResult","status":"noresults","output_str":"No title found"}\n' % snapshot.id,
stderr="",
exit_code=0,
output_dir=str(plugin_dir),
output_files=[OutputFile(path="title.txt", extension="txt", mimetype="text/plain", size=14)],
process_id="proc-title-file",
snapshot_id=str(snapshot.id),
start_ts="2026-03-22T12:00:00+00:00",
end_ts="2026-03-22T12:00:01+00:00",
)
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
service._project_from_process_completed(
event,
{
"snapshot_id": str(snapshot.id),
"plugin": "title",
"hook_name": "on_Snapshot__54_title.js",
"status": "noresults",
"output_str": "No title found",
},
output_files,
output_size,
output_mimetypes,
)
snapshot.refresh_from_db()
assert snapshot.title == "Example Domain"
_cleanup_machine_process_rows()
def test_snapshot_resolved_title_falls_back_to_title_file_without_db_title():
from archivebox.core.models import ArchiveResult
snapshot = _create_snapshot()
plugin_dir = Path(snapshot.output_dir) / "title"
plugin_dir.mkdir(parents=True, exist_ok=True)
(plugin_dir / "title.txt").write_text("Example Domain")
ArchiveResult.objects.create(
snapshot=snapshot,
plugin="title",
hook_name="on_Snapshot__54_title.js",
status="noresults",
output_str="No title found",
output_files={"title.txt": {}},
)
snapshot.refresh_from_db()
assert snapshot.title in (None, "")
assert snapshot.resolved_title == "Example Domain"
_cleanup_machine_process_rows()
def test_collect_output_metadata_preserves_file_metadata():
from archivebox.services.archive_result_service import _resolve_output_metadata
output_files, output_size, output_mimetypes = _resolve_output_metadata(
[OutputFile(path="index.html", extension="html", mimetype="text/html", size=42)],
Path("/tmp/does-not-need-to-exist"),
)
assert output_files == {
"index.html": {
"extension": "html",
"mimetype": "text/html",
"size": 42,
},
}
assert output_size == 42
assert output_mimetypes == "text/html"
def test_collect_output_metadata_detects_warc_gz_mimetype(tmp_path):
from archivebox.services.archive_result_service import _collect_output_metadata
plugin_dir = tmp_path / "wget"
warc_file = plugin_dir / "warc" / "capture.warc.gz"
warc_file.parent.mkdir(parents=True, exist_ok=True)
warc_file.write_bytes(b"warc-bytes")
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
assert output_files["warc/capture.warc.gz"] == {
"extension": "gz",
"mimetype": "application/warc",
"size": 10,
}
assert output_size == 10
assert output_mimetypes == "application/warc"
def test_process_started_hydrates_binary_and_iface_from_existing_binary_records(monkeypatch):
from archivebox.machine.models import Binary, NetworkInterface
from archivebox.services.process_service import ProcessService
machine = _create_machine()
iface = _create_iface(machine)
monkeypatch.setattr(NetworkInterface, 'current', classmethod(lambda cls, refresh=False: iface))
monkeypatch.setattr(NetworkInterface, "current", classmethod(lambda cls, refresh=False: iface))
binary = Binary.objects.create(
machine=machine,
name='postlight-parser',
abspath='/tmp/postlight-parser',
version='2.2.3',
binprovider='npm',
binproviders='npm',
name="postlight-parser",
abspath="/tmp/postlight-parser",
version="2.2.3",
binprovider="npm",
binproviders="npm",
status=Binary.StatusChoices.INSTALLED,
)
@@ -268,15 +460,15 @@ def test_process_started_uses_node_binary_for_js_hooks_without_plugin_binary(mon
machine = _create_machine()
iface = _create_iface(machine)
monkeypatch.setattr(NetworkInterface, 'current', classmethod(lambda cls, refresh=False: iface))
monkeypatch.setattr(NetworkInterface, "current", classmethod(lambda cls, refresh=False: iface))
node = Binary.objects.create(
machine=machine,
name='node',
abspath='/tmp/node',
version='22.0.0',
binprovider='env',
binproviders='env',
name="node",
abspath="/tmp/node",
version="22.0.0",
binprovider="env",
binproviders="env",
status=Binary.StatusChoices.INSTALLED,
)
@@ -303,3 +495,40 @@ def test_process_started_uses_node_binary_for_js_hooks_without_plugin_binary(mon
process = service._get_or_create_process(event)
assert process.binary_id == node.id
assert process.iface_id == iface.id
def test_binary_event_reuses_existing_installed_binary_row(monkeypatch):
from archivebox.machine.models import Binary, Machine
from archivebox.services.binary_service import BinaryService as ArchiveBoxBinaryService
machine = _create_machine()
monkeypatch.setattr(Machine, "current", classmethod(lambda cls: machine))
binary = Binary.objects.create(
machine=machine,
name="wget",
abspath="/bin/sh",
version="9.9.9",
binprovider="env",
binproviders="env,apt,brew",
status=Binary.StatusChoices.INSTALLED,
)
service = ArchiveBoxBinaryService(create_bus(name="test_binary_event_reuses_existing_installed_binary_row"))
event = BinaryEvent(
name="wget",
plugin_name="wget",
hook_name="on_Crawl__10_wget_install.finite.bg",
output_dir="/tmp/wget",
binproviders="provider",
)
service._project_binary(event)
binary.refresh_from_db()
assert Binary.objects.filter(machine=machine, name="wget").count() == 1
assert binary.status == Binary.StatusChoices.INSTALLED
assert binary.abspath == "/bin/sh"
assert binary.version == "9.9.9"
assert binary.binprovider == "env"
assert binary.binproviders == "provider"

View File

@@ -78,8 +78,8 @@ class TestLDAPConfig(unittest.TestCase):
from archivebox.config import get_CONFIG
all_config = get_CONFIG()
self.assertIn('LDAP_CONFIG', all_config)
self.assertEqual(all_config['LDAP_CONFIG'].__class__.__name__, 'LDAPConfig')
self.assertIn("LDAP_CONFIG", all_config)
self.assertEqual(all_config["LDAP_CONFIG"].__class__.__name__, "LDAPConfig")
class TestLDAPIntegration(unittest.TestCase):
@@ -95,7 +95,7 @@ class TestLDAPIntegration(unittest.TestCase):
self.assertIn("django.contrib.auth.backends.ModelBackend", settings.AUTHENTICATION_BACKENDS)
# LDAP backend should not be present when disabled
ldap_backends = [b for b in settings.AUTHENTICATION_BACKENDS if 'ldap' in b.lower()]
ldap_backends = [b for b in settings.AUTHENTICATION_BACKENDS if "ldap" in b.lower()]
self.assertEqual(len(ldap_backends), 0, "LDAP backend should not be present when LDAP_ENABLED=False")
def test_django_settings_with_ldap_library_check(self):
@@ -106,7 +106,8 @@ class TestLDAPIntegration(unittest.TestCase):
if not ldap_available:
# Settings should have loaded without LDAP backend
from django.conf import settings
ldap_backends = [b for b in settings.AUTHENTICATION_BACKENDS if 'ldap' in b.lower()]
ldap_backends = [b for b in settings.AUTHENTICATION_BACKENDS if "ldap" in b.lower()]
self.assertEqual(len(ldap_backends), 0, "LDAP backend should not be present when libraries unavailable")
@@ -117,14 +118,14 @@ class TestLDAPAuthBackend(unittest.TestCase):
"""Test that ArchiveBoxLDAPBackend class is defined."""
from archivebox.ldap.auth import ArchiveBoxLDAPBackend
self.assertTrue(hasattr(ArchiveBoxLDAPBackend, 'authenticate_ldap_user'))
self.assertTrue(hasattr(ArchiveBoxLDAPBackend, "authenticate_ldap_user"))
def test_ldap_backend_inherits_correctly(self):
"""Test that ArchiveBoxLDAPBackend has correct inheritance."""
from archivebox.ldap.auth import ArchiveBoxLDAPBackend
# Should have authenticate_ldap_user method (from base or overridden)
self.assertTrue(callable(getattr(ArchiveBoxLDAPBackend, 'authenticate_ldap_user', None)))
self.assertTrue(callable(getattr(ArchiveBoxLDAPBackend, "authenticate_ldap_user", None)))
class TestArchiveBoxWithLDAP(unittest.TestCase):
@@ -132,7 +133,7 @@ class TestArchiveBoxWithLDAP(unittest.TestCase):
def setUp(self):
"""Set up test environment."""
self.work_dir = tempfile.mkdtemp(prefix='archivebox-ldap-test-')
self.work_dir = tempfile.mkdtemp(prefix="archivebox-ldap-test-")
def test_archivebox_init_without_ldap(self):
"""Test that archivebox init works without LDAP enabled."""
@@ -140,15 +141,15 @@ class TestArchiveBoxWithLDAP(unittest.TestCase):
# Run archivebox init
result = subprocess.run(
[sys.executable, '-m', 'archivebox', 'init'],
[sys.executable, "-m", "archivebox", "init"],
cwd=self.work_dir,
capture_output=True,
timeout=45,
env={
**os.environ,
'DATA_DIR': self.work_dir,
'LDAP_ENABLED': 'False',
}
"DATA_DIR": self.work_dir,
"LDAP_ENABLED": "False",
},
)
# Should succeed
@@ -160,16 +161,16 @@ class TestArchiveBoxWithLDAP(unittest.TestCase):
# Run archivebox version with LDAP config env vars
result = subprocess.run(
[sys.executable, '-m', 'archivebox', 'version'],
[sys.executable, "-m", "archivebox", "version"],
cwd=self.work_dir,
capture_output=True,
timeout=10,
env={
**os.environ,
'DATA_DIR': self.work_dir,
'LDAP_ENABLED': 'False',
'LDAP_SERVER_URI': 'ldap://ldap-test.localhost:389',
}
"DATA_DIR": self.work_dir,
"LDAP_ENABLED": "False",
"LDAP_SERVER_URI": "ldap://ldap-test.localhost:389",
},
)
# Should succeed
@@ -181,7 +182,7 @@ class TestLDAPConfigValidationInArchiveBox(unittest.TestCase):
def setUp(self):
"""Set up test environment."""
self.work_dir = tempfile.mkdtemp(prefix='archivebox-ldap-validation-')
self.work_dir = tempfile.mkdtemp(prefix="archivebox-ldap-validation-")
def test_archivebox_init_with_incomplete_ldap_config(self):
"""Test that archivebox init fails with helpful error when LDAP config is incomplete."""
@@ -189,16 +190,16 @@ class TestLDAPConfigValidationInArchiveBox(unittest.TestCase):
# Run archivebox init with LDAP enabled but missing required fields
result = subprocess.run(
[sys.executable, '-m', 'archivebox', 'init'],
[sys.executable, "-m", "archivebox", "init"],
cwd=self.work_dir,
capture_output=True,
timeout=45,
env={
**os.environ,
'DATA_DIR': self.work_dir,
'LDAP_ENABLED': 'True',
"DATA_DIR": self.work_dir,
"LDAP_ENABLED": "True",
# Missing: LDAP_SERVER_URI, LDAP_BIND_DN, etc.
}
},
)
# Should fail with validation error
@@ -206,9 +207,12 @@ class TestLDAPConfigValidationInArchiveBox(unittest.TestCase):
# Check error message
stderr = result.stderr.decode()
self.assertIn("LDAP_* config options must all be set", stderr,
f"Expected validation error message in: {stderr}")
self.assertIn(
"LDAP_* config options must all be set",
stderr,
f"Expected validation error message in: {stderr}",
)
if __name__ == '__main__':
if __name__ == "__main__":
unittest.main()

View File

@@ -14,8 +14,8 @@ def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None:
candidates = {snapshot_id}
if len(snapshot_id) == 32:
candidates.add(f"{snapshot_id[:8]}-{snapshot_id[8:12]}-{snapshot_id[12:16]}-{snapshot_id[16:20]}-{snapshot_id[20:]}")
elif len(snapshot_id) == 36 and '-' in snapshot_id:
candidates.add(snapshot_id.replace('-', ''))
elif len(snapshot_id) == 36 and "-" in snapshot_id:
candidates.add(snapshot_id.replace("-", ""))
for needle in candidates:
for path in data_dir.rglob(needle):
@@ -28,7 +28,7 @@ def test_add_single_url_creates_snapshot_in_db(tmp_path, process, disable_extrac
"""Test that adding a single URL creates a snapshot in the database."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
@@ -41,14 +41,14 @@ def test_add_single_url_creates_snapshot_in_db(tmp_path, process, disable_extrac
conn.close()
assert len(snapshots) == 1
assert snapshots[0][0] == 'https://example.com'
assert snapshots[0][0] == "https://example.com"
def test_add_bg_creates_root_snapshot_rows_immediately(tmp_path, process, disable_extractors_dict):
"""Background add should create root snapshots immediately so the queue is visible in the DB."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'add', '--bg', '--depth=0', 'https://example.com'],
["archivebox", "add", "--bg", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
@@ -61,15 +61,15 @@ def test_add_bg_creates_root_snapshot_rows_immediately(tmp_path, process, disabl
conn.close()
assert len(snapshots) == 1
assert snapshots[0][0] == 'https://example.com'
assert snapshots[0][1] == 'queued'
assert snapshots[0][0] == "https://example.com"
assert snapshots[0][1] == "queued"
def test_add_creates_crawl_record(tmp_path, process, disable_extractors_dict):
"""Test that add command creates a Crawl record in the database."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
@@ -86,7 +86,7 @@ def test_add_creates_source_file(tmp_path, process, disable_extractors_dict):
"""Test that add creates a source file with the URL."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
@@ -105,7 +105,7 @@ def test_add_multiple_urls_single_command(tmp_path, process, disable_extractors_
"""Test adding multiple URLs in a single command."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com', 'https://example.org'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com", "https://example.org"],
capture_output=True,
env=disable_extractors_dict,
)
@@ -119,8 +119,8 @@ def test_add_multiple_urls_single_command(tmp_path, process, disable_extractors_
conn.close()
assert snapshot_count == 2
assert urls[0][0] == 'https://example.com'
assert urls[1][0] == 'https://example.org'
assert urls[0][0] == "https://example.com"
assert urls[1][0] == "https://example.org"
def test_add_from_file(tmp_path, process, disable_extractors_dict):
@@ -136,7 +136,7 @@ def test_add_from_file(tmp_path, process, disable_extractors_dict):
urls_file.write_text("https://example.com\nhttps://example.org\n")
result = subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', str(urls_file)],
["archivebox", "add", "--index-only", "--depth=0", str(urls_file)],
capture_output=True,
env=disable_extractors_dict,
)
@@ -158,41 +158,41 @@ def test_add_with_depth_0_flag(tmp_path, process, disable_extractors_dict):
"""Test that --depth=0 flag is accepted and works."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
assert result.returncode == 0
assert 'unrecognized arguments: --depth' not in result.stderr.decode('utf-8')
assert "unrecognized arguments: --depth" not in result.stderr.decode("utf-8")
def test_add_with_depth_1_flag(tmp_path, process, disable_extractors_dict):
"""Test that --depth=1 flag is accepted."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'add', '--index-only', '--depth=1', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=1", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
assert result.returncode == 0
assert 'unrecognized arguments: --depth' not in result.stderr.decode('utf-8')
assert "unrecognized arguments: --depth" not in result.stderr.decode("utf-8")
def test_add_rejects_invalid_depth_values(tmp_path, process, disable_extractors_dict):
"""Test that add rejects depth values outside the supported range."""
os.chdir(tmp_path)
for depth in ('5', '-1'):
for depth in ("5", "-1"):
result = subprocess.run(
['archivebox', 'add', '--index-only', f'--depth={depth}', 'https://example.com'],
["archivebox", "add", "--index-only", f"--depth={depth}", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
stderr = result.stderr.decode('utf-8').lower()
stderr = result.stderr.decode("utf-8").lower()
assert result.returncode != 0
assert 'invalid' in stderr or 'not one of' in stderr
assert "invalid" in stderr or "not one of" in stderr
def test_add_with_tags(tmp_path, process, disable_extractors_dict):
@@ -203,7 +203,7 @@ def test_add_with_tags(tmp_path, process, disable_extractors_dict):
"""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', '--tag=test,example', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "--tag=test,example", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
@@ -214,14 +214,14 @@ def test_add_with_tags(tmp_path, process, disable_extractors_dict):
conn.close()
# Tags are stored as a comma-separated string in crawl
assert 'test' in tags_str or 'example' in tags_str
assert "test" in tags_str or "example" in tags_str
def test_add_records_selected_persona_on_crawl(tmp_path, process, disable_extractors_dict):
"""Test add persists the selected persona so browser config derives from it later."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', '--persona=Default', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "--persona=Default", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
@@ -231,12 +231,12 @@ def test_add_records_selected_persona_on_crawl(tmp_path, process, disable_extrac
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
persona_id, default_persona = c.execute(
"SELECT persona_id, json_extract(config, '$.DEFAULT_PERSONA') FROM crawls_crawl LIMIT 1"
"SELECT persona_id, json_extract(config, '$.DEFAULT_PERSONA') FROM crawls_crawl LIMIT 1",
).fetchone()
conn.close()
assert persona_id
assert default_persona == 'Default'
assert default_persona == "Default"
assert (tmp_path / "personas" / "Default" / "chrome_user_data").is_dir()
@@ -244,10 +244,13 @@ def test_add_records_url_filter_overrides_on_crawl(tmp_path, process, disable_ex
os.chdir(tmp_path)
result = subprocess.run(
[
'archivebox', 'add', '--index-only', '--depth=0',
'--domain-allowlist=example.com,*.example.com',
'--domain-denylist=static.example.com',
'https://example.com',
"archivebox",
"add",
"--index-only",
"--depth=0",
"--domain-allowlist=example.com,*.example.com",
"--domain-denylist=static.example.com",
"https://example.com",
],
capture_output=True,
env=disable_extractors_dict,
@@ -258,12 +261,12 @@ def test_add_records_url_filter_overrides_on_crawl(tmp_path, process, disable_ex
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
allowlist, denylist = c.execute(
"SELECT json_extract(config, '$.URL_ALLOWLIST'), json_extract(config, '$.URL_DENYLIST') FROM crawls_crawl LIMIT 1"
"SELECT json_extract(config, '$.URL_ALLOWLIST'), json_extract(config, '$.URL_DENYLIST') FROM crawls_crawl LIMIT 1",
).fetchone()
conn.close()
assert allowlist == 'example.com,*.example.com'
assert denylist == 'static.example.com'
assert allowlist == "example.com,*.example.com"
assert denylist == "static.example.com"
assert (tmp_path / "personas" / "Default" / "chrome_extensions").is_dir()
@@ -277,14 +280,14 @@ def test_add_duplicate_url_creates_separate_crawls(tmp_path, process, disable_ex
# Add URL first time
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
# Add same URL second time
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
@@ -306,27 +309,27 @@ def test_add_with_overwrite_flag(tmp_path, process, disable_extractors_dict):
# Add URL first time
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
# Add with overwrite
result = subprocess.run(
['archivebox', 'add', '--index-only', '--overwrite', 'https://example.com'],
["archivebox", "add", "--index-only", "--overwrite", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
assert result.returncode == 0
assert 'unrecognized arguments: --overwrite' not in result.stderr.decode('utf-8')
assert "unrecognized arguments: --overwrite" not in result.stderr.decode("utf-8")
def test_add_creates_snapshot_output_directory(tmp_path, process, disable_extractors_dict):
"""Test that add creates the current snapshot output directory on disk."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
@@ -346,14 +349,39 @@ def test_add_help_shows_depth_and_tag_options(tmp_path, process):
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'add', '--help'],
["archivebox", "add", "--help"],
capture_output=True,
text=True,
)
assert result.returncode == 0
assert '--depth' in result.stdout
assert '--tag' in result.stdout
assert "--depth" in result.stdout
assert "--max-urls" in result.stdout
assert "--max-size" in result.stdout
assert "--tag" in result.stdout
def test_add_records_max_url_and_size_limits_on_crawl(tmp_path, process, disable_extractors_dict):
os.chdir(tmp_path)
result = subprocess.run(
["archivebox", "add", "--index-only", "--depth=1", "--max-urls=3", "--max-size=45mb", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
assert result.returncode == 0
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
max_urls, max_size, config_max_urls, config_max_size = c.execute(
"SELECT max_urls, max_size, json_extract(config, '$.MAX_URLS'), json_extract(config, '$.MAX_SIZE') FROM crawls_crawl LIMIT 1",
).fetchone()
conn.close()
assert max_urls == 3
assert max_size == 45 * 1024 * 1024
assert config_max_urls == 3
assert config_max_size == 45 * 1024 * 1024
def test_add_without_args_shows_usage(tmp_path, process):
@@ -361,21 +389,21 @@ def test_add_without_args_shows_usage(tmp_path, process):
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'add'],
["archivebox", "add"],
capture_output=True,
text=True,
)
combined = result.stdout + result.stderr
assert result.returncode != 0
assert 'usage' in combined.lower() or 'url' in combined.lower()
assert "usage" in combined.lower() or "url" in combined.lower()
def test_add_index_only_skips_extraction(tmp_path, process, disable_extractors_dict):
"""Test that --index-only flag skips extraction (fast)."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
timeout=30, # Should be fast
@@ -396,7 +424,7 @@ def test_add_links_snapshot_to_crawl(tmp_path, process, disable_extractors_dict)
"""Test that add links the snapshot to the crawl via crawl_id."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
@@ -419,7 +447,7 @@ def test_add_sets_snapshot_timestamp(tmp_path, process, disable_extractors_dict)
"""Test that add sets a timestamp on the snapshot."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)

View File

@@ -17,10 +17,10 @@ from archivebox.tests.conftest import (
)
PROJECTOR_TEST_ENV = {
'PLUGINS': 'favicon',
'SAVE_FAVICON': 'True',
'USE_COLOR': 'False',
'SHOW_PROGRESS': 'False',
"PLUGINS": "favicon",
"SAVE_FAVICON": "True",
"USE_COLOR": "False",
"SHOW_PROGRESS": "False",
}
@@ -32,12 +32,12 @@ class TestArchiveResultCreate:
url = create_test_url()
# Create a snapshot first
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
snapshot = parse_jsonl_output(stdout1)[0]
# Pipe snapshot to archiveresult create
stdout2, stderr, code = run_archivebox_cmd(
['archiveresult', 'create', '--plugin=title'],
["archiveresult", "create", "--plugin=title"],
stdin=json.dumps(snapshot),
data_dir=initialized_archive,
)
@@ -46,49 +46,49 @@ class TestArchiveResultCreate:
records = parse_jsonl_output(stdout2)
# Should have the Snapshot passed through and an ArchiveResult request emitted
types = [r.get('type') for r in records]
assert 'Snapshot' in types
assert 'ArchiveResult' in types
types = [r.get("type") for r in records]
assert "Snapshot" in types
assert "ArchiveResult" in types
ar = next(r for r in records if r['type'] == 'ArchiveResult')
assert ar['plugin'] == 'title'
assert 'id' not in ar
ar = next(r for r in records if r["type"] == "ArchiveResult")
assert ar["plugin"] == "title"
assert "id" not in ar
def test_create_with_specific_plugin(self, initialized_archive):
"""Create archive result for specific plugin."""
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
snapshot = parse_jsonl_output(stdout1)[0]
stdout2, stderr, code = run_archivebox_cmd(
['archiveresult', 'create', '--plugin=screenshot'],
["archiveresult", "create", "--plugin=screenshot"],
stdin=json.dumps(snapshot),
data_dir=initialized_archive,
)
assert code == 0
records = parse_jsonl_output(stdout2)
ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
ar_records = [r for r in records if r.get("type") == "ArchiveResult"]
assert len(ar_records) >= 1
assert ar_records[0]['plugin'] == 'screenshot'
assert ar_records[0]["plugin"] == "screenshot"
def test_create_pass_through_crawl(self, initialized_archive):
"""Pass-through Crawl records unchanged."""
url = create_test_url()
# Create crawl and snapshot
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
stdout1, _, _ = run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive)
crawl = parse_jsonl_output(stdout1)[0]
stdout2, _, _ = run_archivebox_cmd(
['snapshot', 'create'],
["snapshot", "create"],
stdin=json.dumps(crawl),
data_dir=initialized_archive,
)
# Now pipe all to archiveresult create
stdout3, stderr, code = run_archivebox_cmd(
['archiveresult', 'create', '--plugin=title'],
["archiveresult", "create", "--plugin=title"],
stdin=stdout2,
data_dir=initialized_archive,
)
@@ -96,23 +96,23 @@ class TestArchiveResultCreate:
assert code == 0
records = parse_jsonl_output(stdout3)
types = [r.get('type') for r in records]
assert 'Crawl' in types
assert 'Snapshot' in types
assert 'ArchiveResult' in types
types = [r.get("type") for r in records]
assert "Crawl" in types
assert "Snapshot" in types
assert "ArchiveResult" in types
def test_create_pass_through_only_when_no_snapshots(self, initialized_archive):
"""Only pass-through records but no new snapshots returns success."""
crawl_record = {'type': 'Crawl', 'id': 'fake-id', 'urls': 'https://example.com'}
crawl_record = {"type": "Crawl", "id": "fake-id", "urls": "https://example.com"}
stdout, stderr, code = run_archivebox_cmd(
['archiveresult', 'create'],
["archiveresult", "create"],
stdin=json.dumps(crawl_record),
data_dir=initialized_archive,
)
assert code == 0
assert 'Passed through' in stderr
assert "Passed through" in stderr
class TestArchiveResultList:
@@ -121,26 +121,26 @@ class TestArchiveResultList:
def test_list_empty(self, initialized_archive):
"""List with no archive results returns empty."""
stdout, stderr, code = run_archivebox_cmd(
['archiveresult', 'list'],
["archiveresult", "list"],
data_dir=initialized_archive,
)
assert code == 0
assert 'Listed 0 archive results' in stderr
assert "Listed 0 archive results" in stderr
def test_list_filter_by_status(self, initialized_archive):
"""Filter archive results by status."""
# Create snapshot and materialize an archive result via the runner
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
snapshot = parse_jsonl_output(stdout1)[0]
stdout2, _, _ = run_archivebox_cmd(
['archiveresult', 'create', '--plugin=favicon'],
["archiveresult", "create", "--plugin=favicon"],
stdin=json.dumps(snapshot),
data_dir=initialized_archive,
)
run_archivebox_cmd(
['run'],
["run"],
stdin=stdout2,
data_dir=initialized_archive,
timeout=120,
@@ -148,38 +148,38 @@ class TestArchiveResultList:
)
created = parse_jsonl_output(
run_archivebox_cmd(
['archiveresult', 'list', '--plugin=favicon'],
["archiveresult", "list", "--plugin=favicon"],
data_dir=initialized_archive,
)[0]
)[0],
)[0]
run_archivebox_cmd(
['archiveresult', 'update', '--status=queued'],
["archiveresult", "update", "--status=queued"],
stdin=json.dumps(created),
data_dir=initialized_archive,
)
stdout, stderr, code = run_archivebox_cmd(
['archiveresult', 'list', '--status=queued'],
["archiveresult", "list", "--status=queued"],
data_dir=initialized_archive,
)
assert code == 0
records = parse_jsonl_output(stdout)
for r in records:
assert r['status'] == 'queued'
assert r["status"] == "queued"
def test_list_filter_by_plugin(self, initialized_archive):
"""Filter archive results by plugin."""
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
snapshot = parse_jsonl_output(stdout1)[0]
stdout2, _, _ = run_archivebox_cmd(
['archiveresult', 'create', '--plugin=favicon'],
["archiveresult", "create", "--plugin=favicon"],
stdin=json.dumps(snapshot),
data_dir=initialized_archive,
)
run_archivebox_cmd(
['run'],
["run"],
stdin=stdout2,
data_dir=initialized_archive,
timeout=120,
@@ -187,29 +187,29 @@ class TestArchiveResultList:
)
stdout, stderr, code = run_archivebox_cmd(
['archiveresult', 'list', '--plugin=favicon'],
["archiveresult", "list", "--plugin=favicon"],
data_dir=initialized_archive,
)
assert code == 0
records = parse_jsonl_output(stdout)
for r in records:
assert r['plugin'] == 'favicon'
assert r["plugin"] == "favicon"
def test_list_with_limit(self, initialized_archive):
"""Limit number of results."""
# Create multiple archive results
for _ in range(3):
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
snapshot = parse_jsonl_output(stdout1)[0]
stdout2, _, _ = run_archivebox_cmd(
['archiveresult', 'create', '--plugin=favicon'],
["archiveresult", "create", "--plugin=favicon"],
stdin=json.dumps(snapshot),
data_dir=initialized_archive,
)
run_archivebox_cmd(
['run'],
["run"],
stdin=stdout2,
data_dir=initialized_archive,
timeout=120,
@@ -217,7 +217,7 @@ class TestArchiveResultList:
)
stdout, stderr, code = run_archivebox_cmd(
['archiveresult', 'list', '--limit=2'],
["archiveresult", "list", "--limit=2"],
data_dir=initialized_archive,
)
@@ -232,38 +232,38 @@ class TestArchiveResultUpdate:
def test_update_status(self, initialized_archive):
"""Update archive result status."""
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
snapshot = parse_jsonl_output(stdout1)[0]
stdout2, _, _ = run_archivebox_cmd(
['archiveresult', 'create', '--plugin=favicon'],
["archiveresult", "create", "--plugin=favicon"],
stdin=json.dumps(snapshot),
data_dir=initialized_archive,
)
stdout_run, _, _ = run_archivebox_cmd(
['run'],
["run"],
stdin=stdout2,
data_dir=initialized_archive,
timeout=120,
env=PROJECTOR_TEST_ENV,
)
stdout_list, _, _ = run_archivebox_cmd(
['archiveresult', 'list', '--plugin=favicon'],
["archiveresult", "list", "--plugin=favicon"],
data_dir=initialized_archive,
)
ar = parse_jsonl_output(stdout_list)[0]
stdout3, stderr, code = run_archivebox_cmd(
['archiveresult', 'update', '--status=failed'],
["archiveresult", "update", "--status=failed"],
stdin=json.dumps(ar),
data_dir=initialized_archive,
)
assert code == 0
assert 'Updated 1 archive results' in stderr
assert "Updated 1 archive results" in stderr
records = parse_jsonl_output(stdout3)
assert records[0]['status'] == 'failed'
assert records[0]["status"] == "failed"
class TestArchiveResultDelete:
@@ -272,65 +272,65 @@ class TestArchiveResultDelete:
def test_delete_requires_yes(self, initialized_archive):
"""Delete requires --yes flag."""
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
snapshot = parse_jsonl_output(stdout1)[0]
stdout2, _, _ = run_archivebox_cmd(
['archiveresult', 'create', '--plugin=favicon'],
["archiveresult", "create", "--plugin=favicon"],
stdin=json.dumps(snapshot),
data_dir=initialized_archive,
)
stdout_run, _, _ = run_archivebox_cmd(
['run'],
["run"],
stdin=stdout2,
data_dir=initialized_archive,
timeout=120,
env=PROJECTOR_TEST_ENV,
)
stdout_list, _, _ = run_archivebox_cmd(
['archiveresult', 'list', '--plugin=favicon'],
["archiveresult", "list", "--plugin=favicon"],
data_dir=initialized_archive,
)
ar = parse_jsonl_output(stdout_list)[0]
stdout, stderr, code = run_archivebox_cmd(
['archiveresult', 'delete'],
["archiveresult", "delete"],
stdin=json.dumps(ar),
data_dir=initialized_archive,
)
assert code == 1
assert '--yes' in stderr
assert "--yes" in stderr
def test_delete_with_yes(self, initialized_archive):
"""Delete with --yes flag works."""
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
snapshot = parse_jsonl_output(stdout1)[0]
stdout2, _, _ = run_archivebox_cmd(
['archiveresult', 'create', '--plugin=favicon'],
["archiveresult", "create", "--plugin=favicon"],
stdin=json.dumps(snapshot),
data_dir=initialized_archive,
)
stdout_run, _, _ = run_archivebox_cmd(
['run'],
["run"],
stdin=stdout2,
data_dir=initialized_archive,
timeout=120,
env=PROJECTOR_TEST_ENV,
)
stdout_list, _, _ = run_archivebox_cmd(
['archiveresult', 'list', '--plugin=favicon'],
["archiveresult", "list", "--plugin=favicon"],
data_dir=initialized_archive,
)
ar = parse_jsonl_output(stdout_list)[0]
stdout, stderr, code = run_archivebox_cmd(
['archiveresult', 'delete', '--yes'],
["archiveresult", "delete", "--yes"],
stdin=json.dumps(ar),
data_dir=initialized_archive,
)
assert code == 0
assert 'Deleted 1 archive results' in stderr
assert "Deleted 1 archive results" in stderr

View File

@@ -11,27 +11,27 @@ import subprocess
def test_config_displays_all_config(tmp_path, process):
"""Test that config without args displays all configuration."""
os.chdir(tmp_path)
result = subprocess.run(['archivebox', 'config'], capture_output=True, text=True)
result = subprocess.run(["archivebox", "config"], capture_output=True, text=True)
assert result.returncode == 0
output = result.stdout
# Should show config sections
assert len(output) > 100
# Should show at least some standard config keys
assert 'TIMEOUT' in output or 'OUTPUT_PERMISSIONS' in output
assert "TIMEOUT" in output or "OUTPUT_PERMISSIONS" in output
def test_config_get_specific_key(tmp_path, process):
"""Test that config --get KEY retrieves specific value."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'config', '--get', 'TIMEOUT'],
["archivebox", "config", "--get", "TIMEOUT"],
capture_output=True,
text=True,
)
assert result.returncode == 0
assert 'TIMEOUT' in result.stdout
assert "TIMEOUT" in result.stdout
def test_config_set_writes_to_file(tmp_path, process):
@@ -39,7 +39,7 @@ def test_config_set_writes_to_file(tmp_path, process):
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'config', '--set', 'TIMEOUT=120'],
["archivebox", "config", "--set", "TIMEOUT=120"],
capture_output=True,
text=True,
)
@@ -47,11 +47,11 @@ def test_config_set_writes_to_file(tmp_path, process):
assert result.returncode == 0
# Verify config file was updated
config_file = tmp_path / 'ArchiveBox.conf'
config_file = tmp_path / "ArchiveBox.conf"
assert config_file.exists()
content = config_file.read_text()
assert 'TIMEOUT' in content or '120' in content
assert "TIMEOUT" in content or "120" in content
def test_config_set_and_get_roundtrip(tmp_path, process):
@@ -60,19 +60,19 @@ def test_config_set_and_get_roundtrip(tmp_path, process):
# Set a unique value
subprocess.run(
['archivebox', 'config', '--set', 'TIMEOUT=987'],
["archivebox", "config", "--set", "TIMEOUT=987"],
capture_output=True,
text=True,
)
# Get the value back
result = subprocess.run(
['archivebox', 'config', '--get', 'TIMEOUT'],
["archivebox", "config", "--get", "TIMEOUT"],
capture_output=True,
text=True,
)
assert '987' in result.stdout
assert "987" in result.stdout
def test_config_set_multiple_values(tmp_path, process):
@@ -80,7 +80,7 @@ def test_config_set_multiple_values(tmp_path, process):
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'config', '--set', 'TIMEOUT=111', 'YTDLP_TIMEOUT=222'],
["archivebox", "config", "--set", "TIMEOUT=111", "YTDLP_TIMEOUT=222"],
capture_output=True,
text=True,
)
@@ -88,10 +88,10 @@ def test_config_set_multiple_values(tmp_path, process):
assert result.returncode == 0
# Verify both were written
config_file = tmp_path / 'ArchiveBox.conf'
config_file = tmp_path / "ArchiveBox.conf"
content = config_file.read_text()
assert '111' in content
assert '222' in content
assert "111" in content
assert "222" in content
def test_config_set_invalid_key_fails(tmp_path, process):
@@ -99,7 +99,7 @@ def test_config_set_invalid_key_fails(tmp_path, process):
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'config', '--set', 'TOTALLY_INVALID_KEY_XYZ=value'],
["archivebox", "config", "--set", "TOTALLY_INVALID_KEY_XYZ=value"],
capture_output=True,
text=True,
)
@@ -112,7 +112,7 @@ def test_config_set_requires_equals_sign(tmp_path, process):
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'config', '--set', 'TIMEOUT'],
["archivebox", "config", "--set", "TIMEOUT"],
capture_output=True,
text=True,
)
@@ -125,13 +125,13 @@ def test_config_search_finds_keys(tmp_path, process):
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'config', '--search', 'TIMEOUT'],
["archivebox", "config", "--search", "TIMEOUT"],
capture_output=True,
text=True,
)
# Should find timeout-related config
assert 'TIMEOUT' in result.stdout
assert "TIMEOUT" in result.stdout
def test_config_preserves_existing_values(tmp_path, process):
@@ -140,21 +140,21 @@ def test_config_preserves_existing_values(tmp_path, process):
# Set first value
subprocess.run(
['archivebox', 'config', '--set', 'TIMEOUT=100'],
["archivebox", "config", "--set", "TIMEOUT=100"],
capture_output=True,
)
# Set second value
subprocess.run(
['archivebox', 'config', '--set', 'YTDLP_TIMEOUT=200'],
["archivebox", "config", "--set", "YTDLP_TIMEOUT=200"],
capture_output=True,
)
# Verify both are in config file
config_file = tmp_path / 'ArchiveBox.conf'
config_file = tmp_path / "ArchiveBox.conf"
content = config_file.read_text()
assert 'TIMEOUT' in content
assert 'YTDLP_TIMEOUT' in content
assert "TIMEOUT" in content
assert "YTDLP_TIMEOUT" in content
def test_config_file_is_valid_toml(tmp_path, process):
@@ -162,15 +162,15 @@ def test_config_file_is_valid_toml(tmp_path, process):
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'config', '--set', 'TIMEOUT=150'],
["archivebox", "config", "--set", "TIMEOUT=150"],
capture_output=True,
)
config_file = tmp_path / 'ArchiveBox.conf'
config_file = tmp_path / "ArchiveBox.conf"
content = config_file.read_text()
# Basic TOML validation - should have sections and key=value pairs
assert '[' in content or '=' in content
assert "[" in content or "=" in content
def test_config_updates_existing_value(tmp_path, process):
@@ -179,22 +179,22 @@ def test_config_updates_existing_value(tmp_path, process):
# Set initial value
subprocess.run(
['archivebox', 'config', '--set', 'TIMEOUT=100'],
["archivebox", "config", "--set", "TIMEOUT=100"],
capture_output=True,
)
# Update to new value
subprocess.run(
['archivebox', 'config', '--set', 'TIMEOUT=200'],
["archivebox", "config", "--set", "TIMEOUT=200"],
capture_output=True,
)
# Get current value
result = subprocess.run(
['archivebox', 'config', '--get', 'TIMEOUT'],
["archivebox", "config", "--get", "TIMEOUT"],
capture_output=True,
text=True,
)
# Should show updated value
assert '200' in result.stdout
assert "200" in result.stdout

View File

@@ -25,26 +25,26 @@ class TestCrawlCreate:
url = create_test_url()
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'create', url],
["crawl", "create", url],
data_dir=initialized_archive,
)
assert code == 0, f"Command failed: {stderr}"
assert 'Created crawl' in stderr
assert "Created crawl" in stderr
# Check JSONL output
records = parse_jsonl_output(stdout)
assert len(records) == 1
assert records[0]['type'] == 'Crawl'
assert url in records[0]['urls']
assert records[0]["type"] == "Crawl"
assert url in records[0]["urls"]
def test_create_from_stdin_urls(self, initialized_archive):
"""Create crawl from stdin URLs (one per line)."""
urls = [create_test_url() for _ in range(3)]
stdin = '\n'.join(urls)
stdin = "\n".join(urls)
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'create'],
["crawl", "create"],
stdin=stdin,
data_dir=initialized_archive,
)
@@ -54,45 +54,45 @@ class TestCrawlCreate:
records = parse_jsonl_output(stdout)
assert len(records) == 1
crawl = records[0]
assert crawl['type'] == 'Crawl'
assert crawl["type"] == "Crawl"
# All URLs should be in the crawl
for url in urls:
assert url in crawl['urls']
assert url in crawl["urls"]
def test_create_with_depth(self, initialized_archive):
"""Create crawl with --depth flag."""
url = create_test_url()
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'create', '--depth=2', url],
["crawl", "create", "--depth=2", url],
data_dir=initialized_archive,
)
assert code == 0
records = parse_jsonl_output(stdout)
assert records[0]['max_depth'] == 2
assert records[0]["max_depth"] == 2
def test_create_with_tag(self, initialized_archive):
"""Create crawl with --tag flag."""
url = create_test_url()
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'create', '--tag=test-tag', url],
["crawl", "create", "--tag=test-tag", url],
data_dir=initialized_archive,
)
assert code == 0
records = parse_jsonl_output(stdout)
assert 'test-tag' in records[0].get('tags', '')
assert "test-tag" in records[0].get("tags_str", "")
def test_create_pass_through_other_types(self, initialized_archive):
"""Pass-through records of other types unchanged."""
tag_record = {'type': 'Tag', 'id': 'fake-tag-id', 'name': 'test'}
tag_record = {"type": "Tag", "id": "fake-tag-id", "name": "test"}
url = create_test_url()
stdin = json.dumps(tag_record) + '\n' + json.dumps({'url': url})
stdin = json.dumps(tag_record) + "\n" + json.dumps({"url": url})
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'create'],
["crawl", "create"],
stdin=stdin,
data_dir=initialized_archive,
)
@@ -101,20 +101,20 @@ class TestCrawlCreate:
records = parse_jsonl_output(stdout)
# Should have both the passed-through Tag and the new Crawl
types = [r.get('type') for r in records]
assert 'Tag' in types
assert 'Crawl' in types
types = [r.get("type") for r in records]
assert "Tag" in types
assert "Crawl" in types
def test_create_pass_through_existing_crawl(self, initialized_archive):
"""Existing Crawl records (with id) are passed through."""
# First create a crawl
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
stdout1, _, _ = run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive)
crawl = parse_jsonl_output(stdout1)[0]
# Now pipe it back - should pass through
stdout2, stderr, code = run_archivebox_cmd(
['crawl', 'create'],
["crawl", "create"],
stdin=json.dumps(crawl),
data_dir=initialized_archive,
)
@@ -122,7 +122,7 @@ class TestCrawlCreate:
assert code == 0
records = parse_jsonl_output(stdout2)
assert len(records) == 1
assert records[0]['id'] == crawl['id']
assert records[0]["id"] == crawl["id"]
class TestCrawlList:
@@ -131,51 +131,51 @@ class TestCrawlList:
def test_list_empty(self, initialized_archive):
"""List with no crawls returns empty."""
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'list'],
["crawl", "list"],
data_dir=initialized_archive,
)
assert code == 0
assert 'Listed 0 crawls' in stderr
assert "Listed 0 crawls" in stderr
def test_list_returns_created(self, initialized_archive):
"""List returns previously created crawls."""
url = create_test_url()
run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive)
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'list'],
["crawl", "list"],
data_dir=initialized_archive,
)
assert code == 0
records = parse_jsonl_output(stdout)
assert len(records) >= 1
assert any(url in r.get('urls', '') for r in records)
assert any(url in r.get("urls", "") for r in records)
def test_list_filter_by_status(self, initialized_archive):
"""Filter crawls by status."""
url = create_test_url()
run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive)
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'list', '--status=queued'],
["crawl", "list", "--status=queued"],
data_dir=initialized_archive,
)
assert code == 0
records = parse_jsonl_output(stdout)
for r in records:
assert r['status'] == 'queued'
assert r["status"] == "queued"
def test_list_with_limit(self, initialized_archive):
"""Limit number of results."""
# Create multiple crawls
for _ in range(3):
run_archivebox_cmd(['crawl', 'create', create_test_url()], data_dir=initialized_archive)
run_archivebox_cmd(["crawl", "create", create_test_url()], data_dir=initialized_archive)
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'list', '--limit=2'],
["crawl", "list", "--limit=2"],
data_dir=initialized_archive,
)
@@ -191,21 +191,21 @@ class TestCrawlUpdate:
"""Update crawl status."""
# Create a crawl
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
stdout1, _, _ = run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive)
crawl = parse_jsonl_output(stdout1)[0]
# Update it
stdout2, stderr, code = run_archivebox_cmd(
['crawl', 'update', '--status=started'],
["crawl", "update", "--status=started"],
stdin=json.dumps(crawl),
data_dir=initialized_archive,
)
assert code == 0
assert 'Updated 1 crawls' in stderr
assert "Updated 1 crawls" in stderr
records = parse_jsonl_output(stdout2)
assert records[0]['status'] == 'started'
assert records[0]["status"] == "started"
class TestCrawlDelete:
@@ -214,45 +214,45 @@ class TestCrawlDelete:
def test_delete_requires_yes(self, initialized_archive):
"""Delete requires --yes flag."""
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
stdout1, _, _ = run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive)
crawl = parse_jsonl_output(stdout1)[0]
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'delete'],
["crawl", "delete"],
stdin=json.dumps(crawl),
data_dir=initialized_archive,
)
assert code == 1
assert '--yes' in stderr
assert "--yes" in stderr
def test_delete_with_yes(self, initialized_archive):
"""Delete with --yes flag works."""
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
stdout1, _, _ = run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive)
crawl = parse_jsonl_output(stdout1)[0]
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'delete', '--yes'],
["crawl", "delete", "--yes"],
stdin=json.dumps(crawl),
data_dir=initialized_archive,
)
assert code == 0
assert 'Deleted 1 crawls' in stderr
assert "Deleted 1 crawls" in stderr
def test_delete_dry_run(self, initialized_archive):
"""Dry run shows what would be deleted."""
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
stdout1, _, _ = run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive)
crawl = parse_jsonl_output(stdout1)[0]
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'delete', '--dry-run'],
["crawl", "delete", "--dry-run"],
stdin=json.dumps(crawl),
data_dir=initialized_archive,
)
assert code == 0
assert 'Would delete' in stderr
assert 'dry run' in stderr.lower()
assert "Would delete" in stderr
assert "dry run" in stderr.lower()

View File

@@ -15,14 +15,14 @@ def test_extract_runs_on_existing_snapshots(tmp_path, process, disable_extractor
# Add a snapshot first
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
# Run extract
result = subprocess.run(
['archivebox', 'extract'],
["archivebox", "extract"],
capture_output=True,
env=disable_extractors_dict,
timeout=30,
@@ -38,7 +38,7 @@ def test_extract_preserves_snapshot_count(tmp_path, process, disable_extractors_
# Add snapshot
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
@@ -50,7 +50,7 @@ def test_extract_preserves_snapshot_count(tmp_path, process, disable_extractors_
# Run extract
subprocess.run(
['archivebox', 'extract', '--overwrite'],
["archivebox", "extract", "--overwrite"],
capture_output=True,
env=disable_extractors_dict,
timeout=30,

View File

@@ -6,34 +6,33 @@ import sqlite3
import json
def test_extract_runs_on_snapshot_id(tmp_path, process, disable_extractors_dict):
"""Test that extract command accepts a snapshot ID."""
os.chdir(tmp_path)
# First create a snapshot
subprocess.run(
['archivebox', 'add', '--index-only', 'https://example.com'],
["archivebox", "add", "--index-only", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
# Get the snapshot ID
conn = sqlite3.connect('index.sqlite3')
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
snapshot_id = c.execute("SELECT id FROM core_snapshot LIMIT 1").fetchone()[0]
conn.close()
# Run extract on the snapshot
result = subprocess.run(
['archivebox', 'extract', '--no-wait', str(snapshot_id)],
["archivebox", "extract", "--no-wait", str(snapshot_id)],
capture_output=True,
text=True,
env=disable_extractors_dict,
)
# Should not error about invalid snapshot ID
assert 'not found' not in result.stderr.lower()
assert "not found" not in result.stderr.lower()
def test_extract_with_enabled_extractor_creates_archiveresult(tmp_path, process, disable_extractors_dict):
@@ -42,33 +41,35 @@ def test_extract_with_enabled_extractor_creates_archiveresult(tmp_path, process,
# First create a snapshot
subprocess.run(
['archivebox', 'add', '--index-only', 'https://example.com'],
["archivebox", "add", "--index-only", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
# Get the snapshot ID
conn = sqlite3.connect('index.sqlite3')
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
snapshot_id = c.execute("SELECT id FROM core_snapshot LIMIT 1").fetchone()[0]
conn.close()
# Run extract with title extractor enabled
env = disable_extractors_dict.copy()
env['SAVE_TITLE'] = 'true'
env["SAVE_TITLE"] = "true"
subprocess.run(
['archivebox', 'extract', '--no-wait', str(snapshot_id)],
["archivebox", "extract", "--no-wait", str(snapshot_id)],
capture_output=True,
text=True,
env=env,
)
# Check for archiveresults (may be queued, not completed with --no-wait)
conn = sqlite3.connect('index.sqlite3')
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
count = c.execute("SELECT COUNT(*) FROM core_archiveresult WHERE snapshot_id = ?",
(snapshot_id,)).fetchone()[0]
count = c.execute(
"SELECT COUNT(*) FROM core_archiveresult WHERE snapshot_id = ?",
(snapshot_id,),
).fetchone()[0]
conn.close()
# May or may not have results depending on timing
@@ -81,25 +82,25 @@ def test_extract_plugin_option_accepted(tmp_path, process, disable_extractors_di
# First create a snapshot
subprocess.run(
['archivebox', 'add', '--index-only', 'https://example.com'],
["archivebox", "add", "--index-only", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
# Get the snapshot ID
conn = sqlite3.connect('index.sqlite3')
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
snapshot_id = c.execute("SELECT id FROM core_snapshot LIMIT 1").fetchone()[0]
conn.close()
result = subprocess.run(
['archivebox', 'extract', '--plugin=title', '--no-wait', str(snapshot_id)],
["archivebox", "extract", "--plugin=title", "--no-wait", str(snapshot_id)],
capture_output=True,
text=True,
env=disable_extractors_dict,
)
assert 'unrecognized arguments: --plugin' not in result.stderr
assert "unrecognized arguments: --plugin" not in result.stderr
def test_extract_stdin_snapshot_id(tmp_path, process, disable_extractors_dict):
@@ -108,27 +109,27 @@ def test_extract_stdin_snapshot_id(tmp_path, process, disable_extractors_dict):
# First create a snapshot
subprocess.run(
['archivebox', 'add', '--index-only', 'https://example.com'],
["archivebox", "add", "--index-only", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
# Get the snapshot ID
conn = sqlite3.connect('index.sqlite3')
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
snapshot_id = c.execute("SELECT id FROM core_snapshot LIMIT 1").fetchone()[0]
conn.close()
result = subprocess.run(
['archivebox', 'extract', '--no-wait'],
input=f'{snapshot_id}\n',
["archivebox", "extract", "--no-wait"],
input=f"{snapshot_id}\n",
capture_output=True,
text=True,
env=disable_extractors_dict,
)
# Should not show "not found" error
assert 'not found' not in result.stderr.lower() or result.returncode == 0
assert "not found" not in result.stderr.lower() or result.returncode == 0
def test_extract_stdin_jsonl_input(tmp_path, process, disable_extractors_dict):
@@ -137,21 +138,21 @@ def test_extract_stdin_jsonl_input(tmp_path, process, disable_extractors_dict):
# First create a snapshot
subprocess.run(
['archivebox', 'add', '--index-only', 'https://example.com'],
["archivebox", "add", "--index-only", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
# Get the snapshot ID
conn = sqlite3.connect('index.sqlite3')
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
snapshot_id = c.execute("SELECT id FROM core_snapshot LIMIT 1").fetchone()[0]
conn.close()
jsonl_input = json.dumps({"type": "Snapshot", "id": str(snapshot_id)}) + '\n'
jsonl_input = json.dumps({"type": "Snapshot", "id": str(snapshot_id)}) + "\n"
result = subprocess.run(
['archivebox', 'extract', '--no-wait'],
["archivebox", "extract", "--no-wait"],
input=jsonl_input,
capture_output=True,
text=True,
@@ -159,7 +160,7 @@ def test_extract_stdin_jsonl_input(tmp_path, process, disable_extractors_dict):
)
# Should not show "not found" error
assert 'not found' not in result.stderr.lower() or result.returncode == 0
assert "not found" not in result.stderr.lower() or result.returncode == 0
def test_extract_pipeline_from_snapshot(tmp_path, process, disable_extractors_dict):
@@ -168,14 +169,14 @@ def test_extract_pipeline_from_snapshot(tmp_path, process, disable_extractors_di
# Create snapshot and pipe to extract
snapshot_proc = subprocess.Popen(
['archivebox', 'snapshot', 'https://example.com'],
["archivebox", "snapshot", "https://example.com"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
env=disable_extractors_dict,
)
subprocess.run(
['archivebox', 'extract', '--no-wait'],
["archivebox", "extract", "--no-wait"],
stdin=snapshot_proc.stdout,
capture_output=True,
text=True,
@@ -185,10 +186,12 @@ def test_extract_pipeline_from_snapshot(tmp_path, process, disable_extractors_di
snapshot_proc.wait()
# Check database for snapshot
conn = sqlite3.connect('index.sqlite3')
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
snapshot = c.execute("SELECT id, url FROM core_snapshot WHERE url = ?",
('https://example.com',)).fetchone()
snapshot = c.execute(
"SELECT id, url FROM core_snapshot WHERE url = ?",
("https://example.com",),
).fetchone()
conn.close()
assert snapshot is not None, "Snapshot should be created by pipeline"
@@ -200,18 +203,18 @@ def test_extract_multiple_snapshots(tmp_path, process, disable_extractors_dict):
# Create multiple snapshots one at a time to avoid deduplication issues
subprocess.run(
['archivebox', 'add', '--index-only', 'https://example.com'],
["archivebox", "add", "--index-only", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
subprocess.run(
['archivebox', 'add', '--index-only', 'https://iana.org'],
["archivebox", "add", "--index-only", "https://iana.org"],
capture_output=True,
env=disable_extractors_dict,
)
# Get all snapshot IDs
conn = sqlite3.connect('index.sqlite3')
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
snapshot_ids = c.execute("SELECT id FROM core_snapshot").fetchall()
conn.close()
@@ -219,9 +222,9 @@ def test_extract_multiple_snapshots(tmp_path, process, disable_extractors_dict):
assert len(snapshot_ids) >= 2, "Should have at least 2 snapshots"
# Extract from all snapshots
ids_input = '\n'.join(str(s[0]) for s in snapshot_ids) + '\n'
ids_input = "\n".join(str(s[0]) for s in snapshot_ids) + "\n"
result = subprocess.run(
['archivebox', 'extract', '--no-wait'],
["archivebox", "extract", "--no-wait"],
input=ids_input,
capture_output=True,
text=True,
@@ -230,7 +233,7 @@ def test_extract_multiple_snapshots(tmp_path, process, disable_extractors_dict):
assert result.returncode == 0, result.stderr
# Should not error
conn = sqlite3.connect('index.sqlite3')
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
conn.close()
@@ -246,25 +249,25 @@ class TestExtractCLI:
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'extract', '--help'],
["archivebox", "extract", "--help"],
capture_output=True,
text=True,
)
assert result.returncode == 0
assert '--plugin' in result.stdout or '-p' in result.stdout
assert '--wait' in result.stdout or '--no-wait' in result.stdout
assert "--plugin" in result.stdout or "-p" in result.stdout
assert "--wait" in result.stdout or "--no-wait" in result.stdout
def test_cli_no_snapshots_shows_warning(self, tmp_path, process):
"""Test that running without snapshots shows a warning."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'extract', '--no-wait'],
input='',
["archivebox", "extract", "--no-wait"],
input="",
capture_output=True,
text=True,
)
# Should show warning about no snapshots or exit normally (empty input)
assert result.returncode == 0 or 'No' in result.stderr
assert result.returncode == 0 or "No" in result.stderr

View File

@@ -11,20 +11,20 @@ import subprocess
def test_help_runs_successfully(tmp_path):
"""Test that help command runs and produces output."""
os.chdir(tmp_path)
result = subprocess.run(['archivebox', 'help'], capture_output=True, text=True)
result = subprocess.run(["archivebox", "help"], capture_output=True, text=True)
assert result.returncode == 0
combined = result.stdout + result.stderr
assert len(combined) > 100
assert 'archivebox' in combined.lower()
assert "archivebox" in combined.lower()
def test_help_in_initialized_dir(tmp_path, process):
"""Test help command in initialized data directory."""
os.chdir(tmp_path)
result = subprocess.run(['archivebox', 'help'], capture_output=True, text=True)
result = subprocess.run(["archivebox", "help"], capture_output=True, text=True)
assert result.returncode == 0
combined = result.stdout + result.stderr
assert 'init' in combined
assert 'add' in combined
assert "init" in combined
assert "add" in combined

View File

@@ -11,13 +11,13 @@ import subprocess
from archivebox.config.common import STORAGE_CONFIG
DIR_PERMISSIONS = STORAGE_CONFIG.OUTPUT_PERMISSIONS.replace('6', '7').replace('4', '5')
DIR_PERMISSIONS = STORAGE_CONFIG.OUTPUT_PERMISSIONS.replace("6", "7").replace("4", "5")
def test_init_creates_database_file(tmp_path):
"""Test that init creates index.sqlite3 database file."""
os.chdir(tmp_path)
result = subprocess.run(['archivebox', 'init'], capture_output=True)
result = subprocess.run(["archivebox", "init"], capture_output=True)
assert result.returncode == 0
db_path = tmp_path / "index.sqlite3"
@@ -28,7 +28,7 @@ def test_init_creates_database_file(tmp_path):
def test_init_creates_archive_directory(tmp_path):
"""Test that init creates archive directory."""
os.chdir(tmp_path)
subprocess.run(['archivebox', 'init'], capture_output=True)
subprocess.run(["archivebox", "init"], capture_output=True)
archive_dir = tmp_path / "archive"
assert archive_dir.exists()
@@ -38,7 +38,7 @@ def test_init_creates_archive_directory(tmp_path):
def test_init_creates_sources_directory(tmp_path):
"""Test that init creates sources directory."""
os.chdir(tmp_path)
subprocess.run(['archivebox', 'init'], capture_output=True)
subprocess.run(["archivebox", "init"], capture_output=True)
sources_dir = tmp_path / "sources"
assert sources_dir.exists()
@@ -48,7 +48,7 @@ def test_init_creates_sources_directory(tmp_path):
def test_init_creates_logs_directory(tmp_path):
"""Test that init creates logs directory."""
os.chdir(tmp_path)
subprocess.run(['archivebox', 'init'], capture_output=True)
subprocess.run(["archivebox", "init"], capture_output=True)
logs_dir = tmp_path / "logs"
assert logs_dir.exists()
@@ -58,7 +58,7 @@ def test_init_creates_logs_directory(tmp_path):
def test_init_creates_config_file(tmp_path):
"""Test that init creates ArchiveBox.conf config file."""
os.chdir(tmp_path)
subprocess.run(['archivebox', 'init'], capture_output=True)
subprocess.run(["archivebox", "init"], capture_output=True)
config_file = tmp_path / "ArchiveBox.conf"
assert config_file.exists()
@@ -68,7 +68,7 @@ def test_init_creates_config_file(tmp_path):
def test_init_runs_migrations(tmp_path):
"""Test that init runs Django migrations and creates core tables."""
os.chdir(tmp_path)
subprocess.run(['archivebox', 'init'], capture_output=True)
subprocess.run(["archivebox", "init"], capture_output=True)
# Check that migrations were applied
conn = sqlite3.connect("index.sqlite3")
@@ -76,7 +76,7 @@ def test_init_runs_migrations(tmp_path):
# Check django_migrations table exists
migrations = c.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='django_migrations'"
"SELECT name FROM sqlite_master WHERE type='table' AND name='django_migrations'",
).fetchall()
assert len(migrations) == 1
@@ -90,14 +90,14 @@ def test_init_runs_migrations(tmp_path):
def test_init_creates_core_snapshot_table(tmp_path):
"""Test that init creates core_snapshot table."""
os.chdir(tmp_path)
subprocess.run(['archivebox', 'init'], capture_output=True)
subprocess.run(["archivebox", "init"], capture_output=True)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
# Check core_snapshot table exists
tables = c.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='core_snapshot'"
"SELECT name FROM sqlite_master WHERE type='table' AND name='core_snapshot'",
).fetchall()
assert len(tables) == 1
@@ -107,14 +107,14 @@ def test_init_creates_core_snapshot_table(tmp_path):
def test_init_creates_crawls_crawl_table(tmp_path):
"""Test that init creates crawls_crawl table."""
os.chdir(tmp_path)
subprocess.run(['archivebox', 'init'], capture_output=True)
subprocess.run(["archivebox", "init"], capture_output=True)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
# Check crawls_crawl table exists
tables = c.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='crawls_crawl'"
"SELECT name FROM sqlite_master WHERE type='table' AND name='crawls_crawl'",
).fetchall()
assert len(tables) == 1
@@ -124,14 +124,14 @@ def test_init_creates_crawls_crawl_table(tmp_path):
def test_init_creates_core_archiveresult_table(tmp_path):
"""Test that init creates core_archiveresult table."""
os.chdir(tmp_path)
subprocess.run(['archivebox', 'init'], capture_output=True)
subprocess.run(["archivebox", "init"], capture_output=True)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
# Check core_archiveresult table exists
tables = c.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='core_archiveresult'"
"SELECT name FROM sqlite_master WHERE type='table' AND name='core_archiveresult'",
).fetchall()
assert len(tables) == 1
@@ -141,7 +141,7 @@ def test_init_creates_core_archiveresult_table(tmp_path):
def test_init_sets_correct_file_permissions(tmp_path):
"""Test that init sets correct permissions on created files."""
os.chdir(tmp_path)
subprocess.run(['archivebox', 'init'], capture_output=True)
subprocess.run(["archivebox", "init"], capture_output=True)
# Check database permissions
db_path = tmp_path / "index.sqlite3"
@@ -157,12 +157,12 @@ def test_init_is_idempotent(tmp_path):
os.chdir(tmp_path)
# First init
result1 = subprocess.run(['archivebox', 'init'], capture_output=True, text=True)
result1 = subprocess.run(["archivebox", "init"], capture_output=True, text=True)
assert result1.returncode == 0
assert "Initializing a new ArchiveBox" in result1.stdout
# Second init should update, not fail
result2 = subprocess.run(['archivebox', 'init'], capture_output=True, text=True)
result2 = subprocess.run(["archivebox", "init"], capture_output=True, text=True)
assert result2.returncode == 0
assert "updating existing ArchiveBox" in result2.stdout or "up-to-date" in result2.stdout.lower()
@@ -180,7 +180,7 @@ def test_init_with_existing_data_preserves_snapshots(tmp_path, process, disable_
# Add a snapshot
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
@@ -193,7 +193,7 @@ def test_init_with_existing_data_preserves_snapshots(tmp_path, process, disable_
conn.close()
# Run init again
result = subprocess.run(['archivebox', 'init'], capture_output=True)
result = subprocess.run(["archivebox", "init"], capture_output=True)
assert result.returncode == 0
# Snapshot should still exist
@@ -208,7 +208,7 @@ def test_init_quick_flag_skips_checks(tmp_path):
"""Test that init --quick runs faster by skipping some checks."""
os.chdir(tmp_path)
result = subprocess.run(['archivebox', 'init', '--quick'], capture_output=True, text=True)
result = subprocess.run(["archivebox", "init", "--quick"], capture_output=True, text=True)
assert result.returncode == 0
# Database should still be created
@@ -219,14 +219,14 @@ def test_init_quick_flag_skips_checks(tmp_path):
def test_init_creates_machine_table(tmp_path):
"""Test that init creates the machine_machine table."""
os.chdir(tmp_path)
subprocess.run(['archivebox', 'init'], capture_output=True)
subprocess.run(["archivebox", "init"], capture_output=True)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
# Check machine_machine table exists
tables = c.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='machine_machine'"
"SELECT name FROM sqlite_master WHERE type='table' AND name='machine_machine'",
).fetchall()
conn.close()
@@ -236,18 +236,18 @@ def test_init_creates_machine_table(tmp_path):
def test_init_output_shows_collection_info(tmp_path):
"""Test that init output shows helpful collection information."""
os.chdir(tmp_path)
result = subprocess.run(['archivebox', 'init'], capture_output=True, text=True)
result = subprocess.run(["archivebox", "init"], capture_output=True, text=True)
output = result.stdout
# Should show some helpful info about the collection
assert 'ArchiveBox' in output or 'collection' in output.lower() or 'Initializing' in output
assert "ArchiveBox" in output or "collection" in output.lower() or "Initializing" in output
def test_init_ignores_unrecognized_archive_directories(tmp_path, process, disable_extractors_dict):
"""Test that init upgrades existing dirs without choking on extra folders."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
check=True,
@@ -255,7 +255,7 @@ def test_init_ignores_unrecognized_archive_directories(tmp_path, process, disabl
(tmp_path / "archive" / "some_random_folder").mkdir(parents=True, exist_ok=True)
result = subprocess.run(
['archivebox', 'init'],
["archivebox", "init"],
capture_output=True,
text=True,
env=disable_extractors_dict,

View File

@@ -14,7 +14,7 @@ def test_install_runs_successfully(tmp_path, process):
"""Test that install command runs without error."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'install', '--dry-run'],
["archivebox", "install", "--dry-run"],
capture_output=True,
text=True,
timeout=60,
@@ -29,7 +29,7 @@ def test_install_creates_binary_records_in_db(tmp_path, process):
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'install', '--dry-run'],
["archivebox", "install", "--dry-run"],
capture_output=True,
timeout=60,
)
@@ -40,7 +40,7 @@ def test_install_creates_binary_records_in_db(tmp_path, process):
# Check machine_binary table exists
tables = c.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='machine_binary'"
"SELECT name FROM sqlite_master WHERE type='table' AND name='machine_binary'",
).fetchall()
conn.close()
@@ -52,14 +52,14 @@ def test_install_dry_run_does_not_install(tmp_path, process):
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'install', '--dry-run'],
["archivebox", "install", "--dry-run"],
capture_output=True,
text=True,
timeout=60,
)
# Should complete without actually installing
assert 'dry' in result.stdout.lower() or result.returncode in [0, 1]
assert "dry" in result.stdout.lower() or result.returncode in [0, 1]
def test_install_detects_system_binaries(tmp_path, process):
@@ -67,7 +67,7 @@ def test_install_detects_system_binaries(tmp_path, process):
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'install', '--dry-run'],
["archivebox", "install", "--dry-run"],
capture_output=True,
text=True,
timeout=60,
@@ -82,7 +82,7 @@ def test_install_shows_binary_status(tmp_path, process):
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'install', '--dry-run'],
["archivebox", "install", "--dry-run"],
capture_output=True,
text=True,
timeout=60,
@@ -97,34 +97,34 @@ def test_install_dry_run_prints_dry_run_message(tmp_path, process):
"""Test that install --dry-run clearly reports that no changes will be made."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'install', '--dry-run'],
["archivebox", "install", "--dry-run"],
capture_output=True,
text=True,
timeout=60,
)
assert result.returncode == 0
assert 'dry run' in result.stdout.lower()
assert "dry run" in result.stdout.lower()
def test_install_help_lists_dry_run_flag(tmp_path):
"""Test that install --help documents the dry-run option."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'install', '--help'],
["archivebox", "install", "--help"],
capture_output=True,
text=True,
)
assert result.returncode == 0
assert '--dry-run' in result.stdout or '-d' in result.stdout
assert "--dry-run" in result.stdout or "-d" in result.stdout
def test_install_invalid_option_fails(tmp_path):
"""Test that invalid install options fail cleanly."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'install', '--invalid-option'],
["archivebox", "install", "--invalid-option"],
capture_output=True,
text=True,
)
@@ -136,29 +136,31 @@ def test_install_from_empty_dir_initializes_collection(tmp_path):
"""Test that install bootstraps an empty dir before performing work."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'install', '--dry-run'],
["archivebox", "install", "--dry-run"],
capture_output=True,
text=True,
)
output = result.stdout + result.stderr
assert result.returncode == 0
assert 'Initializing' in output or 'Dry run' in output or 'init' in output.lower()
assert "Initializing" in output or "Dry run" in output or "init" in output.lower()
def test_install_updates_binary_table(tmp_path, process):
"""Test that install completes and only mutates dependency state."""
os.chdir(tmp_path)
env = os.environ.copy()
tmp_short = Path('/tmp') / f'abx-install-{tmp_path.name}'
tmp_short = Path("/tmp") / f"abx-install-{tmp_path.name}"
tmp_short.mkdir(parents=True, exist_ok=True)
env.update({
'TMP_DIR': str(tmp_short),
'ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS': 'true',
})
env.update(
{
"TMP_DIR": str(tmp_short),
"ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS": "true",
},
)
result = subprocess.run(
['archivebox', 'install'],
["archivebox", "install"],
capture_output=True,
text=True,
timeout=420,
@@ -171,16 +173,18 @@ def test_install_updates_binary_table(tmp_path, process):
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
binary_counts = dict(c.execute(
"SELECT status, COUNT(*) FROM machine_binary GROUP BY status"
).fetchall())
binary_counts = dict(
c.execute(
"SELECT status, COUNT(*) FROM machine_binary GROUP BY status",
).fetchall(),
)
snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
sealed_crawls = c.execute(
"SELECT COUNT(*) FROM crawls_crawl WHERE status='sealed'"
"SELECT COUNT(*) FROM crawls_crawl WHERE status='sealed'",
).fetchone()[0]
conn.close()
assert sealed_crawls >= 1
assert snapshot_count == 0
assert binary_counts.get('queued', 0) == 0
assert binary_counts.get('installed', 0) > 0
assert binary_counts.get("queued", 0) == 0
assert binary_counts.get("installed", 0) > 0

View File

@@ -11,52 +11,48 @@ import subprocess
def _parse_jsonl(stdout: str) -> list[dict]:
return [
json.loads(line)
for line in stdout.splitlines()
if line.strip().startswith('{')
]
return [json.loads(line) for line in stdout.splitlines() if line.strip().startswith("{")]
def test_list_outputs_existing_snapshots_as_jsonl(tmp_path, process, disable_extractors_dict):
"""Test that list prints one JSON object per stored snapshot."""
os.chdir(tmp_path)
for url in ['https://example.com', 'https://iana.org']:
for url in ["https://example.com", "https://iana.org"]:
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', url],
["archivebox", "add", "--index-only", "--depth=0", url],
capture_output=True,
env=disable_extractors_dict,
check=True,
)
result = subprocess.run(
['archivebox', 'list'],
["archivebox", "list"],
capture_output=True,
text=True,
timeout=30,
)
rows = _parse_jsonl(result.stdout)
urls = {row['url'] for row in rows}
urls = {row["url"] for row in rows}
assert result.returncode == 0, result.stderr
assert 'https://example.com' in urls
assert 'https://iana.org' in urls
assert "https://example.com" in urls
assert "https://iana.org" in urls
def test_list_filters_by_url_icontains(tmp_path, process, disable_extractors_dict):
"""Test that list --url__icontains returns only matching snapshots."""
os.chdir(tmp_path)
for url in ['https://example.com', 'https://iana.org']:
for url in ["https://example.com", "https://iana.org"]:
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', url],
["archivebox", "add", "--index-only", "--depth=0", url],
capture_output=True,
env=disable_extractors_dict,
check=True,
)
result = subprocess.run(
['archivebox', 'list', '--url__icontains', 'example.com'],
["archivebox", "list", "--url__icontains", "example.com"],
capture_output=True,
text=True,
timeout=30,
@@ -65,15 +61,15 @@ def test_list_filters_by_url_icontains(tmp_path, process, disable_extractors_dic
rows = _parse_jsonl(result.stdout)
assert result.returncode == 0, result.stderr
assert len(rows) == 1
assert rows[0]['url'] == 'https://example.com'
assert rows[0]["url"] == "https://example.com"
def test_list_filters_by_crawl_id_and_limit(tmp_path, process, disable_extractors_dict):
"""Test that crawl-id and limit filters constrain the result set."""
os.chdir(tmp_path)
for url in ['https://example.com', 'https://iana.org']:
for url in ["https://example.com", "https://iana.org"]:
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', url],
["archivebox", "add", "--index-only", "--depth=0", url],
capture_output=True,
env=disable_extractors_dict,
check=True,
@@ -81,14 +77,16 @@ def test_list_filters_by_crawl_id_and_limit(tmp_path, process, disable_extractor
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
crawl_id = str(c.execute(
"SELECT crawl_id FROM core_snapshot WHERE url = ?",
('https://example.com',),
).fetchone()[0])
crawl_id = str(
c.execute(
"SELECT crawl_id FROM core_snapshot WHERE url = ?",
("https://example.com",),
).fetchone()[0],
)
conn.close()
result = subprocess.run(
['archivebox', 'list', '--crawl-id', crawl_id, '--limit', '1'],
["archivebox", "list", "--crawl-id", crawl_id, "--limit", "1"],
capture_output=True,
text=True,
timeout=30,
@@ -97,15 +95,15 @@ def test_list_filters_by_crawl_id_and_limit(tmp_path, process, disable_extractor
rows = _parse_jsonl(result.stdout)
assert result.returncode == 0, result.stderr
assert len(rows) == 1
assert rows[0]['crawl_id'].replace('-', '') == crawl_id.replace('-', '')
assert rows[0]['url'] == 'https://example.com'
assert rows[0]["crawl_id"].replace("-", "") == crawl_id.replace("-", "")
assert rows[0]["url"] == "https://example.com"
def test_list_filters_by_status(tmp_path, process, disable_extractors_dict):
"""Test that list can filter using the current snapshot status."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
check=True,
@@ -117,7 +115,7 @@ def test_list_filters_by_status(tmp_path, process, disable_extractors_dict):
conn.close()
result = subprocess.run(
['archivebox', 'list', '--status', status],
["archivebox", "list", "--status", status],
capture_output=True,
text=True,
timeout=30,
@@ -126,7 +124,7 @@ def test_list_filters_by_status(tmp_path, process, disable_extractors_dict):
rows = _parse_jsonl(result.stdout)
assert result.returncode == 0, result.stderr
assert len(rows) == 1
assert rows[0]['status'] == status
assert rows[0]["status"] == status
def test_list_help_lists_filter_options(tmp_path, process):
@@ -134,13 +132,60 @@ def test_list_help_lists_filter_options(tmp_path, process):
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'list', '--help'],
["archivebox", "list", "--help"],
capture_output=True,
text=True,
timeout=30,
)
assert result.returncode == 0
assert '--url__icontains' in result.stdout
assert '--crawl-id' in result.stdout
assert '--limit' in result.stdout
assert "--url__icontains" in result.stdout
assert "--crawl-id" in result.stdout
assert "--limit" in result.stdout
assert "--search" in result.stdout
def test_list_allows_sort_with_limit(tmp_path, process, disable_extractors_dict):
"""Test that list can sort and then apply limit without queryset slicing errors."""
os.chdir(tmp_path)
for url in ["https://example.com", "https://iana.org", "https://example.net"]:
subprocess.run(
["archivebox", "add", "--index-only", "--depth=0", url],
capture_output=True,
env=disable_extractors_dict,
check=True,
)
result = subprocess.run(
["archivebox", "list", "--limit", "2", "--sort", "-created_at"],
capture_output=True,
text=True,
timeout=30,
)
rows = _parse_jsonl(result.stdout)
assert result.returncode == 0, result.stderr
assert len(rows) == 2
def test_list_search_meta_matches_metadata(tmp_path, process, disable_extractors_dict):
"""Test that list --search=meta applies metadata search to the queryset."""
os.chdir(tmp_path)
subprocess.run(
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
check=True,
)
result = subprocess.run(
["archivebox", "list", "--search=meta", "example.com"],
capture_output=True,
text=True,
timeout=30,
)
rows = _parse_jsonl(result.stdout)
assert result.returncode == 0, result.stderr
assert len(rows) == 1
assert rows[0]["url"] == "https://example.com"

View File

@@ -13,7 +13,7 @@ def test_manage_help_works(tmp_path, process):
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'manage', 'help'],
["archivebox", "manage", "help"],
capture_output=True,
text=True,
timeout=30,
@@ -28,7 +28,7 @@ def test_manage_showmigrations_works(tmp_path, process):
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'manage', 'showmigrations'],
["archivebox", "manage", "showmigrations"],
capture_output=True,
text=True,
timeout=30,
@@ -36,7 +36,7 @@ def test_manage_showmigrations_works(tmp_path, process):
assert result.returncode == 0
# Should show migration status
assert 'core' in result.stdout or '[' in result.stdout
assert "core" in result.stdout or "[" in result.stdout
def test_manage_dbshell_command_exists(tmp_path, process):
@@ -44,7 +44,7 @@ def test_manage_dbshell_command_exists(tmp_path, process):
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'manage', 'help', 'dbshell'],
["archivebox", "manage", "help", "dbshell"],
capture_output=True,
text=True,
timeout=30,
@@ -52,7 +52,7 @@ def test_manage_dbshell_command_exists(tmp_path, process):
# Should show help for dbshell
assert result.returncode == 0
assert 'dbshell' in result.stdout or 'database' in result.stdout.lower()
assert "dbshell" in result.stdout or "database" in result.stdout.lower()
def test_manage_check_works(tmp_path, process):
@@ -60,7 +60,7 @@ def test_manage_check_works(tmp_path, process):
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'manage', 'check'],
["archivebox", "manage", "check"],
capture_output=True,
text=True,
timeout=30,

View File

@@ -111,14 +111,14 @@ def test_read_args_or_stdin_handles_args_stdin_and_mixed_jsonl():
read_args_or_stdin(
(),
stream=MockTTYStringIO(
'https://plain-url.com\n'
"https://plain-url.com\n"
'{"type":"Snapshot","url":"https://jsonl-url.com","tags":"test"}\n'
'{"type":"Tag","id":"tag-1","name":"example"}\n'
'01234567-89ab-cdef-0123-456789abcdef\n'
'not valid json\n',
"01234567-89ab-cdef-0123-456789abcdef\n"
"not valid json\n",
is_tty=False,
),
)
),
)
assert len(stdin_records) == 4
assert stdin_records[0]["url"] == "https://plain-url.com"
@@ -135,7 +135,7 @@ def test_read_args_or_stdin_handles_args_stdin_and_mixed_jsonl():
'{"type":"Crawl","id":"crawl-1","urls":"https://example.com\\nhttps://foo.com"}\n',
is_tty=False,
),
)
),
)
assert len(crawl_records) == 1
assert crawl_records[0]["type"] == TYPE_CRAWL
@@ -151,14 +151,12 @@ def test_collect_urls_from_plugins_reads_only_parser_outputs(tmp_path):
(tmp_path / "wget").mkdir()
(tmp_path / "wget" / "urls.jsonl").write_text(
'{"url":"https://wget-link-1.com"}\n'
'{"url":"https://wget-link-2.com"}\n',
'{"url":"https://wget-link-1.com"}\n{"url":"https://wget-link-2.com"}\n',
encoding="utf-8",
)
(tmp_path / "parse_html_urls").mkdir()
(tmp_path / "parse_html_urls" / "urls.jsonl").write_text(
'{"url":"https://html-link-1.com"}\n'
'{"url":"https://html-link-2.com","title":"HTML Link 2"}\n',
'{"url":"https://html-link-1.com"}\n{"url":"https://html-link-2.com","title":"HTML Link 2"}\n',
encoding="utf-8",
)
(tmp_path / "screenshot").mkdir()
@@ -187,6 +185,22 @@ def test_collect_urls_from_plugins_trims_markdown_suffixes(tmp_path):
assert urls[0]["url"] == "https://docs.sweeting.me/s/youtube-favorites"
def test_collect_urls_from_plugins_trims_trailing_punctuation(tmp_path):
from archivebox.hooks import collect_urls_from_plugins
(tmp_path / "parse_html_urls").mkdir()
(tmp_path / "parse_html_urls" / "urls.jsonl").write_text(
('{"url":"https://github.com/ArchiveBox/ArchiveBox."}\n{"url":"https://github.com/abc?abc#234234?."}\n'),
encoding="utf-8",
)
urls = collect_urls_from_plugins(tmp_path)
assert [url["url"] for url in urls] == [
"https://github.com/ArchiveBox/ArchiveBox",
"https://github.com/abc?abc#234234",
]
def test_crawl_create_stdout_pipes_into_run(initialized_archive):
"""`archivebox crawl create | archivebox run` should queue and materialize snapshots."""
url = create_test_url()
@@ -311,10 +325,7 @@ def test_archiveresult_list_stdout_pipes_into_run(initialized_archive):
_assert_stdout_is_jsonl_only(run_stdout)
run_records = parse_jsonl_output(run_stdout)
assert any(
record.get("type") == "ArchiveResult" and record.get("id") == archiveresult["id"]
for record in run_records
)
assert any(record.get("type") == "ArchiveResult" and record.get("id") == archiveresult["id"] for record in run_records)
def test_binary_create_stdout_pipes_into_run(initialized_archive):

View File

@@ -14,8 +14,8 @@ def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None:
candidates = {snapshot_id}
if len(snapshot_id) == 32:
candidates.add(f"{snapshot_id[:8]}-{snapshot_id[8:12]}-{snapshot_id[12:16]}-{snapshot_id[16:20]}-{snapshot_id[20:]}")
elif len(snapshot_id) == 36 and '-' in snapshot_id:
candidates.add(snapshot_id.replace('-', ''))
elif len(snapshot_id) == 36 and "-" in snapshot_id:
candidates.add(snapshot_id.replace("-", ""))
for needle in candidates:
for path in data_dir.rglob(needle):
@@ -30,7 +30,7 @@ def test_remove_deletes_snapshot_from_db(tmp_path, process, disable_extractors_d
# Add a snapshot
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
@@ -44,7 +44,7 @@ def test_remove_deletes_snapshot_from_db(tmp_path, process, disable_extractors_d
# Remove it
subprocess.run(
['archivebox', 'remove', 'https://example.com', '--yes'],
["archivebox", "remove", "https://example.com", "--yes"],
capture_output=True,
env=disable_extractors_dict,
)
@@ -64,7 +64,7 @@ def test_remove_deletes_archive_directory(tmp_path, process, disable_extractors_
# Add a snapshot
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
@@ -78,7 +78,7 @@ def test_remove_deletes_archive_directory(tmp_path, process, disable_extractors_
assert snapshot_dir is not None, f"Snapshot output directory not found for {snapshot_id}"
subprocess.run(
['archivebox', 'remove', 'https://example.com', '--yes', '--delete'],
["archivebox", "remove", "https://example.com", "--yes", "--delete"],
capture_output=True,
env=disable_extractors_dict,
)
@@ -91,14 +91,14 @@ def test_remove_yes_flag_skips_confirmation(tmp_path, process, disable_extractor
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
# Remove with --yes should complete without interaction
result = subprocess.run(
['archivebox', 'remove', 'https://example.com', '--yes'],
["archivebox", "remove", "https://example.com", "--yes"],
capture_output=True,
env=disable_extractors_dict,
timeout=30,
@@ -114,9 +114,9 @@ def test_remove_multiple_snapshots(tmp_path, process, disable_extractors_dict):
os.chdir(tmp_path)
# Add multiple snapshots
for url in ['https://example.com', 'https://example.org']:
for url in ["https://example.com", "https://example.org"]:
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', url],
["archivebox", "add", "--index-only", "--depth=0", url],
capture_output=True,
env=disable_extractors_dict,
)
@@ -130,7 +130,7 @@ def test_remove_multiple_snapshots(tmp_path, process, disable_extractors_dict):
# Remove both
subprocess.run(
['archivebox', 'remove', 'https://example.com', 'https://example.org', '--yes'],
["archivebox", "remove", "https://example.com", "https://example.org", "--yes"],
capture_output=True,
env=disable_extractors_dict,
)
@@ -150,14 +150,14 @@ def test_remove_with_filter(tmp_path, process, disable_extractors_dict):
# Add snapshots
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
# Remove using filter
result = subprocess.run(
['archivebox', 'remove', '--filter-type=search', '--filter=example.com', '--yes'],
["archivebox", "remove", "--filter-type=search", "--filter=example.com", "--yes"],
capture_output=True,
env=disable_extractors_dict,
timeout=30,
@@ -171,16 +171,16 @@ def test_remove_with_regex_filter_deletes_all_matches(tmp_path, process, disable
"""Test regex filters remove every matching snapshot."""
os.chdir(tmp_path)
for url in ['https://example.com', 'https://iana.org']:
for url in ["https://example.com", "https://iana.org"]:
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', url],
["archivebox", "add", "--index-only", "--depth=0", url],
capture_output=True,
env=disable_extractors_dict,
check=True,
)
result = subprocess.run(
['archivebox', 'remove', '--filter-type=regex', '.*', '--yes'],
["archivebox", "remove", "--filter-type=regex", ".*", "--yes"],
capture_output=True,
env=disable_extractors_dict,
check=True,
@@ -193,7 +193,7 @@ def test_remove_with_regex_filter_deletes_all_matches(tmp_path, process, disable
output = result.stdout.decode("utf-8") + result.stderr.decode("utf-8")
assert count_after == 0
assert 'Removed' in output or 'Found' in output
assert "Removed" in output or "Found" in output
def test_remove_nonexistent_url_fails_gracefully(tmp_path, process, disable_extractors_dict):
@@ -201,30 +201,30 @@ def test_remove_nonexistent_url_fails_gracefully(tmp_path, process, disable_extr
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'remove', 'https://nonexistent-url-12345.com', '--yes'],
["archivebox", "remove", "https://nonexistent-url-12345.com", "--yes"],
capture_output=True,
env=disable_extractors_dict,
)
# Should fail or show error
stdout_text = result.stdout.decode('utf-8', errors='replace').lower()
assert result.returncode != 0 or 'not found' in stdout_text or 'no matches' in stdout_text
stdout_text = result.stdout.decode("utf-8", errors="replace").lower()
assert result.returncode != 0 or "not found" in stdout_text or "no matches" in stdout_text
def test_remove_reports_remaining_link_count_correctly(tmp_path, process, disable_extractors_dict):
"""Test remove reports the remaining snapshot count after deletion."""
os.chdir(tmp_path)
for url in ['https://example.com', 'https://example.org']:
for url in ["https://example.com", "https://example.org"]:
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', url],
["archivebox", "add", "--index-only", "--depth=0", url],
capture_output=True,
env=disable_extractors_dict,
check=True,
)
result = subprocess.run(
['archivebox', 'remove', 'https://example.org', '--yes'],
["archivebox", "remove", "https://example.org", "--yes"],
capture_output=True,
env=disable_extractors_dict,
check=True,
@@ -240,14 +240,14 @@ def test_remove_after_flag(tmp_path, process, disable_extractors_dict):
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
# Try remove with --after flag (should work or show usage)
result = subprocess.run(
['archivebox', 'remove', '--after=2020-01-01', '--yes'],
["archivebox", "remove", "--after=2020-01-01", "--yes"],
capture_output=True,
env=disable_extractors_dict,
timeout=30,

View File

@@ -21,8 +21,8 @@ from archivebox.tests.conftest import (
)
RUN_TEST_ENV = {
'PLUGINS': 'favicon',
'SAVE_FAVICON': 'True',
"PLUGINS": "favicon",
"SAVE_FAVICON": "True",
}
@@ -34,7 +34,7 @@ class TestRunWithCrawl:
crawl_record = create_test_crawl_json()
stdout, stderr, code = run_archivebox_cmd(
['run'],
["run"],
stdin=json.dumps(crawl_record),
data_dir=initialized_archive,
timeout=120,
@@ -45,21 +45,21 @@ class TestRunWithCrawl:
# Should output the created Crawl
records = parse_jsonl_output(stdout)
crawl_records = [r for r in records if r.get('type') == 'Crawl']
crawl_records = [r for r in records if r.get("type") == "Crawl"]
assert len(crawl_records) >= 1
assert crawl_records[0].get('id') # Should have an id now
assert crawl_records[0].get("id") # Should have an id now
def test_run_with_existing_crawl(self, initialized_archive):
"""Run re-queues an existing Crawl (with id)."""
url = create_test_url()
# First create a crawl
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive, env=RUN_TEST_ENV)
stdout1, _, _ = run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive, env=RUN_TEST_ENV)
crawl = parse_jsonl_output(stdout1)[0]
# Run with the existing crawl
stdout2, stderr, code = run_archivebox_cmd(
['run'],
["run"],
stdin=json.dumps(crawl),
data_dir=initialized_archive,
timeout=120,
@@ -79,7 +79,7 @@ class TestRunWithSnapshot:
snapshot_record = create_test_snapshot_json()
stdout, stderr, code = run_archivebox_cmd(
['run'],
["run"],
stdin=json.dumps(snapshot_record),
data_dir=initialized_archive,
timeout=120,
@@ -89,21 +89,21 @@ class TestRunWithSnapshot:
assert code == 0, f"Command failed: {stderr}"
records = parse_jsonl_output(stdout)
snapshot_records = [r for r in records if r.get('type') == 'Snapshot']
snapshot_records = [r for r in records if r.get("type") == "Snapshot"]
assert len(snapshot_records) >= 1
assert snapshot_records[0].get('id')
assert snapshot_records[0].get("id")
def test_run_with_existing_snapshot(self, initialized_archive):
"""Run re-queues an existing Snapshot (with id)."""
url = create_test_url()
# First create a snapshot
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive, env=RUN_TEST_ENV)
stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive, env=RUN_TEST_ENV)
snapshot = parse_jsonl_output(stdout1)[0]
# Run with the existing snapshot
stdout2, stderr, code = run_archivebox_cmd(
['run'],
["run"],
stdin=json.dumps(snapshot),
data_dir=initialized_archive,
timeout=120,
@@ -117,10 +117,10 @@ class TestRunWithSnapshot:
def test_run_with_plain_url(self, initialized_archive):
"""Run accepts plain URL records (no type field)."""
url = create_test_url()
url_record = {'url': url}
url_record = {"url": url}
stdout, stderr, code = run_archivebox_cmd(
['run'],
["run"],
stdin=json.dumps(url_record),
data_dir=initialized_archive,
timeout=120,
@@ -140,21 +140,21 @@ class TestRunWithArchiveResult:
url = create_test_url()
# Create snapshot and archive result
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive, env=RUN_TEST_ENV)
stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive, env=RUN_TEST_ENV)
snapshot = parse_jsonl_output(stdout1)[0]
stdout2, _, _ = run_archivebox_cmd(
['archiveresult', 'create', '--plugin=favicon'],
["archiveresult", "create", "--plugin=favicon"],
stdin=json.dumps(snapshot),
data_dir=initialized_archive,
env=RUN_TEST_ENV,
)
ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult')
ar = next(r for r in parse_jsonl_output(stdout2) if r.get("type") == "ArchiveResult")
# Update to failed
ar['status'] = 'failed'
ar["status"] = "failed"
run_archivebox_cmd(
['archiveresult', 'update', '--status=failed'],
["archiveresult", "update", "--status=failed"],
stdin=json.dumps(ar),
data_dir=initialized_archive,
env=RUN_TEST_ENV,
@@ -162,7 +162,7 @@ class TestRunWithArchiveResult:
# Now run should re-queue it
stdout3, stderr, code = run_archivebox_cmd(
['run'],
["run"],
stdin=json.dumps(ar),
data_dir=initialized_archive,
timeout=120,
@@ -171,7 +171,7 @@ class TestRunWithArchiveResult:
assert code == 0
records = parse_jsonl_output(stdout3)
ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
ar_records = [r for r in records if r.get("type") == "ArchiveResult"]
assert len(ar_records) >= 1
@@ -180,19 +180,19 @@ class TestRunPassThrough:
def test_run_passes_through_unknown_types(self, initialized_archive):
"""Run passes through records with unknown types."""
unknown_record = {'type': 'Unknown', 'id': 'fake-id', 'data': 'test'}
unknown_record = {"type": "Unknown", "id": "fake-id", "data": "test"}
stdout, stderr, code = run_archivebox_cmd(
['run'],
["run"],
stdin=json.dumps(unknown_record),
data_dir=initialized_archive,
)
assert code == 0
records = parse_jsonl_output(stdout)
unknown_records = [r for r in records if r.get('type') == 'Unknown']
unknown_records = [r for r in records if r.get("type") == "Unknown"]
assert len(unknown_records) == 1
assert unknown_records[0]['data'] == 'test'
assert unknown_records[0]["data"] == "test"
def test_run_outputs_all_processed_records(self, initialized_archive):
"""Run outputs all processed records for chaining."""
@@ -200,7 +200,7 @@ class TestRunPassThrough:
crawl_record = create_test_crawl_json(urls=[url])
stdout, stderr, code = run_archivebox_cmd(
['run'],
["run"],
stdin=json.dumps(crawl_record),
data_dir=initialized_archive,
timeout=120,
@@ -220,16 +220,18 @@ class TestRunMixedInput:
"""Run handles mixed Crawl/Snapshot/ArchiveResult input."""
crawl = create_test_crawl_json()
snapshot = create_test_snapshot_json()
unknown = {'type': 'Tag', 'id': 'fake', 'name': 'test'}
unknown = {"type": "Tag", "id": "fake", "name": "test"}
stdin = '\n'.join([
json.dumps(crawl),
json.dumps(snapshot),
json.dumps(unknown),
])
stdin = "\n".join(
[
json.dumps(crawl),
json.dumps(snapshot),
json.dumps(unknown),
],
)
stdout, stderr, code = run_archivebox_cmd(
['run'],
["run"],
stdin=stdin,
data_dir=initialized_archive,
timeout=120,
@@ -239,9 +241,9 @@ class TestRunMixedInput:
assert code == 0
records = parse_jsonl_output(stdout)
types = set(r.get('type') for r in records)
types = {r.get("type") for r in records}
# Should have processed Crawl and Snapshot, passed through Tag
assert 'Crawl' in types or 'Snapshot' in types or 'Tag' in types
assert "Crawl" in types or "Snapshot" in types or "Tag" in types
class TestRunEmpty:
@@ -250,8 +252,8 @@ class TestRunEmpty:
def test_run_empty_stdin(self, initialized_archive):
"""Run with empty stdin returns success."""
stdout, stderr, code = run_archivebox_cmd(
['run'],
stdin='',
["run"],
stdin="",
data_dir=initialized_archive,
)
@@ -259,16 +261,16 @@ class TestRunEmpty:
def test_run_no_records_to_process(self, initialized_archive):
"""Run with only pass-through records shows message."""
unknown = {'type': 'Unknown', 'id': 'fake'}
unknown = {"type": "Unknown", "id": "fake"}
stdout, stderr, code = run_archivebox_cmd(
['run'],
["run"],
stdin=json.dumps(unknown),
data_dir=initialized_archive,
)
assert code == 0
assert 'No records to process' in stderr
assert "No records to process" in stderr
class TestRunDaemonMode:
@@ -328,13 +330,13 @@ class TestRecoverOrphanedCrawls:
from archivebox.services.runner import recover_orphaned_crawls
crawl = Crawl.objects.create(
urls='https://example.com',
urls="https://example.com",
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.STARTED,
retry_at=None,
)
Snapshot.objects.create(
url='https://example.com',
url="https://example.com",
crawl=crawl,
status=Snapshot.StatusChoices.QUEUED,
retry_at=None,
@@ -358,13 +360,13 @@ class TestRecoverOrphanedCrawls:
from archivebox.services.runner import recover_orphaned_crawls
crawl = Crawl.objects.create(
urls='https://example.com',
urls="https://example.com",
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.STARTED,
retry_at=None,
)
snapshot = Snapshot.objects.create(
url='https://example.com',
url="https://example.com",
crawl=crawl,
status=Snapshot.StatusChoices.QUEUED,
retry_at=None,
@@ -376,10 +378,10 @@ class TestRecoverOrphanedCrawls:
machine=machine,
process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.RUNNING,
cmd=['/plugins/chrome/on_Crawl__91_chrome_wait.js'],
cmd=["/plugins/chrome/on_Crawl__91_chrome_wait.js"],
env={
'CRAWL_ID': str(crawl.id),
'SNAPSHOT_ID': str(snapshot.id),
"CRAWL_ID": str(crawl.id),
"SNAPSHOT_ID": str(snapshot.id),
},
started_at=timezone.now(),
)
@@ -397,13 +399,13 @@ class TestRecoverOrphanedCrawls:
from archivebox.services.runner import recover_orphaned_crawls
crawl = Crawl.objects.create(
urls='https://example.com',
urls="https://example.com",
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.STARTED,
retry_at=None,
)
Snapshot.objects.create(
url='https://example.com',
url="https://example.com",
crawl=crawl,
status=Snapshot.StatusChoices.SEALED,
retry_at=None,
@@ -426,13 +428,13 @@ class TestRecoverOrphanedSnapshots:
from archivebox.services.runner import recover_orphaned_snapshots
crawl = Crawl.objects.create(
urls='https://example.com',
urls="https://example.com",
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.SEALED,
retry_at=None,
)
snapshot = Snapshot.objects.create(
url='https://example.com',
url="https://example.com",
crawl=crawl,
status=Snapshot.StatusChoices.STARTED,
retry_at=None,

View File

@@ -6,26 +6,25 @@ import sqlite3
import subprocess
def test_schedule_run_all_enqueues_scheduled_crawl(tmp_path, process, disable_extractors_dict):
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'schedule', '--every=daily', '--depth=0', 'https://example.com'],
["archivebox", "schedule", "--every=daily", "--depth=0", "https://example.com"],
capture_output=True,
text=True,
check=True,
)
result = subprocess.run(
['archivebox', 'schedule', '--run-all'],
["archivebox", "schedule", "--run-all"],
capture_output=True,
text=True,
env=disable_extractors_dict,
)
assert result.returncode == 0
assert 'Enqueued 1 scheduled crawl' in result.stdout
assert "Enqueued 1 scheduled crawl" in result.stdout
conn = sqlite3.connect(tmp_path / "index.sqlite3")
try:
@@ -42,20 +41,20 @@ def test_schedule_without_import_path_creates_maintenance_schedule(tmp_path, pro
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'schedule', '--every=day'],
["archivebox", "schedule", "--every=day"],
capture_output=True,
text=True,
)
assert result.returncode == 0
assert 'Created scheduled maintenance update' in result.stdout
assert "Created scheduled maintenance update" in result.stdout
conn = sqlite3.connect(tmp_path / "index.sqlite3")
try:
row = conn.execute(
"SELECT urls, status FROM crawls_crawl ORDER BY created_at DESC LIMIT 1"
"SELECT urls, status FROM crawls_crawl ORDER BY created_at DESC LIMIT 1",
).fetchone()
finally:
conn.close()
assert row == ('archivebox://update', 'sealed')
assert row == ("archivebox://update", "sealed")

View File

@@ -15,21 +15,21 @@ def test_search_finds_snapshots(tmp_path, process, disable_extractors_dict):
# Add snapshots
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
# Search for it
result = subprocess.run(
['archivebox', 'search', 'example'],
["archivebox", "search", "example"],
capture_output=True,
text=True,
timeout=30,
)
assert result.returncode == 0
assert 'example' in result.stdout
assert "example" in result.stdout
def test_search_returns_no_results_for_missing_term(tmp_path, process, disable_extractors_dict):
@@ -37,13 +37,13 @@ def test_search_returns_no_results_for_missing_term(tmp_path, process, disable_e
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
result = subprocess.run(
['archivebox', 'search', 'nonexistentterm12345'],
["archivebox", "search", "nonexistentterm12345"],
capture_output=True,
text=True,
timeout=30,
@@ -58,7 +58,7 @@ def test_search_on_empty_archive(tmp_path, process):
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'search', 'anything'],
["archivebox", "search", "anything"],
capture_output=True,
text=True,
timeout=30,
@@ -72,14 +72,14 @@ def test_search_json_outputs_matching_snapshots(tmp_path, process, disable_extra
"""Test that search --json returns parseable matching snapshot rows."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
check=True,
)
result = subprocess.run(
['archivebox', 'search', '--json'],
["archivebox", "search", "--json"],
capture_output=True,
text=True,
timeout=30,
@@ -87,21 +87,21 @@ def test_search_json_outputs_matching_snapshots(tmp_path, process, disable_extra
assert result.returncode == 0, result.stderr
payload = json.loads(result.stdout)
assert any('example.com' in row.get('url', '') for row in payload)
assert any("example.com" in row.get("url", "") for row in payload)
def test_search_json_with_headers_wraps_links_payload(tmp_path, process, disable_extractors_dict):
"""Test that search --json --with-headers returns a headers envelope."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
check=True,
)
result = subprocess.run(
['archivebox', 'search', '--json', '--with-headers'],
["archivebox", "search", "--json", "--with-headers"],
capture_output=True,
text=True,
timeout=30,
@@ -109,51 +109,51 @@ def test_search_json_with_headers_wraps_links_payload(tmp_path, process, disable
assert result.returncode == 0, result.stderr
payload = json.loads(result.stdout)
links = payload.get('links', payload)
assert any('example.com' in row.get('url', '') for row in links)
links = payload.get("links", payload)
assert any("example.com" in row.get("url", "") for row in links)
def test_search_html_outputs_markup(tmp_path, process, disable_extractors_dict):
"""Test that search --html renders an HTML response."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
check=True,
)
result = subprocess.run(
['archivebox', 'search', '--html'],
["archivebox", "search", "--html"],
capture_output=True,
text=True,
timeout=30,
)
assert result.returncode == 0, result.stderr
assert '<' in result.stdout
assert "<" in result.stdout
def test_search_csv_outputs_requested_column(tmp_path, process, disable_extractors_dict):
"""Test that search --csv emits the requested fields."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
check=True,
)
result = subprocess.run(
['archivebox', 'search', '--csv', 'url', '--with-headers'],
["archivebox", "search", "--csv", "url", "--with-headers"],
capture_output=True,
text=True,
timeout=30,
)
assert result.returncode == 0, result.stderr
assert 'url' in result.stdout
assert 'example.com' in result.stdout
assert "url" in result.stdout
assert "example.com" in result.stdout
def test_search_with_headers_requires_structured_output_format(tmp_path, process):
@@ -161,36 +161,36 @@ def test_search_with_headers_requires_structured_output_format(tmp_path, process
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'search', '--with-headers'],
["archivebox", "search", "--with-headers"],
capture_output=True,
text=True,
timeout=30,
)
assert result.returncode != 0
assert 'requires' in result.stderr.lower() or 'json' in result.stderr.lower()
assert "requires" in result.stderr.lower() or "json" in result.stderr.lower()
def test_search_sort_option_runs_successfully(tmp_path, process, disable_extractors_dict):
"""Test that search --sort accepts sortable fields."""
os.chdir(tmp_path)
for url in ['https://iana.org', 'https://example.com']:
for url in ["https://iana.org", "https://example.com"]:
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', url],
["archivebox", "add", "--index-only", "--depth=0", url],
capture_output=True,
env=disable_extractors_dict,
check=True,
)
result = subprocess.run(
['archivebox', 'search', '--csv', 'url', '--sort=url'],
["archivebox", "search", "--csv", "url", "--sort=url"],
capture_output=True,
text=True,
timeout=30,
)
assert result.returncode == 0, result.stderr
assert 'example.com' in result.stdout or 'iana.org' in result.stdout
assert "example.com" in result.stdout or "iana.org" in result.stdout
def test_search_help_lists_supported_filters(tmp_path, process):
@@ -198,13 +198,13 @@ def test_search_help_lists_supported_filters(tmp_path, process):
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'search', '--help'],
["archivebox", "search", "--help"],
capture_output=True,
text=True,
timeout=30,
)
assert result.returncode == 0
assert '--filter-type' in result.stdout or '-f' in result.stdout
assert '--status' in result.stdout
assert '--sort' in result.stdout
assert "--filter-type" in result.stdout or "-f" in result.stdout
assert "--status" in result.stdout
assert "--sort" in result.stdout

View File

@@ -24,14 +24,14 @@ def test_server_shows_usage_info(tmp_path, process):
# Just check that the command is recognized
# We won't actually start a full server in tests
result = subprocess.run(
['archivebox', 'server', '--help'],
["archivebox", "server", "--help"],
capture_output=True,
text=True,
timeout=10,
)
assert result.returncode == 0
assert 'server' in result.stdout.lower() or 'http' in result.stdout.lower()
assert "server" in result.stdout.lower() or "http" in result.stdout.lower()
def test_server_init_flag(tmp_path, process):
@@ -40,14 +40,14 @@ def test_server_init_flag(tmp_path, process):
# Check init flag is recognized
result = subprocess.run(
['archivebox', 'server', '--help'],
["archivebox", "server", "--help"],
capture_output=True,
text=True,
timeout=10,
)
assert result.returncode == 0
assert '--init' in result.stdout or 'init' in result.stdout.lower()
assert "--init" in result.stdout or "init" in result.stdout.lower()
def test_runner_worker_uses_current_interpreter():
@@ -109,3 +109,61 @@ def test_stop_existing_background_runner_cleans_up_and_stops_orchestrators():
runner_a.kill_tree.assert_called_once_with(graceful_timeout=2.0)
runner_b.terminate.assert_called_once_with(graceful_timeout=2.0)
log.assert_called_once()
def test_stop_existing_server_workers_takes_over_same_runserver_port(monkeypatch):
from archivebox.cli.archivebox_server import stop_existing_server_workers
supervisor = Mock()
supervisor.getProcessInfo.side_effect = lambda name: {
"worker_runserver": {"statename": "RUNNING"},
"worker_daphne": {"statename": "STOPPED"},
}.get(name, None)
stop_worker = Mock()
log = Mock()
monkeypatch.setattr(
"archivebox.cli.archivebox_server._read_supervisor_worker_command",
lambda worker_name: f"{sys.executable} -m archivebox manage runserver 0.0.0.0:8000" if worker_name == "worker_runserver" else "",
)
stopped = stop_existing_server_workers(
supervisor=supervisor,
stop_worker_fn=stop_worker,
host="0.0.0.0",
port="8000",
log=log,
)
assert stopped == 1
stop_worker.assert_called_once_with(supervisor, "worker_runserver")
log.assert_called_once()
def test_stop_existing_server_workers_leaves_different_port_running(monkeypatch):
from archivebox.cli.archivebox_server import stop_existing_server_workers
supervisor = Mock()
supervisor.getProcessInfo.side_effect = lambda name: {
"worker_runserver": {"statename": "RUNNING"},
"worker_daphne": {"statename": "STOPPED"},
}.get(name, None)
stop_worker = Mock()
log = Mock()
monkeypatch.setattr(
"archivebox.cli.archivebox_server._read_supervisor_worker_command",
lambda worker_name: f"{sys.executable} -m archivebox manage runserver 127.0.0.1:9000" if worker_name == "worker_runserver" else "",
)
stopped = stop_existing_server_workers(
supervisor=supervisor,
stop_worker_fn=stop_worker,
host="0.0.0.0",
port="8000",
log=log,
)
assert stopped == 0
stop_worker.assert_not_called()
log.assert_not_called()

View File

@@ -14,7 +14,7 @@ def test_shell_command_exists(tmp_path, process):
# Test that the command exists (will fail without input but should recognize command)
result = subprocess.run(
['archivebox', 'shell', '--help'],
["archivebox", "shell", "--help"],
capture_output=True,
text=True,
timeout=10,
@@ -29,11 +29,11 @@ def test_shell_c_executes_python(tmp_path, process):
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'shell', '-c', 'print("shell-ok")'],
["archivebox", "shell", "-c", 'print("shell-ok")'],
capture_output=True,
text=True,
timeout=30,
)
assert result.returncode == 0, result.stderr
assert 'shell-ok' in result.stdout
assert "shell-ok" in result.stdout

View File

@@ -25,29 +25,29 @@ class TestSnapshotCreate:
url = create_test_url()
stdout, stderr, code = run_archivebox_cmd(
['snapshot', 'create', url],
["snapshot", "create", url],
data_dir=initialized_archive,
)
assert code == 0, f"Command failed: {stderr}"
assert 'Created' in stderr
assert "Created" in stderr
records = parse_jsonl_output(stdout)
assert len(records) == 1
assert records[0]['type'] == 'Snapshot'
assert records[0]['url'] == url
assert records[0]["type"] == "Snapshot"
assert records[0]["url"] == url
def test_create_from_crawl_jsonl(self, initialized_archive):
"""Create snapshots from Crawl JSONL input."""
url = create_test_url()
# First create a crawl
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
stdout1, _, _ = run_archivebox_cmd(["crawl", "create", url], data_dir=initialized_archive)
crawl = parse_jsonl_output(stdout1)[0]
# Pipe crawl to snapshot create
stdout2, stderr, code = run_archivebox_cmd(
['snapshot', 'create'],
["snapshot", "create"],
stdin=json.dumps(crawl),
data_dir=initialized_archive,
)
@@ -56,34 +56,34 @@ class TestSnapshotCreate:
records = parse_jsonl_output(stdout2)
# Should have the Crawl passed through and the Snapshot created
types = [r.get('type') for r in records]
assert 'Crawl' in types
assert 'Snapshot' in types
types = [r.get("type") for r in records]
assert "Crawl" in types
assert "Snapshot" in types
snapshot = next(r for r in records if r['type'] == 'Snapshot')
assert snapshot['url'] == url
snapshot = next(r for r in records if r["type"] == "Snapshot")
assert snapshot["url"] == url
def test_create_with_tag(self, initialized_archive):
"""Create snapshot with --tag flag."""
url = create_test_url()
stdout, stderr, code = run_archivebox_cmd(
['snapshot', 'create', '--tag=test-tag', url],
["snapshot", "create", "--tag=test-tag", url],
data_dir=initialized_archive,
)
assert code == 0
records = parse_jsonl_output(stdout)
assert 'test-tag' in records[0].get('tags', '')
assert "test-tag" in records[0].get("tags", "")
def test_create_pass_through_other_types(self, initialized_archive):
"""Pass-through records of other types unchanged."""
tag_record = {'type': 'Tag', 'id': 'fake-tag-id', 'name': 'test'}
tag_record = {"type": "Tag", "id": "fake-tag-id", "name": "test"}
url = create_test_url()
stdin = json.dumps(tag_record) + '\n' + json.dumps({'url': url})
stdin = json.dumps(tag_record) + "\n" + json.dumps({"url": url})
stdout, stderr, code = run_archivebox_cmd(
['snapshot', 'create'],
["snapshot", "create"],
stdin=stdin,
data_dir=initialized_archive,
)
@@ -91,16 +91,16 @@ class TestSnapshotCreate:
assert code == 0
records = parse_jsonl_output(stdout)
types = [r.get('type') for r in records]
assert 'Tag' in types
assert 'Snapshot' in types
types = [r.get("type") for r in records]
assert "Tag" in types
assert "Snapshot" in types
def test_create_multiple_urls(self, initialized_archive):
"""Create snapshots from multiple URLs."""
urls = [create_test_url() for _ in range(3)]
stdout, stderr, code = run_archivebox_cmd(
['snapshot', 'create'] + urls,
["snapshot", "create"] + urls,
data_dir=initialized_archive,
)
@@ -108,7 +108,7 @@ class TestSnapshotCreate:
records = parse_jsonl_output(stdout)
assert len(records) == 3
created_urls = {r['url'] for r in records}
created_urls = {r["url"] for r in records}
for url in urls:
assert url in created_urls
@@ -119,65 +119,65 @@ class TestSnapshotList:
def test_list_empty(self, initialized_archive):
"""List with no snapshots returns empty."""
stdout, stderr, code = run_archivebox_cmd(
['snapshot', 'list'],
["snapshot", "list"],
data_dir=initialized_archive,
)
assert code == 0
assert 'Listed 0 snapshots' in stderr
assert "Listed 0 snapshots" in stderr
def test_list_returns_created(self, initialized_archive):
"""List returns previously created snapshots."""
url = create_test_url()
run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
stdout, stderr, code = run_archivebox_cmd(
['snapshot', 'list'],
["snapshot", "list"],
data_dir=initialized_archive,
)
assert code == 0
records = parse_jsonl_output(stdout)
assert len(records) >= 1
assert any(r.get('url') == url for r in records)
assert any(r.get("url") == url for r in records)
def test_list_filter_by_status(self, initialized_archive):
"""Filter snapshots by status."""
url = create_test_url()
run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
stdout, stderr, code = run_archivebox_cmd(
['snapshot', 'list', '--status=queued'],
["snapshot", "list", "--status=queued"],
data_dir=initialized_archive,
)
assert code == 0
records = parse_jsonl_output(stdout)
for r in records:
assert r['status'] == 'queued'
assert r["status"] == "queued"
def test_list_filter_by_url_contains(self, initialized_archive):
"""Filter snapshots by URL contains."""
url = create_test_url(domain='unique-domain-12345.com')
run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
url = create_test_url(domain="unique-domain-12345.com")
run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
stdout, stderr, code = run_archivebox_cmd(
['snapshot', 'list', '--url__icontains=unique-domain-12345'],
["snapshot", "list", "--url__icontains=unique-domain-12345"],
data_dir=initialized_archive,
)
assert code == 0
records = parse_jsonl_output(stdout)
assert len(records) == 1
assert 'unique-domain-12345' in records[0]['url']
assert "unique-domain-12345" in records[0]["url"]
def test_list_with_limit(self, initialized_archive):
"""Limit number of results."""
for _ in range(3):
run_archivebox_cmd(['snapshot', 'create', create_test_url()], data_dir=initialized_archive)
run_archivebox_cmd(["snapshot", "create", create_test_url()], data_dir=initialized_archive)
stdout, stderr, code = run_archivebox_cmd(
['snapshot', 'list', '--limit=2'],
["snapshot", "list", "--limit=2"],
data_dir=initialized_archive,
)
@@ -185,6 +185,35 @@ class TestSnapshotList:
records = parse_jsonl_output(stdout)
assert len(records) == 2
def test_list_with_sort_and_limit(self, initialized_archive):
"""Sorting should be applied before limiting."""
for _ in range(3):
run_archivebox_cmd(["snapshot", "create", create_test_url()], data_dir=initialized_archive)
stdout, stderr, code = run_archivebox_cmd(
["snapshot", "list", "--limit=2", "--sort=-created_at"],
data_dir=initialized_archive,
)
assert code == 0, f"Command failed: {stderr}"
records = parse_jsonl_output(stdout)
assert len(records) == 2
def test_list_search_meta(self, initialized_archive):
"""snapshot list should support metadata search mode."""
url = create_test_url(domain="meta-search-example.com")
run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
stdout, stderr, code = run_archivebox_cmd(
["snapshot", "list", "--search=meta", "meta-search-example.com"],
data_dir=initialized_archive,
)
assert code == 0, f"Command failed: {stderr}"
records = parse_jsonl_output(stdout)
assert len(records) == 1
assert "meta-search-example.com" in records[0]["url"]
class TestSnapshotUpdate:
"""Tests for `archivebox snapshot update`."""
@@ -192,35 +221,35 @@ class TestSnapshotUpdate:
def test_update_status(self, initialized_archive):
"""Update snapshot status."""
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
snapshot = parse_jsonl_output(stdout1)[0]
stdout2, stderr, code = run_archivebox_cmd(
['snapshot', 'update', '--status=started'],
["snapshot", "update", "--status=started"],
stdin=json.dumps(snapshot),
data_dir=initialized_archive,
)
assert code == 0
assert 'Updated 1 snapshots' in stderr
assert "Updated 1 snapshots" in stderr
records = parse_jsonl_output(stdout2)
assert records[0]['status'] == 'started'
assert records[0]["status"] == "started"
def test_update_add_tag(self, initialized_archive):
"""Update snapshot by adding tag."""
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
snapshot = parse_jsonl_output(stdout1)[0]
stdout2, stderr, code = run_archivebox_cmd(
['snapshot', 'update', '--tag=new-tag'],
["snapshot", "update", "--tag=new-tag"],
stdin=json.dumps(snapshot),
data_dir=initialized_archive,
)
assert code == 0
assert 'Updated 1 snapshots' in stderr
assert "Updated 1 snapshots" in stderr
class TestSnapshotDelete:
@@ -229,44 +258,44 @@ class TestSnapshotDelete:
def test_delete_requires_yes(self, initialized_archive):
"""Delete requires --yes flag."""
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
snapshot = parse_jsonl_output(stdout1)[0]
stdout, stderr, code = run_archivebox_cmd(
['snapshot', 'delete'],
["snapshot", "delete"],
stdin=json.dumps(snapshot),
data_dir=initialized_archive,
)
assert code == 1
assert '--yes' in stderr
assert "--yes" in stderr
def test_delete_with_yes(self, initialized_archive):
"""Delete with --yes flag works."""
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
snapshot = parse_jsonl_output(stdout1)[0]
stdout, stderr, code = run_archivebox_cmd(
['snapshot', 'delete', '--yes'],
["snapshot", "delete", "--yes"],
stdin=json.dumps(snapshot),
data_dir=initialized_archive,
)
assert code == 0
assert 'Deleted 1 snapshots' in stderr
assert "Deleted 1 snapshots" in stderr
def test_delete_dry_run(self, initialized_archive):
"""Dry run shows what would be deleted."""
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
stdout1, _, _ = run_archivebox_cmd(["snapshot", "create", url], data_dir=initialized_archive)
snapshot = parse_jsonl_output(stdout1)[0]
stdout, stderr, code = run_archivebox_cmd(
['snapshot', 'delete', '--dry-run'],
["snapshot", "delete", "--dry-run"],
stdin=json.dumps(snapshot),
data_dir=initialized_archive,
)
assert code == 0
assert 'Would delete' in stderr
assert "Would delete" in stderr

View File

@@ -14,8 +14,8 @@ def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None:
candidates = {snapshot_id}
if len(snapshot_id) == 32:
candidates.add(f"{snapshot_id[:8]}-{snapshot_id[8:12]}-{snapshot_id[12:16]}-{snapshot_id[16:20]}-{snapshot_id[20:]}")
elif len(snapshot_id) == 36 and '-' in snapshot_id:
candidates.add(snapshot_id.replace('-', ''))
elif len(snapshot_id) == 36 and "-" in snapshot_id:
candidates.add(snapshot_id.replace("-", ""))
for needle in candidates:
for path in data_dir.rglob(needle):
@@ -27,7 +27,7 @@ def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None:
def test_status_runs_successfully(tmp_path, process):
"""Test that status command runs without error."""
os.chdir(tmp_path)
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True)
result = subprocess.run(["archivebox", "status"], capture_output=True, text=True)
assert result.returncode == 0
assert len(result.stdout) > 100
@@ -36,11 +36,11 @@ def test_status_runs_successfully(tmp_path, process):
def test_status_shows_zero_snapshots_in_empty_archive(tmp_path, process):
"""Test status shows 0 snapshots in empty archive."""
os.chdir(tmp_path)
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True)
result = subprocess.run(["archivebox", "status"], capture_output=True, text=True)
output = result.stdout
# Should indicate empty/zero state
assert '0' in output
assert "0" in output
def test_status_shows_correct_snapshot_count(tmp_path, process, disable_extractors_dict):
@@ -48,14 +48,14 @@ def test_status_shows_correct_snapshot_count(tmp_path, process, disable_extracto
os.chdir(tmp_path)
# Add 3 snapshots
for url in ['https://example.com', 'https://example.org', 'https://example.net']:
for url in ["https://example.com", "https://example.org", "https://example.net"]:
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', url],
["archivebox", "add", "--index-only", "--depth=0", url],
capture_output=True,
env=disable_extractors_dict,
)
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True)
result = subprocess.run(["archivebox", "status"], capture_output=True, text=True)
# Verify DB has 3 snapshots
conn = sqlite3.connect("index.sqlite3")
@@ -65,7 +65,7 @@ def test_status_shows_correct_snapshot_count(tmp_path, process, disable_extracto
assert db_count == 3
# Status output should show 3
assert '3' in result.stdout
assert "3" in result.stdout
def test_status_shows_archived_count(tmp_path, process, disable_extractors_dict):
@@ -73,25 +73,25 @@ def test_status_shows_archived_count(tmp_path, process, disable_extractors_dict)
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True)
result = subprocess.run(["archivebox", "status"], capture_output=True, text=True)
# Should show archived/unarchived categories
assert 'archived' in result.stdout.lower() or 'queued' in result.stdout.lower()
assert "archived" in result.stdout.lower() or "queued" in result.stdout.lower()
def test_status_shows_archive_directory_size(tmp_path, process):
"""Test status reports archive directory size."""
os.chdir(tmp_path)
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True)
result = subprocess.run(["archivebox", "status"], capture_output=True, text=True)
output = result.stdout
# Should show size info
assert 'Size' in output or 'size' in output
assert "Size" in output or "size" in output
def test_status_counts_archive_directories(tmp_path, process, disable_extractors_dict):
@@ -99,15 +99,15 @@ def test_status_counts_archive_directories(tmp_path, process, disable_extractors
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True)
result = subprocess.run(["archivebox", "status"], capture_output=True, text=True)
# Should show directory count
assert 'present' in result.stdout.lower() or 'directories' in result.stdout
assert "present" in result.stdout.lower() or "directories" in result.stdout
def test_status_detects_orphaned_directories(tmp_path, process, disable_extractors_dict):
@@ -116,7 +116,7 @@ def test_status_detects_orphaned_directories(tmp_path, process, disable_extracto
# Add a snapshot
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
@@ -124,10 +124,10 @@ def test_status_detects_orphaned_directories(tmp_path, process, disable_extracto
# Create an orphaned directory
(tmp_path / "archive" / "fake_orphaned_dir").mkdir(parents=True, exist_ok=True)
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True)
result = subprocess.run(["archivebox", "status"], capture_output=True, text=True)
# Should mention orphaned dirs
assert 'orphan' in result.stdout.lower() or '1' in result.stdout
assert "orphan" in result.stdout.lower() or "1" in result.stdout
def test_status_counts_new_snapshot_output_dirs_as_archived(tmp_path, process, disable_extractors_dict):
@@ -137,7 +137,7 @@ def test_status_counts_new_snapshot_output_dirs_as_archived(tmp_path, process, d
env["ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS"] = "true"
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=env,
check=True,
@@ -145,7 +145,7 @@ def test_status_counts_new_snapshot_output_dirs_as_archived(tmp_path, process, d
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
snapshot_id = c.execute("SELECT id FROM core_snapshot WHERE url = ?", ('https://example.com',)).fetchone()[0]
snapshot_id = c.execute("SELECT id FROM core_snapshot WHERE url = ?", ("https://example.com",)).fetchone()[0]
conn.close()
snapshot_dir = _find_snapshot_dir(tmp_path, str(snapshot_id))
@@ -154,21 +154,21 @@ def test_status_counts_new_snapshot_output_dirs_as_archived(tmp_path, process, d
title_dir.mkdir(parents=True, exist_ok=True)
(title_dir / "title.txt").write_text("Example Domain")
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True, env=env)
result = subprocess.run(["archivebox", "status"], capture_output=True, text=True, env=env)
assert result.returncode == 0, result.stdout + result.stderr
assert 'archived: 1' in result.stdout
assert 'present: 1' in result.stdout
assert "archived: 1" in result.stdout
assert "present: 1" in result.stdout
def test_status_shows_user_info(tmp_path, process):
"""Test status shows user/login information."""
os.chdir(tmp_path)
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True)
result = subprocess.run(["archivebox", "status"], capture_output=True, text=True)
output = result.stdout
# Should show user section
assert 'user' in output.lower() or 'login' in output.lower()
assert "user" in output.lower() or "login" in output.lower()
def test_status_reads_from_db_not_filesystem(tmp_path, process, disable_extractors_dict):
@@ -177,7 +177,7 @@ def test_status_reads_from_db_not_filesystem(tmp_path, process, disable_extracto
# Add snapshot to DB
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
@@ -191,35 +191,35 @@ def test_status_reads_from_db_not_filesystem(tmp_path, process, disable_extracto
assert db_count == 1
# Status should reflect DB count
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True)
assert '1' in result.stdout
result = subprocess.run(["archivebox", "status"], capture_output=True, text=True)
assert "1" in result.stdout
def test_status_shows_index_file_info(tmp_path, process):
"""Test status shows index file information."""
os.chdir(tmp_path)
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True)
result = subprocess.run(["archivebox", "status"], capture_output=True, text=True)
# Should mention index
assert 'index' in result.stdout.lower() or 'Index' in result.stdout
assert "index" in result.stdout.lower() or "Index" in result.stdout
def test_status_help_lists_available_options(tmp_path, process):
"""Test that status --help works and documents the command."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'status', '--help'],
["archivebox", "status", "--help"],
capture_output=True,
text=True,
)
assert result.returncode == 0
assert 'status' in result.stdout.lower() or 'statistic' in result.stdout.lower()
assert "status" in result.stdout.lower() or "statistic" in result.stdout.lower()
def test_status_shows_data_directory_path(tmp_path, process):
"""Test that status reports which collection directory it is inspecting."""
os.chdir(tmp_path)
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True)
result = subprocess.run(["archivebox", "status"], capture_output=True, text=True)
assert 'archive' in result.stdout.lower() or str(tmp_path) in result.stdout
assert "archive" in result.stdout.lower() or str(tmp_path) in result.stdout

View File

@@ -13,7 +13,7 @@ def test_update_runs_successfully_on_empty_archive(tmp_path, process):
"""Test that update runs without error on empty archive."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'update'],
["archivebox", "update"],
capture_output=True,
text=True,
timeout=30,
@@ -29,14 +29,14 @@ def test_update_reconciles_existing_snapshots(tmp_path, process, disable_extract
# Add a snapshot (index-only for faster test)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
)
# Run update - should reconcile and queue
result = subprocess.run(
['archivebox', 'update'],
["archivebox", "update"],
capture_output=True,
env=disable_extractors_dict,
timeout=30,
@@ -51,13 +51,13 @@ def test_update_specific_snapshot_by_filter(tmp_path, process, disable_extractor
# Add multiple snapshots
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
timeout=90,
)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.org'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.org"],
capture_output=True,
env=disable_extractors_dict,
timeout=90,
@@ -65,7 +65,7 @@ def test_update_specific_snapshot_by_filter(tmp_path, process, disable_extractor
# Update with filter pattern (uses filter_patterns argument)
result = subprocess.run(
['archivebox', 'update', '--filter-type=substring', 'example.com'],
["archivebox", "update", "--filter-type=substring", "example.com"],
capture_output=True,
env=disable_extractors_dict,
timeout=30,
@@ -81,7 +81,7 @@ def test_update_preserves_snapshot_count(tmp_path, process, disable_extractors_d
# Add snapshots
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
timeout=90,
@@ -97,7 +97,7 @@ def test_update_preserves_snapshot_count(tmp_path, process, disable_extractors_d
# Run update (should reconcile + queue, not create new snapshots)
subprocess.run(
['archivebox', 'update'],
["archivebox", "update"],
capture_output=True,
env=disable_extractors_dict,
timeout=30,
@@ -118,7 +118,7 @@ def test_update_queues_snapshots_for_archiving(tmp_path, process, disable_extrac
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,
timeout=90,
@@ -126,7 +126,7 @@ def test_update_queues_snapshots_for_archiving(tmp_path, process, disable_extrac
# Run update
result = subprocess.run(
['archivebox', 'update'],
["archivebox", "update"],
capture_output=True,
env=disable_extractors_dict,
timeout=30,
@@ -140,4 +140,4 @@ def test_update_queues_snapshots_for_archiving(tmp_path, process, disable_extrac
status = c.execute("SELECT status FROM core_snapshot").fetchone()[0]
conn.close()
assert status == 'queued'
assert status == "queued"

View File

@@ -67,56 +67,56 @@ def _extract_location_path(output: str, key: str) -> Path:
def test_version_quiet_outputs_version_number(tmp_path):
"""Test that version --quiet outputs just the version number."""
os.chdir(tmp_path)
result = subprocess.run(['archivebox', 'version', '--quiet'], capture_output=True, text=True)
result = subprocess.run(["archivebox", "version", "--quiet"], capture_output=True, text=True)
assert result.returncode == 0
version = result.stdout.strip()
assert version
# Version should be semver-ish format (e.g., 0.8.0)
parts = version.split('.')
parts = version.split(".")
assert len(parts) >= 2
def test_version_flag_outputs_version_number(tmp_path):
"""Test that top-level --version reports the package version."""
os.chdir(tmp_path)
result = subprocess.run(['archivebox', '--version'], capture_output=True, text=True)
result = subprocess.run(["archivebox", "--version"], capture_output=True, text=True)
assert result.returncode == 0
version = result.stdout.strip()
assert version
assert len(version.split('.')) >= 2
assert len(version.split(".")) >= 2
def test_version_shows_system_info_in_initialized_dir(tmp_path, process):
"""Test that version shows system metadata in initialized directory."""
os.chdir(tmp_path)
result = subprocess.run(['archivebox', 'version'], capture_output=True, text=True)
result = subprocess.run(["archivebox", "version"], capture_output=True, text=True)
output = result.stdout
assert 'ArchiveBox' in output
assert "ArchiveBox" in output
# Should show system info
assert any(x in output for x in ['ARCH=', 'OS=', 'PYTHON='])
assert any(x in output for x in ["ARCH=", "OS=", "PYTHON="])
def test_version_shows_binaries_after_init(tmp_path, process):
"""Test that version shows binary dependencies in initialized directory."""
os.chdir(tmp_path)
result = subprocess.run(['archivebox', 'version'], capture_output=True, text=True)
result = subprocess.run(["archivebox", "version"], capture_output=True, text=True)
output = result.stdout
# Should show binary section
assert 'Binary' in output or 'Dependencies' in output
assert "Binary" in output or "Dependencies" in output
def test_version_shows_data_locations(tmp_path, process):
"""Test that version shows data directory locations."""
os.chdir(tmp_path)
result = subprocess.run(['archivebox', 'version'], capture_output=True, text=True)
result = subprocess.run(["archivebox", "version"], capture_output=True, text=True)
output = result.stdout
# Should show paths
assert any(x in output for x in ['Data', 'Code', 'location'])
assert any(x in output for x in ["Data", "Code", "location"])
def test_version_in_uninitialized_dir_still_works(tmp_path):
@@ -125,7 +125,7 @@ def test_version_in_uninitialized_dir_still_works(tmp_path):
empty_dir.mkdir()
os.chdir(empty_dir)
result = subprocess.run(['archivebox', 'version', '--quiet'], capture_output=True, text=True)
result = subprocess.run(["archivebox", "version", "--quiet"], capture_output=True, text=True)
# Should still output version
assert result.returncode == 0
@@ -164,15 +164,15 @@ def test_version_auto_selects_short_tmp_dir_for_deep_collection_path(tmp_path):
def test_version_help_lists_quiet_flag(tmp_path):
"""Test that version --help documents the quiet output mode."""
os.chdir(tmp_path)
result = subprocess.run(['archivebox', 'version', '--help'], capture_output=True, text=True)
result = subprocess.run(["archivebox", "version", "--help"], capture_output=True, text=True)
assert result.returncode == 0
assert '--quiet' in result.stdout or '-q' in result.stdout
assert "--quiet" in result.stdout or "-q" in result.stdout
def test_version_invalid_option_fails(tmp_path):
"""Test that invalid version options fail cleanly."""
os.chdir(tmp_path)
result = subprocess.run(['archivebox', 'version', '--invalid-option'], capture_output=True, text=True)
result = subprocess.run(["archivebox", "version", "--invalid-option"], capture_output=True, text=True)
assert result.returncode != 0

View File

@@ -7,19 +7,18 @@ import subprocess
import pytest
def test_config_shows_all_config_values(tmp_path, process):
"""Test that config without args shows all config values."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'config'],
["archivebox", "config"],
capture_output=True,
text=True,
)
# Should show various config sections
assert 'TIMEOUT' in result.stdout or 'timeout' in result.stdout.lower()
assert "TIMEOUT" in result.stdout or "timeout" in result.stdout.lower()
# Config should show some output
assert len(result.stdout) > 100
@@ -29,13 +28,13 @@ def test_config_get_specific_key(tmp_path, process):
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'config', '--get', 'TIMEOUT'],
["archivebox", "config", "--get", "TIMEOUT"],
capture_output=True,
text=True,
)
# Should show the TIMEOUT value
assert 'TIMEOUT' in result.stdout or result.returncode == 0
assert "TIMEOUT" in result.stdout or result.returncode == 0
def test_config_set_value_writes_to_config_file(tmp_path, process):
@@ -44,18 +43,18 @@ def test_config_set_value_writes_to_config_file(tmp_path, process):
# Set a config value
result = subprocess.run(
['archivebox', 'config', '--set', 'TIMEOUT=120'],
["archivebox", "config", "--set", "TIMEOUT=120"],
capture_output=True,
text=True,
)
assert result.returncode == 0, result.stderr
# Read the config file directly to verify it was written
config_file = tmp_path / 'ArchiveBox.conf'
config_file = tmp_path / "ArchiveBox.conf"
if config_file.exists():
config_content = config_file.read_text()
# Config should contain the set value
assert 'TIMEOUT' in config_content or 'timeout' in config_content.lower()
assert "TIMEOUT" in config_content or "timeout" in config_content.lower()
def test_config_set_and_get_roundtrip(tmp_path, process):
@@ -64,19 +63,19 @@ def test_config_set_and_get_roundtrip(tmp_path, process):
# Set a value
set_result = subprocess.run(
['archivebox', 'config', '--set', 'TIMEOUT=999'],
["archivebox", "config", "--set", "TIMEOUT=999"],
capture_output=True,
text=True,
)
# Verify set was successful
assert set_result.returncode == 0 or '999' in set_result.stdout
assert set_result.returncode == 0 or "999" in set_result.stdout
# Read the config file directly to verify
config_file = tmp_path / 'ArchiveBox.conf'
config_file = tmp_path / "ArchiveBox.conf"
if config_file.exists():
config_content = config_file.read_text()
assert '999' in config_content or 'TIMEOUT' in config_content
assert "999" in config_content or "TIMEOUT" in config_content
def test_config_search_finds_matching_keys(tmp_path, process):
@@ -84,13 +83,13 @@ def test_config_search_finds_matching_keys(tmp_path, process):
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'config', '--search', 'TIMEOUT'],
["archivebox", "config", "--search", "TIMEOUT"],
capture_output=True,
text=True,
)
# Should find TIMEOUT-related config
assert 'TIMEOUT' in result.stdout or result.returncode == 0
assert "TIMEOUT" in result.stdout or result.returncode == 0
def test_config_invalid_key_fails(tmp_path, process):
@@ -98,13 +97,13 @@ def test_config_invalid_key_fails(tmp_path, process):
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'config', '--set', 'INVALID_KEY_THAT_DOES_NOT_EXIST=value'],
["archivebox", "config", "--set", "INVALID_KEY_THAT_DOES_NOT_EXIST=value"],
capture_output=True,
text=True,
)
# Should fail
assert result.returncode != 0 or 'failed' in result.stdout.lower()
assert result.returncode != 0 or "failed" in result.stdout.lower()
def test_config_set_requires_equals_sign(tmp_path, process):
@@ -112,7 +111,7 @@ def test_config_set_requires_equals_sign(tmp_path, process):
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'config', '--set', 'TIMEOUT'],
["archivebox", "config", "--set", "TIMEOUT"],
capture_output=True,
text=True,
)
@@ -129,15 +128,15 @@ class TestConfigCLI:
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'config', '--help'],
["archivebox", "config", "--help"],
capture_output=True,
text=True,
)
assert result.returncode == 0
assert '--get' in result.stdout
assert '--set' in result.stdout
assert "--get" in result.stdout
assert "--set" in result.stdout
if __name__ == '__main__':
pytest.main([__file__, '-v'])
if __name__ == "__main__":
pytest.main([__file__, "-v"])

View File

@@ -17,310 +17,317 @@ def test_get_db_binaries_by_name_collapses_youtube_dl_aliases(monkeypatch):
now = timezone.now()
records = [
SimpleNamespace(
name='youtube-dl',
version='',
binprovider='',
abspath='/usr/bin/youtube-dl',
name="youtube-dl",
version="",
binprovider="",
abspath="/usr/bin/youtube-dl",
status=Binary.StatusChoices.INSTALLED,
modified_at=now,
),
SimpleNamespace(
name='yt-dlp',
version='2026.03.01',
binprovider='pip',
abspath='/usr/bin/yt-dlp',
name="yt-dlp",
version="2026.03.01",
binprovider="pip",
abspath="/usr/bin/yt-dlp",
status=Binary.StatusChoices.INSTALLED,
modified_at=now + timedelta(seconds=1),
),
]
monkeypatch.setattr(config_views.Binary, 'objects', SimpleNamespace(all=lambda: records))
monkeypatch.setattr(config_views.Binary, "objects", SimpleNamespace(all=lambda: records))
binaries = config_views.get_db_binaries_by_name()
assert 'yt-dlp' in binaries
assert 'youtube-dl' not in binaries
assert binaries['yt-dlp'].version == '2026.03.01'
assert "yt-dlp" in binaries
assert "youtube-dl" not in binaries
assert binaries["yt-dlp"].version == "2026.03.01"
def test_binaries_list_view_uses_db_version_and_hides_youtube_dl_alias(monkeypatch):
request = RequestFactory().get('/admin/environment/binaries/')
request = RequestFactory().get("/admin/environment/binaries/")
request.user = SimpleNamespace(is_superuser=True)
db_binary = SimpleNamespace(
name='youtube-dl',
version='2026.03.01',
binprovider='pip',
abspath='/usr/bin/yt-dlp',
name="youtube-dl",
version="2026.03.01",
binprovider="pip",
abspath="/usr/bin/yt-dlp",
status=Binary.StatusChoices.INSTALLED,
sha256='',
sha256="",
modified_at=timezone.now(),
)
monkeypatch.setattr(config_views, 'get_db_binaries_by_name', lambda: {'yt-dlp': db_binary})
monkeypatch.setattr(config_views, "get_db_binaries_by_name", lambda: {"yt-dlp": db_binary})
context = config_views.binaries_list_view.__wrapped__(request)
assert len(context['table']['Binary Name']) == 1
assert str(context['table']['Binary Name'][0].link_item) == 'yt-dlp'
assert context['table']['Found Version'][0] == '✅ 2026.03.01'
assert context['table']['Provided By'][0] == 'pip'
assert context['table']['Found Abspath'][0] == '/usr/bin/yt-dlp'
assert len(context["table"]["Binary Name"]) == 1
assert str(context["table"]["Binary Name"][0].link_item) == "yt-dlp"
assert context["table"]["Found Version"][0] == "✅ 2026.03.01"
assert context["table"]["Provided By"][0] == "pip"
assert context["table"]["Found Abspath"][0] == "/usr/bin/yt-dlp"
def test_binaries_list_view_only_shows_persisted_records(monkeypatch):
request = RequestFactory().get('/admin/environment/binaries/')
request = RequestFactory().get("/admin/environment/binaries/")
request.user = SimpleNamespace(is_superuser=True)
monkeypatch.setattr(config_views, 'get_db_binaries_by_name', lambda: {})
monkeypatch.setattr(config_views, "get_db_binaries_by_name", lambda: {})
context = config_views.binaries_list_view.__wrapped__(request)
assert context['table']['Binary Name'] == []
assert context['table']['Found Version'] == []
assert context['table']['Provided By'] == []
assert context['table']['Found Abspath'] == []
assert context["table"]["Binary Name"] == []
assert context["table"]["Found Version"] == []
assert context["table"]["Provided By"] == []
assert context["table"]["Found Abspath"] == []
def test_binary_detail_view_uses_canonical_db_record(monkeypatch):
request = RequestFactory().get('/admin/environment/binaries/youtube-dl/')
request = RequestFactory().get("/admin/environment/binaries/youtube-dl/")
request.user = SimpleNamespace(is_superuser=True)
db_binary = SimpleNamespace(
id='019d14cc-6c40-7793-8ff1-0f8bb050e8a3',
name='yt-dlp',
version='2026.03.01',
binprovider='pip',
abspath='/usr/bin/yt-dlp',
sha256='abc123',
id="019d14cc-6c40-7793-8ff1-0f8bb050e8a3",
name="yt-dlp",
version="2026.03.01",
binprovider="pip",
abspath="/usr/bin/yt-dlp",
sha256="abc123",
status=Binary.StatusChoices.INSTALLED,
modified_at=timezone.now(),
)
monkeypatch.setattr(config_views, 'get_db_binaries_by_name', lambda: {'yt-dlp': db_binary})
monkeypatch.setattr(config_views, "get_db_binaries_by_name", lambda: {"yt-dlp": db_binary})
context = config_views.binary_detail_view.__wrapped__(request, key='youtube-dl')
section = context['data'][0]
context = config_views.binary_detail_view.__wrapped__(request, key="youtube-dl")
section = context["data"][0]
assert context['title'] == 'yt-dlp'
assert section['fields']['name'] == 'yt-dlp'
assert section['fields']['version'] == '2026.03.01'
assert section['fields']['binprovider'] == 'pip'
assert section['fields']['abspath'] == '/usr/bin/yt-dlp'
assert '/admin/machine/binary/019d14cc-6c40-7793-8ff1-0f8bb050e8a3/change/?_changelist_filters=q%3Dyt-dlp' in section['description']
assert context["title"] == "yt-dlp"
assert section["fields"]["name"] == "yt-dlp"
assert section["fields"]["version"] == "2026.03.01"
assert section["fields"]["binprovider"] == "pip"
assert section["fields"]["abspath"] == "/usr/bin/yt-dlp"
assert "/admin/machine/binary/019d14cc-6c40-7793-8ff1-0f8bb050e8a3/change/?_changelist_filters=q%3Dyt-dlp" in section["description"]
def test_binary_detail_view_marks_unrecorded_binary(monkeypatch):
request = RequestFactory().get('/admin/environment/binaries/wget/')
request = RequestFactory().get("/admin/environment/binaries/wget/")
request.user = SimpleNamespace(is_superuser=True)
monkeypatch.setattr(config_views, 'get_db_binaries_by_name', lambda: {})
monkeypatch.setattr(config_views, "get_db_binaries_by_name", lambda: {})
context = config_views.binary_detail_view.__wrapped__(request, key='wget')
section = context['data'][0]
context = config_views.binary_detail_view.__wrapped__(request, key="wget")
section = context["data"][0]
assert section['description'] == 'No persisted Binary record found'
assert section['fields']['status'] == 'unrecorded'
assert section['fields']['binprovider'] == 'not recorded'
assert section["description"] == "No persisted Binary record found"
assert section["fields"]["status"] == "unrecorded"
assert section["fields"]["binprovider"] == "not recorded"
def test_plugin_detail_view_renders_config_in_dedicated_sections(monkeypatch):
request = RequestFactory().get('/admin/environment/plugins/builtin.example/')
request = RequestFactory().get("/admin/environment/plugins/builtin.example/")
request.user = SimpleNamespace(is_superuser=True)
plugin_config = {
'title': 'Example Plugin',
'description': 'Example config used to verify plugin metadata rendering.',
'type': 'object',
'required_plugins': ['chrome'],
'required_binaries': ['example-cli'],
'output_mimetypes': ['text/plain', 'application/json'],
'properties': {
'EXAMPLE_ENABLED': {
'type': 'boolean',
'description': 'Enable the example plugin.',
'x-fallback': 'CHECK_SSL_VALIDITY',
"title": "Example Plugin",
"description": "Example config used to verify plugin metadata rendering.",
"type": "object",
"required_plugins": ["chrome"],
"required_binaries": ["example-cli"],
"output_mimetypes": ["text/plain", "application/json"],
"properties": {
"EXAMPLE_ENABLED": {
"type": "boolean",
"description": "Enable the example plugin.",
"x-fallback": "CHECK_SSL_VALIDITY",
},
'EXAMPLE_BINARY': {
'type': 'string',
'default': 'gallery-dl',
'description': 'Filesystem path for example output.',
'x-aliases': ['USE_EXAMPLE_BINARY'],
"EXAMPLE_BINARY": {
"type": "string",
"default": "gallery-dl",
"description": "Filesystem path for example output.",
"x-aliases": ["USE_EXAMPLE_BINARY"],
},
},
}
monkeypatch.setattr(config_views, 'get_filesystem_plugins', lambda: {
'builtin.example': {
'id': 'builtin.example',
'name': 'example',
'source': 'builtin',
'path': '/plugins/example',
'hooks': ['on_Snapshot__01_example.py'],
'config': plugin_config,
}
})
monkeypatch.setattr(config_views, 'get_machine_admin_url', lambda: '/admin/machine/machine/test-machine/change/')
monkeypatch.setattr(
config_views,
"get_filesystem_plugins",
lambda: {
"builtin.example": {
"id": "builtin.example",
"name": "example",
"source": "builtin",
"path": "/plugins/example",
"hooks": ["on_Snapshot__01_example.py"],
"config": plugin_config,
},
},
)
monkeypatch.setattr(config_views, "get_machine_admin_url", lambda: "/admin/machine/machine/test-machine/change/")
context = config_views.plugin_detail_view.__wrapped__(request, key='builtin.example')
context = config_views.plugin_detail_view.__wrapped__(request, key="builtin.example")
assert context['title'] == 'example'
assert len(context['data']) == 5
assert context["title"] == "example"
assert len(context["data"]) == 5
summary_section, hooks_section, metadata_section, config_section, properties_section = context['data']
summary_section, hooks_section, metadata_section, config_section, properties_section = context["data"]
assert summary_section['fields'] == {
'id': 'builtin.example',
'name': 'example',
'source': 'builtin',
assert summary_section["fields"] == {
"id": "builtin.example",
"name": "example",
"source": "builtin",
}
assert '/plugins/example' in summary_section['description']
assert 'https://archivebox.github.io/abx-plugins/#example' in summary_section['description']
assert "/plugins/example" in summary_section["description"]
assert "https://archivebox.github.io/abx-plugins/#example" in summary_section["description"]
assert hooks_section['name'] == 'Hooks'
assert hooks_section['fields'] == {}
assert 'https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/example/on_Snapshot__01_example.py' in hooks_section['description']
assert 'on_Snapshot__01_example.py' in hooks_section['description']
assert hooks_section["name"] == "Hooks"
assert hooks_section["fields"] == {}
assert (
"https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/example/on_Snapshot__01_example.py"
in hooks_section["description"]
)
assert "on_Snapshot__01_example.py" in hooks_section["description"]
assert metadata_section['name'] == 'Plugin Metadata'
assert metadata_section['fields'] == {}
assert 'Example Plugin' in metadata_section['description']
assert 'Example config used to verify plugin metadata rendering.' in metadata_section['description']
assert 'https://archivebox.github.io/abx-plugins/#chrome' in metadata_section['description']
assert '/admin/environment/binaries/example-cli/' in metadata_section['description']
assert 'text/plain' in metadata_section['description']
assert 'application/json' in metadata_section['description']
assert metadata_section["name"] == "Plugin Metadata"
assert metadata_section["fields"] == {}
assert "Example Plugin" in metadata_section["description"]
assert "Example config used to verify plugin metadata rendering." in metadata_section["description"]
assert "https://archivebox.github.io/abx-plugins/#chrome" in metadata_section["description"]
assert "/admin/environment/binaries/example-cli/" in metadata_section["description"]
assert "text/plain" in metadata_section["description"]
assert "application/json" in metadata_section["description"]
assert config_section['name'] == 'config.json'
assert config_section['fields'] == {}
assert '<pre style=' in config_section['description']
assert 'EXAMPLE_ENABLED' in config_section['description']
assert '<span style="color: #0550ae;">"properties"</span>' in config_section['description']
assert config_section["name"] == "config.json"
assert config_section["fields"] == {}
assert "<pre style=" in config_section["description"]
assert "EXAMPLE_ENABLED" in config_section["description"]
assert '<span style="color: #0550ae;">"properties"</span>' in config_section["description"]
assert properties_section['name'] == 'Config Properties'
assert properties_section['fields'] == {}
assert '/admin/machine/machine/test-machine/change/' in properties_section['description']
assert '/admin/machine/binary/' in properties_section['description']
assert '/admin/environment/binaries/' in properties_section['description']
assert 'EXAMPLE_ENABLED' in properties_section['description']
assert 'boolean' in properties_section['description']
assert 'Enable the example plugin.' in properties_section['description']
assert '/admin/environment/config/EXAMPLE_ENABLED/' in properties_section['description']
assert '/admin/environment/config/CHECK_SSL_VALIDITY/' in properties_section['description']
assert '/admin/environment/config/USE_EXAMPLE_BINARY/' in properties_section['description']
assert '/admin/environment/binaries/gallery-dl/' in properties_section['description']
assert 'EXAMPLE_BINARY' in properties_section['description']
assert properties_section["name"] == "Config Properties"
assert properties_section["fields"] == {}
assert "/admin/machine/machine/test-machine/change/" in properties_section["description"]
assert "/admin/machine/binary/" in properties_section["description"]
assert "/admin/environment/binaries/" in properties_section["description"]
assert "EXAMPLE_ENABLED" in properties_section["description"]
assert "boolean" in properties_section["description"]
assert "Enable the example plugin." in properties_section["description"]
assert "/admin/environment/config/EXAMPLE_ENABLED/" in properties_section["description"]
assert "/admin/environment/config/CHECK_SSL_VALIDITY/" in properties_section["description"]
assert "/admin/environment/config/USE_EXAMPLE_BINARY/" in properties_section["description"]
assert "/admin/environment/binaries/gallery-dl/" in properties_section["description"]
assert "EXAMPLE_BINARY" in properties_section["description"]
def test_get_config_definition_link_keeps_core_config_search_link(monkeypatch):
monkeypatch.setattr(core_views, 'find_plugin_for_config_key', lambda key: None)
monkeypatch.setattr(core_views, "find_plugin_for_config_key", lambda key: None)
url, label = core_views.get_config_definition_link('CHECK_SSL_VALIDITY')
url, label = core_views.get_config_definition_link("CHECK_SSL_VALIDITY")
assert 'github.com/search' in url
assert 'CHECK_SSL_VALIDITY' in url
assert label == 'archivebox/config'
assert "github.com/search" in url
assert "CHECK_SSL_VALIDITY" in url
assert label == "archivebox/config"
def test_get_config_definition_link_uses_plugin_config_json_for_plugin_options(monkeypatch):
plugin_dir = core_views.BUILTIN_PLUGINS_DIR / 'parse_dom_outlinks'
plugin_dir = core_views.BUILTIN_PLUGINS_DIR / "parse_dom_outlinks"
monkeypatch.setattr(core_views, 'find_plugin_for_config_key', lambda key: 'parse_dom_outlinks')
monkeypatch.setattr(core_views, 'iter_plugin_dirs', lambda: [plugin_dir])
monkeypatch.setattr(core_views, "find_plugin_for_config_key", lambda key: "parse_dom_outlinks")
monkeypatch.setattr(core_views, "iter_plugin_dirs", lambda: [plugin_dir])
url, label = core_views.get_config_definition_link('PARSE_DOM_OUTLINKS_ENABLED')
url, label = core_views.get_config_definition_link("PARSE_DOM_OUTLINKS_ENABLED")
assert url == 'https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/parse_dom_outlinks/config.json'
assert label == 'abx_plugins/plugins/parse_dom_outlinks/config.json'
assert url == "https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/parse_dom_outlinks/config.json"
assert label == "abx_plugins/plugins/parse_dom_outlinks/config.json"
def test_live_config_value_view_renames_source_field_and_uses_plugin_definition_link(monkeypatch):
request = RequestFactory().get('/admin/environment/config/PARSE_DOM_OUTLINKS_ENABLED/')
request = RequestFactory().get("/admin/environment/config/PARSE_DOM_OUTLINKS_ENABLED/")
request.user = SimpleNamespace(is_superuser=True)
monkeypatch.setattr(core_views, 'get_all_configs', lambda: {})
monkeypatch.setattr(core_views, 'get_flat_config', lambda: {})
monkeypatch.setattr(core_views, 'get_config', lambda: {'PARSE_DOM_OUTLINKS_ENABLED': True})
monkeypatch.setattr(core_views, 'find_config_default', lambda key: 'True')
monkeypatch.setattr(core_views, 'find_config_type', lambda key: 'bool')
monkeypatch.setattr(core_views, 'find_config_source', lambda key, merged: 'Default')
monkeypatch.setattr(core_views, 'key_is_safe', lambda key: True)
monkeypatch.setattr(core_views.CONSTANTS, 'CONFIG_FILE', SimpleNamespace(exists=lambda: False))
monkeypatch.setattr(core_views, "get_all_configs", lambda: {})
monkeypatch.setattr(core_views, "get_flat_config", lambda: {})
monkeypatch.setattr(core_views, "get_config", lambda: {"PARSE_DOM_OUTLINKS_ENABLED": True})
monkeypatch.setattr(core_views, "find_config_default", lambda key: "True")
monkeypatch.setattr(core_views, "find_config_type", lambda key: "bool")
monkeypatch.setattr(core_views, "find_config_source", lambda key, merged: "Default")
monkeypatch.setattr(core_views, "key_is_safe", lambda key: True)
monkeypatch.setattr(core_views.CONSTANTS, "CONFIG_FILE", SimpleNamespace(exists=lambda: False))
from archivebox.machine.models import Machine
from archivebox.config.configset import BaseConfigSet
monkeypatch.setattr(Machine, 'current', classmethod(lambda cls: SimpleNamespace(id='machine-id', config={})))
monkeypatch.setattr(BaseConfigSet, 'load_from_file', classmethod(lambda cls, path: {}))
monkeypatch.setattr(Machine, "current", classmethod(lambda cls: SimpleNamespace(id="machine-id", config={})))
monkeypatch.setattr(BaseConfigSet, "load_from_file", classmethod(lambda cls, path: {}))
monkeypatch.setattr(
core_views,
'get_config_definition_link',
"get_config_definition_link",
lambda key: (
'https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/parse_dom_outlinks/config.json',
'abx_plugins/plugins/parse_dom_outlinks/config.json',
"https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/parse_dom_outlinks/config.json",
"abx_plugins/plugins/parse_dom_outlinks/config.json",
),
)
context = core_views.live_config_value_view.__wrapped__(request, key='PARSE_DOM_OUTLINKS_ENABLED')
section = context['data'][0]
context = core_views.live_config_value_view.__wrapped__(request, key="PARSE_DOM_OUTLINKS_ENABLED")
section = context["data"][0]
assert 'Currently read from' in section['fields']
assert 'Source' not in section['fields']
assert section['fields']['Currently read from'] == 'Default'
assert 'abx_plugins/plugins/parse_dom_outlinks/config.json' in section['help_texts']['Type']
assert "Currently read from" in section["fields"]
assert "Source" not in section["fields"]
assert section["fields"]["Currently read from"] == "Default"
assert "abx_plugins/plugins/parse_dom_outlinks/config.json" in section["help_texts"]["Type"]
def test_find_config_source_prefers_environment_over_machine_and_file(monkeypatch):
monkeypatch.setenv('CHECK_SSL_VALIDITY', 'false')
monkeypatch.setenv("CHECK_SSL_VALIDITY", "false")
from archivebox.machine.models import Machine
from archivebox.config.configset import BaseConfigSet
monkeypatch.setattr(
Machine,
'current',
classmethod(lambda cls: SimpleNamespace(id='machine-id', config={'CHECK_SSL_VALIDITY': 'true'})),
"current",
classmethod(lambda cls: SimpleNamespace(id="machine-id", config={"CHECK_SSL_VALIDITY": "true"})),
)
monkeypatch.setattr(
BaseConfigSet,
'load_from_file',
classmethod(lambda cls, path: {'CHECK_SSL_VALIDITY': 'true'}),
"load_from_file",
classmethod(lambda cls, path: {"CHECK_SSL_VALIDITY": "true"}),
)
assert core_views.find_config_source('CHECK_SSL_VALIDITY', {'CHECK_SSL_VALIDITY': False}) == 'Environment'
assert core_views.find_config_source("CHECK_SSL_VALIDITY", {"CHECK_SSL_VALIDITY": False}) == "Environment"
def test_live_config_value_view_priority_text_matches_runtime_precedence(monkeypatch):
request = RequestFactory().get('/admin/environment/config/CHECK_SSL_VALIDITY/')
request = RequestFactory().get("/admin/environment/config/CHECK_SSL_VALIDITY/")
request.user = SimpleNamespace(is_superuser=True)
monkeypatch.setattr(core_views, 'get_all_configs', lambda: {})
monkeypatch.setattr(core_views, 'get_flat_config', lambda: {'CHECK_SSL_VALIDITY': True})
monkeypatch.setattr(core_views, 'get_config', lambda: {'CHECK_SSL_VALIDITY': False})
monkeypatch.setattr(core_views, 'find_config_default', lambda key: 'True')
monkeypatch.setattr(core_views, 'find_config_type', lambda key: 'bool')
monkeypatch.setattr(core_views, 'key_is_safe', lambda key: True)
monkeypatch.setattr(core_views, "get_all_configs", lambda: {})
monkeypatch.setattr(core_views, "get_flat_config", lambda: {"CHECK_SSL_VALIDITY": True})
monkeypatch.setattr(core_views, "get_config", lambda: {"CHECK_SSL_VALIDITY": False})
monkeypatch.setattr(core_views, "find_config_default", lambda key: "True")
monkeypatch.setattr(core_views, "find_config_type", lambda key: "bool")
monkeypatch.setattr(core_views, "key_is_safe", lambda key: True)
from archivebox.machine.models import Machine
from archivebox.config.configset import BaseConfigSet
monkeypatch.setattr(
Machine,
'current',
classmethod(lambda cls: SimpleNamespace(id='machine-id', config={'CHECK_SSL_VALIDITY': 'true'})),
"current",
classmethod(lambda cls: SimpleNamespace(id="machine-id", config={"CHECK_SSL_VALIDITY": "true"})),
)
monkeypatch.setattr(
BaseConfigSet,
'load_from_file',
classmethod(lambda cls, path: {'CHECK_SSL_VALIDITY': 'true'}),
"load_from_file",
classmethod(lambda cls, path: {"CHECK_SSL_VALIDITY": "true"}),
)
monkeypatch.setattr(core_views.CONSTANTS, 'CONFIG_FILE', SimpleNamespace(exists=lambda: True))
monkeypatch.setenv('CHECK_SSL_VALIDITY', 'false')
monkeypatch.setattr(core_views.CONSTANTS, "CONFIG_FILE", SimpleNamespace(exists=lambda: True))
monkeypatch.setenv("CHECK_SSL_VALIDITY", "false")
context = core_views.live_config_value_view.__wrapped__(request, key='CHECK_SSL_VALIDITY')
section = context['data'][0]
context = core_views.live_config_value_view.__wrapped__(request, key="CHECK_SSL_VALIDITY")
section = context["data"][0]
assert section['fields']['Currently read from'] == 'Environment'
help_text = section['help_texts']['Currently read from']
assert help_text.index('Environment') < help_text.index('Machine') < help_text.index('Config File') < help_text.index('Default')
assert 'Configuration Sources (highest priority first):' in section['help_texts']['Value']
assert section["fields"]["Currently read from"] == "Environment"
help_text = section["help_texts"]["Currently read from"]
assert help_text.index("Environment") < help_text.index("Machine") < help_text.index("Config File") < help_text.index("Default")
assert "Configuration Sources (highest priority first):" in section["help_texts"]["Value"]

View File

@@ -8,19 +8,18 @@ import sqlite3
import pytest
def test_crawl_creates_crawl_object(tmp_path, process, disable_extractors_dict):
"""Test that crawl command creates a Crawl object."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'crawl', '--no-wait', 'https://example.com'],
["archivebox", "crawl", "--no-wait", "https://example.com"],
capture_output=True,
text=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect('index.sqlite3')
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
crawl = c.execute("SELECT id, max_depth FROM crawls_crawl ORDER BY created_at DESC LIMIT 1").fetchone()
conn.close()
@@ -33,13 +32,13 @@ def test_crawl_depth_sets_max_depth_in_crawl(tmp_path, process, disable_extracto
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'crawl', '--depth=2', '--no-wait', 'https://example.com'],
["archivebox", "crawl", "--depth=2", "--no-wait", "https://example.com"],
capture_output=True,
text=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect('index.sqlite3')
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
crawl = c.execute("SELECT max_depth FROM crawls_crawl ORDER BY created_at DESC LIMIT 1").fetchone()
conn.close()
@@ -53,16 +52,18 @@ def test_crawl_creates_snapshot_for_url(tmp_path, process, disable_extractors_di
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'crawl', '--no-wait', 'https://example.com'],
["archivebox", "crawl", "--no-wait", "https://example.com"],
capture_output=True,
text=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect('index.sqlite3')
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
snapshot = c.execute("SELECT url FROM core_snapshot WHERE url = ?",
('https://example.com',)).fetchone()
snapshot = c.execute(
"SELECT url FROM core_snapshot WHERE url = ?",
("https://example.com",),
).fetchone()
conn.close()
assert snapshot is not None, "Snapshot should be created for input URL"
@@ -73,13 +74,13 @@ def test_crawl_links_snapshot_to_crawl(tmp_path, process, disable_extractors_dic
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'crawl', '--no-wait', 'https://example.com'],
["archivebox", "crawl", "--no-wait", "https://example.com"],
capture_output=True,
text=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect('index.sqlite3')
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
# Get the crawl ID
@@ -88,8 +89,10 @@ def test_crawl_links_snapshot_to_crawl(tmp_path, process, disable_extractors_dic
crawl_id = crawl[0]
# Check snapshot has correct crawl_id
snapshot = c.execute("SELECT crawl_id FROM core_snapshot WHERE url = ?",
('https://example.com',)).fetchone()
snapshot = c.execute(
"SELECT crawl_id FROM core_snapshot WHERE url = ?",
("https://example.com",),
).fetchone()
conn.close()
assert snapshot is not None
@@ -101,22 +104,26 @@ def test_crawl_multiple_urls_creates_multiple_snapshots(tmp_path, process, disab
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'crawl', '--no-wait',
'https://example.com',
'https://iana.org'],
[
"archivebox",
"crawl",
"--no-wait",
"https://example.com",
"https://iana.org",
],
capture_output=True,
text=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect('index.sqlite3')
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
urls = c.execute("SELECT url FROM core_snapshot ORDER BY url").fetchall()
conn.close()
urls = [u[0] for u in urls]
assert 'https://example.com' in urls
assert 'https://iana.org' in urls
assert "https://example.com" in urls
assert "https://iana.org" in urls
def test_crawl_from_file_creates_snapshot(tmp_path, process, disable_extractors_dict):
@@ -124,17 +131,17 @@ def test_crawl_from_file_creates_snapshot(tmp_path, process, disable_extractors_
os.chdir(tmp_path)
# Write URLs to a file
urls_file = tmp_path / 'urls.txt'
urls_file.write_text('https://example.com\n')
urls_file = tmp_path / "urls.txt"
urls_file.write_text("https://example.com\n")
subprocess.run(
['archivebox', 'crawl', '--no-wait', str(urls_file)],
["archivebox", "crawl", "--no-wait", str(urls_file)],
capture_output=True,
text=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect('index.sqlite3')
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
snapshot = c.execute("SELECT url FROM core_snapshot").fetchone()
conn.close()
@@ -148,19 +155,19 @@ def test_crawl_persists_input_urls_on_crawl(tmp_path, process, disable_extractor
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'crawl', '--no-wait', 'https://example.com'],
["archivebox", "crawl", "--no-wait", "https://example.com"],
capture_output=True,
text=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect('index.sqlite3')
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
crawl_urls = c.execute("SELECT urls FROM crawls_crawl ORDER BY created_at DESC LIMIT 1").fetchone()
conn.close()
assert crawl_urls is not None, "Crawl should be created for crawl input"
assert 'https://example.com' in crawl_urls[0], "Crawl should persist input URLs"
assert "https://example.com" in crawl_urls[0], "Crawl should persist input URLs"
class TestCrawlCLI:
@@ -171,14 +178,14 @@ class TestCrawlCLI:
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'crawl', '--help'],
["archivebox", "crawl", "--help"],
capture_output=True,
text=True,
)
assert result.returncode == 0
assert 'create' in result.stdout
assert "create" in result.stdout
if __name__ == '__main__':
pytest.main([__file__, '-v'])
if __name__ == "__main__":
pytest.main([__file__, "-v"])

View File

@@ -14,75 +14,77 @@ pytestmark = pytest.mark.django_db
User = get_user_model()
ADMIN_HOST = 'admin.archivebox.localhost:8000'
ADMIN_HOST = "admin.archivebox.localhost:8000"
@pytest.fixture
def admin_user(db):
return cast(UserManager, User.objects).create_superuser(
username='crawladmin',
email='crawladmin@test.com',
password='testpassword',
username="crawladmin",
email="crawladmin@test.com",
password="testpassword",
)
@pytest.fixture
def crawl(admin_user):
return Crawl.objects.create(
urls='https://example.com\nhttps://example.org',
tags_str='alpha,beta',
urls="https://example.com\nhttps://example.org",
tags_str="alpha,beta",
created_by=admin_user,
)
def test_crawl_admin_change_view_renders_tag_editor_widget(client, admin_user, crawl):
client.login(username='crawladmin', password='testpassword')
client.login(username="crawladmin", password="testpassword")
response = client.get(
reverse('admin:crawls_crawl_change', args=[crawl.pk]),
reverse("admin:crawls_crawl_change", args=[crawl.pk]),
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
assert b'name="tags_editor"' in response.content
assert b'tag-editor-container' in response.content
assert b'alpha' in response.content
assert b'beta' in response.content
assert b"tag-editor-container" in response.content
assert b"alpha" in response.content
assert b"beta" in response.content
def test_crawl_admin_add_view_renders_url_filter_alias_fields(client, admin_user):
client.login(username='crawladmin', password='testpassword')
client.login(username="crawladmin", password="testpassword")
response = client.get(
reverse('admin:crawls_crawl_add'),
reverse("admin:crawls_crawl_add"),
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
assert b'name="url_filters_allowlist"' in response.content
assert b'name="url_filters_denylist"' in response.content
assert b'Same domain only' in response.content
assert b"Same domain only" in response.content
def test_crawl_admin_form_saves_tags_editor_to_tags_str(crawl, admin_user):
form = CrawlAdminForm(
data={
'created_at': crawl.created_at.strftime('%Y-%m-%d %H:%M:%S'),
'urls': crawl.urls,
'config': '{}',
'max_depth': '0',
'tags_editor': 'alpha, beta, Alpha, gamma',
'url_filters_allowlist': 'example.com\n*.example.com',
'url_filters_denylist': 'static.example.com',
'persona_id': '',
'label': '',
'notes': '',
'schedule': '',
'status': crawl.status,
'retry_at': crawl.retry_at.strftime('%Y-%m-%d %H:%M:%S'),
'created_by': str(admin_user.pk),
'num_uses_failed': '0',
'num_uses_succeeded': '0',
"created_at": crawl.created_at.strftime("%Y-%m-%d %H:%M:%S"),
"urls": crawl.urls,
"config": "{}",
"max_depth": "0",
"max_urls": "3",
"max_size": str(45 * 1024 * 1024),
"tags_editor": "alpha, beta, Alpha, gamma",
"url_filters_allowlist": "example.com\n*.example.com",
"url_filters_denylist": "static.example.com",
"persona_id": "",
"label": "",
"notes": "",
"schedule": "",
"status": crawl.status,
"retry_at": crawl.retry_at.strftime("%Y-%m-%d %H:%M:%S"),
"created_by": str(admin_user.pk),
"num_uses_failed": "0",
"num_uses_succeeded": "0",
},
instance=crawl,
)
@@ -91,130 +93,140 @@ def test_crawl_admin_form_saves_tags_editor_to_tags_str(crawl, admin_user):
updated = form.save()
updated.refresh_from_db()
assert updated.tags_str == 'alpha,beta,gamma'
assert updated.config['URL_ALLOWLIST'] == 'example.com\n*.example.com'
assert updated.config['URL_DENYLIST'] == 'static.example.com'
assert updated.tags_str == "alpha,beta,gamma"
assert updated.max_urls == 3
assert updated.max_size == 45 * 1024 * 1024
assert updated.config["MAX_URLS"] == 3
assert updated.config["MAX_SIZE"] == 45 * 1024 * 1024
assert updated.config["URL_ALLOWLIST"] == "example.com\n*.example.com"
assert updated.config["URL_DENYLIST"] == "static.example.com"
def test_crawl_admin_delete_snapshot_action_removes_snapshot_and_url(client, admin_user):
crawl = Crawl.objects.create(
urls='https://example.com/remove-me',
urls="https://example.com/remove-me",
created_by=admin_user,
)
snapshot = Snapshot.objects.create(
crawl=crawl,
url='https://example.com/remove-me',
url="https://example.com/remove-me",
)
client.login(username='crawladmin', password='testpassword')
client.login(username="crawladmin", password="testpassword")
response = client.post(
reverse('admin:crawls_crawl_snapshot_delete', args=[crawl.pk, snapshot.pk]),
reverse("admin:crawls_crawl_snapshot_delete", args=[crawl.pk, snapshot.pk]),
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
assert response.json()['ok'] is True
assert response.json()["ok"] is True
assert not Snapshot.objects.filter(pk=snapshot.pk).exists()
crawl.refresh_from_db()
assert 'https://example.com/remove-me' not in crawl.urls
assert "https://example.com/remove-me" not in crawl.urls
def test_crawl_admin_exclude_domain_action_prunes_urls_and_pending_snapshots(client, admin_user):
crawl = Crawl.objects.create(
urls='\n'.join([
'https://cdn.example.com/asset.js',
'https://cdn.example.com/second.js',
'https://example.com/root',
]),
urls="\n".join(
[
"https://cdn.example.com/asset.js",
"https://cdn.example.com/second.js",
"https://example.com/root",
],
),
created_by=admin_user,
)
queued_snapshot = Snapshot.objects.create(
crawl=crawl,
url='https://cdn.example.com/asset.js',
url="https://cdn.example.com/asset.js",
status=Snapshot.StatusChoices.QUEUED,
)
preserved_snapshot = Snapshot.objects.create(
crawl=crawl,
url='https://example.com/root',
url="https://example.com/root",
status=Snapshot.StatusChoices.SEALED,
)
client.login(username='crawladmin', password='testpassword')
client.login(username="crawladmin", password="testpassword")
response = client.post(
reverse('admin:crawls_crawl_snapshot_exclude_domain', args=[crawl.pk, queued_snapshot.pk]),
reverse("admin:crawls_crawl_snapshot_exclude_domain", args=[crawl.pk, queued_snapshot.pk]),
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
payload = response.json()
assert payload['ok'] is True
assert payload['domain'] == 'cdn.example.com'
assert payload["ok"] is True
assert payload["domain"] == "cdn.example.com"
crawl.refresh_from_db()
assert crawl.get_url_denylist(use_effective_config=False) == ['cdn.example.com']
assert 'https://cdn.example.com/asset.js' not in crawl.urls
assert 'https://cdn.example.com/second.js' not in crawl.urls
assert 'https://example.com/root' in crawl.urls
assert crawl.get_url_denylist(use_effective_config=False) == ["cdn.example.com"]
assert "https://cdn.example.com/asset.js" not in crawl.urls
assert "https://cdn.example.com/second.js" not in crawl.urls
assert "https://example.com/root" in crawl.urls
assert not Snapshot.objects.filter(pk=queued_snapshot.pk).exists()
assert Snapshot.objects.filter(pk=preserved_snapshot.pk).exists()
def test_snapshot_from_json_trims_markdown_suffixes_on_discovered_urls(crawl):
snapshot = Snapshot.from_json(
{'url': 'https://docs.sweeting.me/s/youtube-favorites)**'},
overrides={'crawl': crawl},
{"url": "https://docs.sweeting.me/s/youtube-favorites)**"},
overrides={"crawl": crawl},
queue_for_extraction=False,
)
assert snapshot is not None
assert snapshot.url == 'https://docs.sweeting.me/s/youtube-favorites'
assert snapshot.url == "https://docs.sweeting.me/s/youtube-favorites"
def test_create_snapshots_from_urls_respects_url_allowlist_and_denylist(admin_user):
crawl = Crawl.objects.create(
urls='\n'.join([
'https://example.com/root',
'https://static.example.com/app.js',
'https://other.test/page',
]),
urls="\n".join(
[
"https://example.com/root",
"https://static.example.com/app.js",
"https://other.test/page",
],
),
created_by=admin_user,
config={
'URL_ALLOWLIST': 'example.com',
'URL_DENYLIST': 'static.example.com',
"URL_ALLOWLIST": "example.com",
"URL_DENYLIST": "static.example.com",
},
)
created = crawl.create_snapshots_from_urls()
assert [snapshot.url for snapshot in created] == ['https://example.com/root']
assert [snapshot.url for snapshot in created] == ["https://example.com/root"]
def test_url_filter_regex_lists_preserve_commas_and_split_on_newlines_only(admin_user):
crawl = Crawl.objects.create(
urls='\n'.join([
'https://example.com/root',
'https://example.com/path,with,commas',
'https://other.test/page',
]),
urls="\n".join(
[
"https://example.com/root",
"https://example.com/path,with,commas",
"https://other.test/page",
],
),
created_by=admin_user,
config={
'URL_ALLOWLIST': r'^https://example\.com/(root|path,with,commas)$' + '\n' + r'^https://other\.test/page$',
'URL_DENYLIST': r'^https://example\.com/path,with,commas$',
"URL_ALLOWLIST": r"^https://example\.com/(root|path,with,commas)$" + "\n" + r"^https://other\.test/page$",
"URL_DENYLIST": r"^https://example\.com/path,with,commas$",
},
)
assert crawl.get_url_allowlist(use_effective_config=False) == [
r'^https://example\.com/(root|path,with,commas)$',
r'^https://other\.test/page$',
r"^https://example\.com/(root|path,with,commas)$",
r"^https://other\.test/page$",
]
assert crawl.get_url_denylist(use_effective_config=False) == [
r'^https://example\.com/path,with,commas$',
r"^https://example\.com/path,with,commas$",
]
created = crawl.create_snapshots_from_urls()
assert [snapshot.url for snapshot in created] == [
'https://example.com/root',
'https://other.test/page',
"https://example.com/root",
"https://other.test/page",
]

View File

@@ -19,7 +19,7 @@ from pathlib import Path
from unittest.mock import patch
# Set up Django before importing any Django-dependent modules
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "archivebox.settings")
class TestBackgroundHookDetection(unittest.TestCase):
@@ -28,32 +28,38 @@ class TestBackgroundHookDetection(unittest.TestCase):
def test_bg_js_suffix_detected(self):
"""Hooks with .bg.js suffix should be detected as background."""
from archivebox.hooks import is_background_hook
self.assertTrue(is_background_hook('on_Snapshot__21_consolelog.daemon.bg.js'))
self.assertTrue(is_background_hook("on_Snapshot__21_consolelog.daemon.bg.js"))
def test_bg_py_suffix_detected(self):
"""Hooks with .bg.py suffix should be detected as background."""
from archivebox.hooks import is_background_hook
self.assertTrue(is_background_hook('on_Snapshot__24_responses.finite.bg.py'))
self.assertTrue(is_background_hook("on_Snapshot__24_responses.finite.bg.py"))
def test_bg_sh_suffix_detected(self):
"""Hooks with .bg.sh suffix should be detected as background."""
from archivebox.hooks import is_background_hook
self.assertTrue(is_background_hook('on_Snapshot__23_ssl.daemon.bg.sh'))
self.assertTrue(is_background_hook("on_Snapshot__23_ssl.daemon.bg.sh"))
def test_legacy_background_suffix_detected(self):
"""Hooks with __background in stem should be detected (backwards compat)."""
from archivebox.hooks import is_background_hook
self.assertTrue(is_background_hook('on_Snapshot__21_consolelog__background.js'))
self.assertTrue(is_background_hook("on_Snapshot__21_consolelog__background.js"))
def test_foreground_hook_not_detected(self):
"""Hooks without .bg. or __background should NOT be detected as background."""
from archivebox.hooks import is_background_hook
self.assertFalse(is_background_hook('on_Snapshot__11_favicon.js'))
self.assertFalse(is_background_hook("on_Snapshot__11_favicon.js"))
def test_foreground_py_hook_not_detected(self):
"""Python hooks without .bg. should NOT be detected as background."""
from archivebox.hooks import is_background_hook
self.assertFalse(is_background_hook('on_Snapshot__50_wget.py'))
self.assertFalse(is_background_hook("on_Snapshot__50_wget.py"))
class TestJSONLParsing(unittest.TestCase):
@@ -63,56 +69,61 @@ class TestJSONLParsing(unittest.TestCase):
"""Clean JSONL format should be parsed correctly."""
stdout = '{"type": "ArchiveResult", "status": "succeeded", "output_str": "Done"}'
from archivebox.machine.models import Process
records = Process.parse_records_from_text(stdout)
self.assertEqual(len(records), 1)
self.assertEqual(records[0]['type'], 'ArchiveResult')
self.assertEqual(records[0]['status'], 'succeeded')
self.assertEqual(records[0]['output_str'], 'Done')
self.assertEqual(records[0]["type"], "ArchiveResult")
self.assertEqual(records[0]["status"], "succeeded")
self.assertEqual(records[0]["output_str"], "Done")
def test_parse_multiple_jsonl_records(self):
"""Multiple JSONL records should all be parsed."""
stdout = '''{"type": "ArchiveResult", "status": "succeeded", "output_str": "Done"}
{"type": "Binary", "name": "wget", "abspath": "/usr/bin/wget"}'''
stdout = """{"type": "ArchiveResult", "status": "succeeded", "output_str": "Done"}
{"type": "Binary", "name": "wget", "abspath": "/usr/bin/wget"}"""
from archivebox.machine.models import Process
records = Process.parse_records_from_text(stdout)
self.assertEqual(len(records), 2)
self.assertEqual(records[0]['type'], 'ArchiveResult')
self.assertEqual(records[1]['type'], 'Binary')
self.assertEqual(records[0]["type"], "ArchiveResult")
self.assertEqual(records[1]["type"], "Binary")
def test_parse_jsonl_with_log_output(self):
"""JSONL should be extracted from mixed stdout with log lines."""
stdout = '''Starting hook execution...
stdout = """Starting hook execution...
Processing URL: https://example.com
{"type": "ArchiveResult", "status": "succeeded", "output_str": "Downloaded"}
Hook completed successfully'''
Hook completed successfully"""
from archivebox.machine.models import Process
records = Process.parse_records_from_text(stdout)
self.assertEqual(len(records), 1)
self.assertEqual(records[0]['status'], 'succeeded')
self.assertEqual(records[0]["status"], "succeeded")
def test_ignore_invalid_json(self):
"""Invalid JSON should be silently ignored."""
stdout = '''{"type": "ArchiveResult", "status": "succeeded"}
stdout = """{"type": "ArchiveResult", "status": "succeeded"}
{invalid json here}
not json at all
{"type": "Binary", "name": "wget"}'''
{"type": "Binary", "name": "wget"}"""
from archivebox.machine.models import Process
records = Process.parse_records_from_text(stdout)
self.assertEqual(len(records), 2)
def test_json_without_type_ignored(self):
"""JSON objects without 'type' field should be ignored."""
stdout = '''{"status": "succeeded", "output_str": "Done"}
{"type": "ArchiveResult", "status": "succeeded"}'''
stdout = """{"status": "succeeded", "output_str": "Done"}
{"type": "ArchiveResult", "status": "succeeded"}"""
from archivebox.machine.models import Process
records = Process.parse_records_from_text(stdout)
self.assertEqual(len(records), 1)
self.assertEqual(records[0]['type'], 'ArchiveResult')
self.assertEqual(records[0]["type"], "ArchiveResult")
class TestInstallHookEnvVarHandling(unittest.TestCase):
@@ -121,7 +132,7 @@ class TestInstallHookEnvVarHandling(unittest.TestCase):
def setUp(self):
"""Set up test environment."""
self.work_dir = Path(tempfile.mkdtemp())
self.test_hook = self.work_dir / 'test_hook.py'
self.test_hook = self.work_dir / "test_hook.py"
def tearDown(self):
"""Clean up test environment."""
@@ -130,37 +141,37 @@ class TestInstallHookEnvVarHandling(unittest.TestCase):
def test_binary_env_var_absolute_path_handling(self):
"""Install hooks should handle absolute paths in XYZ_BINARY."""
# Test the logic that install hooks use
configured_binary = '/custom/path/to/wget2'
if '/' in configured_binary:
configured_binary = "/custom/path/to/wget2"
if "/" in configured_binary:
bin_name = Path(configured_binary).name
else:
bin_name = configured_binary
self.assertEqual(bin_name, 'wget2')
self.assertEqual(bin_name, "wget2")
def test_binary_env_var_name_only_handling(self):
"""Install hooks should handle binary names in XYZ_BINARY."""
# Test the logic that install hooks use
configured_binary = 'wget2'
if '/' in configured_binary:
configured_binary = "wget2"
if "/" in configured_binary:
bin_name = Path(configured_binary).name
else:
bin_name = configured_binary
self.assertEqual(bin_name, 'wget2')
self.assertEqual(bin_name, "wget2")
def test_binary_env_var_empty_default(self):
"""Install hooks should use default when XYZ_BINARY is empty."""
configured_binary = ''
configured_binary = ""
if configured_binary:
if '/' in configured_binary:
if "/" in configured_binary:
bin_name = Path(configured_binary).name
else:
bin_name = configured_binary
else:
bin_name = 'wget' # default
bin_name = "wget" # default
self.assertEqual(bin_name, 'wget')
self.assertEqual(bin_name, "wget")
class TestHookDiscovery(unittest.TestCase):
@@ -169,22 +180,22 @@ class TestHookDiscovery(unittest.TestCase):
def setUp(self):
"""Set up test plugin directory."""
self.test_dir = Path(tempfile.mkdtemp())
self.plugins_dir = self.test_dir / 'plugins'
self.plugins_dir = self.test_dir / "plugins"
self.plugins_dir.mkdir()
# Create test plugin structure
wget_dir = self.plugins_dir / 'wget'
wget_dir = self.plugins_dir / "wget"
wget_dir.mkdir()
(wget_dir / 'on_Snapshot__50_wget.py').write_text('# test hook')
(wget_dir / 'on_Crawl__10_wget_install.finite.bg.py').write_text('# install hook')
(wget_dir / "on_Snapshot__50_wget.py").write_text("# test hook")
(wget_dir / "on_Crawl__10_wget_install.finite.bg.py").write_text("# install hook")
chrome_dir = self.plugins_dir / 'chrome'
chrome_dir = self.plugins_dir / "chrome"
chrome_dir.mkdir(exist_ok=True)
(chrome_dir / 'on_Snapshot__20_chrome_tab.daemon.bg.js').write_text('// background hook')
(chrome_dir / "on_Snapshot__20_chrome_tab.daemon.bg.js").write_text("// background hook")
consolelog_dir = self.plugins_dir / 'consolelog'
consolelog_dir = self.plugins_dir / "consolelog"
consolelog_dir.mkdir()
(consolelog_dir / 'on_Snapshot__21_consolelog.daemon.bg.js').write_text('// background hook')
(consolelog_dir / "on_Snapshot__21_consolelog.daemon.bg.js").write_text("// background hook")
def tearDown(self):
"""Clean up test directory."""
@@ -194,109 +205,118 @@ class TestHookDiscovery(unittest.TestCase):
"""discover_hooks() should find all hooks for an event."""
# Use the local implementation since we can't easily mock BUILTIN_PLUGINS_DIR
hooks = []
for ext in ('sh', 'py', 'js'):
pattern = f'*/on_Snapshot__*.{ext}'
for ext in ("sh", "py", "js"):
pattern = f"*/on_Snapshot__*.{ext}"
hooks.extend(self.plugins_dir.glob(pattern))
hooks = sorted(set(hooks), key=lambda p: p.name)
self.assertEqual(len(hooks), 3)
hook_names = [h.name for h in hooks]
self.assertIn('on_Snapshot__20_chrome_tab.daemon.bg.js', hook_names)
self.assertIn('on_Snapshot__21_consolelog.daemon.bg.js', hook_names)
self.assertIn('on_Snapshot__50_wget.py', hook_names)
self.assertIn("on_Snapshot__20_chrome_tab.daemon.bg.js", hook_names)
self.assertIn("on_Snapshot__21_consolelog.daemon.bg.js", hook_names)
self.assertIn("on_Snapshot__50_wget.py", hook_names)
def test_discover_hooks_sorted_by_name(self):
"""Hooks should be sorted by filename (numeric prefix ordering)."""
hooks = []
for ext in ('sh', 'py', 'js'):
pattern = f'*/on_Snapshot__*.{ext}'
for ext in ("sh", "py", "js"):
pattern = f"*/on_Snapshot__*.{ext}"
hooks.extend(self.plugins_dir.glob(pattern))
hooks = sorted(set(hooks), key=lambda p: p.name)
# Check numeric ordering
self.assertEqual(hooks[0].name, 'on_Snapshot__20_chrome_tab.daemon.bg.js')
self.assertEqual(hooks[1].name, 'on_Snapshot__21_consolelog.daemon.bg.js')
self.assertEqual(hooks[2].name, 'on_Snapshot__50_wget.py')
self.assertEqual(hooks[0].name, "on_Snapshot__20_chrome_tab.daemon.bg.js")
self.assertEqual(hooks[1].name, "on_Snapshot__21_consolelog.daemon.bg.js")
self.assertEqual(hooks[2].name, "on_Snapshot__50_wget.py")
def test_get_plugins_includes_non_snapshot_plugin_dirs(self):
"""get_plugins() should include binary-only plugins with standardized metadata."""
env_dir = self.plugins_dir / 'env'
env_dir = self.plugins_dir / "env"
env_dir.mkdir()
(env_dir / 'on_Binary__15_env_discover.py').write_text('# binary hook')
(env_dir / 'config.json').write_text('{"type": "object", "properties": {}}')
(env_dir / "on_Binary__15_env_discover.py").write_text("# binary hook")
(env_dir / "config.json").write_text('{"type": "object", "properties": {}}')
from archivebox import hooks as hooks_module
hooks_module.get_plugins.cache_clear()
with patch.object(hooks_module, 'BUILTIN_PLUGINS_DIR', self.plugins_dir), patch.object(hooks_module, 'USER_PLUGINS_DIR', self.test_dir / 'user_plugins'):
with (
patch.object(hooks_module, "BUILTIN_PLUGINS_DIR", self.plugins_dir),
patch.object(hooks_module, "USER_PLUGINS_DIR", self.test_dir / "user_plugins"),
):
plugins = hooks_module.get_plugins()
self.assertIn('env', plugins)
self.assertIn("env", plugins)
def test_discover_binary_hooks_ignores_plugins_whitelist(self):
"""Binary provider hooks should remain discoverable under --plugins filtering."""
singlefile_dir = self.plugins_dir / 'singlefile'
singlefile_dir = self.plugins_dir / "singlefile"
singlefile_dir.mkdir()
(singlefile_dir / 'config.json').write_text(
(singlefile_dir / "config.json").write_text(
json.dumps(
{
"type": "object",
"required_plugins": ["chrome"],
"properties": {},
}
)
},
),
)
npm_dir = self.plugins_dir / 'npm'
npm_dir = self.plugins_dir / "npm"
npm_dir.mkdir()
(npm_dir / 'on_Binary__10_npm_install.py').write_text('# npm binary hook')
(npm_dir / 'config.json').write_text('{"type": "object", "properties": {}}')
(npm_dir / "on_Binary__10_npm_install.py").write_text("# npm binary hook")
(npm_dir / "config.json").write_text('{"type": "object", "properties": {}}')
from archivebox import hooks as hooks_module
hooks_module.get_plugins.cache_clear()
with patch.object(hooks_module, 'BUILTIN_PLUGINS_DIR', self.plugins_dir), patch.object(hooks_module, 'USER_PLUGINS_DIR', self.test_dir / 'user_plugins'):
hooks = hooks_module.discover_hooks('Binary', config={'PLUGINS': 'singlefile'})
with (
patch.object(hooks_module, "BUILTIN_PLUGINS_DIR", self.plugins_dir),
patch.object(hooks_module, "USER_PLUGINS_DIR", self.test_dir / "user_plugins"),
):
hooks = hooks_module.discover_hooks("Binary", config={"PLUGINS": "singlefile"})
hook_names = [hook.name for hook in hooks]
self.assertIn('on_Binary__10_npm_install.py', hook_names)
self.assertIn("on_Binary__10_npm_install.py", hook_names)
def test_discover_crawl_hooks_only_include_declared_plugin_dependencies(self):
"""Crawl hook discovery should include required_plugins without broadening to provider plugins."""
responses_dir = self.plugins_dir / 'responses'
responses_dir = self.plugins_dir / "responses"
responses_dir.mkdir()
(responses_dir / 'config.json').write_text(
(responses_dir / "config.json").write_text(
json.dumps(
{
"type": "object",
"required_plugins": ["chrome"],
"properties": {},
}
)
},
),
)
chrome_dir = self.plugins_dir / 'chrome'
chrome_dir = self.plugins_dir / "chrome"
chrome_dir.mkdir(exist_ok=True)
(chrome_dir / 'config.json').write_text('{"type": "object", "properties": {}}')
(chrome_dir / 'on_Crawl__70_chrome_install.finite.bg.py').write_text('# chrome crawl hook')
(chrome_dir / "config.json").write_text('{"type": "object", "properties": {}}')
(chrome_dir / "on_Crawl__70_chrome_install.finite.bg.py").write_text("# chrome crawl hook")
npm_dir = self.plugins_dir / 'npm'
npm_dir = self.plugins_dir / "npm"
npm_dir.mkdir()
(npm_dir / 'on_Binary__10_npm_install.py').write_text('# npm binary hook')
(npm_dir / 'on_Crawl__00_npm_install.py').write_text('# npm crawl hook')
(npm_dir / 'config.json').write_text('{"type": "object", "properties": {}}')
(npm_dir / "on_Binary__10_npm_install.py").write_text("# npm binary hook")
(npm_dir / "on_Crawl__00_npm_install.py").write_text("# npm crawl hook")
(npm_dir / "config.json").write_text('{"type": "object", "properties": {}}')
from archivebox import hooks as hooks_module
hooks_module.get_plugins.cache_clear()
with patch.object(hooks_module, 'BUILTIN_PLUGINS_DIR', self.plugins_dir), patch.object(hooks_module, 'USER_PLUGINS_DIR', self.test_dir / 'user_plugins'):
hooks = hooks_module.discover_hooks('Crawl', config={'PLUGINS': 'responses'})
with (
patch.object(hooks_module, "BUILTIN_PLUGINS_DIR", self.plugins_dir),
patch.object(hooks_module, "USER_PLUGINS_DIR", self.test_dir / "user_plugins"),
):
hooks = hooks_module.discover_hooks("Crawl", config={"PLUGINS": "responses"})
hook_names = [hook.name for hook in hooks]
self.assertIn('on_Crawl__70_chrome_install.finite.bg.py', hook_names)
self.assertNotIn('on_Crawl__00_npm_install.py', hook_names)
self.assertIn("on_Crawl__70_chrome_install.finite.bg.py", hook_names)
self.assertNotIn("on_Crawl__00_npm_install.py", hook_names)
class TestGetExtractorName(unittest.TestCase):
@@ -304,27 +324,29 @@ class TestGetExtractorName(unittest.TestCase):
def test_strip_numeric_prefix(self):
"""Numeric prefix should be stripped from extractor name."""
# Inline implementation of get_extractor_name
def get_extractor_name(extractor: str) -> str:
parts = extractor.split('_', 1)
parts = extractor.split("_", 1)
if len(parts) == 2 and parts[0].isdigit():
return parts[1]
return extractor
self.assertEqual(get_extractor_name('10_title'), 'title')
self.assertEqual(get_extractor_name('26_readability'), 'readability')
self.assertEqual(get_extractor_name('50_parse_html_urls'), 'parse_html_urls')
self.assertEqual(get_extractor_name("10_title"), "title")
self.assertEqual(get_extractor_name("26_readability"), "readability")
self.assertEqual(get_extractor_name("50_parse_html_urls"), "parse_html_urls")
def test_no_prefix_unchanged(self):
"""Extractor without numeric prefix should be unchanged."""
def get_extractor_name(extractor: str) -> str:
parts = extractor.split('_', 1)
parts = extractor.split("_", 1)
if len(parts) == 2 and parts[0].isdigit():
return parts[1]
return extractor
self.assertEqual(get_extractor_name('title'), 'title')
self.assertEqual(get_extractor_name('readability'), 'readability')
self.assertEqual(get_extractor_name("title"), "title")
self.assertEqual(get_extractor_name("readability"), "readability")
class TestHookExecution(unittest.TestCase):
@@ -340,14 +362,14 @@ class TestHookExecution(unittest.TestCase):
def test_python_hook_execution(self):
"""Python hook should execute and output JSONL."""
hook_path = self.work_dir / 'test_hook.py'
hook_path.write_text('''#!/usr/bin/env python3
hook_path = self.work_dir / "test_hook.py"
hook_path.write_text("""#!/usr/bin/env python3
import json
print(json.dumps({"type": "ArchiveResult", "status": "succeeded", "output_str": "Test passed"}))
''')
""")
result = subprocess.run(
['python3', str(hook_path)],
["python3", str(hook_path)],
cwd=str(self.work_dir),
capture_output=True,
text=True,
@@ -355,24 +377,25 @@ print(json.dumps({"type": "ArchiveResult", "status": "succeeded", "output_str":
self.assertEqual(result.returncode, 0)
from archivebox.machine.models import Process
records = Process.parse_records_from_text(result.stdout)
self.assertTrue(records)
self.assertEqual(records[0]['type'], 'ArchiveResult')
self.assertEqual(records[0]['status'], 'succeeded')
self.assertEqual(records[0]["type"], "ArchiveResult")
self.assertEqual(records[0]["status"], "succeeded")
def test_js_hook_execution(self):
"""JavaScript hook should execute and output JSONL."""
# Skip if node not available
if shutil.which('node') is None:
self.skipTest('Node.js not available')
if shutil.which("node") is None:
self.skipTest("Node.js not available")
hook_path = self.work_dir / 'test_hook.js'
hook_path.write_text('''#!/usr/bin/env node
hook_path = self.work_dir / "test_hook.js"
hook_path.write_text("""#!/usr/bin/env node
console.log(JSON.stringify({type: 'ArchiveResult', status: 'succeeded', output_str: 'JS test'}));
''')
""")
result = subprocess.run(
['node', str(hook_path)],
["node", str(hook_path)],
cwd=str(self.work_dir),
capture_output=True,
text=True,
@@ -380,15 +403,16 @@ console.log(JSON.stringify({type: 'ArchiveResult', status: 'succeeded', output_s
self.assertEqual(result.returncode, 0)
from archivebox.machine.models import Process
records = Process.parse_records_from_text(result.stdout)
self.assertTrue(records)
self.assertEqual(records[0]['type'], 'ArchiveResult')
self.assertEqual(records[0]['status'], 'succeeded')
self.assertEqual(records[0]["type"], "ArchiveResult")
self.assertEqual(records[0]["status"], "succeeded")
def test_hook_receives_cli_args(self):
"""Hook should receive CLI arguments."""
hook_path = self.work_dir / 'test_hook.py'
hook_path.write_text('''#!/usr/bin/env python3
hook_path = self.work_dir / "test_hook.py"
hook_path.write_text("""#!/usr/bin/env python3
import sys
import json
# Simple arg parsing
@@ -398,10 +422,10 @@ for arg in sys.argv[1:]:
key, val = arg[2:].split('=', 1)
args[key.replace('-', '_')] = val
print(json.dumps({"type": "ArchiveResult", "status": "succeeded", "url": args.get("url", "")}))
''')
""")
result = subprocess.run(
['python3', str(hook_path), '--url=https://example.com'],
["python3", str(hook_path), "--url=https://example.com"],
cwd=str(self.work_dir),
capture_output=True,
text=True,
@@ -409,9 +433,10 @@ print(json.dumps({"type": "ArchiveResult", "status": "succeeded", "url": args.ge
self.assertEqual(result.returncode, 0)
from archivebox.machine.models import Process
records = Process.parse_records_from_text(result.stdout)
self.assertTrue(records)
self.assertEqual(records[0]['url'], 'https://example.com')
self.assertEqual(records[0]["url"], "https://example.com")
class TestInstallHookOutput(unittest.TestCase):
@@ -427,35 +452,41 @@ class TestInstallHookOutput(unittest.TestCase):
def test_install_hook_outputs_binary(self):
"""Install hook should output Binary JSONL when binary found."""
hook_output = json.dumps({
'type': 'Binary',
'name': 'wget',
'abspath': '/usr/bin/wget',
'version': '1.21.3',
'sha256': None,
'binprovider': 'apt',
})
hook_output = json.dumps(
{
"type": "Binary",
"name": "wget",
"abspath": "/usr/bin/wget",
"version": "1.21.3",
"sha256": None,
"binprovider": "apt",
},
)
from archivebox.machine.models import Process
data = Process.parse_records_from_text(hook_output)[0]
self.assertEqual(data['type'], 'Binary')
self.assertEqual(data['name'], 'wget')
self.assertTrue(data['abspath'].startswith('/'))
self.assertEqual(data["type"], "Binary")
self.assertEqual(data["name"], "wget")
self.assertTrue(data["abspath"].startswith("/"))
def test_install_hook_outputs_machine_config(self):
"""Install hook should output Machine config update JSONL."""
hook_output = json.dumps({
'type': 'Machine',
'config': {
'WGET_BINARY': '/usr/bin/wget',
hook_output = json.dumps(
{
"type": "Machine",
"config": {
"WGET_BINARY": "/usr/bin/wget",
},
},
})
)
from archivebox.machine.models import Process
data = Process.parse_records_from_text(hook_output)[0]
self.assertEqual(data['type'], 'Machine')
self.assertIn('config', data)
self.assertEqual(data['config']['WGET_BINARY'], '/usr/bin/wget')
self.assertEqual(data["type"], "Machine")
self.assertIn("config", data)
self.assertEqual(data["config"]["WGET_BINARY"], "/usr/bin/wget")
class TestSnapshotHookOutput(unittest.TestCase):
@@ -463,75 +494,90 @@ class TestSnapshotHookOutput(unittest.TestCase):
def test_snapshot_hook_basic_output(self):
"""Snapshot hook should output clean ArchiveResult JSONL."""
hook_output = json.dumps({
'type': 'ArchiveResult',
'status': 'succeeded',
'output_str': 'Downloaded 5 files',
})
hook_output = json.dumps(
{
"type": "ArchiveResult",
"status": "succeeded",
"output_str": "Downloaded 5 files",
},
)
from archivebox.machine.models import Process
data = Process.parse_records_from_text(hook_output)[0]
self.assertEqual(data['type'], 'ArchiveResult')
self.assertEqual(data['status'], 'succeeded')
self.assertIn('output_str', data)
self.assertEqual(data["type"], "ArchiveResult")
self.assertEqual(data["status"], "succeeded")
self.assertIn("output_str", data)
def test_snapshot_hook_with_cmd(self):
"""Snapshot hook should include cmd for binary FK lookup."""
hook_output = json.dumps({
'type': 'ArchiveResult',
'status': 'succeeded',
'output_str': 'Archived with wget',
'cmd': ['/usr/bin/wget', '-p', '-k', 'https://example.com'],
})
hook_output = json.dumps(
{
"type": "ArchiveResult",
"status": "succeeded",
"output_str": "Archived with wget",
"cmd": ["/usr/bin/wget", "-p", "-k", "https://example.com"],
},
)
from archivebox.machine.models import Process
data = Process.parse_records_from_text(hook_output)[0]
self.assertEqual(data['type'], 'ArchiveResult')
self.assertIsInstance(data['cmd'], list)
self.assertEqual(data['cmd'][0], '/usr/bin/wget')
self.assertEqual(data["type"], "ArchiveResult")
self.assertIsInstance(data["cmd"], list)
self.assertEqual(data["cmd"][0], "/usr/bin/wget")
def test_snapshot_hook_with_output_json(self):
"""Snapshot hook can include structured metadata in output_json."""
hook_output = json.dumps({
'type': 'ArchiveResult',
'status': 'succeeded',
'output_str': 'Got headers',
'output_json': {
'content-type': 'text/html',
'server': 'nginx',
'status-code': 200,
hook_output = json.dumps(
{
"type": "ArchiveResult",
"status": "succeeded",
"output_str": "Got headers",
"output_json": {
"content-type": "text/html",
"server": "nginx",
"status-code": 200,
},
},
})
)
from archivebox.machine.models import Process
data = Process.parse_records_from_text(hook_output)[0]
self.assertEqual(data['type'], 'ArchiveResult')
self.assertIsInstance(data['output_json'], dict)
self.assertEqual(data['output_json']['status-code'], 200)
self.assertEqual(data["type"], "ArchiveResult")
self.assertIsInstance(data["output_json"], dict)
self.assertEqual(data["output_json"]["status-code"], 200)
def test_snapshot_hook_skipped_status(self):
"""Snapshot hook should support skipped status."""
hook_output = json.dumps({
'type': 'ArchiveResult',
'status': 'skipped',
'output_str': 'SAVE_WGET=False',
})
hook_output = json.dumps(
{
"type": "ArchiveResult",
"status": "skipped",
"output_str": "SAVE_WGET=False",
},
)
from archivebox.machine.models import Process
data = Process.parse_records_from_text(hook_output)[0]
self.assertEqual(data['status'], 'skipped')
self.assertEqual(data["status"], "skipped")
def test_snapshot_hook_failed_status(self):
"""Snapshot hook should support failed status."""
hook_output = json.dumps({
'type': 'ArchiveResult',
'status': 'failed',
'output_str': '404 Not Found',
})
hook_output = json.dumps(
{
"type": "ArchiveResult",
"status": "failed",
"output_str": "404 Not Found",
},
)
from archivebox.machine.models import Process
data = Process.parse_records_from_text(hook_output)[0]
self.assertEqual(data['status'], 'failed')
self.assertEqual(data["status"], "failed")
class TestPluginMetadata(unittest.TestCase):
@@ -540,16 +586,16 @@ class TestPluginMetadata(unittest.TestCase):
def test_plugin_name_added(self):
"""run_hook() should add plugin name to records."""
# Simulate what run_hook() does
script = Path('/abx_plugins/plugins/wget/on_Snapshot__50_wget.py')
script = Path("/abx_plugins/plugins/wget/on_Snapshot__50_wget.py")
plugin_name = script.parent.name
record = {'type': 'ArchiveResult', 'status': 'succeeded'}
record['plugin'] = plugin_name
record['plugin_hook'] = str(script)
record = {"type": "ArchiveResult", "status": "succeeded"}
record["plugin"] = plugin_name
record["plugin_hook"] = str(script)
self.assertEqual(record['plugin'], 'wget')
self.assertIn('on_Snapshot__50_wget.py', record['plugin_hook'])
self.assertEqual(record["plugin"], "wget")
self.assertIn("on_Snapshot__50_wget.py", record["plugin_hook"])
if __name__ == '__main__':
if __name__ == "__main__":
unittest.main()

View File

@@ -30,6 +30,7 @@ from archivebox.machine.models import (
ProcessMachine,
MACHINE_RECHECK_INTERVAL,
PID_REUSE_WINDOW,
PROCESS_TIMEOUT_GRACE,
)
@@ -39,6 +40,7 @@ class TestMachineModel(TestCase):
def setUp(self):
"""Reset cached machine between tests."""
import archivebox.machine.models as models
models._CURRENT_MACHINE = None
def test_machine_current_creates_machine(self):
@@ -49,7 +51,7 @@ class TestMachineModel(TestCase):
self.assertIsNotNone(machine.id)
self.assertIsNotNone(machine.guid)
self.assertEqual(machine.hostname, os.uname().nodename)
self.assertIn(machine.os_family, ['linux', 'darwin', 'windows', 'freebsd'])
self.assertIn(machine.os_family, ["linux", "darwin", "windows", "freebsd"])
def test_machine_current_returns_cached(self):
"""Machine.current() should return cached machine within recheck interval."""
@@ -78,8 +80,8 @@ class TestMachineModel(TestCase):
"""Machine.from_json() should update machine config."""
Machine.current() # Ensure machine exists
record = {
'config': {
'WGET_BINARY': '/usr/bin/wget',
"config": {
"WGET_BINARY": "/usr/bin/wget",
},
}
@@ -87,15 +89,15 @@ class TestMachineModel(TestCase):
self.assertIsNotNone(result)
assert result is not None
self.assertEqual(result.config.get('WGET_BINARY'), '/usr/bin/wget')
self.assertEqual(result.config.get("WGET_BINARY"), "/usr/bin/wget")
def test_machine_from_jsonl_strips_legacy_chromium_version(self):
"""Machine.from_json() should ignore legacy browser version keys."""
Machine.current() # Ensure machine exists
record = {
'config': {
'WGET_BINARY': '/usr/bin/wget',
'CHROMIUM_VERSION': '123.4.5',
"config": {
"WGET_BINARY": "/usr/bin/wget",
"CHROMIUM_VERSION": "123.4.5",
},
}
@@ -103,12 +105,12 @@ class TestMachineModel(TestCase):
self.assertIsNotNone(result)
assert result is not None
self.assertEqual(result.config.get('WGET_BINARY'), '/usr/bin/wget')
self.assertNotIn('CHROMIUM_VERSION', result.config)
self.assertEqual(result.config.get("WGET_BINARY"), "/usr/bin/wget")
self.assertNotIn("CHROMIUM_VERSION", result.config)
def test_machine_from_jsonl_invalid(self):
"""Machine.from_json() should return None for invalid records."""
result = Machine.from_json({'invalid': 'record'})
result = Machine.from_json({"invalid": "record"})
self.assertIsNone(result)
def test_machine_current_strips_legacy_chromium_version(self):
@@ -117,16 +119,16 @@ class TestMachineModel(TestCase):
machine = Machine.current()
machine.config = {
'CHROME_BINARY': '/tmp/chromium',
'CHROMIUM_VERSION': '123.4.5',
"CHROME_BINARY": "/tmp/chromium",
"CHROMIUM_VERSION": "123.4.5",
}
machine.save(update_fields=['config'])
machine.save(update_fields=["config"])
models._CURRENT_MACHINE = machine
refreshed = Machine.current()
self.assertEqual(refreshed.config.get('CHROME_BINARY'), '/tmp/chromium')
self.assertNotIn('CHROMIUM_VERSION', refreshed.config)
self.assertEqual(refreshed.config.get("CHROME_BINARY"), "/tmp/chromium")
self.assertNotIn("CHROMIUM_VERSION", refreshed.config)
def test_machine_manager_current(self):
"""Machine.objects.current() should return current machine."""
@@ -141,6 +143,7 @@ class TestNetworkInterfaceModel(TestCase):
def setUp(self):
"""Reset cached interface between tests."""
import archivebox.machine.models as models
models._CURRENT_MACHINE = None
models._CURRENT_INTERFACE = None
@@ -170,24 +173,24 @@ class TestNetworkInterfaceModel(TestCase):
import archivebox.machine.models as models
first = {
'mac_address': 'aa:bb:cc:dd:ee:01',
'ip_public': '1.1.1.1',
'ip_local': '192.168.1.10',
'dns_server': '8.8.8.8',
'hostname': 'host-a',
'iface': 'en0',
'isp': 'ISP A',
'city': 'City',
'region': 'Region',
'country': 'Country',
"mac_address": "aa:bb:cc:dd:ee:01",
"ip_public": "1.1.1.1",
"ip_local": "192.168.1.10",
"dns_server": "8.8.8.8",
"hostname": "host-a",
"iface": "en0",
"isp": "ISP A",
"city": "City",
"region": "Region",
"country": "Country",
}
second = {
**first,
'ip_public': '2.2.2.2',
'ip_local': '10.0.0.5',
"ip_public": "2.2.2.2",
"ip_local": "10.0.0.5",
}
with patch.object(models, 'get_host_network', side_effect=[first, second]):
with patch.object(models, "get_host_network", side_effect=[first, second]):
interface1 = NetworkInterface.current(refresh=True)
interface2 = NetworkInterface.current(refresh=True)
@@ -202,6 +205,7 @@ class TestBinaryModel(TestCase):
def setUp(self):
"""Reset cached binaries and create a machine."""
import archivebox.machine.models as models
models._CURRENT_MACHINE = None
models._CURRENT_BINARIES = {}
self.machine = Machine.current()
@@ -210,22 +214,23 @@ class TestBinaryModel(TestCase):
"""Binary should be created with default values."""
binary = Binary.objects.create(
machine=self.machine,
name='wget',
binproviders='apt,brew,env',
name="wget",
binproviders="apt,brew,env",
)
self.assertIsNotNone(binary.id)
self.assertEqual(binary.name, 'wget')
self.assertEqual(binary.name, "wget")
self.assertEqual(binary.status, Binary.StatusChoices.QUEUED)
self.assertFalse(binary.is_valid)
def test_binary_is_valid(self):
"""Binary.is_valid should be True when abspath and version are set."""
"""Binary.is_valid should be True for installed binaries with a resolved path."""
binary = Binary.objects.create(
machine=self.machine,
name='wget',
abspath='/usr/bin/wget',
version='1.21',
name="wget",
abspath="/usr/bin/wget",
version="1.21",
status=Binary.StatusChoices.INSTALLED,
)
self.assertTrue(binary.is_valid)
@@ -233,25 +238,26 @@ class TestBinaryModel(TestCase):
def test_binary_manager_get_valid_binary(self):
"""BinaryManager.get_valid_binary() should find valid binaries."""
# Create invalid binary (no abspath)
Binary.objects.create(machine=self.machine, name='wget')
Binary.objects.create(machine=self.machine, name="wget")
# Create valid binary
Binary.objects.create(
machine=self.machine,
name='wget',
abspath='/usr/bin/wget',
version='1.21',
name="wget",
abspath="/usr/bin/wget",
version="1.21",
status=Binary.StatusChoices.INSTALLED,
)
result = cast(BinaryManager, Binary.objects).get_valid_binary('wget')
result = cast(BinaryManager, Binary.objects).get_valid_binary("wget")
self.assertIsNotNone(result)
assert result is not None
self.assertEqual(result.abspath, '/usr/bin/wget')
self.assertEqual(result.abspath, "/usr/bin/wget")
def test_binary_update_and_requeue(self):
"""Binary.update_and_requeue() should update fields and save."""
binary = Binary.objects.create(machine=self.machine, name='test')
binary = Binary.objects.create(machine=self.machine, name="test")
old_modified = binary.modified_at
binary.update_and_requeue(
@@ -266,16 +272,18 @@ class TestBinaryModel(TestCase):
def test_binary_from_json_preserves_install_args_overrides(self):
"""Binary.from_json() should persist canonical install_args overrides unchanged."""
overrides = {
'apt': {'install_args': ['chromium']},
'npm': {'install_args': 'puppeteer'},
'custom': {'install_args': ['bash', '-lc', 'echo ok']},
"apt": {"install_args": ["chromium"]},
"npm": {"install_args": "puppeteer"},
"custom": {"install_args": ["bash", "-lc", "echo ok"]},
}
binary = Binary.from_json({
'name': 'chrome',
'binproviders': 'apt,npm,custom',
'overrides': overrides,
})
binary = Binary.from_json(
{
"name": "chrome",
"binproviders": "apt,npm,custom",
"overrides": overrides,
},
)
self.assertIsNotNone(binary)
assert binary is not None
@@ -284,15 +292,17 @@ class TestBinaryModel(TestCase):
def test_binary_from_json_does_not_coerce_legacy_override_shapes(self):
"""Binary.from_json() should no longer translate legacy non-dict provider overrides."""
overrides = {
'apt': ['chromium'],
'npm': 'puppeteer',
"apt": ["chromium"],
"npm": "puppeteer",
}
binary = Binary.from_json({
'name': 'chrome',
'binproviders': 'apt,npm',
'overrides': overrides,
})
binary = Binary.from_json(
{
"name": "chrome",
"binproviders": "apt,npm",
"overrides": overrides,
},
)
self.assertIsNotNone(binary)
assert binary is not None
@@ -300,23 +310,25 @@ class TestBinaryModel(TestCase):
def test_binary_from_json_prefers_published_readability_package(self):
"""Binary.from_json() should rewrite readability's npm git URL to the published package."""
binary = Binary.from_json({
'name': 'readability-extractor',
'binproviders': 'env,npm',
'overrides': {
'npm': {
'install_args': ['https://github.com/ArchiveBox/readability-extractor'],
binary = Binary.from_json(
{
"name": "readability-extractor",
"binproviders": "env,npm",
"overrides": {
"npm": {
"install_args": ["https://github.com/ArchiveBox/readability-extractor"],
},
},
},
})
)
self.assertIsNotNone(binary)
assert binary is not None
self.assertEqual(
binary.overrides,
{
'npm': {
'install_args': ['readability-extractor'],
"npm": {
"install_args": ["readability-extractor"],
},
},
)
@@ -328,12 +340,13 @@ class TestBinaryStateMachine(TestCase):
def setUp(self):
"""Create a machine and binary for state machine tests."""
import archivebox.machine.models as models
models._CURRENT_MACHINE = None
self.machine = Machine.current()
self.binary = Binary.objects.create(
machine=self.machine,
name='test-binary',
binproviders='env',
name="test-binary",
binproviders="env",
)
def test_binary_state_machine_initial_state(self):
@@ -346,7 +359,7 @@ class TestBinaryStateMachine(TestCase):
sm = BinaryMachine(self.binary)
self.assertTrue(sm.can_install())
self.binary.binproviders = ''
self.binary.binproviders = ""
self.binary.save()
sm = BinaryMachine(self.binary)
self.assertFalse(sm.can_install())
@@ -358,6 +371,7 @@ class TestProcessModel(TestCase):
def setUp(self):
"""Create a machine for process tests."""
import archivebox.machine.models as models
models._CURRENT_MACHINE = None
models._CURRENT_PROCESS = None
self.machine = Machine.current()
@@ -366,12 +380,12 @@ class TestProcessModel(TestCase):
"""Process should be created with default values."""
process = Process.objects.create(
machine=self.machine,
cmd=['echo', 'hello'],
pwd='/tmp',
cmd=["echo", "hello"],
pwd="/tmp",
)
self.assertIsNotNone(process.id)
self.assertEqual(process.cmd, ['echo', 'hello'])
self.assertEqual(process.cmd, ["echo", "hello"])
self.assertEqual(process.status, Process.StatusChoices.QUEUED)
self.assertIsNone(process.pid)
self.assertIsNone(process.exit_code)
@@ -380,20 +394,20 @@ class TestProcessModel(TestCase):
"""Process.to_json() should serialize correctly."""
process = Process.objects.create(
machine=self.machine,
cmd=['echo', 'hello'],
pwd='/tmp',
cmd=["echo", "hello"],
pwd="/tmp",
timeout=60,
)
json_data = process.to_json()
self.assertEqual(json_data['type'], 'Process')
self.assertEqual(json_data['cmd'], ['echo', 'hello'])
self.assertEqual(json_data['pwd'], '/tmp')
self.assertEqual(json_data['timeout'], 60)
self.assertEqual(json_data["type"], "Process")
self.assertEqual(json_data["cmd"], ["echo", "hello"])
self.assertEqual(json_data["pwd"], "/tmp")
self.assertEqual(json_data["timeout"], 60)
def test_process_update_and_requeue(self):
"""Process.update_and_requeue() should update fields and save."""
process = Process.objects.create(machine=self.machine, cmd=['test'])
process = Process.objects.create(machine=self.machine, cmd=["test"])
process.update_and_requeue(
status=Process.StatusChoices.RUNNING,
@@ -413,6 +427,7 @@ class TestProcessCurrent(TestCase):
def setUp(self):
"""Reset caches."""
import archivebox.machine.models as models
models._CURRENT_MACHINE = None
models._CURRENT_PROCESS = None
@@ -437,25 +452,25 @@ class TestProcessCurrent(TestCase):
def test_process_detect_type_runner(self):
"""_detect_process_type should detect the background runner command."""
with patch('sys.argv', ['archivebox', 'run', '--daemon']):
with patch("sys.argv", ["archivebox", "run", "--daemon"]):
result = Process._detect_process_type()
self.assertEqual(result, Process.TypeChoices.ORCHESTRATOR)
def test_process_detect_type_runner_watch(self):
"""runner_watch should be classified as a worker, not the orchestrator itself."""
with patch('sys.argv', ['archivebox', 'manage', 'runner_watch', '--pidfile=/tmp/runserver.pid']):
with patch("sys.argv", ["archivebox", "manage", "runner_watch", "--pidfile=/tmp/runserver.pid"]):
result = Process._detect_process_type()
self.assertEqual(result, Process.TypeChoices.WORKER)
def test_process_detect_type_cli(self):
"""_detect_process_type should detect CLI commands."""
with patch('sys.argv', ['archivebox', 'add', 'http://example.com']):
with patch("sys.argv", ["archivebox", "add", "http://example.com"]):
result = Process._detect_process_type()
self.assertEqual(result, Process.TypeChoices.CLI)
def test_process_detect_type_binary(self):
"""_detect_process_type should detect non-ArchiveBox subprocesses as binary processes."""
with patch('sys.argv', ['/usr/bin/wget', 'https://example.com']):
with patch("sys.argv", ["/usr/bin/wget", "https://example.com"]):
result = Process._detect_process_type()
self.assertEqual(result, Process.TypeChoices.BINARY)
@@ -463,7 +478,7 @@ class TestProcessCurrent(TestCase):
"""Process.proc should accept a script recorded in DB when wrapped by an interpreter in psutil."""
proc = Process.objects.create(
machine=Machine.current(),
cmd=['/tmp/on_Crawl__90_chrome_launch.daemon.bg.js', '--url=https://example.com/'],
cmd=["/tmp/on_Crawl__90_chrome_launch.daemon.bg.js", "--url=https://example.com/"],
pid=12345,
status=Process.StatusChoices.RUNNING,
started_at=timezone.now(),
@@ -472,12 +487,12 @@ class TestProcessCurrent(TestCase):
os_proc = Mock()
os_proc.create_time.return_value = proc.started_at.timestamp()
os_proc.cmdline.return_value = [
'node',
'/tmp/on_Crawl__90_chrome_launch.daemon.bg.js',
'--url=https://example.com/',
"node",
"/tmp/on_Crawl__90_chrome_launch.daemon.bg.js",
"--url=https://example.com/",
]
with patch('archivebox.machine.models.psutil.Process', return_value=os_proc):
with patch("archivebox.machine.models.psutil.Process", return_value=os_proc):
self.assertIs(proc.proc, os_proc)
@@ -487,6 +502,7 @@ class TestProcessHierarchy(TestCase):
def setUp(self):
"""Create machine."""
import archivebox.machine.models as models
models._CURRENT_MACHINE = None
self.machine = Machine.current()
@@ -561,6 +577,7 @@ class TestProcessLifecycle(TestCase):
def setUp(self):
"""Create machine."""
import archivebox.machine.models as models
models._CURRENT_MACHINE = None
self.machine = Machine.current()
@@ -643,6 +660,7 @@ class TestProcessClassMethods(TestCase):
def setUp(self):
"""Create machine."""
import archivebox.machine.models as models
models._CURRENT_MACHINE = None
self.machine = Machine.current()
@@ -689,6 +707,77 @@ class TestProcessClassMethods(TestCase):
stale.refresh_from_db()
self.assertEqual(stale.status, Process.StatusChoices.EXITED)
def test_cleanup_stale_running_marks_timed_out_rows_exited(self):
"""cleanup_stale_running should retire RUNNING rows that exceed timeout + grace."""
stale = Process.objects.create(
machine=self.machine,
status=Process.StatusChoices.RUNNING,
pid=999998,
timeout=5,
started_at=timezone.now() - PROCESS_TIMEOUT_GRACE - timedelta(seconds=10),
)
cleaned = Process.cleanup_stale_running()
self.assertGreaterEqual(cleaned, 1)
stale.refresh_from_db()
self.assertEqual(stale.status, Process.StatusChoices.EXITED)
def test_cleanup_stale_running_marks_timed_out_live_hooks_exited(self):
"""Timed-out live hook rows should be retired in the DB without trying to kill the process."""
stale = Process.objects.create(
machine=self.machine,
process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.RUNNING,
pid=os.getpid(),
timeout=5,
started_at=timezone.now() - PROCESS_TIMEOUT_GRACE - timedelta(seconds=10),
)
with (
patch.object(Process, "poll", return_value=None),
patch.object(Process, "kill_tree") as kill_tree,
patch.object(Process, "terminate") as terminate,
):
cleaned = Process.cleanup_stale_running()
self.assertGreaterEqual(cleaned, 1)
stale.refresh_from_db()
self.assertEqual(stale.status, Process.StatusChoices.EXITED)
kill_tree.assert_not_called()
terminate.assert_not_called()
def test_cleanup_orphaned_workers_marks_dead_root_children_exited(self):
"""cleanup_orphaned_workers should retire rows whose CLI/orchestrator root is gone."""
import psutil
from datetime import datetime
started_at = datetime.fromtimestamp(psutil.Process(os.getpid()).create_time(), tz=timezone.get_current_timezone())
parent = Process.objects.create(
machine=self.machine,
process_type=Process.TypeChoices.CLI,
status=Process.StatusChoices.RUNNING,
pid=999997,
started_at=timezone.now() - timedelta(minutes=5),
)
child = Process.objects.create(
machine=self.machine,
parent=parent,
process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.RUNNING,
pid=os.getpid(),
started_at=started_at,
)
with patch.object(Process, "kill_tree") as kill_tree, patch.object(Process, "terminate") as terminate:
cleaned = Process.cleanup_orphaned_workers()
self.assertEqual(cleaned, 1)
child.refresh_from_db()
self.assertEqual(child.status, Process.StatusChoices.EXITED)
kill_tree.assert_not_called()
terminate.assert_not_called()
class TestProcessStateMachine(TestCase):
"""Test the ProcessMachine state machine."""
@@ -696,12 +785,13 @@ class TestProcessStateMachine(TestCase):
def setUp(self):
"""Create a machine and process for state machine tests."""
import archivebox.machine.models as models
models._CURRENT_MACHINE = None
self.machine = Machine.current()
self.process = Process.objects.create(
machine=self.machine,
cmd=['echo', 'test'],
pwd='/tmp',
cmd=["echo", "test"],
pwd="/tmp",
)
def test_process_state_machine_initial_state(self):
@@ -730,5 +820,5 @@ class TestProcessStateMachine(TestCase):
self.assertTrue(sm.is_exited())
if __name__ == '__main__':
pytest.main([__file__, '-v'])
if __name__ == "__main__":
pytest.main([__file__, "-v"])

View File

@@ -31,7 +31,7 @@ class TestMigrationFrom04x(unittest.TestCase):
def setUp(self):
"""Create a temporary directory with 0.4.x schema and data."""
self.work_dir = Path(tempfile.mkdtemp())
self.db_path = self.work_dir / 'index.sqlite3'
self.db_path = self.work_dir / "index.sqlite3"
# Create directory structure
create_data_dir_structure(self.work_dir)
@@ -50,9 +50,9 @@ class TestMigrationFrom04x(unittest.TestCase):
def test_migration_preserves_snapshot_count(self):
"""Migration should preserve all snapshots from 0.4.x."""
expected_count = len(self.original_data['snapshots'])
expected_count = len(self.original_data["snapshots"])
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
ok, msg = verify_snapshot_count(self.db_path, expected_count)
@@ -60,9 +60,9 @@ class TestMigrationFrom04x(unittest.TestCase):
def test_migration_preserves_snapshot_urls(self):
"""Migration should preserve all snapshot URLs from 0.4.x."""
expected_urls = [s['url'] for s in self.original_data['snapshots']]
expected_urls = [s["url"] for s in self.original_data["snapshots"]]
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
ok, msg = verify_snapshot_urls(self.db_path, expected_urls)
@@ -70,14 +70,14 @@ class TestMigrationFrom04x(unittest.TestCase):
def test_migration_converts_string_tags_to_model(self):
"""Migration should convert comma-separated tags to Tag model instances."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
# Collect unique tags from original data
original_tags = set()
for tags_str in cast(list[str], self.original_data['tags_str']):
for tags_str in cast(list[str], self.original_data["tags_str"]):
if tags_str:
for tag in tags_str.split(','):
for tag in tags_str.split(","):
original_tags.add(tag.strip())
# Tags should have been created
@@ -86,7 +86,7 @@ class TestMigrationFrom04x(unittest.TestCase):
def test_migration_preserves_snapshot_titles(self):
"""Migration should preserve all snapshot titles."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
conn = sqlite3.connect(str(self.db_path))
@@ -95,43 +95,46 @@ class TestMigrationFrom04x(unittest.TestCase):
actual = {row[0]: row[1] for row in cursor.fetchall()}
conn.close()
for snapshot in self.original_data['snapshots']:
for snapshot in self.original_data["snapshots"]:
self.assertEqual(
actual.get(snapshot['url']),
snapshot['title'],
f"Title mismatch for {snapshot['url']}"
actual.get(snapshot["url"]),
snapshot["title"],
f"Title mismatch for {snapshot['url']}",
)
def test_status_works_after_migration(self):
"""Status command should work after migration."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
result = run_archivebox(self.work_dir, ['status'])
result = run_archivebox(self.work_dir, ["status"])
self.assertEqual(result.returncode, 0, f"Status failed after migration: {result.stderr}")
def test_list_works_after_migration(self):
"""List command should work and show ALL migrated snapshots."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
result = run_archivebox(self.work_dir, ['list'])
result = run_archivebox(self.work_dir, ["list"])
self.assertEqual(result.returncode, 0, f"List failed after migration: {result.stderr}")
# Verify ALL snapshots appear in output
output = result.stdout + result.stderr
for snapshot in self.original_data['snapshots']:
url_fragment = snapshot['url'][:30]
self.assertIn(url_fragment, output,
f"Snapshot {snapshot['url']} not found in list output")
for snapshot in self.original_data["snapshots"]:
url_fragment = snapshot["url"][:30]
self.assertIn(
url_fragment,
output,
f"Snapshot {snapshot['url']} not found in list output",
)
def test_add_works_after_migration(self):
"""Adding new URLs should work after migration from 0.4.x."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
# Try to add a new URL after migration
result = run_archivebox(self.work_dir, ['add', '--index-only', 'https://example.com/new-page'], timeout=45)
result = run_archivebox(self.work_dir, ["add", "--index-only", "https://example.com/new-page"], timeout=45)
self.assertEqual(result.returncode, 0, f"Add failed after migration: {result.stderr}")
# Verify snapshot was added
@@ -145,7 +148,7 @@ class TestMigrationFrom04x(unittest.TestCase):
def test_new_schema_elements_created(self):
"""Migration should create new 0.9.x schema elements."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
conn = sqlite3.connect(str(self.db_path))
@@ -155,25 +158,25 @@ class TestMigrationFrom04x(unittest.TestCase):
conn.close()
# New tables should exist
self.assertIn('crawls_crawl', tables, "crawls_crawl table not created")
self.assertIn('core_tag', tables, "core_tag table not created")
self.assertIn('core_archiveresult', tables, "core_archiveresult table not created")
self.assertIn("crawls_crawl", tables, "crawls_crawl table not created")
self.assertIn("core_tag", tables, "core_tag table not created")
self.assertIn("core_archiveresult", tables, "core_archiveresult table not created")
def test_snapshots_have_new_fields(self):
"""Migrated snapshots should have new 0.9.x fields."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
conn = sqlite3.connect(str(self.db_path))
cursor = conn.cursor()
cursor.execute('PRAGMA table_info(core_snapshot)')
cursor.execute("PRAGMA table_info(core_snapshot)")
columns = {row[1] for row in cursor.fetchall()}
conn.close()
required_columns = {'status', 'depth', 'created_at', 'modified_at'}
required_columns = {"status", "depth", "created_at", "modified_at"}
for col in required_columns:
self.assertIn(col, columns, f"Snapshot missing new column: {col}")
if __name__ == '__main__':
if __name__ == "__main__":
unittest.main()

View File

@@ -35,7 +35,7 @@ class TestMigrationFrom07x(unittest.TestCase):
def setUp(self):
"""Create a temporary directory with 0.7.x schema and data."""
self.work_dir = Path(tempfile.mkdtemp())
self.db_path = self.work_dir / 'index.sqlite3'
self.db_path = self.work_dir / "index.sqlite3"
# Create directory structure
create_data_dir_structure(self.work_dir)
@@ -54,9 +54,9 @@ class TestMigrationFrom07x(unittest.TestCase):
def test_migration_preserves_snapshot_count(self):
"""Migration should preserve all snapshots."""
expected_count = len(self.original_data['snapshots'])
expected_count = len(self.original_data["snapshots"])
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
ok, msg = verify_snapshot_count(self.db_path, expected_count)
@@ -64,9 +64,9 @@ class TestMigrationFrom07x(unittest.TestCase):
def test_migration_preserves_snapshot_urls(self):
"""Migration should preserve all snapshot URLs."""
expected_urls = [s['url'] for s in self.original_data['snapshots']]
expected_urls = [s["url"] for s in self.original_data["snapshots"]]
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
ok, msg = verify_snapshot_urls(self.db_path, expected_urls)
@@ -74,9 +74,9 @@ class TestMigrationFrom07x(unittest.TestCase):
def test_migration_preserves_snapshot_titles(self):
"""Migration should preserve all snapshot titles."""
expected_titles = {s['url']: s['title'] for s in self.original_data['snapshots']}
expected_titles = {s["url"]: s["title"] for s in self.original_data["snapshots"]}
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
ok, msg = verify_snapshot_titles(self.db_path, expected_titles)
@@ -84,9 +84,9 @@ class TestMigrationFrom07x(unittest.TestCase):
def test_migration_preserves_tags(self):
"""Migration should preserve all tags."""
expected_count = len(self.original_data['tags'])
expected_count = len(self.original_data["tags"])
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
ok, msg = verify_tag_count(self.db_path, expected_count)
@@ -94,9 +94,9 @@ class TestMigrationFrom07x(unittest.TestCase):
def test_migration_preserves_archiveresults(self):
"""Migration should preserve all archive results."""
expected_count = len(self.original_data['archiveresults'])
expected_count = len(self.original_data["archiveresults"])
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
ok, msg = verify_archiveresult_count(self.db_path, expected_count)
@@ -104,7 +104,7 @@ class TestMigrationFrom07x(unittest.TestCase):
def test_migration_preserves_foreign_keys(self):
"""Migration should maintain foreign key relationships."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
ok, msg = verify_foreign_keys(self.db_path)
@@ -112,41 +112,41 @@ class TestMigrationFrom07x(unittest.TestCase):
def test_status_works_after_migration(self):
"""Status command should work after migration."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
result = run_archivebox(self.work_dir, ['status'])
result = run_archivebox(self.work_dir, ["status"])
self.assertEqual(result.returncode, 0, f"Status failed after migration: {result.stderr}")
def test_search_works_after_migration(self):
"""Search command should find ALL migrated snapshots."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
result = run_archivebox(self.work_dir, ['search'])
result = run_archivebox(self.work_dir, ["search"])
self.assertEqual(result.returncode, 0, f"Search failed after migration: {result.stderr}")
# Verify ALL snapshots appear in output
output = result.stdout + result.stderr
ok, msg = verify_all_snapshots_in_output(output, self.original_data['snapshots'])
ok, msg = verify_all_snapshots_in_output(output, self.original_data["snapshots"])
self.assertTrue(ok, msg)
def test_list_works_after_migration(self):
"""List command should work and show ALL migrated data."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
result = run_archivebox(self.work_dir, ['snapshot', 'list'])
result = run_archivebox(self.work_dir, ["snapshot", "list"])
self.assertEqual(result.returncode, 0, f"List failed after migration: {result.stderr}")
# Verify ALL snapshots appear in output
output = result.stdout + result.stderr
ok, msg = verify_all_snapshots_in_output(output, self.original_data['snapshots'])
ok, msg = verify_all_snapshots_in_output(output, self.original_data["snapshots"])
self.assertTrue(ok, msg)
def test_new_schema_elements_created_after_migration(self):
"""Migration should create new 0.9.x schema elements (crawls_crawl, etc.)."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
conn = sqlite3.connect(str(self.db_path))
@@ -158,29 +158,29 @@ class TestMigrationFrom07x(unittest.TestCase):
conn.close()
# 0.9.x should have crawls_crawl table
self.assertIn('crawls_crawl', tables, "crawls_crawl table not created during migration")
self.assertIn("crawls_crawl", tables, "crawls_crawl table not created during migration")
def test_snapshots_have_new_fields_after_migration(self):
"""Migrated snapshots should have new 0.9.x fields (status, depth, etc.)."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
conn = sqlite3.connect(str(self.db_path))
cursor = conn.cursor()
# Check snapshot table has new columns
cursor.execute('PRAGMA table_info(core_snapshot)')
cursor.execute("PRAGMA table_info(core_snapshot)")
columns = {row[1] for row in cursor.fetchall()}
conn.close()
# 0.9.x snapshots should have status, depth, created_at, modified_at
required_new_columns = {'status', 'depth', 'created_at', 'modified_at'}
required_new_columns = {"status", "depth", "created_at", "modified_at"}
for col in required_new_columns:
self.assertIn(col, columns, f"Snapshot missing new column: {col}")
def test_add_works_after_migration(self):
"""Adding new URLs should work after migration from 0.7.x."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
# Verify that init created the crawls_crawl table before proceeding
@@ -192,7 +192,7 @@ class TestMigrationFrom07x(unittest.TestCase):
self.assertTrue(table_exists, f"Init failed to create crawls_crawl table. Init stderr: {result.stderr[-500:]}")
# Try to add a new URL after migration (use --index-only for speed)
result = run_archivebox(self.work_dir, ['add', '--index-only', 'https://example.com/new-page'], timeout=45)
result = run_archivebox(self.work_dir, ["add", "--index-only", "https://example.com/new-page"], timeout=45)
self.assertEqual(result.returncode, 0, f"Add failed after migration: {result.stderr}")
# Verify a Crawl was created for the new URL
@@ -206,7 +206,7 @@ class TestMigrationFrom07x(unittest.TestCase):
def test_archiveresult_status_preserved_after_migration(self):
"""Migration should preserve archive result status values."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
conn = sqlite3.connect(str(self.db_path))
@@ -218,35 +218,39 @@ class TestMigrationFrom07x(unittest.TestCase):
conn.close()
# Original data has known status distribution: succeeded, failed, skipped
self.assertIn('succeeded', status_counts, "Should have succeeded results")
self.assertIn('failed', status_counts, "Should have failed results")
self.assertIn('skipped', status_counts, "Should have skipped results")
self.assertIn("succeeded", status_counts, "Should have succeeded results")
self.assertIn("failed", status_counts, "Should have failed results")
self.assertIn("skipped", status_counts, "Should have skipped results")
def test_version_works_after_migration(self):
"""Version command should work after migration."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
result = run_archivebox(self.work_dir, ['version'])
result = run_archivebox(self.work_dir, ["version"])
self.assertEqual(result.returncode, 0, f"Version failed after migration: {result.stderr}")
# Should show version info
output = result.stdout + result.stderr
self.assertTrue('ArchiveBox' in output or 'version' in output.lower(),
f"Version output missing expected content: {output[:500]}")
self.assertTrue(
"ArchiveBox" in output or "version" in output.lower(),
f"Version output missing expected content: {output[:500]}",
)
def test_help_works_after_migration(self):
"""Help command should work after migration."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
result = run_archivebox(self.work_dir, ['help'])
result = run_archivebox(self.work_dir, ["help"])
self.assertEqual(result.returncode, 0, f"Help failed after migration: {result.stderr}")
# Should show available commands
output = result.stdout + result.stderr
self.assertTrue('add' in output.lower() and 'status' in output.lower(),
f"Help output missing expected commands: {output[:500]}")
self.assertTrue(
"add" in output.lower() and "status" in output.lower(),
f"Help output missing expected commands: {output[:500]}",
)
class TestMigrationDataIntegrity07x(unittest.TestCase):
@@ -255,7 +259,7 @@ class TestMigrationDataIntegrity07x(unittest.TestCase):
def test_no_duplicate_snapshots_after_migration(self):
"""Migration should not create duplicate snapshots."""
work_dir = Path(tempfile.mkdtemp())
db_path = work_dir / 'index.sqlite3'
db_path = work_dir / "index.sqlite3"
try:
create_data_dir_structure(work_dir)
@@ -264,7 +268,7 @@ class TestMigrationDataIntegrity07x(unittest.TestCase):
conn.close()
seed_0_7_data(db_path)
result = run_archivebox(work_dir, ['init'], timeout=45)
result = run_archivebox(work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
# Check for duplicate URLs
@@ -285,7 +289,7 @@ class TestMigrationDataIntegrity07x(unittest.TestCase):
def test_no_orphaned_archiveresults_after_migration(self):
"""Migration should not leave orphaned ArchiveResults."""
work_dir = Path(tempfile.mkdtemp())
db_path = work_dir / 'index.sqlite3'
db_path = work_dir / "index.sqlite3"
try:
create_data_dir_structure(work_dir)
@@ -294,7 +298,7 @@ class TestMigrationDataIntegrity07x(unittest.TestCase):
conn.close()
seed_0_7_data(db_path)
result = run_archivebox(work_dir, ['init'], timeout=45)
result = run_archivebox(work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
ok, msg = verify_foreign_keys(db_path)
@@ -306,7 +310,7 @@ class TestMigrationDataIntegrity07x(unittest.TestCase):
def test_timestamps_preserved_after_migration(self):
"""Migration should preserve original timestamps."""
work_dir = Path(tempfile.mkdtemp())
db_path = work_dir / 'index.sqlite3'
db_path = work_dir / "index.sqlite3"
try:
create_data_dir_structure(work_dir)
@@ -315,9 +319,9 @@ class TestMigrationDataIntegrity07x(unittest.TestCase):
conn.close()
original_data = seed_0_7_data(db_path)
original_timestamps = {s['url']: s['timestamp'] for s in original_data['snapshots']}
original_timestamps = {s["url"]: s["timestamp"] for s in original_data["snapshots"]}
result = run_archivebox(work_dir, ['init'], timeout=45)
result = run_archivebox(work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
conn = sqlite3.connect(str(db_path))
@@ -328,8 +332,9 @@ class TestMigrationDataIntegrity07x(unittest.TestCase):
for url, original_ts in original_timestamps.items():
self.assertEqual(
migrated_timestamps.get(url), original_ts,
f"Timestamp changed for {url}: {original_ts} -> {migrated_timestamps.get(url)}"
migrated_timestamps.get(url),
original_ts,
f"Timestamp changed for {url}: {original_ts} -> {migrated_timestamps.get(url)}",
)
finally:
@@ -338,7 +343,7 @@ class TestMigrationDataIntegrity07x(unittest.TestCase):
def test_tag_associations_preserved_after_migration(self):
"""Migration should preserve snapshot-tag associations."""
work_dir = Path(tempfile.mkdtemp())
db_path = work_dir / 'index.sqlite3'
db_path = work_dir / "index.sqlite3"
try:
create_data_dir_structure(work_dir)
@@ -354,7 +359,7 @@ class TestMigrationDataIntegrity07x(unittest.TestCase):
original_count = cursor.fetchone()[0]
conn.close()
result = run_archivebox(work_dir, ['init'], timeout=45)
result = run_archivebox(work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
# Count tag associations after migration
@@ -364,12 +369,15 @@ class TestMigrationDataIntegrity07x(unittest.TestCase):
migrated_count = cursor.fetchone()[0]
conn.close()
self.assertEqual(migrated_count, original_count,
f"Tag associations changed: {original_count} -> {migrated_count}")
self.assertEqual(
migrated_count,
original_count,
f"Tag associations changed: {original_count} -> {migrated_count}",
)
finally:
shutil.rmtree(work_dir, ignore_errors=True)
if __name__ == '__main__':
if __name__ == "__main__":
unittest.main()

View File

@@ -39,7 +39,7 @@ class TestMigrationFrom08x(unittest.TestCase):
def setUp(self):
"""Create a temporary directory with 0.8.x schema and data."""
self.work_dir = Path(tempfile.mkdtemp())
self.db_path = self.work_dir / 'index.sqlite3'
self.db_path = self.work_dir / "index.sqlite3"
# Create directory structure
create_data_dir_structure(self.work_dir)
@@ -58,9 +58,9 @@ class TestMigrationFrom08x(unittest.TestCase):
def test_migration_preserves_snapshot_count(self):
"""Migration should preserve all snapshots from 0.8.x."""
expected_count = len(self.original_data['snapshots'])
expected_count = len(self.original_data["snapshots"])
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
ok, msg = verify_snapshot_count(self.db_path, expected_count)
@@ -68,9 +68,9 @@ class TestMigrationFrom08x(unittest.TestCase):
def test_migration_preserves_snapshot_urls(self):
"""Migration should preserve all snapshot URLs from 0.8.x."""
expected_urls = [s['url'] for s in self.original_data['snapshots']]
expected_urls = [s["url"] for s in self.original_data["snapshots"]]
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
ok, msg = verify_snapshot_urls(self.db_path, expected_urls)
@@ -78,14 +78,14 @@ class TestMigrationFrom08x(unittest.TestCase):
def test_migration_preserves_crawls(self):
"""Migration should preserve all Crawl records and create default crawl if needed."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
# Count snapshots with NULL crawl_id in original data
snapshots_without_crawl = sum(1 for s in self.original_data['snapshots'] if s['crawl_id'] is None)
snapshots_without_crawl = sum(1 for s in self.original_data["snapshots"] if s["crawl_id"] is None)
# Expected count: original crawls + 1 default crawl if any snapshots had NULL crawl_id
expected_count = len(self.original_data['crawls'])
expected_count = len(self.original_data["crawls"])
if snapshots_without_crawl > 0:
expected_count += 1 # Migration 0024 creates a default crawl
@@ -94,42 +94,47 @@ class TestMigrationFrom08x(unittest.TestCase):
def test_migration_preserves_snapshot_crawl_links(self):
"""Migration should preserve snapshot-to-crawl relationships and assign default crawl to orphans."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
conn = sqlite3.connect(str(self.db_path))
cursor = conn.cursor()
# Check EVERY snapshot has a crawl_id after migration
for snapshot in self.original_data['snapshots']:
cursor.execute("SELECT crawl_id FROM core_snapshot WHERE url = ?", (snapshot['url'],))
for snapshot in self.original_data["snapshots"]:
cursor.execute("SELECT crawl_id FROM core_snapshot WHERE url = ?", (snapshot["url"],))
row = cursor.fetchone()
self.assertIsNotNone(row, f"Snapshot {snapshot['url']} not found after migration")
if snapshot['crawl_id'] is not None:
if snapshot["crawl_id"] is not None:
# Snapshots that had a crawl should keep it
self.assertEqual(row[0], snapshot['crawl_id'],
f"Crawl ID changed for {snapshot['url']}: expected {snapshot['crawl_id']}, got {row[0]}")
self.assertEqual(
row[0],
snapshot["crawl_id"],
f"Crawl ID changed for {snapshot['url']}: expected {snapshot['crawl_id']}, got {row[0]}",
)
else:
# Snapshots without a crawl should now have one (the default crawl)
self.assertIsNotNone(row[0],
f"Snapshot {snapshot['url']} should have been assigned to default crawl but has NULL")
self.assertIsNotNone(
row[0],
f"Snapshot {snapshot['url']} should have been assigned to default crawl but has NULL",
)
conn.close()
def test_migration_preserves_tags(self):
"""Migration should preserve all tags."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
ok, msg = verify_tag_count(self.db_path, len(self.original_data['tags']))
ok, msg = verify_tag_count(self.db_path, len(self.original_data["tags"]))
self.assertTrue(ok, msg)
def test_migration_preserves_archiveresults(self):
"""Migration should preserve all archive results."""
expected_count = len(self.original_data['archiveresults'])
expected_count = len(self.original_data["archiveresults"])
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
ok, msg = verify_archiveresult_count(self.db_path, expected_count)
@@ -137,7 +142,7 @@ class TestMigrationFrom08x(unittest.TestCase):
def test_migration_preserves_archiveresult_status(self):
"""Migration should preserve archive result status values."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
conn = sqlite3.connect(str(self.db_path))
@@ -149,49 +154,49 @@ class TestMigrationFrom08x(unittest.TestCase):
conn.close()
# Original data has known status distribution: succeeded, failed, skipped
self.assertIn('succeeded', status_counts, "Should have succeeded results")
self.assertIn('failed', status_counts, "Should have failed results")
self.assertIn('skipped', status_counts, "Should have skipped results")
self.assertIn("succeeded", status_counts, "Should have succeeded results")
self.assertIn("failed", status_counts, "Should have failed results")
self.assertIn("skipped", status_counts, "Should have skipped results")
def test_status_works_after_migration(self):
"""Status command should work after migration."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
result = run_archivebox(self.work_dir, ['status'])
result = run_archivebox(self.work_dir, ["status"])
self.assertEqual(result.returncode, 0, f"Status failed after migration: {result.stderr}")
def test_list_works_after_migration(self):
"""List command should work and show ALL migrated data."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
result = run_archivebox(self.work_dir, ['snapshot', 'list'])
result = run_archivebox(self.work_dir, ["snapshot", "list"])
self.assertEqual(result.returncode, 0, f"List failed after migration: {result.stderr}")
# Verify ALL snapshots appear in output
output = result.stdout + result.stderr
ok, msg = verify_all_snapshots_in_output(output, self.original_data['snapshots'])
ok, msg = verify_all_snapshots_in_output(output, self.original_data["snapshots"])
self.assertTrue(ok, msg)
def test_search_works_after_migration(self):
"""Search command should find ALL migrated snapshots."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
result = run_archivebox(self.work_dir, ['search'])
result = run_archivebox(self.work_dir, ["search"])
self.assertEqual(result.returncode, 0, f"Search failed after migration: {result.stderr}")
# Verify ALL snapshots appear in output
output = result.stdout + result.stderr
ok, msg = verify_all_snapshots_in_output(output, self.original_data['snapshots'])
ok, msg = verify_all_snapshots_in_output(output, self.original_data["snapshots"])
self.assertTrue(ok, msg)
def test_migration_preserves_snapshot_titles(self):
"""Migration should preserve all snapshot titles."""
expected_titles = {s['url']: s['title'] for s in self.original_data['snapshots']}
expected_titles = {s["url"]: s["title"] for s in self.original_data["snapshots"]}
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
ok, msg = verify_snapshot_titles(self.db_path, expected_titles)
@@ -199,7 +204,7 @@ class TestMigrationFrom08x(unittest.TestCase):
def test_migration_preserves_foreign_keys(self):
"""Migration should maintain foreign key relationships."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
ok, msg = verify_foreign_keys(self.db_path)
@@ -207,7 +212,7 @@ class TestMigrationFrom08x(unittest.TestCase):
def test_migration_removes_seed_id_column(self):
"""Migration should remove seed_id column from archivebox.crawls.crawl."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
conn = sqlite3.connect(str(self.db_path))
@@ -216,12 +221,15 @@ class TestMigrationFrom08x(unittest.TestCase):
columns = [row[1] for row in cursor.fetchall()]
conn.close()
self.assertNotIn('seed_id', columns,
f"seed_id column should have been removed by migration. Columns: {columns}")
self.assertNotIn(
"seed_id",
columns,
f"seed_id column should have been removed by migration. Columns: {columns}",
)
def test_migration_removes_seed_table(self):
"""Migration should remove crawls_seed table."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
conn = sqlite3.connect(str(self.db_path))
@@ -234,10 +242,13 @@ class TestMigrationFrom08x(unittest.TestCase):
def test_add_works_after_migration(self):
"""Adding new URLs should work after migration from 0.8.x."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
# Check that init actually ran and applied migrations
self.assertIn('Applying', result.stdout + result.stderr,
f"Init did not apply migrations. stdout: {result.stdout[:500]}, stderr: {result.stderr[:500]}")
self.assertIn(
"Applying",
result.stdout + result.stderr,
f"Init did not apply migrations. stdout: {result.stdout[:500]}, stderr: {result.stderr[:500]}",
)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
# Count existing crawls
@@ -248,7 +259,7 @@ class TestMigrationFrom08x(unittest.TestCase):
conn.close()
# Try to add a new URL after migration (use --index-only for speed)
result = run_archivebox(self.work_dir, ['add', '--index-only', 'https://example.com/new-page'], timeout=45)
result = run_archivebox(self.work_dir, ["add", "--index-only", "https://example.com/new-page"], timeout=45)
self.assertEqual(result.returncode, 0, f"Add failed after migration: {result.stderr}")
# Verify a new Crawl was created
@@ -258,35 +269,40 @@ class TestMigrationFrom08x(unittest.TestCase):
new_crawl_count = cursor.fetchone()[0]
conn.close()
self.assertGreater(new_crawl_count, initial_crawl_count,
f"No new Crawl created when adding URL. Add stderr: {result.stderr[-500:]}")
self.assertGreater(
new_crawl_count,
initial_crawl_count,
f"No new Crawl created when adding URL. Add stderr: {result.stderr[-500:]}",
)
def test_version_works_after_migration(self):
"""Version command should work after migration."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
result = run_archivebox(self.work_dir, ['version'])
result = run_archivebox(self.work_dir, ["version"])
self.assertEqual(result.returncode, 0, f"Version failed after migration: {result.stderr}")
# Should show version info
output = result.stdout + result.stderr
self.assertTrue('ArchiveBox' in output or 'version' in output.lower(),
f"Version output missing expected content: {output[:500]}")
self.assertTrue(
"ArchiveBox" in output or "version" in output.lower(),
f"Version output missing expected content: {output[:500]}",
)
def test_migration_creates_process_records(self):
"""Migration should create Process records for all ArchiveResults."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
# Verify Process records created
expected_count = len(self.original_data['archiveresults'])
expected_count = len(self.original_data["archiveresults"])
ok, msg = verify_process_migration(self.db_path, expected_count)
self.assertTrue(ok, msg)
def test_migration_creates_binary_records(self):
"""Migration should create Binary records from cmd_version data."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
conn = sqlite3.connect(str(self.db_path))
@@ -297,15 +313,18 @@ class TestMigrationFrom08x(unittest.TestCase):
binary_count = cursor.fetchone()[0]
# Should have at least one binary per unique extractor
extractors = set(ar['extractor'] for ar in self.original_data['archiveresults'])
self.assertGreaterEqual(binary_count, len(extractors),
f"Expected at least {len(extractors)} Binaries, got {binary_count}")
extractors = {ar["extractor"] for ar in self.original_data["archiveresults"]}
self.assertGreaterEqual(
binary_count,
len(extractors),
f"Expected at least {len(extractors)} Binaries, got {binary_count}",
)
conn.close()
def test_migration_preserves_cmd_data(self):
"""Migration should preserve cmd data in Process.cmd field."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
result = run_archivebox(self.work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
conn = sqlite3.connect(str(self.db_path))
@@ -316,9 +335,12 @@ class TestMigrationFrom08x(unittest.TestCase):
cmd_records = cursor.fetchall()
# All Processes should have non-empty cmd (test data has json.dumps([extractor, '--version']))
expected_count = len(self.original_data['archiveresults'])
self.assertEqual(len(cmd_records), expected_count,
f"Expected {expected_count} Processes with cmd, got {len(cmd_records)}")
expected_count = len(self.original_data["archiveresults"])
self.assertEqual(
len(cmd_records),
expected_count,
f"Expected {expected_count} Processes with cmd, got {len(cmd_records)}",
)
conn.close()
@@ -329,7 +351,7 @@ class TestMigrationDataIntegrity08x(unittest.TestCase):
def test_no_duplicate_snapshots_after_migration(self):
"""Migration should not create duplicate snapshots."""
work_dir = Path(tempfile.mkdtemp())
db_path = work_dir / 'index.sqlite3'
db_path = work_dir / "index.sqlite3"
try:
create_data_dir_structure(work_dir)
@@ -338,7 +360,7 @@ class TestMigrationDataIntegrity08x(unittest.TestCase):
conn.close()
seed_0_8_data(db_path)
result = run_archivebox(work_dir, ['init'], timeout=45)
result = run_archivebox(work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
# Check for duplicate URLs
@@ -359,7 +381,7 @@ class TestMigrationDataIntegrity08x(unittest.TestCase):
def test_no_orphaned_archiveresults_after_migration(self):
"""Migration should not leave orphaned ArchiveResults."""
work_dir = Path(tempfile.mkdtemp())
db_path = work_dir / 'index.sqlite3'
db_path = work_dir / "index.sqlite3"
try:
create_data_dir_structure(work_dir)
@@ -368,7 +390,7 @@ class TestMigrationDataIntegrity08x(unittest.TestCase):
conn.close()
seed_0_8_data(db_path)
result = run_archivebox(work_dir, ['init'], timeout=45)
result = run_archivebox(work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
ok, msg = verify_foreign_keys(db_path)
@@ -380,7 +402,7 @@ class TestMigrationDataIntegrity08x(unittest.TestCase):
def test_timestamps_preserved_after_migration(self):
"""Migration should preserve original timestamps."""
work_dir = Path(tempfile.mkdtemp())
db_path = work_dir / 'index.sqlite3'
db_path = work_dir / "index.sqlite3"
try:
create_data_dir_structure(work_dir)
@@ -389,9 +411,9 @@ class TestMigrationDataIntegrity08x(unittest.TestCase):
conn.close()
original_data = seed_0_8_data(db_path)
original_timestamps = {s['url']: s['timestamp'] for s in original_data['snapshots']}
original_timestamps = {s["url"]: s["timestamp"] for s in original_data["snapshots"]}
result = run_archivebox(work_dir, ['init'], timeout=45)
result = run_archivebox(work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
conn = sqlite3.connect(str(db_path))
@@ -402,8 +424,9 @@ class TestMigrationDataIntegrity08x(unittest.TestCase):
for url, original_ts in original_timestamps.items():
self.assertEqual(
migrated_timestamps.get(url), original_ts,
f"Timestamp changed for {url}: {original_ts} -> {migrated_timestamps.get(url)}"
migrated_timestamps.get(url),
original_ts,
f"Timestamp changed for {url}: {original_ts} -> {migrated_timestamps.get(url)}",
)
finally:
@@ -412,7 +435,7 @@ class TestMigrationDataIntegrity08x(unittest.TestCase):
def test_crawl_data_preserved_after_migration(self):
"""Migration should preserve crawl metadata (urls, label, status)."""
work_dir = Path(tempfile.mkdtemp())
db_path = work_dir / 'index.sqlite3'
db_path = work_dir / "index.sqlite3"
try:
create_data_dir_structure(work_dir)
@@ -421,19 +444,19 @@ class TestMigrationDataIntegrity08x(unittest.TestCase):
conn.close()
original_data = seed_0_8_data(db_path)
result = run_archivebox(work_dir, ['init'], timeout=45)
result = run_archivebox(work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
conn = sqlite3.connect(str(db_path))
cursor = conn.cursor()
# Check each crawl's data is preserved
for crawl in original_data['crawls']:
cursor.execute("SELECT urls, label FROM crawls_crawl WHERE id = ?", (crawl['id'],))
for crawl in original_data["crawls"]:
cursor.execute("SELECT urls, label FROM crawls_crawl WHERE id = ?", (crawl["id"],))
row = cursor.fetchone()
self.assertIsNotNone(row, f"Crawl {crawl['id']} not found after migration")
self.assertEqual(row[0], crawl['urls'], f"URLs mismatch for crawl {crawl['id']}")
self.assertEqual(row[1], crawl['label'], f"Label mismatch for crawl {crawl['id']}")
self.assertEqual(row[0], crawl["urls"], f"URLs mismatch for crawl {crawl['id']}")
self.assertEqual(row[1], crawl["label"], f"Label mismatch for crawl {crawl['id']}")
conn.close()
@@ -443,7 +466,7 @@ class TestMigrationDataIntegrity08x(unittest.TestCase):
def test_tag_associations_preserved_after_migration(self):
"""Migration should preserve snapshot-tag associations."""
work_dir = Path(tempfile.mkdtemp())
db_path = work_dir / 'index.sqlite3'
db_path = work_dir / "index.sqlite3"
try:
create_data_dir_structure(work_dir)
@@ -459,7 +482,7 @@ class TestMigrationDataIntegrity08x(unittest.TestCase):
original_count = cursor.fetchone()[0]
conn.close()
result = run_archivebox(work_dir, ['init'], timeout=45)
result = run_archivebox(work_dir, ["init"], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
# Count tag associations after migration
@@ -469,8 +492,11 @@ class TestMigrationDataIntegrity08x(unittest.TestCase):
migrated_count = cursor.fetchone()[0]
conn.close()
self.assertEqual(migrated_count, original_count,
f"Tag associations changed: {original_count} -> {migrated_count}")
self.assertEqual(
migrated_count,
original_count,
f"Tag associations changed: {original_count} -> {migrated_count}",
)
finally:
shutil.rmtree(work_dir, ignore_errors=True)
@@ -482,7 +508,7 @@ class TestFilesystemMigration08to09(unittest.TestCase):
def setUp(self):
"""Create a temporary directory for testing."""
self.work_dir = Path(tempfile.mkdtemp())
self.db_path = self.work_dir / 'index.sqlite3'
self.db_path = self.work_dir / "index.sqlite3"
def tearDown(self):
"""Clean up temporary directory."""
@@ -500,12 +526,13 @@ class TestFilesystemMigration08to09(unittest.TestCase):
5. Old archive/timestamp/ directories are cleaned up
"""
# Use the real 0.7.2 database which has actual ArchiveResults with files
gold_db = Path('/Users/squash/Local/Code/archiveboxes/archivebox-migration-path/archivebox-v0.7.2/data')
gold_db = Path("/Users/squash/Local/Code/archiveboxes/archivebox-migration-path/archivebox-v0.7.2/data")
if not gold_db.exists():
self.skipTest(f"Gold standard database not found at {gold_db}")
# Copy gold database to test directory
import shutil
for item in gold_db.iterdir():
if item.is_dir():
shutil.copytree(item, self.work_dir / item.name, dirs_exist_ok=True)
@@ -513,23 +540,23 @@ class TestFilesystemMigration08to09(unittest.TestCase):
shutil.copy2(item, self.work_dir / item.name)
# Count archive directories and files BEFORE migration
archive_dir = self.work_dir / 'archive'
dirs_before = list(archive_dir.glob('*')) if archive_dir.exists() else []
archive_dir = self.work_dir / "archive"
dirs_before = list(archive_dir.glob("*")) if archive_dir.exists() else []
dirs_before_count = len([d for d in dirs_before if d.is_dir()])
# Count total files in all archive directories
files_before = []
for d in dirs_before:
if d.is_dir():
files_before.extend([f for f in d.rglob('*') if f.is_file()])
files_before.extend([f for f in d.rglob("*") if f.is_file()])
files_before_count = len(files_before)
# Sample some specific files to check they're preserved
sample_files = [
'favicon.ico',
'screenshot.png',
'singlefile.html',
'headers.json',
"favicon.ico",
"screenshot.png",
"singlefile.html",
"headers.json",
]
sample_paths_before = {}
for d in dirs_before:
@@ -544,17 +571,17 @@ class TestFilesystemMigration08to09(unittest.TestCase):
print(f"[*] Sample files found: {len(sample_paths_before)}")
# Run init to trigger migration
result = run_archivebox(self.work_dir, ['init'], timeout=60)
result = run_archivebox(self.work_dir, ["init"], timeout=60)
self.assertEqual(result.returncode, 0, f"Init (migration) failed: {result.stderr}")
# Count archive directories and files AFTER migration
dirs_after = list(archive_dir.glob('*')) if archive_dir.exists() else []
dirs_after = list(archive_dir.glob("*")) if archive_dir.exists() else []
dirs_after_count = len([d for d in dirs_after if d.is_dir()])
files_after = []
for d in dirs_after:
if d.is_dir():
files_after.extend([f for f in d.rglob('*') if f.is_file()])
files_after.extend([f for f in d.rglob("*") if f.is_file()])
files_after_count = len(files_after)
# Verify sample files still exist
@@ -571,26 +598,32 @@ class TestFilesystemMigration08to09(unittest.TestCase):
print(f"[*] Sample files found: {len(sample_paths_after)}")
# Verify files still in old structure after migration (not moved yet)
self.assertEqual(dirs_before_count, dirs_after_count,
f"Archive directories lost during migration: {dirs_before_count} -> {dirs_after_count}")
self.assertEqual(files_before_count, files_after_count,
f"Files lost during migration: {files_before_count} -> {files_after_count}")
self.assertEqual(
dirs_before_count,
dirs_after_count,
f"Archive directories lost during migration: {dirs_before_count} -> {dirs_after_count}",
)
self.assertEqual(
files_before_count,
files_after_count,
f"Files lost during migration: {files_before_count} -> {files_after_count}",
)
# Run update to trigger filesystem reorganization
print("\n[*] Running archivebox update to reorganize filesystem...")
result = run_archivebox(self.work_dir, ['update'], timeout=120)
result = run_archivebox(self.work_dir, ["update"], timeout=120)
self.assertEqual(result.returncode, 0, f"Update failed: {result.stderr}")
# Check new filesystem structure
# New structure: users/username/snapshots/YYYYMMDD/example.com/snap-uuid-here/output.ext
users_dir = self.work_dir / 'users'
users_dir = self.work_dir / "users"
snapshots_base = None
if users_dir.exists():
# Find the snapshots directory
for user_dir in users_dir.iterdir():
if user_dir.is_dir():
user_snapshots = user_dir / 'snapshots'
user_snapshots = user_dir / "snapshots"
if user_snapshots.exists():
snapshots_base = user_snapshots
break
@@ -610,7 +643,7 @@ class TestFilesystemMigration08to09(unittest.TestCase):
for snap_dir in domain_dir.iterdir():
if snap_dir.is_dir():
# Files are directly in snap-uuid/ directory (no plugin subdirs)
for f in snap_dir.rglob('*'):
for f in snap_dir.rglob("*"):
if f.is_file():
files_new_structure.append(f)
# Track sample files
@@ -622,15 +655,15 @@ class TestFilesystemMigration08to09(unittest.TestCase):
print(f"[*] Sample files in new structure: {len(new_sample_files)}")
# Check old structure (should be gone or empty)
old_archive_dir = self.work_dir / 'archive'
old_archive_dir = self.work_dir / "archive"
old_files_remaining = []
unmigrated_dirs = []
if old_archive_dir.exists():
for d in old_archive_dir.glob('*'):
for d in old_archive_dir.glob("*"):
# Only count REAL directories, not symlinks (symlinks are the migrated ones)
if d.is_dir(follow_symlinks=False) and d.name.replace('.', '').isdigit():
if d.is_dir(follow_symlinks=False) and d.name.replace(".", "").isdigit():
# This is a timestamp directory (old structure)
files_in_dir = [f for f in d.rglob('*') if f.is_file()]
files_in_dir = [f for f in d.rglob("*") if f.is_file()]
if files_in_dir:
unmigrated_dirs.append((d.name, len(files_in_dir)))
old_files_remaining.extend(files_in_dir)
@@ -641,30 +674,48 @@ class TestFilesystemMigration08to09(unittest.TestCase):
print(f"[*] Unmigrated directories: {unmigrated_dirs}")
# CRITICAL: Verify files were moved to new structure
self.assertGreater(files_new_count, 0,
"No files found in new structure after update")
self.assertGreater(
files_new_count,
0,
"No files found in new structure after update",
)
# CRITICAL: Verify old structure is cleaned up
self.assertEqual(old_files_count, 0,
f"Old structure not cleaned up: {old_files_count} files still in archive/timestamp/ directories")
self.assertEqual(
old_files_count,
0,
f"Old structure not cleaned up: {old_files_count} files still in archive/timestamp/ directories",
)
# CRITICAL: Verify all files were moved (total count should match)
total_after_update = files_new_count + old_files_count
self.assertEqual(files_before_count, total_after_update,
f"Files lost during reorganization: {files_before_count} before → {total_after_update} after")
self.assertEqual(
files_before_count,
total_after_update,
f"Files lost during reorganization: {files_before_count} before → {total_after_update} after",
)
# CRITICAL: Verify sample files exist in new structure
self.assertGreater(len(new_sample_files), 0,
"Sample files not found in new structure")
self.assertGreater(
len(new_sample_files),
0,
"Sample files not found in new structure",
)
# Verify new path format
for path_key, file_path in new_sample_files.items():
# Path should contain: snapshots/YYYYMMDD/domain/snap-uuid/plugin/file
path_parts = file_path.parts
self.assertIn('snapshots', path_parts,
f"New path should contain 'snapshots': {file_path}")
self.assertIn('users', path_parts,
f"New path should contain 'users': {file_path}")
self.assertIn(
"snapshots",
path_parts,
f"New path should contain 'snapshots': {file_path}",
)
self.assertIn(
"users",
path_parts,
f"New path should contain 'users': {file_path}",
)
print(f"{path_key}{file_path.relative_to(self.work_dir)}")
# Verify Process and Binary records were created
@@ -692,24 +743,33 @@ class TestFilesystemMigration08to09(unittest.TestCase):
# Verify data migration happened correctly
# The 0.7.2 gold database has 44 ArchiveResults
self.assertEqual(archiveresult_count, 44,
f"Expected 44 ArchiveResults from 0.7.2 database, got {archiveresult_count}")
self.assertEqual(
archiveresult_count,
44,
f"Expected 44 ArchiveResults from 0.7.2 database, got {archiveresult_count}",
)
# Each ArchiveResult should create one Process record
self.assertEqual(process_count, 44,
f"Expected 44 Process records (1 per ArchiveResult), got {process_count}")
self.assertEqual(
process_count,
44,
f"Expected 44 Process records (1 per ArchiveResult), got {process_count}",
)
# The 44 ArchiveResults use 7 unique binaries (curl, wget, etc.)
self.assertEqual(binary_count, 7,
f"Expected 7 unique Binary records, got {binary_count}")
self.assertEqual(
binary_count,
7,
f"Expected 7 unique Binary records, got {binary_count}",
)
# ALL ArchiveResults should be linked to Process records
self.assertEqual(linked_count, 44,
f"Expected all 44 ArchiveResults linked to Process, got {linked_count}")
self.assertEqual(
linked_count,
44,
f"Expected all 44 ArchiveResults linked to Process, got {linked_count}",
)
if __name__ == '__main__':
if __name__ == "__main__":
unittest.main()

View File

@@ -22,13 +22,13 @@ class TestFreshInstall(unittest.TestCase):
work_dir = Path(tempfile.mkdtemp())
try:
result = run_archivebox(work_dir, ['init'])
result = run_archivebox(work_dir, ["init"])
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
# Verify database was created
self.assertTrue((work_dir / 'index.sqlite3').exists(), "Database not created")
self.assertTrue((work_dir / "index.sqlite3").exists(), "Database not created")
# Verify archive directory exists
self.assertTrue((work_dir / 'archive').is_dir(), "Archive dir not created")
self.assertTrue((work_dir / "archive").is_dir(), "Archive dir not created")
finally:
shutil.rmtree(work_dir, ignore_errors=True)
@@ -38,10 +38,10 @@ class TestFreshInstall(unittest.TestCase):
work_dir = Path(tempfile.mkdtemp())
try:
result = run_archivebox(work_dir, ['init'])
result = run_archivebox(work_dir, ["init"])
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
result = run_archivebox(work_dir, ['status'])
result = run_archivebox(work_dir, ["status"])
self.assertEqual(result.returncode, 0, f"Status failed: {result.stderr}")
finally:
@@ -52,14 +52,14 @@ class TestFreshInstall(unittest.TestCase):
work_dir = Path(tempfile.mkdtemp())
try:
result = run_archivebox(work_dir, ['init'])
result = run_archivebox(work_dir, ["init"])
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
# Add a URL with --index-only for speed
result = run_archivebox(work_dir, ['add', '--index-only', 'https://example.com'])
result = run_archivebox(work_dir, ["add", "--index-only", "https://example.com"])
self.assertEqual(result.returncode, 0, f"Add command failed: {result.stderr}")
conn = sqlite3.connect(str(work_dir / 'index.sqlite3'))
conn = sqlite3.connect(str(work_dir / "index.sqlite3"))
cursor = conn.cursor()
# Verify a Crawl was created
@@ -82,18 +82,18 @@ class TestFreshInstall(unittest.TestCase):
work_dir = Path(tempfile.mkdtemp())
try:
result = run_archivebox(work_dir, ['init'])
result = run_archivebox(work_dir, ["init"])
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
result = run_archivebox(work_dir, ['add', '--index-only', 'https://example.com'])
result = run_archivebox(work_dir, ["add", "--index-only", "https://example.com"])
self.assertEqual(result.returncode, 0, f"Add failed: {result.stderr}")
result = run_archivebox(work_dir, ['list'])
result = run_archivebox(work_dir, ["list"])
self.assertEqual(result.returncode, 0, f"List failed: {result.stderr}")
# Verify the URL appears in output
output = result.stdout + result.stderr
self.assertIn('example.com', output, f"Added URL not in list output: {output[:500]}")
self.assertIn("example.com", output, f"Added URL not in list output: {output[:500]}")
finally:
shutil.rmtree(work_dir, ignore_errors=True)
@@ -103,10 +103,10 @@ class TestFreshInstall(unittest.TestCase):
work_dir = Path(tempfile.mkdtemp())
try:
result = run_archivebox(work_dir, ['init'])
result = run_archivebox(work_dir, ["init"])
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
conn = sqlite3.connect(str(work_dir / 'index.sqlite3'))
conn = sqlite3.connect(str(work_dir / "index.sqlite3"))
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM django_migrations")
count = cursor.fetchone()[0]
@@ -123,16 +123,16 @@ class TestFreshInstall(unittest.TestCase):
work_dir = Path(tempfile.mkdtemp())
try:
result = run_archivebox(work_dir, ['init'])
result = run_archivebox(work_dir, ["init"])
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
conn = sqlite3.connect(str(work_dir / 'index.sqlite3'))
conn = sqlite3.connect(str(work_dir / "index.sqlite3"))
cursor = conn.cursor()
cursor.execute("SELECT name FROM django_migrations WHERE app='core' ORDER BY name")
migrations = [row[0] for row in cursor.fetchall()]
conn.close()
self.assertIn('0001_initial', migrations)
self.assertIn("0001_initial", migrations)
finally:
shutil.rmtree(work_dir, ignore_errors=True)
@@ -146,16 +146,16 @@ class TestSchemaIntegrity(unittest.TestCase):
work_dir = Path(tempfile.mkdtemp())
try:
result = run_archivebox(work_dir, ['init'])
result = run_archivebox(work_dir, ["init"])
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
conn = sqlite3.connect(str(work_dir / 'index.sqlite3'))
conn = sqlite3.connect(str(work_dir / "index.sqlite3"))
cursor = conn.cursor()
cursor.execute('PRAGMA table_info(core_snapshot)')
cursor.execute("PRAGMA table_info(core_snapshot)")
columns = {row[1] for row in cursor.fetchall()}
conn.close()
required = {'id', 'url', 'timestamp', 'title', 'status', 'created_at', 'modified_at'}
required = {"id", "url", "timestamp", "title", "status", "created_at", "modified_at"}
for col in required:
self.assertIn(col, columns, f"Missing column: {col}")
@@ -167,16 +167,16 @@ class TestSchemaIntegrity(unittest.TestCase):
work_dir = Path(tempfile.mkdtemp())
try:
result = run_archivebox(work_dir, ['init'])
result = run_archivebox(work_dir, ["init"])
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
conn = sqlite3.connect(str(work_dir / 'index.sqlite3'))
conn = sqlite3.connect(str(work_dir / "index.sqlite3"))
cursor = conn.cursor()
cursor.execute('PRAGMA table_info(core_archiveresult)')
cursor.execute("PRAGMA table_info(core_archiveresult)")
columns = {row[1] for row in cursor.fetchall()}
conn.close()
required = {'id', 'snapshot_id', 'plugin', 'status', 'created_at', 'modified_at'}
required = {"id", "snapshot_id", "plugin", "status", "created_at", "modified_at"}
for col in required:
self.assertIn(col, columns, f"Missing column: {col}")
@@ -188,16 +188,16 @@ class TestSchemaIntegrity(unittest.TestCase):
work_dir = Path(tempfile.mkdtemp())
try:
result = run_archivebox(work_dir, ['init'])
result = run_archivebox(work_dir, ["init"])
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
conn = sqlite3.connect(str(work_dir / 'index.sqlite3'))
conn = sqlite3.connect(str(work_dir / "index.sqlite3"))
cursor = conn.cursor()
cursor.execute('PRAGMA table_info(core_tag)')
cursor.execute("PRAGMA table_info(core_tag)")
columns = {row[1] for row in cursor.fetchall()}
conn.close()
required = {'id', 'name', 'slug'}
required = {"id", "name", "slug"}
for col in required:
self.assertIn(col, columns, f"Missing column: {col}")
@@ -209,21 +209,21 @@ class TestSchemaIntegrity(unittest.TestCase):
work_dir = Path(tempfile.mkdtemp())
try:
result = run_archivebox(work_dir, ['init'])
result = run_archivebox(work_dir, ["init"])
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
conn = sqlite3.connect(str(work_dir / 'index.sqlite3'))
conn = sqlite3.connect(str(work_dir / "index.sqlite3"))
cursor = conn.cursor()
cursor.execute('PRAGMA table_info(crawls_crawl)')
cursor.execute("PRAGMA table_info(crawls_crawl)")
columns = {row[1] for row in cursor.fetchall()}
conn.close()
required = {'id', 'urls', 'status', 'created_at', 'created_by_id'}
required = {"id", "urls", "status", "created_at", "created_by_id"}
for col in required:
self.assertIn(col, columns, f"Missing column: {col}")
# seed_id should NOT exist (removed in 0.9.x)
self.assertNotIn('seed_id', columns, "seed_id column should not exist in 0.9.x")
self.assertNotIn("seed_id", columns, "seed_id column should not exist in 0.9.x")
finally:
shutil.rmtree(work_dir, ignore_errors=True)
@@ -237,17 +237,17 @@ class TestMultipleSnapshots(unittest.TestCase):
work_dir = Path(tempfile.mkdtemp())
try:
result = run_archivebox(work_dir, ['init'])
result = run_archivebox(work_dir, ["init"])
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
# Add URLs one at a time
result = run_archivebox(work_dir, ['add', '--index-only', 'https://example.com'])
result = run_archivebox(work_dir, ["add", "--index-only", "https://example.com"])
self.assertEqual(result.returncode, 0, f"Add 1 failed: {result.stderr}")
result = run_archivebox(work_dir, ['add', '--index-only', 'https://example.org'])
result = run_archivebox(work_dir, ["add", "--index-only", "https://example.org"])
self.assertEqual(result.returncode, 0, f"Add 2 failed: {result.stderr}")
conn = sqlite3.connect(str(work_dir / 'index.sqlite3'))
conn = sqlite3.connect(str(work_dir / "index.sqlite3"))
cursor = conn.cursor()
# Verify snapshots were created
@@ -270,13 +270,13 @@ class TestMultipleSnapshots(unittest.TestCase):
work_dir = Path(tempfile.mkdtemp())
try:
result = run_archivebox(work_dir, ['init'])
result = run_archivebox(work_dir, ["init"])
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
result = run_archivebox(work_dir, ['add', '--index-only', 'https://example.com'])
result = run_archivebox(work_dir, ["add", "--index-only", "https://example.com"])
self.assertEqual(result.returncode, 0, f"Add failed: {result.stderr}")
conn = sqlite3.connect(str(work_dir / 'index.sqlite3'))
conn = sqlite3.connect(str(work_dir / "index.sqlite3"))
cursor = conn.cursor()
# Check that snapshot has a crawl_id
@@ -291,5 +291,5 @@ class TestMultipleSnapshots(unittest.TestCase):
shutil.rmtree(work_dir, ignore_errors=True)
if __name__ == '__main__':
if __name__ == "__main__":
unittest.main()

View File

@@ -53,23 +53,23 @@ def test_persona_prepare_runtime_for_crawl_clones_and_cleans_profile(initialized
'template_dir_recorded': (runtime_root / 'template_dir.txt').read_text().strip(),
'chrome_binary_recorded': (runtime_root / 'chrome_binary.txt').read_text().strip(),
}))
"""
""",
)
stdout, stderr, code = run_python_cwd(script, cwd=initialized_archive, timeout=60)
assert code == 0, stderr
payload = json.loads(stdout.strip().splitlines()[-1])
assert payload['runtime_root_exists'] is True
assert payload['runtime_profile_exists'] is True
assert payload['runtime_downloads_exists'] is True
assert payload['preferences_copied'] is True
assert payload['singleton_removed'] is True
assert payload['cache_removed'] is True
assert payload['log_removed'] is True
assert payload['persona_name_recorded'] == 'Default'
assert payload['template_dir_recorded'].endswith('/personas/Default/chrome_user_data')
assert payload['chrome_binary_recorded'] == '/Applications/Chromium.app/Contents/MacOS/Chromium'
assert payload["runtime_root_exists"] is True
assert payload["runtime_profile_exists"] is True
assert payload["runtime_downloads_exists"] is True
assert payload["preferences_copied"] is True
assert payload["singleton_removed"] is True
assert payload["cache_removed"] is True
assert payload["log_removed"] is True
assert payload["persona_name_recorded"] == "Default"
assert payload["template_dir_recorded"].endswith("/personas/Default/chrome_user_data")
assert payload["chrome_binary_recorded"] == "/Applications/Chromium.app/Contents/MacOS/Chromium"
def test_persona_cleanup_runtime_for_crawl_removes_only_runtime_copy(initialized_archive):
@@ -102,15 +102,15 @@ def test_persona_cleanup_runtime_for_crawl_removes_only_runtime_copy(initialized
'runtime_removed': not runtime_root.exists(),
'template_still_exists': (template_dir / 'Default' / 'Preferences').exists(),
}))
"""
""",
)
stdout, stderr, code = run_python_cwd(script, cwd=initialized_archive, timeout=60)
assert code == 0, stderr
payload = json.loads(stdout.strip().splitlines()[-1])
assert payload['runtime_removed'] is True
assert payload['template_still_exists'] is True
assert payload["runtime_removed"] is True
assert payload["template_still_exists"] is True
def test_crawl_resolve_persona_raises_for_missing_persona_id(initialized_archive):
@@ -135,15 +135,15 @@ def test_crawl_resolve_persona_raises_for_missing_persona_id(initialized_archive
print(json.dumps({'raised': True, 'message': str(err)}))
else:
raise SystemExit('resolve_persona unexpectedly succeeded')
"""
""",
)
stdout, stderr, code = run_python_cwd(script, cwd=initialized_archive, timeout=60)
assert code == 0, stderr
payload = json.loads(stdout.strip().splitlines()[-1])
assert payload['raised'] is True
assert 'references missing Persona' in payload['message']
assert payload["raised"] is True
assert "references missing Persona" in payload["message"]
def test_get_config_raises_for_missing_persona_id(initialized_archive):
@@ -169,12 +169,12 @@ def test_get_config_raises_for_missing_persona_id(initialized_archive):
print(json.dumps({'raised': True, 'message': str(err)}))
else:
raise SystemExit('get_config unexpectedly succeeded')
"""
""",
)
stdout, stderr, code = run_python_cwd(script, cwd=initialized_archive, timeout=60)
assert code == 0, stderr
payload = json.loads(stdout.strip().splitlines()[-1])
assert payload['raised'] is True
assert 'references missing Persona' in payload['message']
assert payload["raised"] is True
assert "references missing Persona" in payload["message"]

View File

@@ -3,7 +3,7 @@ import unittest
from pathlib import Path
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "archivebox.settings")
from archivebox.machine.models import Process
@@ -13,26 +13,25 @@ class TestProcessRuntimePaths(unittest.TestCase):
def test_hook_processes_use_isolated_runtime_dir(self):
process = Process(
process_type=Process.TypeChoices.HOOK,
pwd='/tmp/archive/example/chrome',
cmd=['node', '/plugins/chrome/on_Snapshot__11_chrome_wait.js', '--url=https://example.com'],
pwd="/tmp/archive/example/chrome",
cmd=["node", "/plugins/chrome/on_Snapshot__11_chrome_wait.js", "--url=https://example.com"],
)
expected_dir = Path('/tmp/archive/example/chrome/.hooks/on_Snapshot__11_chrome_wait.js')
expected_dir = Path("/tmp/archive/example/chrome/.hooks/on_Snapshot__11_chrome_wait.js")
self.assertEqual(process.runtime_dir, expected_dir)
self.assertEqual(process.stdout_file, expected_dir / 'stdout.log')
self.assertEqual(process.stderr_file, expected_dir / 'stderr.log')
self.assertEqual(process.pid_file, expected_dir / 'process.pid')
self.assertEqual(process.stdout_file, expected_dir / "stdout.log")
self.assertEqual(process.stderr_file, expected_dir / "stderr.log")
self.assertEqual(process.pid_file, expected_dir / "process.pid")
def test_non_hook_processes_keep_runtime_files_in_pwd(self):
process = Process(
process_type=Process.TypeChoices.WORKER,
pwd='/tmp/archive/example',
cmd=['archivebox', 'run', '--snapshot-id', '123'],
pwd="/tmp/archive/example",
cmd=["archivebox", "run", "--snapshot-id", "123"],
)
expected_dir = Path('/tmp/archive/example')
expected_dir = Path("/tmp/archive/example")
self.assertEqual(process.runtime_dir, expected_dir)
self.assertEqual(process.stdout_file, expected_dir / 'stdout.log')
self.assertEqual(process.stderr_file, expected_dir / 'stderr.log')
self.assertEqual(process.pid_file, expected_dir / 'process.pid')
self.assertEqual(process.stdout_file, expected_dir / "stdout.log")
self.assertEqual(process.stderr_file, expected_dir / "stderr.log")
self.assertEqual(process.pid_file, expected_dir / "process.pid")

View File

@@ -11,7 +11,6 @@ from pathlib import Path
import pytest
def wait_for_db_condition(timeout, condition, interval=0.5):
deadline = time.time() + timeout
while time.time() < deadline:
@@ -45,9 +44,7 @@ def run_add_until(args, env, condition, timeout=120):
env=env,
)
assert wait_for_db_condition(timeout=timeout, condition=condition), (
f"Timed out waiting for condition while running: {' '.join(args)}"
)
assert wait_for_db_condition(timeout=timeout, condition=condition), f"Timed out waiting for condition while running: {' '.join(args)}"
return stop_process(proc)
@@ -60,26 +57,28 @@ def test_background_hooks_dont_block_parser_extractors(tmp_path, process, recurs
# Enable only parser extractors and background hooks for this test
env = os.environ.copy()
env.update({
# Disable most extractors
"SAVE_WGET": "false",
"SAVE_SINGLEFILE": "false",
"SAVE_READABILITY": "false",
"SAVE_MERCURY": "false",
"SAVE_HTMLTOTEXT": "false",
"SAVE_PDF": "false",
"SAVE_SCREENSHOT": "false",
"SAVE_DOM": "false",
"SAVE_HEADERS": "false",
"SAVE_GIT": "false",
"SAVE_YTDLP": "false",
"SAVE_ARCHIVEDOTORG": "false",
"SAVE_TITLE": "false",
"SAVE_FAVICON": "true",
})
env.update(
{
# Disable most extractors
"SAVE_WGET": "false",
"SAVE_SINGLEFILE": "false",
"SAVE_READABILITY": "false",
"SAVE_MERCURY": "false",
"SAVE_HTMLTOTEXT": "false",
"SAVE_PDF": "false",
"SAVE_SCREENSHOT": "false",
"SAVE_DOM": "false",
"SAVE_HEADERS": "false",
"SAVE_GIT": "false",
"SAVE_YTDLP": "false",
"SAVE_ARCHIVEDOTORG": "false",
"SAVE_TITLE": "false",
"SAVE_FAVICON": "true",
},
)
proc = subprocess.Popen(
['archivebox', 'add', '--depth=1', '--plugins=favicon,parse_html_urls', recursive_test_site['root_url']],
["archivebox", "add", "--depth=1", "--plugins=favicon,parse_html_urls", recursive_test_site["root_url"]],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
@@ -88,9 +87,12 @@ def test_background_hooks_dont_block_parser_extractors(tmp_path, process, recurs
assert wait_for_db_condition(
timeout=120,
condition=lambda c: c.execute(
"SELECT COUNT(*) FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls' AND status IN ('started', 'succeeded', 'failed')"
).fetchone()[0] > 0,
condition=lambda c: (
c.execute(
"SELECT COUNT(*) FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls' AND status IN ('started', 'succeeded', 'failed')",
).fetchone()[0]
> 0
),
), "Parser extractors never progressed beyond queued status"
stdout, stderr = stop_process(proc)
@@ -99,18 +101,18 @@ def test_background_hooks_dont_block_parser_extractors(tmp_path, process, recurs
if stdout:
print(f"\n=== STDOUT (last 2000 chars) ===\n{stdout[-2000:]}\n=== END STDOUT ===\n")
conn = sqlite3.connect('index.sqlite3')
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
snapshots = c.execute("SELECT url, depth, status FROM core_snapshot").fetchall()
bg_hooks = c.execute(
"SELECT plugin, status FROM core_archiveresult WHERE plugin IN ('favicon', 'consolelog', 'ssl', 'responses', 'redirects', 'staticfile') ORDER BY plugin"
"SELECT plugin, status FROM core_archiveresult WHERE plugin IN ('favicon', 'consolelog', 'ssl', 'responses', 'redirects', 'staticfile') ORDER BY plugin",
).fetchall()
parser_extractors = c.execute(
"SELECT plugin, status FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls' ORDER BY plugin"
"SELECT plugin, status FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls' ORDER BY plugin",
).fetchall()
all_extractors = c.execute(
"SELECT plugin, status FROM core_archiveresult ORDER BY plugin"
"SELECT plugin, status FROM core_archiveresult ORDER BY plugin",
).fetchall()
conn.close()
@@ -122,14 +124,13 @@ def test_background_hooks_dont_block_parser_extractors(tmp_path, process, recurs
)
assert len(all_extractors) > 0, (
f"Should have extractors created for snapshot. "
f"If this fails, Snapshot.run() may not have started. "
f"Got: {all_extractors}"
f"Should have extractors created for snapshot. If this fails, Snapshot.run() may not have started. Got: {all_extractors}"
)
parser_statuses = [status for _, status in parser_extractors]
assert 'started' in parser_statuses or 'succeeded' in parser_statuses or 'failed' in parser_statuses, \
assert "started" in parser_statuses or "succeeded" in parser_statuses or "failed" in parser_statuses, (
f"Parser extractors should have run, got statuses: {parser_statuses}. Background hooks: {bg_hooks}"
)
def test_parser_extractors_emit_snapshot_jsonl(tmp_path, process, recursive_test_site):
@@ -137,26 +138,28 @@ def test_parser_extractors_emit_snapshot_jsonl(tmp_path, process, recursive_test
os.chdir(tmp_path)
env = os.environ.copy()
env.update({
"SAVE_WGET": "false",
"SAVE_SINGLEFILE": "false",
"SAVE_READABILITY": "false",
"SAVE_MERCURY": "false",
"SAVE_HTMLTOTEXT": "false",
"SAVE_PDF": "false",
"SAVE_SCREENSHOT": "false",
"SAVE_DOM": "false",
"SAVE_HEADERS": "false",
"SAVE_GIT": "false",
"SAVE_YTDLP": "false",
"SAVE_ARCHIVEDOTORG": "false",
"SAVE_TITLE": "false",
"SAVE_FAVICON": "false",
"USE_CHROME": "false",
})
env.update(
{
"SAVE_WGET": "false",
"SAVE_SINGLEFILE": "false",
"SAVE_READABILITY": "false",
"SAVE_MERCURY": "false",
"SAVE_HTMLTOTEXT": "false",
"SAVE_PDF": "false",
"SAVE_SCREENSHOT": "false",
"SAVE_DOM": "false",
"SAVE_HEADERS": "false",
"SAVE_GIT": "false",
"SAVE_YTDLP": "false",
"SAVE_ARCHIVEDOTORG": "false",
"SAVE_TITLE": "false",
"SAVE_FAVICON": "false",
"USE_CHROME": "false",
},
)
result = subprocess.run(
['archivebox', 'add', '--depth=0', '--plugins=wget,parse_html_urls', recursive_test_site['root_url']],
["archivebox", "add", "--depth=0", "--plugins=wget,parse_html_urls", recursive_test_site["root_url"]],
capture_output=True,
text=True,
env=env,
@@ -164,11 +167,11 @@ def test_parser_extractors_emit_snapshot_jsonl(tmp_path, process, recursive_test
)
assert result.returncode == 0, result.stderr
conn = sqlite3.connect('index.sqlite3')
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
parse_html = c.execute(
"SELECT id, status, output_str FROM core_archiveresult WHERE plugin LIKE '%parse_html_urls' ORDER BY id LIMIT 1"
"SELECT id, status, output_str FROM core_archiveresult WHERE plugin LIKE '%parse_html_urls' ORDER BY id LIMIT 1",
).fetchone()
conn.close()
@@ -177,11 +180,10 @@ def test_parser_extractors_emit_snapshot_jsonl(tmp_path, process, recursive_test
status = parse_html[1]
output = parse_html[2] or ""
assert status in ['started', 'succeeded', 'failed'], \
f"60_parse_html_urls should have run, got status: {status}"
assert status in ["started", "succeeded", "failed"], f"60_parse_html_urls should have run, got status: {status}"
if status == 'succeeded' and output:
assert 'parsed' in output.lower(), "Parser summary should report parsed URLs"
if status == "succeeded" and output:
assert "parsed" in output.lower(), "Parser summary should report parsed URLs"
urls_jsonl_files = list(Path("users/system/snapshots").rglob("parse_html_urls/**/urls.jsonl"))
assert urls_jsonl_files, "parse_html_urls should write urls.jsonl output"
@@ -192,8 +194,7 @@ def test_parser_extractors_emit_snapshot_jsonl(tmp_path, process, recursive_test
records.append(json.loads(line))
assert records, "urls.jsonl should contain parsed Snapshot records"
assert all(record.get("type") == "Snapshot" for record in records), \
f"Expected Snapshot JSONL records, got: {records}"
assert all(record.get("type") == "Snapshot" for record in records), f"Expected Snapshot JSONL records, got: {records}"
def test_recursive_crawl_creates_child_snapshots(tmp_path, process, recursive_test_site):
@@ -201,27 +202,29 @@ def test_recursive_crawl_creates_child_snapshots(tmp_path, process, recursive_te
os.chdir(tmp_path)
env = os.environ.copy()
env.update({
"URL_ALLOWLIST": r"127\.0\.0\.1[:/].*",
"SAVE_READABILITY": "false",
"SAVE_SINGLEFILE": "false",
"SAVE_MERCURY": "false",
"SAVE_SCREENSHOT": "false",
"SAVE_PDF": "false",
"SAVE_HEADERS": "false",
"SAVE_ARCHIVEDOTORG": "false",
"SAVE_GIT": "false",
"SAVE_YTDLP": "false",
"SAVE_TITLE": "false",
})
env.update(
{
"URL_ALLOWLIST": r"127\.0\.0\.1[:/].*",
"SAVE_READABILITY": "false",
"SAVE_SINGLEFILE": "false",
"SAVE_MERCURY": "false",
"SAVE_SCREENSHOT": "false",
"SAVE_PDF": "false",
"SAVE_HEADERS": "false",
"SAVE_ARCHIVEDOTORG": "false",
"SAVE_GIT": "false",
"SAVE_YTDLP": "false",
"SAVE_TITLE": "false",
},
)
stdout, stderr = run_add_until(
['archivebox', 'add', '--depth=1', '--plugins=wget,parse_html_urls', recursive_test_site['root_url']],
["archivebox", "add", "--depth=1", "--plugins=wget,parse_html_urls", recursive_test_site["root_url"]],
env=env,
timeout=120,
condition=lambda c: (
c.execute("SELECT COUNT(*) FROM core_snapshot WHERE depth = 0").fetchone()[0] >= 1
and c.execute("SELECT COUNT(*) FROM core_snapshot WHERE depth = 1").fetchone()[0] >= len(recursive_test_site['child_urls'])
and c.execute("SELECT COUNT(*) FROM core_snapshot WHERE depth = 1").fetchone()[0] >= len(recursive_test_site["child_urls"])
),
)
@@ -230,26 +233,26 @@ def test_recursive_crawl_creates_child_snapshots(tmp_path, process, recursive_te
if stdout:
print(f"\n=== STDOUT (last 2000 chars) ===\n{stdout[-2000:]}\n=== END STDOUT ===\n")
conn = sqlite3.connect('index.sqlite3')
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
all_snapshots = c.execute("SELECT url, depth FROM core_snapshot").fetchall()
root_snapshot = c.execute(
"SELECT id, url, depth, parent_snapshot_id FROM core_snapshot WHERE depth = 0 ORDER BY created_at LIMIT 1"
"SELECT id, url, depth, parent_snapshot_id FROM core_snapshot WHERE depth = 0 ORDER BY created_at LIMIT 1",
).fetchone()
child_snapshots = c.execute(
"SELECT id, url, depth, parent_snapshot_id FROM core_snapshot WHERE depth = 1"
"SELECT id, url, depth, parent_snapshot_id FROM core_snapshot WHERE depth = 1",
).fetchall()
crawl = c.execute(
"SELECT id, max_depth FROM crawls_crawl ORDER BY created_at DESC LIMIT 1"
"SELECT id, max_depth FROM crawls_crawl ORDER BY created_at DESC LIMIT 1",
).fetchone()
parser_status = c.execute(
"SELECT plugin, status FROM core_archiveresult WHERE snapshot_id = ? AND plugin LIKE 'parse_%_urls'",
(root_snapshot[0] if root_snapshot else '',)
(root_snapshot[0] if root_snapshot else "",),
).fetchall()
started_extractors = c.execute(
"SELECT plugin, status FROM core_archiveresult WHERE snapshot_id = ? AND status = 'started'",
(root_snapshot[0] if root_snapshot else '',)
(root_snapshot[0] if root_snapshot else "",),
).fetchall()
conn.close()
@@ -260,13 +263,13 @@ def test_recursive_crawl_creates_child_snapshots(tmp_path, process, recursive_te
assert crawl is not None, "Crawl should be created"
assert crawl[1] == 1, f"Crawl max_depth should be 1, got {crawl[1]}"
assert len(child_snapshots) > 0, \
assert len(child_snapshots) > 0, (
f"Child snapshots should be created from monadical.com links. Parser status: {parser_status}. Started extractors blocking: {started_extractors}"
)
for child_id, child_url, child_depth, parent_id in child_snapshots:
assert child_depth == 1, f"Child snapshot should have depth=1, got {child_depth}"
assert parent_id == root_id, \
f"Child snapshot {child_url} should have parent_snapshot_id={root_id}, got {parent_id}"
assert parent_id == root_id, f"Child snapshot {child_url} should have parent_snapshot_id={root_id}, got {parent_id}"
def test_recursive_crawl_respects_depth_limit(tmp_path, process, disable_extractors_dict, recursive_test_site):
@@ -277,45 +280,45 @@ def test_recursive_crawl_respects_depth_limit(tmp_path, process, disable_extract
env["URL_ALLOWLIST"] = r"127\.0\.0\.1[:/].*"
stdout, stderr = run_add_until(
['archivebox', 'add', '--depth=1', '--plugins=wget,parse_html_urls', recursive_test_site['root_url']],
["archivebox", "add", "--depth=1", "--plugins=wget,parse_html_urls", recursive_test_site["root_url"]],
env=env,
timeout=120,
condition=lambda c: (
c.execute("SELECT COUNT(*) FROM core_snapshot WHERE depth = 0").fetchone()[0] >= 1
and c.execute("SELECT COUNT(*) FROM core_snapshot WHERE depth = 1").fetchone()[0] >= len(recursive_test_site['child_urls'])
and c.execute("SELECT COUNT(*) FROM core_snapshot WHERE depth = 1").fetchone()[0] >= len(recursive_test_site["child_urls"])
and c.execute(
"SELECT COUNT(DISTINCT ar.snapshot_id) "
"FROM core_archiveresult ar "
"JOIN core_snapshot s ON s.id = ar.snapshot_id "
"WHERE s.depth = 1 "
"AND ar.plugin LIKE 'parse_%_urls' "
"AND ar.status IN ('started', 'succeeded', 'failed')"
).fetchone()[0] >= len(recursive_test_site['child_urls'])
"AND ar.status IN ('started', 'succeeded', 'failed')",
).fetchone()[0]
>= len(recursive_test_site["child_urls"])
),
)
conn = sqlite3.connect('index.sqlite3')
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
max_depth_found = c.execute(
"SELECT MAX(depth) FROM core_snapshot"
"SELECT MAX(depth) FROM core_snapshot",
).fetchone()[0]
depth_counts = c.execute(
"SELECT depth, COUNT(*) FROM core_snapshot GROUP BY depth ORDER BY depth"
"SELECT depth, COUNT(*) FROM core_snapshot GROUP BY depth ORDER BY depth",
).fetchall()
conn.close()
assert max_depth_found is not None, "Should have at least one snapshot"
assert max_depth_found <= 1, \
f"Max depth should not exceed 1, got {max_depth_found}. Depth distribution: {depth_counts}"
assert max_depth_found <= 1, f"Max depth should not exceed 1, got {max_depth_found}. Depth distribution: {depth_counts}"
def test_crawl_snapshot_has_parent_snapshot_field(tmp_path, process, disable_extractors_dict):
"""Test that Snapshot model has parent_snapshot field."""
os.chdir(tmp_path)
conn = sqlite3.connect('index.sqlite3')
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
# Check schema for parent_snapshot_id column
@@ -324,15 +327,14 @@ def test_crawl_snapshot_has_parent_snapshot_field(tmp_path, process, disable_ext
column_names = [col[1] for col in schema]
assert 'parent_snapshot_id' in column_names, \
f"Snapshot table should have parent_snapshot_id column. Columns: {column_names}"
assert "parent_snapshot_id" in column_names, f"Snapshot table should have parent_snapshot_id column. Columns: {column_names}"
def test_snapshot_depth_field_exists(tmp_path, process, disable_extractors_dict):
"""Test that Snapshot model has depth field."""
os.chdir(tmp_path)
conn = sqlite3.connect('index.sqlite3')
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
# Check schema for depth column
@@ -341,8 +343,7 @@ def test_snapshot_depth_field_exists(tmp_path, process, disable_extractors_dict)
column_names = [col[1] for col in schema]
assert 'depth' in column_names, \
f"Snapshot table should have depth column. Columns: {column_names}"
assert "depth" in column_names, f"Snapshot table should have depth column. Columns: {column_names}"
def test_root_snapshot_has_depth_zero(tmp_path, process, disable_extractors_dict, recursive_test_site):
@@ -353,21 +354,24 @@ def test_root_snapshot_has_depth_zero(tmp_path, process, disable_extractors_dict
env["URL_ALLOWLIST"] = r"127\.0\.0\.1[:/].*"
stdout, stderr = run_add_until(
['archivebox', 'add', '--depth=1', '--plugins=wget,parse_html_urls', recursive_test_site['root_url']],
["archivebox", "add", "--depth=1", "--plugins=wget,parse_html_urls", recursive_test_site["root_url"]],
env=env,
timeout=120,
condition=lambda c: c.execute(
"SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
(recursive_test_site['root_url'],),
).fetchone()[0] >= 1,
condition=lambda c: (
c.execute(
"SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
(recursive_test_site["root_url"],),
).fetchone()[0]
>= 1
),
)
conn = sqlite3.connect('index.sqlite3')
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
snapshot = c.execute(
"SELECT id, depth FROM core_snapshot WHERE url = ? ORDER BY created_at LIMIT 1",
(recursive_test_site['root_url'],)
(recursive_test_site["root_url"],),
).fetchone()
conn.close()
@@ -381,42 +385,47 @@ def test_archiveresult_worker_queue_filters_by_foreground_extractors(tmp_path, p
os.chdir(tmp_path)
env = os.environ.copy()
env.update({
"SAVE_WGET": "true",
"SAVE_SINGLEFILE": "false",
"SAVE_PDF": "false",
"SAVE_SCREENSHOT": "false",
"SAVE_FAVICON": "true",
})
stdout, stderr = run_add_until(
['archivebox', 'add', '--plugins=favicon,wget,parse_html_urls', recursive_test_site['root_url']],
env=env,
timeout=120,
condition=lambda c: c.execute(
"SELECT COUNT(*) FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls' AND status IN ('started', 'succeeded', 'failed')"
).fetchone()[0] > 0,
env.update(
{
"SAVE_WGET": "true",
"SAVE_SINGLEFILE": "false",
"SAVE_PDF": "false",
"SAVE_SCREENSHOT": "false",
"SAVE_FAVICON": "true",
},
)
conn = sqlite3.connect('index.sqlite3')
stdout, stderr = run_add_until(
["archivebox", "add", "--plugins=favicon,wget,parse_html_urls", recursive_test_site["root_url"]],
env=env,
timeout=120,
condition=lambda c: (
c.execute(
"SELECT COUNT(*) FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls' AND status IN ('started', 'succeeded', 'failed')",
).fetchone()[0]
> 0
),
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
bg_results = c.execute(
"SELECT plugin, status FROM core_archiveresult WHERE plugin IN ('favicon', 'consolelog', 'ssl', 'responses', 'redirects', 'staticfile') AND status IN ('started', 'succeeded', 'failed')"
"SELECT plugin, status FROM core_archiveresult WHERE plugin IN ('favicon', 'consolelog', 'ssl', 'responses', 'redirects', 'staticfile') AND status IN ('started', 'succeeded', 'failed')",
).fetchall()
parser_status = c.execute(
"SELECT plugin, status FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls'"
"SELECT plugin, status FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls'",
).fetchall()
conn.close()
if len(bg_results) > 0:
parser_statuses = [status for _, status in parser_status]
non_queued = [s for s in parser_statuses if s != 'queued']
assert len(non_queued) > 0 or len(parser_status) == 0, \
f"With {len(bg_results)} background hooks started, parser extractors should still run. " \
f"Got statuses: {parser_statuses}"
non_queued = [s for s in parser_statuses if s != "queued"]
assert len(non_queued) > 0 or len(parser_status) == 0, (
f"With {len(bg_results)} background hooks started, parser extractors should still run. Got statuses: {parser_statuses}"
)
if __name__ == '__main__':
pytest.main([__file__, '-v'])
if __name__ == "__main__":
pytest.main([__file__, "-v"])

View File

@@ -1,5 +1,8 @@
import asyncio
import json
import subprocess
import sys
from pathlib import Path
from types import SimpleNamespace
import pytest
@@ -12,6 +15,15 @@ pytestmark = pytest.mark.django_db
class _DummyBus:
def __init__(self, name: str):
self.name = name
self.registrations = []
def on(self, event_pattern, handler):
registration = SimpleNamespace(event_pattern=event_pattern, handler=handler)
self.registrations.append(registration)
return registration
def off(self, event_pattern, registration):
self.registrations = [existing for existing in self.registrations if existing is not registration]
async def stop(self):
return None
@@ -41,16 +53,16 @@ def test_run_snapshot_uses_isolated_bus_per_snapshot(monkeypatch):
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls='https://blog.sweeting.me\nhttps://sweeting.me',
urls="https://blog.sweeting.me\nhttps://sweeting.me",
created_by_id=get_or_create_system_user_pk(),
)
snapshot_a = Snapshot.objects.create(
url='https://blog.sweeting.me',
url="https://blog.sweeting.me",
crawl=crawl,
status=Snapshot.StatusChoices.QUEUED,
)
snapshot_b = Snapshot.objects.create(
url='https://sweeting.me',
url="https://sweeting.me",
crawl=crawl,
status=Snapshot.StatusChoices.QUEUED,
)
@@ -62,64 +74,66 @@ def test_run_snapshot_uses_isolated_bus_per_snapshot(monkeypatch):
created_buses.append(bus)
return bus
monkeypatch.setattr(runner_module, 'create_bus', fake_create_bus)
monkeypatch.setattr(runner_module, 'discover_plugins', lambda: {})
monkeypatch.setattr(runner_module, 'ProcessService', _DummyService)
monkeypatch.setattr(runner_module, 'MachineService', _DummyService)
monkeypatch.setattr(runner_module, 'BinaryService', _DummyService)
monkeypatch.setattr(runner_module, 'TagService', _DummyService)
monkeypatch.setattr(runner_module, 'CrawlService', _DummyService)
monkeypatch.setattr(runner_module, 'SnapshotService', _DummyService)
monkeypatch.setattr(runner_module, 'ArchiveResultService', _DummyService)
monkeypatch.setattr(runner_module, 'setup_abx_services', lambda *args, **kwargs: _DummyAbxServices())
monkeypatch.setattr(runner_module, "create_bus", fake_create_bus)
monkeypatch.setattr(runner_module, "discover_plugins", lambda: {})
monkeypatch.setattr(runner_module, "ProcessService", _DummyService)
monkeypatch.setattr(runner_module, "MachineService", _DummyService)
monkeypatch.setattr(runner_module, "BinaryService", _DummyService)
monkeypatch.setattr(runner_module, "TagService", _DummyService)
monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
monkeypatch.setattr(runner_module, "SnapshotService", _DummyService)
monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService)
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: _DummyAbxServices())
download_calls = []
async def fake_download(*, url, bus, config_overrides, snapshot, **kwargs):
download_calls.append(
{
'url': url,
'bus': bus,
'snapshot_id': config_overrides['SNAPSHOT_ID'],
'source_url': config_overrides['SOURCE_URL'],
'abx_snapshot_id': snapshot.id,
}
"url": url,
"bus": bus,
"snapshot_id": config_overrides["SNAPSHOT_ID"],
"source_url": config_overrides["SOURCE_URL"],
"abx_snapshot_id": snapshot.id,
},
)
await asyncio.sleep(0)
return []
monkeypatch.setattr(runner_module, 'download', fake_download)
monkeypatch.setattr(runner_module, "download", fake_download)
crawl_runner = runner_module.CrawlRunner(crawl)
snapshot_data = {
str(snapshot_a.id): {
'id': str(snapshot_a.id),
'url': snapshot_a.url,
'title': snapshot_a.title,
'timestamp': snapshot_a.timestamp,
'bookmarked_at': snapshot_a.bookmarked_at.isoformat() if snapshot_a.bookmarked_at else "",
'created_at': snapshot_a.created_at.isoformat() if snapshot_a.created_at else "",
'tags': snapshot_a.tags_str(),
'depth': snapshot_a.depth,
'parent_snapshot_id': str(snapshot_a.parent_snapshot_id) if snapshot_a.parent_snapshot_id else None,
'output_dir': str(snapshot_a.output_dir),
'config': crawl_runner._snapshot_config(snapshot_a),
"id": str(snapshot_a.id),
"url": snapshot_a.url,
"status": snapshot_a.status,
"title": snapshot_a.title,
"timestamp": snapshot_a.timestamp,
"bookmarked_at": snapshot_a.bookmarked_at.isoformat() if snapshot_a.bookmarked_at else "",
"created_at": snapshot_a.created_at.isoformat() if snapshot_a.created_at else "",
"tags": snapshot_a.tags_str(),
"depth": snapshot_a.depth,
"parent_snapshot_id": str(snapshot_a.parent_snapshot_id) if snapshot_a.parent_snapshot_id else None,
"output_dir": str(snapshot_a.output_dir),
"config": crawl_runner._snapshot_config(snapshot_a),
},
str(snapshot_b.id): {
'id': str(snapshot_b.id),
'url': snapshot_b.url,
'title': snapshot_b.title,
'timestamp': snapshot_b.timestamp,
'bookmarked_at': snapshot_b.bookmarked_at.isoformat() if snapshot_b.bookmarked_at else "",
'created_at': snapshot_b.created_at.isoformat() if snapshot_b.created_at else "",
'tags': snapshot_b.tags_str(),
'depth': snapshot_b.depth,
'parent_snapshot_id': str(snapshot_b.parent_snapshot_id) if snapshot_b.parent_snapshot_id else None,
'output_dir': str(snapshot_b.output_dir),
'config': crawl_runner._snapshot_config(snapshot_b),
"id": str(snapshot_b.id),
"url": snapshot_b.url,
"status": snapshot_b.status,
"title": snapshot_b.title,
"timestamp": snapshot_b.timestamp,
"bookmarked_at": snapshot_b.bookmarked_at.isoformat() if snapshot_b.bookmarked_at else "",
"created_at": snapshot_b.created_at.isoformat() if snapshot_b.created_at else "",
"tags": snapshot_b.tags_str(),
"depth": snapshot_b.depth,
"parent_snapshot_id": str(snapshot_b.parent_snapshot_id) if snapshot_b.parent_snapshot_id else None,
"output_dir": str(snapshot_b.output_dir),
"config": crawl_runner._snapshot_config(snapshot_b),
},
}
monkeypatch.setattr(crawl_runner, '_load_snapshot_run_data', lambda snapshot_id: snapshot_data[snapshot_id])
monkeypatch.setattr(crawl_runner, "_load_snapshot_run_data", lambda snapshot_id: snapshot_data[snapshot_id])
async def run_both():
await asyncio.gather(
@@ -130,9 +144,9 @@ def test_run_snapshot_uses_isolated_bus_per_snapshot(monkeypatch):
asyncio.run(run_both())
assert len(download_calls) == 2
assert {call['snapshot_id'] for call in download_calls} == {str(snapshot_a.id), str(snapshot_b.id)}
assert {call['source_url'] for call in download_calls} == {snapshot_a.url, snapshot_b.url}
assert len({id(call['bus']) for call in download_calls}) == 2
assert {call["snapshot_id"] for call in download_calls} == {str(snapshot_a.id), str(snapshot_b.id)}
assert {call["source_url"] for call in download_calls} == {snapshot_a.url, snapshot_b.url}
assert len({id(call["bus"]) for call in download_calls}) == 2
assert len(created_buses) == 3 # 1 crawl bus + 2 isolated snapshot buses
@@ -146,38 +160,40 @@ def test_ensure_background_runner_starts_when_none_running(monkeypatch):
def __init__(self, args, **kwargs):
popen_calls.append((args, kwargs))
monkeypatch.setattr(machine_models.Process, 'cleanup_stale_running', classmethod(lambda cls, machine=None: 0))
monkeypatch.setattr(machine_models.Machine, 'current', classmethod(lambda cls: SimpleNamespace(id='machine-1')))
monkeypatch.setattr(machine_models.Process, "cleanup_stale_running", classmethod(lambda cls, machine=None: 0))
monkeypatch.setattr(machine_models.Process, "cleanup_orphaned_workers", classmethod(lambda cls: 0))
monkeypatch.setattr(machine_models.Machine, "current", classmethod(lambda cls: SimpleNamespace(id="machine-1")))
monkeypatch.setattr(
machine_models.Process.objects,
'filter',
"filter",
lambda **kwargs: SimpleNamespace(exists=lambda: False),
)
monkeypatch.setattr(runner_module.subprocess, 'Popen', DummyPopen)
monkeypatch.setattr(runner_module.subprocess, "Popen", DummyPopen)
started = runner_module.ensure_background_runner(allow_under_pytest=True)
assert started is True
assert len(popen_calls) == 1
assert popen_calls[0][0] == [runner_module.sys.executable, '-m', 'archivebox', 'run', '--daemon']
assert popen_calls[0][1]['stdin'] is subprocess.DEVNULL
assert popen_calls[0][0] == [runner_module.sys.executable, "-m", "archivebox", "run", "--daemon"]
assert popen_calls[0][1]["stdin"] is subprocess.DEVNULL
def test_ensure_background_runner_skips_when_orchestrator_running(monkeypatch):
import archivebox.machine.models as machine_models
from archivebox.services import runner as runner_module
monkeypatch.setattr(machine_models.Process, 'cleanup_stale_running', classmethod(lambda cls, machine=None: 0))
monkeypatch.setattr(machine_models.Machine, 'current', classmethod(lambda cls: SimpleNamespace(id='machine-1')))
monkeypatch.setattr(machine_models.Process, "cleanup_stale_running", classmethod(lambda cls, machine=None: 0))
monkeypatch.setattr(machine_models.Process, "cleanup_orphaned_workers", classmethod(lambda cls: 0))
monkeypatch.setattr(machine_models.Machine, "current", classmethod(lambda cls: SimpleNamespace(id="machine-1")))
monkeypatch.setattr(
machine_models.Process.objects,
'filter',
"filter",
lambda **kwargs: SimpleNamespace(exists=lambda: True),
)
monkeypatch.setattr(
runner_module.subprocess,
'Popen',
lambda *args, **kwargs: (_ for _ in ()).throw(AssertionError('runner should not be spawned')),
"Popen",
lambda *args, **kwargs: (_ for _ in ()).throw(AssertionError("runner should not be spawned")),
)
started = runner_module.ensure_background_runner(allow_under_pytest=True)
@@ -191,20 +207,20 @@ def test_runner_prepare_refreshes_network_interface_and_attaches_current_process
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls='https://example.com',
urls="https://example.com",
created_by_id=get_or_create_system_user_pk(),
)
class _Iface:
id = 'iface-1'
machine = SimpleNamespace(id='machine-1')
machine_id = 'machine-1'
id = "iface-1"
machine = SimpleNamespace(id="machine-1")
machine_id = "machine-1"
saved_updates = []
class _Proc:
iface_id = None
machine_id = 'machine-1'
machine_id = "machine-1"
iface = None
machine = None
@@ -213,23 +229,23 @@ def test_runner_prepare_refreshes_network_interface_and_attaches_current_process
proc = _Proc()
monkeypatch.setattr(runner_module, 'discover_plugins', lambda: {})
monkeypatch.setattr(runner_module, 'create_bus', lambda **kwargs: _DummyBus(kwargs['name']))
monkeypatch.setattr(runner_module, 'ProcessService', _DummyService)
monkeypatch.setattr(runner_module, 'MachineService', _DummyService)
monkeypatch.setattr(runner_module, 'BinaryService', _DummyService)
monkeypatch.setattr(runner_module, 'TagService', _DummyService)
monkeypatch.setattr(runner_module, 'CrawlService', _DummyService)
monkeypatch.setattr(runner_module, 'SnapshotService', _DummyService)
monkeypatch.setattr(runner_module, 'ArchiveResultService', _DummyService)
monkeypatch.setattr(runner_module, "discover_plugins", lambda: {})
monkeypatch.setattr(runner_module, "create_bus", lambda **kwargs: _DummyBus(kwargs["name"]))
monkeypatch.setattr(runner_module, "ProcessService", _DummyService)
monkeypatch.setattr(runner_module, "MachineService", _DummyService)
monkeypatch.setattr(runner_module, "BinaryService", _DummyService)
monkeypatch.setattr(runner_module, "TagService", _DummyService)
monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
monkeypatch.setattr(runner_module, "SnapshotService", _DummyService)
monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService)
from archivebox.machine.models import NetworkInterface, Process
from archivebox.config import configset as configset_module
refresh_calls = []
monkeypatch.setattr(NetworkInterface, 'current', classmethod(lambda cls, refresh=False: refresh_calls.append(refresh) or _Iface()))
monkeypatch.setattr(Process, 'current', classmethod(lambda cls: proc))
monkeypatch.setattr(configset_module, 'get_config', lambda **kwargs: {})
monkeypatch.setattr(NetworkInterface, "current", classmethod(lambda cls, refresh=False: refresh_calls.append(refresh) or _Iface()))
monkeypatch.setattr(Process, "current", classmethod(lambda cls: proc))
monkeypatch.setattr(configset_module, "get_config", lambda **kwargs: {})
crawl_runner = runner_module.CrawlRunner(crawl)
crawl_runner._prepare()
@@ -237,7 +253,182 @@ def test_runner_prepare_refreshes_network_interface_and_attaches_current_process
assert refresh_calls == [True]
assert proc.iface is not None
assert proc.machine == proc.iface.machine
assert saved_updates == [('iface', 'machine', 'modified_at')]
assert saved_updates == [("iface", "machine", "modified_at")]
def test_installed_binary_config_overrides_include_valid_installed_binaries(monkeypatch):
from archivebox.machine.models import Binary, Machine
from archivebox.services import runner as runner_module
from abx_dl.models import Plugin
machine = Machine.objects.create(
guid="test-guid-runner-overrides",
hostname="runner-host",
hw_in_docker=False,
hw_in_vm=False,
hw_manufacturer="Test",
hw_product="Test Product",
hw_uuid="test-hw-runner-overrides",
os_arch="arm64",
os_family="darwin",
os_platform="macOS",
os_release="14.0",
os_kernel="Darwin",
stats={},
config={},
)
mercury_binary = Binary.objects.create(
machine=machine,
name="postlight-parser",
abspath=sys.executable,
version="2.0.0",
binprovider="pip",
binproviders="env,pip",
status=Binary.StatusChoices.INSTALLED,
)
wget_binary = Binary.objects.create(
machine=machine,
name="wget",
abspath="/tmp/not-an-executable",
version="1.0.0",
binprovider="env",
binproviders="env",
status=Binary.StatusChoices.INSTALLED,
)
monkeypatch.setattr(Machine, "current", classmethod(lambda cls: machine))
monkeypatch.setattr(Path, "is_file", lambda self: str(self) in {sys.executable, mercury_binary.abspath, wget_binary.abspath})
monkeypatch.setattr(
runner_module.os,
"access",
lambda path, mode: str(path) == sys.executable,
)
overrides = runner_module._installed_binary_config_overrides(
{
"mercury": Plugin(
name="mercury",
path=Path("."),
hooks=[],
config_schema={"MERCURY_BINARY": {"type": "string", "default": "postlight-parser"}},
),
},
)
assert overrides["MERCURY_BINARY"] == sys.executable
assert overrides["POSTLIGHT_PARSER_BINARY"] == sys.executable
assert "WGET_BINARY" not in overrides
def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch):
import asgiref.sync
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls="https://example.com",
created_by_id=get_or_create_system_user_pk(),
max_size=16,
)
monkeypatch.setattr(runner_module, "discover_plugins", lambda: {})
monkeypatch.setattr(runner_module, "create_bus", lambda **kwargs: _DummyBus(kwargs["name"]))
monkeypatch.setattr(runner_module, "ProcessService", _DummyService)
monkeypatch.setattr(runner_module, "MachineService", _DummyService)
monkeypatch.setattr(runner_module, "BinaryService", _DummyService)
monkeypatch.setattr(runner_module, "TagService", _DummyService)
monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
monkeypatch.setattr(runner_module, "SnapshotService", _DummyService)
monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService)
monkeypatch.setattr(runner_module, "_limit_stop_reason", lambda config: "max_size")
monkeypatch.setattr(
asgiref.sync,
"sync_to_async",
lambda func, thread_sensitive=True: lambda *args, **kwargs: _call_sync(func, *args, **kwargs),
)
monkeypatch.setattr(
runner_module,
"download",
lambda *args, **kwargs: (_ for _ in ()).throw(AssertionError("snapshot download should have been skipped")),
)
crawl_runner = runner_module.CrawlRunner(crawl)
cancelled: list[str] = []
crawl_runner._load_snapshot_run_data = lambda snapshot_id: {
"id": snapshot_id,
"url": "https://example.com/child",
"title": "",
"timestamp": "",
"bookmarked_at": "",
"created_at": "",
"tags": "",
"depth": 1,
"status": "queued",
"parent_snapshot_id": None,
"output_dir": "/tmp/child",
"config": {"CRAWL_DIR": "/tmp/crawl", "MAX_SIZE": 16},
}
crawl_runner._cancel_snapshot_due_to_limit = lambda snapshot_id: cancelled.append(snapshot_id)
asyncio.run(crawl_runner._run_snapshot("child-1"))
assert cancelled == ["child-1"]
def test_seal_snapshot_cancels_queued_descendants_after_max_size():
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.services.snapshot_service import SnapshotService
from abx_dl.orchestrator import create_bus
crawl = Crawl.objects.create(
urls="https://example.com",
created_by_id=get_or_create_system_user_pk(),
max_size=16,
)
root = Snapshot.objects.create(
url="https://example.com",
crawl=crawl,
status=Snapshot.StatusChoices.STARTED,
)
child = Snapshot.objects.create(
url="https://example.com/child",
crawl=crawl,
depth=1,
parent_snapshot_id=root.id,
status=Snapshot.StatusChoices.QUEUED,
)
state_dir = Path(crawl.output_dir) / ".abx-dl"
state_dir.mkdir(parents=True, exist_ok=True)
(state_dir / "limits.json").write_text(
json.dumps(
{
"admitted_snapshot_ids": [str(root.id), str(child.id)],
"counted_process_ids": ["proc-1"],
"total_size": 32,
"stop_reason": "max_size",
},
),
encoding="utf-8",
)
bus = create_bus(name="test_snapshot_limit_cancel")
service = SnapshotService(bus, crawl_id=str(crawl.id), schedule_snapshot=lambda snapshot_id: None)
try:
sealed_id = service._seal_snapshot(str(root.id))
finally:
asyncio.run(bus.stop())
root.refresh_from_db()
child.refresh_from_db()
assert sealed_id == str(root.id)
assert root.status == Snapshot.StatusChoices.SEALED
assert child.status == Snapshot.StatusChoices.SEALED
assert child.retry_at is None
def test_create_crawl_api_queues_crawl_without_spawning_runner(monkeypatch):
@@ -245,28 +436,28 @@ def test_create_crawl_api_queues_crawl_without_spawning_runner(monkeypatch):
from archivebox.api.v1_crawls import CrawlCreateSchema, create_crawl
user = get_user_model().objects.create_superuser(
username='runner-api-admin',
email='runner-api-admin@example.com',
password='testpassword',
username="runner-api-admin",
email="runner-api-admin@example.com",
password="testpassword",
)
request = RequestFactory().post('/api/v1/crawls')
request = RequestFactory().post("/api/v1/crawls")
request.user = user
crawl = create_crawl(
request,
CrawlCreateSchema(
urls=['https://example.com'],
urls=["https://example.com"],
max_depth=0,
tags=[],
tags_str='',
label='',
notes='',
tags_str="",
label="",
notes="",
config={},
),
)
assert str(crawl.id)
assert crawl.status == 'queued'
assert crawl.status == "queued"
assert crawl.retry_at is not None
@@ -278,36 +469,36 @@ def test_crawl_runner_does_not_seal_unfinished_crawl(monkeypatch):
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls='https://example.com',
urls="https://example.com",
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.STARTED,
)
snapshot = Snapshot.objects.create(
url='https://example.com',
url="https://example.com",
crawl=crawl,
status=Snapshot.StatusChoices.STARTED,
)
monkeypatch.setattr(runner_module, '_attach_bus_trace', lambda bus: None)
monkeypatch.setattr(runner_module, '_stop_bus_trace', lambda bus: asyncio.sleep(0))
monkeypatch.setattr(runner_module, 'setup_abx_services', lambda *args, **kwargs: _DummyAbxServices())
monkeypatch.setattr(runner_module, "_attach_bus_trace", lambda bus: None)
monkeypatch.setattr(runner_module, "_stop_bus_trace", lambda bus: asyncio.sleep(0))
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: _DummyAbxServices())
monkeypatch.setenv("DJANGO_ALLOW_ASYNC_UNSAFE", "true")
monkeypatch.setattr(
asgiref.sync,
'sync_to_async',
lambda func, thread_sensitive=True: (lambda *args, **kwargs: _call_sync(func, *args, **kwargs)),
"sync_to_async",
lambda func, thread_sensitive=True: lambda *args, **kwargs: _call_sync(func, *args, **kwargs),
)
monkeypatch.setattr(Crawl.objects, 'get', lambda id: crawl)
monkeypatch.setattr(crawl, 'is_finished', lambda: False)
monkeypatch.setattr(crawl, 'save', lambda *args, **kwargs: None)
monkeypatch.setattr(runner_module.CrawlRunner, '_prepare', lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, '_create_live_ui', lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, '_initial_snapshot_ids', lambda self: [str(snapshot.id)])
monkeypatch.setattr(runner_module.CrawlRunner, '_run_crawl_setup', lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, 'enqueue_snapshot', lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, '_wait_for_snapshot_tasks', lambda self: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, '_run_crawl_cleanup', lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, '_cleanup_persona', lambda self: None)
monkeypatch.setattr(Crawl.objects, "get", lambda id: crawl)
monkeypatch.setattr(crawl, "is_finished", lambda: False)
monkeypatch.setattr(crawl, "save", lambda *args, **kwargs: None)
monkeypatch.setattr(runner_module.CrawlRunner, "_prepare", lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, "_create_live_ui", lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, "_initial_snapshot_ids", lambda self: [str(snapshot.id)])
monkeypatch.setattr(runner_module.CrawlRunner, "_run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "enqueue_snapshot", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "_wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "_run_crawl_cleanup", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "_cleanup_persona", lambda self: None)
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
@@ -323,39 +514,39 @@ def test_crawl_runner_finalizes_with_sync_to_async_for_is_finished(monkeypatch):
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls='https://example.com',
urls="https://example.com",
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.STARTED,
)
snapshot = Snapshot.objects.create(
url='https://example.com',
url="https://example.com",
crawl=crawl,
status=Snapshot.StatusChoices.STARTED,
)
monkeypatch.setattr(runner_module, 'create_bus', lambda *args, **kwargs: _DummyBus('runner'))
monkeypatch.setattr(runner_module, 'discover_plugins', lambda: {})
monkeypatch.setattr(runner_module, 'ProcessService', _DummyService)
monkeypatch.setattr(runner_module, 'MachineService', _DummyService)
monkeypatch.setattr(runner_module, 'BinaryService', _DummyService)
monkeypatch.setattr(runner_module, 'TagService', _DummyService)
monkeypatch.setattr(runner_module, 'CrawlService', _DummyService)
monkeypatch.setattr(runner_module, 'SnapshotService', _DummyService)
monkeypatch.setattr(runner_module, 'ArchiveResultService', _DummyService)
monkeypatch.setattr(runner_module, '_attach_bus_trace', lambda bus: None)
monkeypatch.setattr(runner_module, '_stop_bus_trace', lambda bus: asyncio.sleep(0))
monkeypatch.setattr(runner_module, 'setup_abx_services', lambda *args, **kwargs: _DummyAbxServices())
monkeypatch.setattr(Crawl.objects, 'get', lambda id: crawl)
monkeypatch.setattr(crawl, 'save', lambda *args, **kwargs: None)
monkeypatch.setattr(crawl, 'cleanup', lambda: None)
monkeypatch.setattr(runner_module.CrawlRunner, '_prepare', lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, '_create_live_ui', lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, '_initial_snapshot_ids', lambda self: [str(snapshot.id)])
monkeypatch.setattr(runner_module.CrawlRunner, '_run_crawl_setup', lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, 'enqueue_snapshot', lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, '_wait_for_snapshot_tasks', lambda self: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, '_run_crawl_cleanup', lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, '_cleanup_persona', lambda self: None)
monkeypatch.setattr(runner_module, "create_bus", lambda *args, **kwargs: _DummyBus("runner"))
monkeypatch.setattr(runner_module, "discover_plugins", lambda: {})
monkeypatch.setattr(runner_module, "ProcessService", _DummyService)
monkeypatch.setattr(runner_module, "MachineService", _DummyService)
monkeypatch.setattr(runner_module, "BinaryService", _DummyService)
monkeypatch.setattr(runner_module, "TagService", _DummyService)
monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
monkeypatch.setattr(runner_module, "SnapshotService", _DummyService)
monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService)
monkeypatch.setattr(runner_module, "_attach_bus_trace", lambda bus: None)
monkeypatch.setattr(runner_module, "_stop_bus_trace", lambda bus: asyncio.sleep(0))
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: _DummyAbxServices())
monkeypatch.setattr(Crawl.objects, "get", lambda id: crawl)
monkeypatch.setattr(crawl, "save", lambda *args, **kwargs: None)
monkeypatch.setattr(crawl, "cleanup", lambda: None)
monkeypatch.setattr(runner_module.CrawlRunner, "_prepare", lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, "_create_live_ui", lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, "_initial_snapshot_ids", lambda self: [str(snapshot.id)])
monkeypatch.setattr(runner_module.CrawlRunner, "_run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "enqueue_snapshot", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "_wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "_run_crawl_cleanup", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "_cleanup_persona", lambda self: None)
sync_to_async_wrapped: list[str] = []
sync_to_async_active = False
@@ -363,28 +554,29 @@ def test_crawl_runner_finalizes_with_sync_to_async_for_is_finished(monkeypatch):
def fake_sync_to_async(func, thread_sensitive=True):
async def wrapper(*args, **kwargs):
nonlocal sync_to_async_active
sync_to_async_wrapped.append(getattr(func, '__name__', repr(func)))
sync_to_async_wrapped.append(getattr(func, "__name__", repr(func)))
previous = sync_to_async_active
sync_to_async_active = True
try:
return func(*args, **kwargs)
finally:
sync_to_async_active = previous
return wrapper
def guarded_is_finished():
assert sync_to_async_active is True
return False
monkeypatch.setattr(asgiref.sync, 'sync_to_async', fake_sync_to_async)
monkeypatch.setattr(crawl, 'is_finished', guarded_is_finished)
monkeypatch.setattr(asgiref.sync, "sync_to_async", fake_sync_to_async)
monkeypatch.setattr(crawl, "is_finished", guarded_is_finished)
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
crawl.refresh_from_db()
assert crawl.status == Crawl.StatusChoices.STARTED
assert crawl.retry_at is not None
assert 'guarded_is_finished' in sync_to_async_wrapped
assert "guarded_is_finished" in sync_to_async_wrapped
def test_wait_for_snapshot_tasks_surfaces_already_failed_task():
@@ -393,16 +585,16 @@ def test_wait_for_snapshot_tasks_surfaces_already_failed_task():
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls='https://example.com',
urls="https://example.com",
created_by_id=get_or_create_system_user_pk(),
)
crawl_runner = runner_module.CrawlRunner(crawl)
async def run_test():
task = asyncio.get_running_loop().create_future()
task.set_exception(RuntimeError('snapshot failed'))
crawl_runner.snapshot_tasks['snap-1'] = task
with pytest.raises(RuntimeError, match='snapshot failed'):
task.set_exception(RuntimeError("snapshot failed"))
crawl_runner.snapshot_tasks["snap-1"] = task
with pytest.raises(RuntimeError, match="snapshot failed"):
await crawl_runner._wait_for_snapshot_tasks()
asyncio.run(run_test())
@@ -414,7 +606,7 @@ def test_wait_for_snapshot_tasks_returns_after_completed_tasks_are_pruned():
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls='https://example.com',
urls="https://example.com",
created_by_id=get_or_create_system_user_pk(),
)
crawl_runner = runner_module.CrawlRunner(crawl)
@@ -424,7 +616,7 @@ def test_wait_for_snapshot_tasks_returns_after_completed_tasks_are_pruned():
async def run_test():
task = asyncio.create_task(finish_snapshot())
crawl_runner.snapshot_tasks['snap-1'] = task
crawl_runner.snapshot_tasks["snap-1"] = task
await asyncio.wait_for(crawl_runner._wait_for_snapshot_tasks(), timeout=0.5)
assert crawl_runner.snapshot_tasks == {}
@@ -439,43 +631,47 @@ def test_crawl_runner_calls_crawl_cleanup_after_snapshot_phase(monkeypatch):
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls='https://example.com',
urls="https://example.com",
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.STARTED,
)
snapshot = Snapshot.objects.create(
url='https://example.com',
url="https://example.com",
crawl=crawl,
status=Snapshot.StatusChoices.STARTED,
)
monkeypatch.setattr(runner_module, '_attach_bus_trace', lambda bus: None)
monkeypatch.setattr(runner_module, '_stop_bus_trace', lambda bus: asyncio.sleep(0))
monkeypatch.setattr(runner_module, 'setup_abx_services', lambda *args, **kwargs: _DummyAbxServices())
monkeypatch.setattr(runner_module, "_attach_bus_trace", lambda bus: None)
monkeypatch.setattr(runner_module, "_stop_bus_trace", lambda bus: asyncio.sleep(0))
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: _DummyAbxServices())
monkeypatch.setenv("DJANGO_ALLOW_ASYNC_UNSAFE", "true")
monkeypatch.setattr(
asgiref.sync,
'sync_to_async',
lambda func, thread_sensitive=True: (lambda *args, **kwargs: _call_sync(func, *args, **kwargs)),
"sync_to_async",
lambda func, thread_sensitive=True: lambda *args, **kwargs: _call_sync(func, *args, **kwargs),
)
monkeypatch.setattr(Crawl.objects, 'get', lambda id: crawl)
monkeypatch.setattr(crawl, 'is_finished', lambda: False)
monkeypatch.setattr(crawl, 'save', lambda *args, **kwargs: None)
monkeypatch.setattr(runner_module.CrawlRunner, '_prepare', lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, '_create_live_ui', lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, '_initial_snapshot_ids', lambda self: [str(snapshot.id)])
monkeypatch.setattr(runner_module.CrawlRunner, '_run_crawl_setup', lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, 'enqueue_snapshot', lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, '_wait_for_snapshot_tasks', lambda self: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, '_cleanup_persona', lambda self: None)
monkeypatch.setattr(Crawl.objects, "get", lambda id: crawl)
monkeypatch.setattr(crawl, "is_finished", lambda: False)
monkeypatch.setattr(crawl, "save", lambda *args, **kwargs: None)
monkeypatch.setattr(runner_module.CrawlRunner, "_prepare", lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, "_create_live_ui", lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, "_initial_snapshot_ids", lambda self: [str(snapshot.id)])
monkeypatch.setattr(runner_module.CrawlRunner, "_run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "enqueue_snapshot", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "_wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "_cleanup_persona", lambda self: None)
cleanup_calls = []
monkeypatch.setattr(runner_module.CrawlRunner, '_run_crawl_cleanup', lambda self, snapshot_id: cleanup_calls.append('abx_cleanup') or asyncio.sleep(0))
monkeypatch.setattr(crawl, 'cleanup', lambda: cleanup_calls.append('crawl_cleanup'))
monkeypatch.setattr(
runner_module.CrawlRunner,
"_run_crawl_cleanup",
lambda self, snapshot_id: cleanup_calls.append("abx_cleanup") or asyncio.sleep(0),
)
monkeypatch.setattr(crawl, "cleanup", lambda: cleanup_calls.append("crawl_cleanup"))
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
assert cleanup_calls == ['crawl_cleanup', 'abx_cleanup']
assert cleanup_calls == ["crawl_cleanup", "abx_cleanup"]
def test_abx_process_service_background_monitor_finishes_after_process_exit(monkeypatch, tmp_path):
@@ -497,7 +693,7 @@ def test_abx_process_service_background_monitor_finishes_after_process_exit(monk
return ["daemon output\n"]
service._emit_event = fake_emit_event
monkeypatch.setattr(service, '_stream_stdout', fake_stream_stdout)
monkeypatch.setattr(service, "_stream_stdout", fake_stream_stdout)
class FakeAsyncProcess:
def __init__(self):
@@ -509,32 +705,32 @@ def test_abx_process_service_background_monitor_finishes_after_process_exit(monk
self.returncode = 0
return 0
plugin_output_dir = tmp_path / 'chrome'
plugin_output_dir = tmp_path / "chrome"
plugin_output_dir.mkdir()
stdout_file = plugin_output_dir / 'on_Crawl__90_chrome_launch.daemon.bg.stdout.log'
stderr_file = plugin_output_dir / 'on_Crawl__90_chrome_launch.daemon.bg.stderr.log'
stderr_file.write_text('')
pid_file = plugin_output_dir / 'on_Crawl__90_chrome_launch.daemon.bg.pid'
pid_file.write_text('12345')
stdout_file = plugin_output_dir / "on_Crawl__90_chrome_launch.daemon.bg.stdout.log"
stderr_file = plugin_output_dir / "on_Crawl__90_chrome_launch.daemon.bg.stderr.log"
stderr_file.write_text("")
pid_file = plugin_output_dir / "on_Crawl__90_chrome_launch.daemon.bg.pid"
pid_file.write_text("12345")
proc = AbxProcess(
cmd=['hook'],
cmd=["hook"],
pwd=str(plugin_output_dir),
timeout=60,
started_at=now_iso(),
plugin='chrome',
hook_name='on_Crawl__90_chrome_launch.daemon.bg',
plugin="chrome",
hook_name="on_Crawl__90_chrome_launch.daemon.bg",
)
process = FakeAsyncProcess()
event = SimpleNamespace(
plugin_name='chrome',
hook_name='on_Crawl__90_chrome_launch.daemon.bg',
hook_path='hook',
hook_args=['--url=https://example.org/'],
plugin_name="chrome",
hook_name="on_Crawl__90_chrome_launch.daemon.bg",
hook_path="hook",
hook_args=["--url=https://example.org/"],
env={},
output_dir=str(plugin_output_dir),
timeout=60,
snapshot_id='snap-1',
snapshot_id="snap-1",
is_background=True,
)
@@ -566,28 +762,29 @@ def test_run_pending_crawls_runs_due_snapshot_in_place(monkeypatch):
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls='https://example.com',
urls="https://example.com",
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.SEALED,
)
snapshot = Snapshot.objects.create(
url='https://example.com',
url="https://example.com",
crawl=crawl,
status=Snapshot.StatusChoices.QUEUED,
retry_at=runner_module.timezone.now(),
)
monkeypatch.setattr(type(snapshot), 'claim_processing_lock', lambda self, lock_seconds=60: True)
monkeypatch.setattr(type(crawl), 'claim_processing_lock', lambda self, lock_seconds=60: True)
monkeypatch.setattr(type(snapshot), "claim_processing_lock", lambda self, lock_seconds=60: True)
monkeypatch.setattr(type(crawl), "claim_processing_lock", lambda self, lock_seconds=60: True)
run_calls: list[tuple[str, list[str] | None, bool]] = []
def fake_run_crawl(crawl_id, snapshot_ids=None, selected_plugins=None, process_discovered_snapshots_inline=True):
run_calls.append((crawl_id, snapshot_ids, process_discovered_snapshots_inline))
snapshot.status = Snapshot.StatusChoices.SEALED
snapshot.retry_at = None
snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
monkeypatch.setattr(runner_module, 'run_crawl', fake_run_crawl)
monkeypatch.setattr(runner_module, "run_crawl", fake_run_crawl)
result = runner_module.run_pending_crawls(daemon=False)
@@ -602,26 +799,26 @@ def test_run_pending_crawls_prioritizes_new_queued_crawl_before_snapshot_backlog
from archivebox.services import runner as runner_module
older_crawl = Crawl.objects.create(
urls='https://older.example.com',
urls="https://older.example.com",
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.STARTED,
)
older_snapshot = Snapshot.objects.create(
url='https://older.example.com',
url="https://older.example.com",
crawl=older_crawl,
status=Snapshot.StatusChoices.QUEUED,
retry_at=runner_module.timezone.now(),
)
newer_crawl = Crawl.objects.create(
urls='https://newer.example.com',
urls="https://newer.example.com",
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.QUEUED,
retry_at=runner_module.timezone.now(),
)
monkeypatch.setattr(type(older_snapshot), 'claim_processing_lock', lambda self, lock_seconds=60: True)
monkeypatch.setattr(type(older_crawl), 'claim_processing_lock', lambda self, lock_seconds=60: True)
monkeypatch.setattr(type(newer_crawl), 'claim_processing_lock', lambda self, lock_seconds=60: True)
monkeypatch.setattr(type(older_snapshot), "claim_processing_lock", lambda self, lock_seconds=60: True)
monkeypatch.setattr(type(older_crawl), "claim_processing_lock", lambda self, lock_seconds=60: True)
monkeypatch.setattr(type(newer_crawl), "claim_processing_lock", lambda self, lock_seconds=60: True)
run_calls: list[tuple[str, list[str] | None, bool]] = []
@@ -632,7 +829,7 @@ def test_run_pending_crawls_prioritizes_new_queued_crawl_before_snapshot_backlog
run_calls.append((crawl_id, snapshot_ids, process_discovered_snapshots_inline))
raise _StopScheduling
monkeypatch.setattr(runner_module, 'run_crawl', fake_run_crawl)
monkeypatch.setattr(runner_module, "run_crawl", fake_run_crawl)
with pytest.raises(_StopScheduling):
runner_module.run_pending_crawls(daemon=False)

View File

@@ -9,10 +9,18 @@ from pathlib import Path
from archivebox.tests.conftest import create_test_url
ADMIN_HOST = 'admin.archivebox.localhost:8000'
ADMIN_HOST = "admin.archivebox.localhost:8000"
def _run_savepagenow_script(initialized_archive: Path, request_url: str, expected_url: str, *, login: bool, public_add_view: bool, host: str):
def _run_savepagenow_script(
initialized_archive: Path,
request_url: str,
expected_url: str,
*,
login: bool,
public_add_view: bool,
host: str,
):
script = textwrap.dedent(
f"""
import os
@@ -52,34 +60,34 @@ def _run_savepagenow_script(initialized_archive: Path, request_url: str, expecte
assert resp2.status_code == 302, resp2.status_code
assert Snapshot.objects.filter(url={expected_url!r}).count() == 1
assert resp2['Location'] == f"/{{snapshot.url_path}}"
"""
""",
)
env = {
**os.environ,
'DATA_DIR': str(initialized_archive),
'USE_COLOR': 'False',
'SHOW_PROGRESS': 'False',
'PUBLIC_ADD_VIEW': 'True' if public_add_view else 'False',
'SAVE_ARCHIVEDOTORG': 'False',
'SAVE_TITLE': 'False',
'SAVE_FAVICON': 'False',
'SAVE_WGET': 'False',
'SAVE_WARC': 'False',
'SAVE_PDF': 'False',
'SAVE_SCREENSHOT': 'False',
'SAVE_DOM': 'False',
'SAVE_SINGLEFILE': 'False',
'SAVE_READABILITY': 'False',
'SAVE_MERCURY': 'False',
'SAVE_GIT': 'False',
'SAVE_YTDLP': 'False',
'SAVE_HEADERS': 'False',
'SAVE_HTMLTOTEXT': 'False',
"DATA_DIR": str(initialized_archive),
"USE_COLOR": "False",
"SHOW_PROGRESS": "False",
"PUBLIC_ADD_VIEW": "True" if public_add_view else "False",
"SAVE_ARCHIVEDOTORG": "False",
"SAVE_TITLE": "False",
"SAVE_FAVICON": "False",
"SAVE_WGET": "False",
"SAVE_WARC": "False",
"SAVE_PDF": "False",
"SAVE_SCREENSHOT": "False",
"SAVE_DOM": "False",
"SAVE_SINGLEFILE": "False",
"SAVE_READABILITY": "False",
"SAVE_MERCURY": "False",
"SAVE_GIT": "False",
"SAVE_YTDLP": "False",
"SAVE_HEADERS": "False",
"SAVE_HTMLTOTEXT": "False",
}
return subprocess.run(
[sys.executable, '-c', script],
[sys.executable, "-c", script],
cwd=initialized_archive,
env=env,
text=True,
@@ -105,36 +113,104 @@ def _run_savepagenow_not_found_script(initialized_archive: Path, request_url: st
target_url = {request_url!r}
resp = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000')
assert resp.status_code == 404, resp.status_code
assert resp.status_code == 302, resp.status_code
assert resp['Location'] == f'http://{ADMIN_HOST}/web/' + target_url
assert Snapshot.objects.count() == 0
"""
""",
)
env = {
**os.environ,
'DATA_DIR': str(initialized_archive),
'USE_COLOR': 'False',
'SHOW_PROGRESS': 'False',
'PUBLIC_ADD_VIEW': 'False',
'SAVE_ARCHIVEDOTORG': 'False',
'SAVE_TITLE': 'False',
'SAVE_FAVICON': 'False',
'SAVE_WGET': 'False',
'SAVE_WARC': 'False',
'SAVE_PDF': 'False',
'SAVE_SCREENSHOT': 'False',
'SAVE_DOM': 'False',
'SAVE_SINGLEFILE': 'False',
'SAVE_READABILITY': 'False',
'SAVE_MERCURY': 'False',
'SAVE_GIT': 'False',
'SAVE_YTDLP': 'False',
'SAVE_HEADERS': 'False',
'SAVE_HTMLTOTEXT': 'False',
"DATA_DIR": str(initialized_archive),
"USE_COLOR": "False",
"SHOW_PROGRESS": "False",
"PUBLIC_ADD_VIEW": "False",
"SAVE_ARCHIVEDOTORG": "False",
"SAVE_TITLE": "False",
"SAVE_FAVICON": "False",
"SAVE_WGET": "False",
"SAVE_WARC": "False",
"SAVE_PDF": "False",
"SAVE_SCREENSHOT": "False",
"SAVE_DOM": "False",
"SAVE_SINGLEFILE": "False",
"SAVE_READABILITY": "False",
"SAVE_MERCURY": "False",
"SAVE_GIT": "False",
"SAVE_YTDLP": "False",
"SAVE_HEADERS": "False",
"SAVE_HTMLTOTEXT": "False",
}
return subprocess.run(
[sys.executable, '-c', script],
[sys.executable, "-c", script],
cwd=initialized_archive,
env=env,
text=True,
capture_output=True,
timeout=60,
)
def _run_savepagenow_via_web_host_redirect_script(initialized_archive: Path, request_url: str, expected_url: str):
script = textwrap.dedent(
f"""
import os
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
from archivebox.config.django import setup_django
setup_django()
from django.test import Client
from django.contrib.auth import get_user_model
from archivebox.core.models import Snapshot
client = Client()
user = get_user_model().objects.create_user(username='tester', password='pw')
client.force_login(user)
target_url = {request_url!r}
resp = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000')
assert resp.status_code == 302, resp.status_code
assert resp['Location'] == f'http://{ADMIN_HOST}/web/' + target_url
resp2 = client.get('/web/' + target_url, HTTP_HOST={ADMIN_HOST!r})
assert resp2.status_code == 302, resp2.status_code
snapshot = Snapshot.objects.filter(url={expected_url!r}).order_by('-created_at').first()
assert snapshot is not None
assert resp2['Location'] == f"/{{snapshot.url_path}}"
assert Snapshot.objects.filter(url={expected_url!r}).count() == 1
""",
)
env = {
**os.environ,
"DATA_DIR": str(initialized_archive),
"USE_COLOR": "False",
"SHOW_PROGRESS": "False",
"PUBLIC_ADD_VIEW": "False",
"SAVE_ARCHIVEDOTORG": "False",
"SAVE_TITLE": "False",
"SAVE_FAVICON": "False",
"SAVE_WGET": "False",
"SAVE_WARC": "False",
"SAVE_PDF": "False",
"SAVE_SCREENSHOT": "False",
"SAVE_DOM": "False",
"SAVE_SINGLEFILE": "False",
"SAVE_READABILITY": "False",
"SAVE_MERCURY": "False",
"SAVE_GIT": "False",
"SAVE_YTDLP": "False",
"SAVE_HEADERS": "False",
"SAVE_HTMLTOTEXT": "False",
}
return subprocess.run(
[sys.executable, "-c", script],
cwd=initialized_archive,
env=env,
text=True,
@@ -168,34 +244,34 @@ def _run_savepagenow_existing_snapshot_script(initialized_archive: Path, request
resp = client.get('/web/' + target_url, HTTP_HOST='web.archivebox.localhost:8000')
assert resp.status_code == 302, resp.status_code
assert resp['Location'] == f"/{{snapshot.url_path}}"
"""
""",
)
env = {
**os.environ,
'DATA_DIR': str(initialized_archive),
'USE_COLOR': 'False',
'SHOW_PROGRESS': 'False',
'PUBLIC_ADD_VIEW': 'False',
'SAVE_ARCHIVEDOTORG': 'False',
'SAVE_TITLE': 'False',
'SAVE_FAVICON': 'False',
'SAVE_WGET': 'False',
'SAVE_WARC': 'False',
'SAVE_PDF': 'False',
'SAVE_SCREENSHOT': 'False',
'SAVE_DOM': 'False',
'SAVE_SINGLEFILE': 'False',
'SAVE_READABILITY': 'False',
'SAVE_MERCURY': 'False',
'SAVE_GIT': 'False',
'SAVE_YTDLP': 'False',
'SAVE_HEADERS': 'False',
'SAVE_HTMLTOTEXT': 'False',
"DATA_DIR": str(initialized_archive),
"USE_COLOR": "False",
"SHOW_PROGRESS": "False",
"PUBLIC_ADD_VIEW": "False",
"SAVE_ARCHIVEDOTORG": "False",
"SAVE_TITLE": "False",
"SAVE_FAVICON": "False",
"SAVE_WGET": "False",
"SAVE_WARC": "False",
"SAVE_PDF": "False",
"SAVE_SCREENSHOT": "False",
"SAVE_DOM": "False",
"SAVE_SINGLEFILE": "False",
"SAVE_READABILITY": "False",
"SAVE_MERCURY": "False",
"SAVE_GIT": "False",
"SAVE_YTDLP": "False",
"SAVE_HEADERS": "False",
"SAVE_HTMLTOTEXT": "False",
}
return subprocess.run(
[sys.executable, '-c', script],
[sys.executable, "-c", script],
cwd=initialized_archive,
env=env,
text=True,
@@ -206,47 +282,49 @@ def _run_savepagenow_existing_snapshot_script(initialized_archive: Path, request
def test_web_add_creates_and_reuses_snapshot_logged_in(initialized_archive):
"""/web/https://... should work for authenticated users even when public add is off."""
url = create_test_url(domain='example.com', path='savepagenow-auth')
request_url = url.replace('https://', '')
url = create_test_url(domain="example.com", path="savepagenow-auth")
request_url = url.replace("https://", "")
result = _run_savepagenow_script(initialized_archive, request_url, url, login=True, public_add_view=False, host=ADMIN_HOST)
assert result.returncode == 0, (
"SavePageNow shortcut (logged-in) test failed.\n"
f"stdout:\n{result.stdout}\n"
f"stderr:\n{result.stderr}"
)
assert result.returncode == 0, f"SavePageNow shortcut (logged-in) test failed.\nstdout:\n{result.stdout}\nstderr:\n{result.stderr}"
def test_web_add_creates_and_reuses_snapshot_public(initialized_archive):
"""/web/https://... should work when PUBLIC_ADD_VIEW is enabled without login."""
url = create_test_url(domain='example.com', path='savepagenow-public')
request_url = url.replace('https://', '')
result = _run_savepagenow_script(initialized_archive, request_url, url, login=False, public_add_view=True, host='web.archivebox.localhost:8000')
assert result.returncode == 0, (
"SavePageNow shortcut (public add) test failed.\n"
f"stdout:\n{result.stdout}\n"
f"stderr:\n{result.stderr}"
url = create_test_url(domain="example.com", path="savepagenow-public")
request_url = url
result = _run_savepagenow_script(
initialized_archive,
request_url,
url,
login=False,
public_add_view=True,
host="web.archivebox.localhost:8000",
)
assert result.returncode == 0, f"SavePageNow shortcut (public add) test failed.\nstdout:\n{result.stdout}\nstderr:\n{result.stderr}"
def test_web_add_requires_login_when_public_off(initialized_archive):
"""/web/https://... should 404 for new URLs when PUBLIC_ADD_VIEW is false and not logged in."""
url = create_test_url(domain='example.com', path='savepagenow-404')
request_url = url.replace('https://', '')
"""/web/https://... should bounce to admin when PUBLIC_ADD_VIEW is false and not logged in."""
url = create_test_url(domain="example.com", path="savepagenow-404")
request_url = url
result = _run_savepagenow_not_found_script(initialized_archive, request_url)
assert result.returncode == 0, f"SavePageNow shortcut (no public add) test failed.\nstdout:\n{result.stdout}\nstderr:\n{result.stderr}"
def test_web_add_redirects_to_admin_and_creates_when_logged_in(initialized_archive):
"""/web/https://... on web host should redirect to admin host and create when the user is logged in there."""
url = create_test_url(domain="example.com", path="savepagenow-web-admin")
result = _run_savepagenow_via_web_host_redirect_script(initialized_archive, url, url)
assert result.returncode == 0, (
"SavePageNow shortcut (no public add) test failed.\n"
f"stdout:\n{result.stdout}\n"
f"stderr:\n{result.stderr}"
f"SavePageNow shortcut (web->admin redirect) test failed.\nstdout:\n{result.stdout}\nstderr:\n{result.stderr}"
)
def test_web_add_redirects_existing_snapshot_when_public_off(initialized_archive):
"""/web/https://... should redirect to existing snapshot even when public add is off and not logged in."""
url = create_test_url(domain='example.com', path='savepagenow-existing')
request_url = url.replace('https://', '')
url = create_test_url(domain="example.com", path="savepagenow-existing")
request_url = url.replace("https://", "")
result = _run_savepagenow_existing_snapshot_script(initialized_archive, request_url, url)
assert result.returncode == 0, (
"SavePageNow shortcut (existing snapshot) test failed.\n"
f"stdout:\n{result.stdout}\n"
f"stderr:\n{result.stderr}"
f"SavePageNow shortcut (existing snapshot) test failed.\nstdout:\n{result.stdout}\nstderr:\n{result.stderr}"
)

View File

@@ -8,7 +8,6 @@ import subprocess
import pytest
def _fetchone(tmp_path, query):
conn = sqlite3.connect(tmp_path / "index.sqlite3")
try:
@@ -21,7 +20,7 @@ def test_schedule_creates_enabled_db_schedule(tmp_path, process):
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'schedule', '--every=daily', '--depth=1', 'https://example.com/feed.xml'],
["archivebox", "schedule", "--every=daily", "--depth=1", "https://example.com/feed.xml"],
capture_output=True,
text=True,
)
@@ -37,50 +36,50 @@ def test_schedule_creates_enabled_db_schedule(tmp_path, process):
"SELECT urls, status, max_depth FROM crawls_crawl ORDER BY created_at DESC LIMIT 1",
)
assert schedule_row == ('daily', 1, 'Scheduled import: https://example.com/feed.xml')
assert crawl_row == ('https://example.com/feed.xml', 'sealed', 1)
assert schedule_row == ("daily", 1, "Scheduled import: https://example.com/feed.xml")
assert crawl_row == ("https://example.com/feed.xml", "sealed", 1)
def test_schedule_show_lists_enabled_schedules(tmp_path, process):
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'schedule', '--every=weekly', 'https://example.com/feed.xml'],
["archivebox", "schedule", "--every=weekly", "https://example.com/feed.xml"],
capture_output=True,
text=True,
check=True,
)
result = subprocess.run(
['archivebox', 'schedule', '--show'],
["archivebox", "schedule", "--show"],
capture_output=True,
text=True,
)
assert result.returncode == 0
assert 'Active scheduled crawls' in result.stdout
assert 'https://example.com/feed.xml' in result.stdout
assert 'weekly' in result.stdout
assert "Active scheduled crawls" in result.stdout
assert "https://example.com/feed.xml" in result.stdout
assert "weekly" in result.stdout
def test_schedule_clear_disables_existing_schedules(tmp_path, process):
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'schedule', '--every=daily', 'https://example.com/feed.xml'],
["archivebox", "schedule", "--every=daily", "https://example.com/feed.xml"],
capture_output=True,
text=True,
check=True,
)
result = subprocess.run(
['archivebox', 'schedule', '--clear'],
["archivebox", "schedule", "--clear"],
capture_output=True,
text=True,
)
assert result.returncode == 0
assert 'Disabled 1 scheduled crawl' in result.stdout
assert "Disabled 1 scheduled crawl" in result.stdout
disabled_count = _fetchone(
tmp_path,
@@ -99,13 +98,13 @@ def test_schedule_every_requires_valid_period(tmp_path, process):
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'schedule', '--every=invalid_period', 'https://example.com/feed.xml'],
["archivebox", "schedule", "--every=invalid_period", "https://example.com/feed.xml"],
capture_output=True,
text=True,
)
assert result.returncode != 0
assert 'Invalid schedule' in result.stderr or 'Invalid schedule' in result.stdout
assert "Invalid schedule" in result.stderr or "Invalid schedule" in result.stdout
class TestScheduleCLI:
@@ -113,17 +112,17 @@ class TestScheduleCLI:
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'schedule', '--help'],
["archivebox", "schedule", "--help"],
capture_output=True,
text=True,
)
assert result.returncode == 0
assert '--every' in result.stdout
assert '--show' in result.stdout
assert '--clear' in result.stdout
assert '--run-all' in result.stdout
assert "--every" in result.stdout
assert "--show" in result.stdout
assert "--clear" in result.stdout
assert "--run-all" in result.stdout
if __name__ == '__main__':
pytest.main([__file__, '-v'])
if __name__ == "__main__":
pytest.main([__file__, "-v"])

View File

@@ -21,7 +21,7 @@ REPO_ROOT = Path(__file__).resolve().parents[2]
def init_archive(cwd: Path) -> None:
result = subprocess.run(
[sys.executable, '-m', 'archivebox', 'init', '--quick'],
[sys.executable, "-m", "archivebox", "init", "--quick"],
cwd=cwd,
capture_output=True,
text=True,
@@ -32,46 +32,48 @@ def init_archive(cwd: Path) -> None:
def build_test_env(port: int, **extra: str) -> dict[str, str]:
env = os.environ.copy()
env.pop('DATA_DIR', None)
env.update({
'LISTEN_HOST': f'archivebox.localhost:{port}',
'ALLOWED_HOSTS': '*',
'CSRF_TRUSTED_ORIGINS': f'http://admin.archivebox.localhost:{port}',
'PUBLIC_ADD_VIEW': 'True',
'USE_COLOR': 'False',
'SHOW_PROGRESS': 'False',
'TIMEOUT': '20',
'URL_ALLOWLIST': r'127\.0\.0\.1[:/].*',
'SAVE_ARCHIVEDOTORG': 'False',
'SAVE_TITLE': 'False',
'SAVE_FAVICON': 'False',
'SAVE_WARC': 'False',
'SAVE_PDF': 'False',
'SAVE_SCREENSHOT': 'False',
'SAVE_DOM': 'False',
'SAVE_SINGLEFILE': 'False',
'SAVE_READABILITY': 'False',
'SAVE_MERCURY': 'False',
'SAVE_GIT': 'False',
'SAVE_YTDLP': 'False',
'SAVE_HEADERS': 'False',
'SAVE_HTMLTOTEXT': 'False',
'SAVE_WGET': 'True',
'USE_CHROME': 'False',
})
env.pop("DATA_DIR", None)
env.update(
{
"LISTEN_HOST": f"archivebox.localhost:{port}",
"ALLOWED_HOSTS": "*",
"CSRF_TRUSTED_ORIGINS": f"http://admin.archivebox.localhost:{port}",
"PUBLIC_ADD_VIEW": "True",
"USE_COLOR": "False",
"SHOW_PROGRESS": "False",
"TIMEOUT": "20",
"URL_ALLOWLIST": r"127\.0\.0\.1[:/].*",
"SAVE_ARCHIVEDOTORG": "False",
"SAVE_TITLE": "False",
"SAVE_FAVICON": "False",
"SAVE_WARC": "False",
"SAVE_PDF": "False",
"SAVE_SCREENSHOT": "False",
"SAVE_DOM": "False",
"SAVE_SINGLEFILE": "False",
"SAVE_READABILITY": "False",
"SAVE_MERCURY": "False",
"SAVE_GIT": "False",
"SAVE_YTDLP": "False",
"SAVE_HEADERS": "False",
"SAVE_HTMLTOTEXT": "False",
"SAVE_WGET": "True",
"USE_CHROME": "False",
},
)
env.update(extra)
return env
def get_free_port() -> int:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
sock.bind(('127.0.0.1', 0))
sock.bind(("127.0.0.1", 0))
return sock.getsockname()[1]
def start_server(cwd: Path, env: dict[str, str], port: int) -> None:
result = subprocess.run(
[sys.executable, '-m', 'archivebox', 'server', '--daemonize', f'127.0.0.1:{port}'],
[sys.executable, "-m", "archivebox", "server", "--daemonize", f"127.0.0.1:{port}"],
cwd=cwd,
capture_output=True,
text=True,
@@ -91,19 +93,19 @@ def stop_server(cwd: Path) -> None:
from archivebox.workers.supervisord_util import stop_existing_supervisord_process
stop_existing_supervisord_process()
print('stopped')
"""
""",
)
run_python_cwd(script, cwd=cwd, timeout=30)
def wait_for_http(port: int, host: str, path: str = '/', timeout: int = 30) -> requests.Response:
def wait_for_http(port: int, host: str, path: str = "/", timeout: int = 30) -> requests.Response:
deadline = time.time() + timeout
last_exc = None
while time.time() < deadline:
try:
response = requests.get(
f'http://127.0.0.1:{port}{path}',
headers={'Host': host},
f"http://127.0.0.1:{port}{path}",
headers={"Host": host},
timeout=2,
allow_redirects=False,
)
@@ -112,11 +114,11 @@ def wait_for_http(port: int, host: str, path: str = '/', timeout: int = 30) -> r
except requests.RequestException as exc:
last_exc = exc
time.sleep(0.5)
raise AssertionError(f'Timed out waiting for HTTP on {host}: {last_exc}')
raise AssertionError(f"Timed out waiting for HTTP on {host}: {last_exc}")
def make_latest_schedule_due(cwd: Path) -> None:
conn = sqlite3.connect(cwd / 'index.sqlite3')
conn = sqlite3.connect(cwd / "index.sqlite3")
try:
conn.execute(
"""
@@ -129,7 +131,7 @@ def make_latest_schedule_due(cwd: Path) -> None:
ORDER BY created_at DESC
LIMIT 1
)
"""
""",
)
conn.commit()
finally:
@@ -182,7 +184,7 @@ def get_snapshot_file_text(cwd: Path, url: str) -> str:
assert candidates, f'no captured html/txt files found in {{snapshot_dir}}'
print(candidates[0].read_text(errors='ignore'))
"""
""",
)
stdout, stderr, code = run_python_cwd(script, cwd=cwd, timeout=60)
assert code == 0, stderr
@@ -198,11 +200,11 @@ def wait_for_snapshot_capture(cwd: Path, url: str, timeout: int = 180) -> str:
except AssertionError as err:
last_error = err
time.sleep(2)
raise AssertionError(f'timed out waiting for captured content for {url}: {last_error}')
raise AssertionError(f"timed out waiting for captured content for {url}: {last_error}")
def get_counts(cwd: Path, scheduled_url: str, one_shot_url: str) -> tuple[int, int, int]:
conn = sqlite3.connect(cwd / 'index.sqlite3')
conn = sqlite3.connect(cwd / "index.sqlite3")
try:
scheduled_snapshots = conn.execute(
"SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
@@ -259,7 +261,7 @@ def create_admin_and_token(cwd: Path) -> str:
expires=timezone.now() + timedelta(days=1),
)
print(token.token)
"""
""",
)
stdout, stderr, code = run_python_cwd(script, cwd=cwd, timeout=60)
assert code == 0, stderr
@@ -275,7 +277,7 @@ def test_server_processes_due_cli_schedule_and_saves_real_content(tmp_path, recu
env = build_test_env(port)
schedule_result = subprocess.run(
[sys.executable, '-m', 'archivebox', 'schedule', '--every=daily', '--depth=0', recursive_test_site['root_url']],
[sys.executable, "-m", "archivebox", "schedule", "--every=daily", "--depth=0", recursive_test_site["root_url"]],
cwd=tmp_path,
capture_output=True,
text=True,
@@ -283,16 +285,16 @@ def test_server_processes_due_cli_schedule_and_saves_real_content(tmp_path, recu
timeout=60,
)
assert schedule_result.returncode == 0, schedule_result.stderr
assert 'Created scheduled crawl' in schedule_result.stdout
assert "Created scheduled crawl" in schedule_result.stdout
make_latest_schedule_due(tmp_path)
try:
start_server(tmp_path, env=env, port=port)
wait_for_http(port, host=f'web.archivebox.localhost:{port}')
captured_text = wait_for_snapshot_capture(tmp_path, recursive_test_site['root_url'], timeout=180)
assert 'Root' in captured_text
assert 'About' in captured_text
wait_for_http(port, host=f"web.archivebox.localhost:{port}")
captured_text = wait_for_snapshot_capture(tmp_path, recursive_test_site["root_url"], timeout=180)
assert "Root" in captured_text
assert "About" in captured_text
finally:
stop_server(tmp_path)
@@ -304,11 +306,11 @@ def test_archivebox_add_remains_one_shot_even_when_schedule_is_due(tmp_path, rec
port = get_free_port()
env = build_test_env(port)
scheduled_url = recursive_test_site['root_url']
one_shot_url = recursive_test_site['child_urls'][0]
scheduled_url = recursive_test_site["root_url"]
one_shot_url = recursive_test_site["child_urls"][0]
schedule_result = subprocess.run(
[sys.executable, '-m', 'archivebox', 'schedule', '--every=daily', '--depth=0', scheduled_url],
[sys.executable, "-m", "archivebox", "schedule", "--every=daily", "--depth=0", scheduled_url],
cwd=tmp_path,
capture_output=True,
text=True,
@@ -320,7 +322,7 @@ def test_archivebox_add_remains_one_shot_even_when_schedule_is_due(tmp_path, rec
make_latest_schedule_due(tmp_path)
add_result = subprocess.run(
[sys.executable, '-m', 'archivebox', 'add', '--depth=0', '--plugins=wget', one_shot_url],
[sys.executable, "-m", "archivebox", "add", "--depth=0", "--plugins=wget", one_shot_url],
cwd=tmp_path,
capture_output=True,
text=True,
@@ -329,7 +331,7 @@ def test_archivebox_add_remains_one_shot_even_when_schedule_is_due(tmp_path, rec
)
assert add_result.returncode == 0, add_result.stderr
captured_text = wait_for_snapshot_capture(tmp_path, one_shot_url, timeout=120)
assert 'Deep About' in captured_text or 'About' in captured_text
assert "Deep About" in captured_text or "About" in captured_text
scheduled_snapshots, one_shot_snapshots, scheduled_crawls = get_counts(tmp_path, scheduled_url, one_shot_url)
assert one_shot_snapshots >= 1
@@ -348,27 +350,27 @@ def test_schedule_rest_api_works_over_running_server(tmp_path, recursive_test_si
try:
start_server(tmp_path, env=env, port=port)
wait_for_http(port, host=f'api.archivebox.localhost:{port}', path='/api/v1/docs')
wait_for_http(port, host=f"api.archivebox.localhost:{port}", path="/api/v1/docs")
response = requests.post(
f'http://127.0.0.1:{port}/api/v1/cli/schedule',
f"http://127.0.0.1:{port}/api/v1/cli/schedule",
headers={
'Host': f'api.archivebox.localhost:{port}',
'X-ArchiveBox-API-Key': api_token,
"Host": f"api.archivebox.localhost:{port}",
"X-ArchiveBox-API-Key": api_token,
},
json={
'every': 'daily',
'import_path': recursive_test_site['root_url'],
'quiet': True,
"every": "daily",
"import_path": recursive_test_site["root_url"],
"quiet": True,
},
timeout=10,
)
assert response.status_code == 200, response.text
payload = response.json()
assert payload['success'] is True
assert payload['result_format'] == 'json'
assert len(payload['result']['created_schedule_ids']) == 1
assert payload["success"] is True
assert payload["result_format"] == "json"
assert len(payload["result"]["created_schedule_ids"]) == 1
finally:
stop_server(tmp_path)
@@ -379,21 +381,21 @@ def test_schedule_web_ui_post_works_over_running_server(tmp_path, recursive_test
init_archive(tmp_path)
port = get_free_port()
env = build_test_env(port, PUBLIC_ADD_VIEW='True')
env = build_test_env(port, PUBLIC_ADD_VIEW="True")
try:
start_server(tmp_path, env=env, port=port)
wait_for_http(port, host=f'web.archivebox.localhost:{port}', path='/add/')
wait_for_http(port, host=f"web.archivebox.localhost:{port}", path="/add/")
response = requests.post(
f'http://127.0.0.1:{port}/add/',
headers={'Host': f'web.archivebox.localhost:{port}'},
f"http://127.0.0.1:{port}/add/",
headers={"Host": f"web.archivebox.localhost:{port}"},
data={
'url': recursive_test_site['root_url'],
'depth': '0',
'schedule': 'daily',
'tag': 'web-ui',
'notes': 'created from web ui',
"url": recursive_test_site["root_url"],
"depth": "0",
"schedule": "daily",
"tag": "web-ui",
"notes": "created from web ui",
},
timeout=10,
allow_redirects=False,
@@ -401,7 +403,7 @@ def test_schedule_web_ui_post_works_over_running_server(tmp_path, recursive_test
assert response.status_code in (302, 303), response.text
conn = sqlite3.connect(tmp_path / 'index.sqlite3')
conn = sqlite3.connect(tmp_path / "index.sqlite3")
try:
row = conn.execute(
"""
@@ -410,11 +412,11 @@ def test_schedule_web_ui_post_works_over_running_server(tmp_path, recursive_test
JOIN crawls_crawl c ON c.schedule_id = cs.id
ORDER BY cs.created_at DESC
LIMIT 1
"""
""",
).fetchone()
finally:
conn.close()
assert row == ('daily', recursive_test_site['root_url'], 'web-ui')
assert row == ("daily", recursive_test_site["root_url"], "web-ui")
finally:
stop_server(tmp_path)

View File

@@ -103,7 +103,10 @@ async function main() {
timeout: 15000,
});
await new Promise((resolve) => setTimeout(resolve, 1500));
await page.waitForFunction(
() => window.__dangerousScriptRan !== true || window.__probeResults !== undefined,
{timeout: 15000},
);
const pageState = await page.evaluate(() => ({
href: location.href,
@@ -297,7 +300,7 @@ def _seed_archive(data_dir: Path) -> dict[str, object]:
"password": "testpassword",
"snapshots": snapshots,
}))
"""
""",
)
stdout, stderr, returncode = run_python_cwd(script, cwd=data_dir, timeout=120)
assert returncode == 0, stderr
@@ -310,10 +313,17 @@ def _get_free_port() -> int:
return sock.getsockname()[1]
def _wait_for_http(port: int, host: str, timeout: float = 30.0) -> None:
def _wait_for_http(
port: int,
host: str,
timeout: float = 30.0,
process: subprocess.Popen[str] | None = None,
) -> None:
deadline = time.time() + timeout
last_error = "server did not answer"
while time.time() < deadline:
if process is not None and process.poll() is not None:
raise AssertionError(f"Server exited before becoming ready with code {process.returncode}")
try:
response = requests.get(
f"http://127.0.0.1:{port}/",
@@ -358,7 +368,7 @@ def _start_server(data_dir: Path, *, mode: str, port: int) -> subprocess.Popen[s
"SAVE_HEADERS": "False",
"SAVE_HTMLTOTEXT": "False",
"USE_CHROME": "False",
}
},
)
process = subprocess.Popen(
[sys.executable, "-m", "archivebox", "server", "--debug", "--nothreading", f"127.0.0.1:{port}"],
@@ -369,7 +379,11 @@ def _start_server(data_dir: Path, *, mode: str, port: int) -> subprocess.Popen[s
text=True,
start_new_session=True,
)
_wait_for_http(port, f"archivebox.localhost:{port}")
try:
_wait_for_http(port, f"archivebox.localhost:{port}", process=process)
except AssertionError as exc:
server_log = _stop_server(process)
raise AssertionError(f"{exc}\n\nSERVER LOG:\n{server_log}") from exc
return process
@@ -414,7 +428,7 @@ def _build_probe_config(mode: str, port: int, fixture: dict[str, object], runtim
"victim": victim_url,
"admin": f"{admin_origin}/admin/",
"api": f"{admin_origin}/api/v1/docs",
}
},
)
return {
@@ -427,7 +441,13 @@ def _build_probe_config(mode: str, port: int, fixture: dict[str, object], runtim
}
def _run_browser_probe(data_dir: Path, runtime: dict[str, Path], mode: str, fixture: dict[str, object], tmp_path: Path) -> dict[str, object]:
def _run_browser_probe(
data_dir: Path,
runtime: dict[str, Path],
mode: str,
fixture: dict[str, object],
tmp_path: Path,
) -> dict[str, object]:
port = _get_free_port()
process = _start_server(data_dir, mode=mode, port=port)
probe_path = tmp_path / "server_security_probe.js"
@@ -517,7 +537,13 @@ def _run_browser_probe(data_dir: Path, runtime: dict[str, Path], mode: str, fixt
),
],
)
def test_server_security_modes_in_chrome(initialized_archive: Path, browser_runtime, tmp_path: Path, mode: str, expected: dict[str, object]) -> None:
def test_server_security_modes_in_chrome(
initialized_archive: Path,
browser_runtime,
tmp_path: Path,
mode: str,
expected: dict[str, object],
) -> None:
fixture = _seed_archive(initialized_archive)
result = _run_browser_probe(initialized_archive, browser_runtime, mode, fixture, tmp_path)

View File

@@ -12,32 +12,31 @@ import uuid
import pytest
def test_snapshot_creates_snapshot_with_correct_url(tmp_path, process, disable_extractors_dict):
"""Test that snapshot stores the exact URL in the database."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'snapshot', 'create', 'https://example.com'],
["archivebox", "snapshot", "create", "https://example.com"],
capture_output=True,
env={**disable_extractors_dict, 'DATA_DIR': str(tmp_path)},
env={**disable_extractors_dict, "DATA_DIR": str(tmp_path)},
)
conn = sqlite3.connect('index.sqlite3')
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
snapshot_row = c.execute(
"SELECT id, created_at, url, crawl_id FROM core_snapshot WHERE url = ?",
('https://example.com',)
("https://example.com",),
).fetchone()
assert snapshot_row is not None
crawl_row = c.execute(
"SELECT id, created_at, urls, created_by_id FROM crawls_crawl WHERE id = ?",
(snapshot_row[3],)
(snapshot_row[3],),
).fetchone()
assert crawl_row is not None
user_row = c.execute(
"SELECT username FROM auth_user WHERE id = ?",
(crawl_row[3],)
(crawl_row[3],),
).fetchone()
assert user_row is not None
conn.close()
@@ -45,15 +44,12 @@ def test_snapshot_creates_snapshot_with_correct_url(tmp_path, process, disable_e
snapshot_id_raw, snapshot_created_at, snapshot_url, crawl_id = snapshot_row
snapshot_id = str(uuid.UUID(snapshot_id_raw))
username = user_row[0]
snapshot_date_str = datetime.fromisoformat(snapshot_created_at).strftime('%Y%m%d')
domain = urlparse(snapshot_url).hostname or 'unknown'
snapshot_date_str = datetime.fromisoformat(snapshot_created_at).strftime("%Y%m%d")
domain = urlparse(snapshot_url).hostname or "unknown"
# Verify crawl symlink exists and is relative
target_path = tmp_path / 'users' / username / 'snapshots' / snapshot_date_str / domain / snapshot_id
symlinks = [
p for p in tmp_path.rglob(str(snapshot_id))
if p.is_symlink()
]
target_path = tmp_path / "users" / username / "snapshots" / snapshot_date_str / domain / snapshot_id
symlinks = [p for p in tmp_path.rglob(str(snapshot_id)) if p.is_symlink()]
assert symlinks, "Snapshot symlink should exist under crawl dir"
link_path = symlinks[0]
@@ -68,21 +64,25 @@ def test_snapshot_multiple_urls_creates_multiple_records(tmp_path, process, disa
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'snapshot', 'create',
'https://example.com',
'https://iana.org'],
[
"archivebox",
"snapshot",
"create",
"https://example.com",
"https://iana.org",
],
capture_output=True,
env={**disable_extractors_dict, 'DATA_DIR': str(tmp_path)},
env={**disable_extractors_dict, "DATA_DIR": str(tmp_path)},
)
conn = sqlite3.connect('index.sqlite3')
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
urls = c.execute("SELECT url FROM core_snapshot ORDER BY url").fetchall()
conn.close()
urls = [u[0] for u in urls]
assert 'https://example.com' in urls
assert 'https://iana.org' in urls
assert "https://example.com" in urls
assert "https://iana.org" in urls
assert len(urls) >= 2
@@ -91,31 +91,41 @@ def test_snapshot_tag_creates_tag_and_links_to_snapshot(tmp_path, process, disab
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'snapshot', 'create', '--tag=mytesttag',
'https://example.com'],
[
"archivebox",
"snapshot",
"create",
"--tag=mytesttag",
"https://example.com",
],
capture_output=True,
env={**disable_extractors_dict, 'DATA_DIR': str(tmp_path)},
env={**disable_extractors_dict, "DATA_DIR": str(tmp_path)},
)
conn = sqlite3.connect('index.sqlite3')
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
# Verify tag was created
tag = c.execute("SELECT id, name FROM core_tag WHERE name = ?", ('mytesttag',)).fetchone()
tag = c.execute("SELECT id, name FROM core_tag WHERE name = ?", ("mytesttag",)).fetchone()
assert tag is not None, "Tag 'mytesttag' should exist in core_tag"
tag_id = tag[0]
# Verify snapshot exists
snapshot = c.execute("SELECT id FROM core_snapshot WHERE url = ?",
('https://example.com',)).fetchone()
snapshot = c.execute(
"SELECT id FROM core_snapshot WHERE url = ?",
("https://example.com",),
).fetchone()
assert snapshot is not None
snapshot_id = snapshot[0]
# Verify tag is linked to snapshot via join table
link = c.execute("""
link = c.execute(
"""
SELECT * FROM core_snapshot_tags
WHERE snapshot_id = ? AND tag_id = ?
""", (snapshot_id, tag_id)).fetchone()
""",
(snapshot_id, tag_id),
).fetchone()
conn.close()
assert link is not None, "Tag should be linked to snapshot via core_snapshot_tags"
@@ -127,23 +137,23 @@ def test_snapshot_jsonl_output_has_correct_structure(tmp_path, process, disable_
# Pass URL as argument instead of stdin for more reliable behavior
result = subprocess.run(
['archivebox', 'snapshot', 'create', 'https://example.com'],
["archivebox", "snapshot", "create", "https://example.com"],
capture_output=True,
text=True,
env={**disable_extractors_dict, 'DATA_DIR': str(tmp_path)},
env={**disable_extractors_dict, "DATA_DIR": str(tmp_path)},
)
# Parse JSONL output lines
records = Process.parse_records_from_text(result.stdout)
snapshot_records = [r for r in records if r.get('type') == 'Snapshot']
snapshot_records = [r for r in records if r.get("type") == "Snapshot"]
assert len(snapshot_records) >= 1, "Should output at least one Snapshot JSONL record"
record = snapshot_records[0]
assert record.get('type') == 'Snapshot'
assert 'id' in record, "Snapshot record should have 'id' field"
assert 'url' in record, "Snapshot record should have 'url' field"
assert record['url'] == 'https://example.com'
assert record.get("type") == "Snapshot"
assert "id" in record, "Snapshot record should have 'id' field"
assert "url" in record, "Snapshot record should have 'url' field"
assert record["url"] == "https://example.com"
def test_snapshot_with_tag_stores_tag_name(tmp_path, process, disable_extractors_dict):
@@ -152,22 +162,24 @@ def test_snapshot_with_tag_stores_tag_name(tmp_path, process, disable_extractors
# Use command line args instead of stdin
subprocess.run(
['archivebox', 'snapshot', 'create', '--tag=customtag', 'https://example.com'],
["archivebox", "snapshot", "create", "--tag=customtag", "https://example.com"],
capture_output=True,
text=True,
env={**disable_extractors_dict, 'DATA_DIR': str(tmp_path)},
env={**disable_extractors_dict, "DATA_DIR": str(tmp_path)},
)
conn = sqlite3.connect('index.sqlite3')
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
# Verify tag was created with correct name
tag = c.execute("SELECT name FROM core_tag WHERE name = ?",
('customtag',)).fetchone()
tag = c.execute(
"SELECT name FROM core_tag WHERE name = ?",
("customtag",),
).fetchone()
conn.close()
assert tag is not None
assert tag[0] == 'customtag'
assert tag[0] == "customtag"
def test_snapshot_with_depth_sets_snapshot_depth(tmp_path, process, disable_extractors_dict):
@@ -175,13 +187,18 @@ def test_snapshot_with_depth_sets_snapshot_depth(tmp_path, process, disable_extr
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'snapshot', 'create', '--depth=1',
'https://example.com'],
[
"archivebox",
"snapshot",
"create",
"--depth=1",
"https://example.com",
],
capture_output=True,
env={**disable_extractors_dict, 'DATA_DIR': str(tmp_path)},
env={**disable_extractors_dict, "DATA_DIR": str(tmp_path)},
)
conn = sqlite3.connect('index.sqlite3')
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
snapshot = c.execute("SELECT depth FROM core_snapshot ORDER BY created_at DESC LIMIT 1").fetchone()
conn.close()
@@ -196,24 +213,26 @@ def test_snapshot_allows_duplicate_urls_across_crawls(tmp_path, process, disable
# Add same URL twice
subprocess.run(
['archivebox', 'snapshot', 'create', 'https://example.com'],
["archivebox", "snapshot", "create", "https://example.com"],
capture_output=True,
env={**disable_extractors_dict, 'DATA_DIR': str(tmp_path)},
env={**disable_extractors_dict, "DATA_DIR": str(tmp_path)},
)
subprocess.run(
['archivebox', 'snapshot', 'create', 'https://example.com'],
["archivebox", "snapshot", "create", "https://example.com"],
capture_output=True,
env={**disable_extractors_dict, 'DATA_DIR': str(tmp_path)},
env={**disable_extractors_dict, "DATA_DIR": str(tmp_path)},
)
conn = sqlite3.connect('index.sqlite3')
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
count = c.execute("SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
('https://example.com',)).fetchone()[0]
count = c.execute(
"SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
("https://example.com",),
).fetchone()[0]
conn.close()
assert count == 2, "Same URL should create separate snapshots across different crawls"
if __name__ == '__main__':
pytest.main([__file__, '-v'])
if __name__ == "__main__":
pytest.main([__file__, "-v"])

View File

@@ -13,15 +13,15 @@ pytestmark = pytest.mark.django_db
User = get_user_model()
ADMIN_HOST = 'admin.archivebox.localhost:8000'
ADMIN_HOST = "admin.archivebox.localhost:8000"
@pytest.fixture
def admin_user(db):
return cast(UserManager, User.objects).create_superuser(
username='tagadmin',
email='tagadmin@test.com',
password='testpassword',
username="tagadmin",
email="tagadmin@test.com",
password="testpassword",
)
@@ -39,7 +39,7 @@ def crawl(admin_user):
from archivebox.crawls.models import Crawl
return Crawl.objects.create(
urls='https://example.com',
urls="https://example.com",
created_by=admin_user,
)
@@ -48,15 +48,15 @@ def crawl(admin_user):
def tagged_data(crawl, admin_user):
from archivebox.core.models import Snapshot, Tag
tag = Tag.objects.create(name='Alpha Research', created_by=admin_user)
tag = Tag.objects.create(name="Alpha Research", created_by=admin_user)
first = Snapshot.objects.create(
url='https://example.com/one',
title='Example One',
url="https://example.com/one",
title="Example One",
crawl=crawl,
)
second = Snapshot.objects.create(
url='https://example.com/two',
title='Example Two',
url="https://example.com/two",
title="Example Two",
crawl=crawl,
)
first.tags.add(tag)
@@ -65,27 +65,26 @@ def tagged_data(crawl, admin_user):
def test_tag_admin_changelist_renders_custom_ui(client, admin_user, tagged_data):
client.login(username='tagadmin', password='testpassword')
client.login(username="tagadmin", password="testpassword")
response = client.get(reverse('admin:core_tag_changelist'), HTTP_HOST=ADMIN_HOST)
response = client.get(reverse("admin:core_tag_changelist"), HTTP_HOST=ADMIN_HOST)
assert response.status_code == 200
assert b'id="tag-live-search"' in response.content
assert b'id="tag-sort-select"' in response.content
assert b'id="tag-created-by-select"' in response.content
assert b'id="tag-year-select"' in response.content
assert b'id="tag-has-snapshots-select"' in response.content
assert b'Alpha Research' in response.content
assert b"Alpha Research" in response.content
assert b'class="tag-card"' in response.content
def test_tag_admin_add_view_renders_similar_tag_reference(client, admin_user):
client.login(username='tagadmin', password='testpassword')
client.login(username="tagadmin", password="testpassword")
response = client.get(reverse('admin:core_tag_add'), HTTP_HOST=ADMIN_HOST)
response = client.get(reverse("admin:core_tag_add"), HTTP_HOST=ADMIN_HOST)
assert response.status_code == 200
assert b'Similar Tags' in response.content
assert b"Similar Tags" in response.content
assert b'data-tag-name-input="1"' in response.content
@@ -93,40 +92,40 @@ def test_tag_search_api_returns_card_payload(client, api_token, tagged_data):
tag, snapshots = tagged_data
response = client.get(
reverse('api-1:search_tags'),
{'q': 'Alpha', 'api_key': api_token},
reverse("api-1:search_tags"),
{"q": "Alpha", "api_key": api_token},
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
payload = response.json()
assert payload['sort'] == 'created_desc'
assert payload['created_by'] == ''
assert payload['year'] == ''
assert payload['has_snapshots'] == 'all'
assert payload['tags'][0]['id'] == tag.id
assert payload['tags'][0]['name'] == 'Alpha Research'
assert payload['tags'][0]['num_snapshots'] == 2
assert payload['tags'][0]['snapshots'][0]['title'] in {'Example One', 'Example Two'}
assert payload['tags'][0]['export_jsonl_url'].endswith(f'/api/v1/core/tag/{tag.id}/snapshots.jsonl')
assert payload['tags'][0]['filter_url'].endswith(f'/admin/core/snapshot/?tags__id__exact={tag.id}')
assert {snapshot['url'] for snapshot in payload['tags'][0]['snapshots']} == {snap.url for snap in snapshots}
assert payload["sort"] == "created_desc"
assert payload["created_by"] == ""
assert payload["year"] == ""
assert payload["has_snapshots"] == "all"
assert payload["tags"][0]["id"] == tag.id
assert payload["tags"][0]["name"] == "Alpha Research"
assert payload["tags"][0]["num_snapshots"] == 2
assert payload["tags"][0]["snapshots"][0]["title"] in {"Example One", "Example Two"}
assert payload["tags"][0]["export_jsonl_url"].endswith(f"/api/v1/core/tag/{tag.id}/snapshots.jsonl")
assert payload["tags"][0]["filter_url"].endswith(f"/admin/core/snapshot/?tags__id__exact={tag.id}")
assert {snapshot["url"] for snapshot in payload["tags"][0]["snapshots"]} == {snap.url for snap in snapshots}
def test_tag_search_api_respects_sort_and_filters(client, api_token, admin_user, crawl, tagged_data):
from archivebox.core.models import Snapshot, Tag
other_user = cast(UserManager, User.objects).create_user(
username='tagother',
email='tagother@test.com',
password='unused',
username="tagother",
email="tagother@test.com",
password="unused",
)
tag_with_snapshots = tagged_data[0]
empty_tag = Tag.objects.create(name='Zulu Empty', created_by=other_user)
alpha_tag = Tag.objects.create(name='Alpha Empty', created_by=other_user)
empty_tag = Tag.objects.create(name="Zulu Empty", created_by=other_user)
alpha_tag = Tag.objects.create(name="Alpha Empty", created_by=other_user)
Snapshot.objects.create(
url='https://example.com/three',
title='Example Three',
url="https://example.com/three",
title="Example Three",
crawl=crawl,
).tags.add(alpha_tag)
@@ -135,24 +134,24 @@ def test_tag_search_api_respects_sort_and_filters(client, api_token, admin_user,
Tag.objects.filter(pk=tag_with_snapshots.pk).update(created_at=timezone.make_aware(datetime(2026, 1, 1, 12, 0, 0)))
response = client.get(
reverse('api-1:search_tags'),
reverse("api-1:search_tags"),
{
'sort': 'name_desc',
'created_by': str(other_user.pk),
'year': '2024',
'has_snapshots': 'no',
'api_key': api_token,
"sort": "name_desc",
"created_by": str(other_user.pk),
"year": "2024",
"has_snapshots": "no",
"api_key": api_token,
},
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
payload = response.json()
assert payload['sort'] == 'name_desc'
assert payload['created_by'] == str(other_user.pk)
assert payload['year'] == '2024'
assert payload['has_snapshots'] == 'no'
assert [tag['name'] for tag in payload['tags']] == ['Zulu Empty']
assert payload["sort"] == "name_desc"
assert payload["created_by"] == str(other_user.pk)
assert payload["year"] == "2024"
assert payload["has_snapshots"] == "no"
assert [tag["name"] for tag in payload["tags"]] == ["Zulu Empty"]
def test_tag_rename_api_updates_slug(client, api_token, tagged_data):
@@ -160,30 +159,30 @@ def test_tag_rename_api_updates_slug(client, api_token, tagged_data):
response = client.post(
f"{reverse('api-1:rename_tag', args=[tag.id])}?api_key={api_token}",
data=json.dumps({'name': 'Alpha Archive'}),
content_type='application/json',
data=json.dumps({"name": "Alpha Archive"}),
content_type="application/json",
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
tag.refresh_from_db()
assert tag.name == 'Alpha Archive'
assert tag.slug == 'alpha-archive'
assert tag.name == "Alpha Archive"
assert tag.slug == "alpha-archive"
def test_tag_snapshots_export_returns_jsonl(client, api_token, tagged_data):
tag, _ = tagged_data
response = client.get(
reverse('api-1:tag_snapshots_export', args=[tag.id]),
{'api_key': api_token},
reverse("api-1:tag_snapshots_export", args=[tag.id]),
{"api_key": api_token},
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
assert response['Content-Type'].startswith('application/x-ndjson')
assert f'tag-{tag.slug}-snapshots.jsonl' in response['Content-Disposition']
assert response["Content-Type"].startswith("application/x-ndjson")
assert f"tag-{tag.slug}-snapshots.jsonl" in response["Content-Disposition"]
body = response.content.decode()
assert '"type": "Snapshot"' in body
assert '"tags": "Alpha Research"' in body
@@ -193,13 +192,13 @@ def test_tag_urls_export_returns_plain_text_urls(client, api_token, tagged_data)
tag, snapshots = tagged_data
response = client.get(
reverse('api-1:tag_urls_export', args=[tag.id]),
{'api_key': api_token},
reverse("api-1:tag_urls_export", args=[tag.id]),
{"api_key": api_token},
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
assert response['Content-Type'].startswith('text/plain')
assert f'tag-{tag.slug}-urls.txt' in response['Content-Disposition']
assert response["Content-Type"].startswith("text/plain")
assert f"tag-{tag.slug}-urls.txt" in response["Content-Disposition"]
exported_urls = set(filter(None, response.content.decode().splitlines()))
assert exported_urls == {snapshot.url for snapshot in snapshots}

View File

@@ -6,11 +6,12 @@ from .fixtures import disable_extractors_dict, process
FIXTURES = (disable_extractors_dict, process)
def test_title_is_extracted(tmp_path, process, disable_extractors_dict):
"""Test that title is extracted from the page."""
disable_extractors_dict.update({"SAVE_TITLE": "true"})
add_process = subprocess.run(
['archivebox', 'add', '--plugins=title', 'https://example.com'],
["archivebox", "add", "--plugins=title", "https://example.com"],
capture_output=True,
text=True,
env=disable_extractors_dict,
@@ -28,6 +29,7 @@ def test_title_is_extracted(tmp_path, process, disable_extractors_dict):
assert snapshot[0] is not None
assert "Example" in snapshot[0]
def test_title_is_htmlencoded_in_index_html(tmp_path, process, disable_extractors_dict):
"""
https://github.com/ArchiveBox/ArchiveBox/issues/330
@@ -36,7 +38,7 @@ def test_title_is_htmlencoded_in_index_html(tmp_path, process, disable_extractor
"""
disable_extractors_dict.update({"SAVE_TITLE": "true"})
add_process = subprocess.run(
['archivebox', 'add', '--plugins=title', 'https://example.com'],
["archivebox", "add", "--plugins=title", "https://example.com"],
capture_output=True,
text=True,
env=disable_extractors_dict,

View File

@@ -1,28 +1,37 @@
import json
import sqlite3
import subprocess
from datetime import datetime, timedelta
import pytest
from django.utils import timezone
from .fixtures import disable_extractors_dict, process
FIXTURES = (disable_extractors_dict, process)
def test_update_imports_orphaned_snapshots(tmp_path, process, disable_extractors_dict):
"""Test that archivebox update imports real legacy archive directories."""
legacy_timestamp = '1710000000'
legacy_dir = tmp_path / 'archive' / legacy_timestamp
legacy_timestamp = "1710000000"
legacy_dir = tmp_path / "archive" / legacy_timestamp
legacy_dir.mkdir(parents=True, exist_ok=True)
(legacy_dir / 'singlefile.html').write_text('<html>example</html>')
(legacy_dir / 'index.json').write_text(json.dumps({
'url': 'https://example.com',
'timestamp': legacy_timestamp,
'title': 'Example Domain',
'fs_version': '0.8.0',
'archive_results': [],
}))
(legacy_dir / "singlefile.html").write_text("<html>example</html>")
(legacy_dir / "index.json").write_text(
json.dumps(
{
"url": "https://example.com",
"timestamp": legacy_timestamp,
"title": "Example Domain",
"fs_version": "0.8.0",
"archive_results": [],
},
),
)
# Run update without filters - should import and migrate the legacy directory.
update_process = subprocess.run(
['archivebox', 'update'],
["archivebox", "update"],
capture_output=True,
text=True,
env=disable_extractors_dict,
@@ -36,10 +45,151 @@ def test_update_imports_orphaned_snapshots(tmp_path, process, disable_extractors
conn.commit()
conn.close()
assert row == ('https://example.com', '0.9.0')
assert row == ("https://example.com", "0.9.0")
assert legacy_dir.is_symlink()
migrated_dir = legacy_dir.resolve()
assert migrated_dir.exists()
assert (migrated_dir / 'index.jsonl').exists()
assert (migrated_dir / 'singlefile.html').exists()
assert (migrated_dir / "index.jsonl").exists()
assert (migrated_dir / "singlefile.html").exists()
@pytest.mark.django_db
def test_reindex_snapshots_resets_existing_search_results_and_reruns_requested_plugins(monkeypatch):
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.cli.archivebox_update import reindex_snapshots
from archivebox.core.models import ArchiveResult, Snapshot
from archivebox.crawls.models import Crawl
import archivebox.cli.archivebox_extract as extract_mod
crawl = Crawl.objects.create(
urls="https://example.com",
created_by_id=get_or_create_system_user_pk(),
)
snapshot = Snapshot.objects.create(
url="https://example.com",
crawl=crawl,
status=Snapshot.StatusChoices.SEALED,
)
result = ArchiveResult.objects.create(
snapshot=snapshot,
plugin="search_backend_sqlite",
hook_name="on_Snapshot__90_index_sqlite.py",
status=ArchiveResult.StatusChoices.SUCCEEDED,
output_str="old index hit",
output_json={"indexed": True},
output_files={"search.sqlite3": {"size": 123}},
output_size=123,
)
captured: dict[str, object] = {}
def fake_run_plugins(*, args, records, wait, emit_results, plugins=""):
captured["args"] = args
captured["records"] = records
captured["wait"] = wait
captured["emit_results"] = emit_results
captured["plugins"] = plugins
return 0
monkeypatch.setattr(extract_mod, "run_plugins", fake_run_plugins)
stats = reindex_snapshots(
Snapshot.objects.filter(id=snapshot.id),
search_plugins=["search_backend_sqlite"],
batch_size=10,
)
result.refresh_from_db()
assert stats["processed"] == 1
assert stats["queued"] == 1
assert stats["reindexed"] == 1
assert result.status == ArchiveResult.StatusChoices.QUEUED
assert result.output_str == ""
assert result.output_json is None
assert result.output_files == {}
assert captured == {
"args": (),
"records": [{"type": "ArchiveResult", "snapshot_id": str(snapshot.id), "plugin": "search_backend_sqlite"}],
"wait": True,
"emit_results": False,
"plugins": "",
}
@pytest.mark.django_db
def test_build_filtered_snapshots_queryset_respects_resume_cutoff():
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.cli.archivebox_update import _build_filtered_snapshots_queryset
from archivebox.core.models import Snapshot
from archivebox.crawls.models import Crawl
crawl = Crawl.objects.create(
urls="https://example.com\nhttps://example.org\nhttps://example.net",
created_by_id=get_or_create_system_user_pk(),
)
base = timezone.make_aware(datetime(2026, 3, 23, 12, 0, 0))
older = Snapshot.objects.create(
url="https://example.net",
crawl=crawl,
bookmarked_at=base - timedelta(hours=2),
)
middle = Snapshot.objects.create(
url="https://example.org",
crawl=crawl,
bookmarked_at=base - timedelta(hours=1),
)
newer = Snapshot.objects.create(
url="https://example.com",
crawl=crawl,
bookmarked_at=base,
)
snapshots = list(
_build_filtered_snapshots_queryset(
filter_patterns=(),
filter_type="exact",
before=None,
after=None,
resume=middle.timestamp,
).values_list("id", flat=True),
)
assert str(newer.id) not in {str(snapshot_id) for snapshot_id in snapshots}
assert set(map(str, snapshots)) == {str(middle.id), str(older.id)}
@pytest.mark.django_db
def test_reconcile_with_index_json_tolerates_null_title(tmp_path):
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.core.models import Snapshot
from archivebox.crawls.models import Crawl
crawl = Crawl.objects.create(
urls="https://example.com",
created_by_id=get_or_create_system_user_pk(),
)
snapshot = Snapshot.objects.create(
url="https://example.com",
crawl=crawl,
title="Example Domain",
status=Snapshot.StatusChoices.SEALED,
)
output_dir = snapshot.output_dir
output_dir.mkdir(parents=True, exist_ok=True)
(output_dir / "index.json").write_text(
json.dumps(
{
"url": snapshot.url,
"timestamp": snapshot.timestamp,
"title": None,
"archive_results": [],
},
),
)
snapshot.reconcile_with_index_json()
snapshot.refresh_from_db()
assert snapshot.title == "Example Domain"

View File

@@ -49,19 +49,22 @@ def _build_script(body: str) -> str:
from django.contrib.auth import get_user_model
from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.crawls.models import Crawl
from archivebox.config.common import SERVER_CONFIG
from archivebox.core.host_utils import (
get_admin_host,
get_admin_base_url,
get_api_host,
get_web_host,
get_web_base_url,
get_public_host,
get_snapshot_subdomain,
get_snapshot_host,
get_original_host,
get_listen_subdomain,
split_host_port,
host_matches,
is_snapshot_subdomain,
build_admin_url,
build_snapshot_url,
)
@@ -82,45 +85,12 @@ def _build_script(body: str) -> str:
def get_snapshot():
snapshot = Snapshot.objects.order_by("-created_at").first()
if snapshot is None:
admin = ensure_admin_user()
crawl = Crawl.objects.create(
urls="https://example.com",
created_by=admin,
)
snapshot = Snapshot.objects.create(
url="https://example.com",
title="Example Domain",
crawl=crawl,
status=Snapshot.StatusChoices.SEALED,
)
snapshot_dir = Path(snapshot.output_dir)
snapshot_dir.mkdir(parents=True, exist_ok=True)
(snapshot_dir / "index.json").write_text('{"url": "https://example.com"}', encoding="utf-8")
(snapshot_dir / "favicon.ico").write_bytes(b"ico")
screenshot_dir = snapshot_dir / "screenshot"
screenshot_dir.mkdir(parents=True, exist_ok=True)
(screenshot_dir / "screenshot.png").write_bytes(b"png")
responses_root = snapshot_dir / "responses" / snapshot.domain
responses_root.mkdir(parents=True, exist_ok=True)
(responses_root / "index.html").write_text(
"<!doctype html><html><body><h1>Example Domain</h1></body></html>",
encoding="utf-8",
)
ArchiveResult.objects.get_or_create(
snapshot=snapshot,
plugin="screenshot",
defaults={"status": "succeeded", "output_size": 1, "output_str": "."},
)
ArchiveResult.objects.get_or_create(
snapshot=snapshot,
plugin="responses",
defaults={"status": "succeeded", "output_size": 1, "output_str": "."},
)
assert snapshot is not None, "Expected real_archive_with_example to seed a snapshot"
return snapshot
def get_snapshot_files(snapshot):
output_rel = None
reserved_snapshot_paths = {"index.html"}
for output in snapshot.discover_outputs():
candidate = output.get("path")
if not candidate:
@@ -144,10 +114,22 @@ def _build_script(body: str) -> str:
if not candidate.is_file():
continue
rel = candidate.relative_to(responses_root)
if str(rel) in reserved_snapshot_paths:
continue
if not (Path(snapshot.output_dir) / rel).exists():
response_file = candidate
response_rel = str(rel)
break
if response_file is None:
for candidate in responses_root.rglob("*"):
if not candidate.is_file():
continue
rel = candidate.relative_to(responses_root)
if str(rel) in reserved_snapshot_paths:
continue
response_file = candidate
response_rel = str(rel)
break
if response_file is None:
response_file = next(p for p in responses_root.rglob("*") if p.is_file())
response_rel = str(response_file.relative_to(responses_root))
@@ -170,7 +152,7 @@ def _build_script(body: str) -> str:
encoding="utf-8",
)
return "dangerous.html", "safe.json", "dangerous-response"
"""
""",
)
return prelude + "\n" + textwrap.dedent(body)
@@ -179,13 +161,26 @@ class TestUrlRouting:
data_dir: Path
@pytest.fixture(autouse=True)
def _setup_data_dir(self, initialized_archive: Path) -> None:
self.data_dir = initialized_archive
def _setup_data_dir(self, real_archive_with_example: Path) -> None:
self.data_dir = real_archive_with_example
def _run(self, body: str, timeout: int = 120, mode: str | None = None) -> None:
def _run(
self,
body: str,
timeout: int = 120,
mode: str | None = None,
env_overrides: dict[str, str] | None = None,
) -> None:
script = _build_script(body)
env_overrides = {"SERVER_SECURITY_MODE": mode} if mode else None
result = _run_python(script, cwd=self.data_dir, timeout=timeout, env_overrides=env_overrides)
merged_env = dict(env_overrides or {})
if mode:
merged_env["SERVER_SECURITY_MODE"] = mode
result = _run_python(
script,
cwd=self.data_dir,
timeout=timeout,
env_overrides=merged_env or None,
)
assert result.returncode == 0, result.stderr
assert "OK" in result.stdout
@@ -200,6 +195,7 @@ class TestUrlRouting:
admin_host = get_admin_host()
api_host = get_api_host()
public_host = get_public_host()
snapshot_subdomain = get_snapshot_subdomain(snapshot_id)
snapshot_host = get_snapshot_host(snapshot_id)
original_host = get_original_host(domain)
base_host = SERVER_CONFIG.LISTEN_HOST
@@ -211,15 +207,17 @@ class TestUrlRouting:
assert admin_host == "admin.archivebox.localhost:8000"
assert api_host == "api.archivebox.localhost:8000"
assert public_host == "public.archivebox.localhost:8000"
assert snapshot_host == f"{snapshot_id}.archivebox.localhost:8000"
assert snapshot_subdomain == f"snap-{snapshot_id[-12:].lower()}"
assert snapshot_host == f"{snapshot_subdomain}.archivebox.localhost:8000"
assert original_host == f"{domain}.archivebox.localhost:8000"
assert get_listen_subdomain(web_host) == "web"
assert get_listen_subdomain(admin_host) == "admin"
assert get_listen_subdomain(api_host) == "api"
assert get_listen_subdomain(snapshot_host) == snapshot_id
assert get_listen_subdomain(snapshot_host) == snapshot_subdomain
assert get_listen_subdomain(original_host) == domain
assert get_listen_subdomain(base_host) == ""
assert host_matches(web_host, get_web_host())
assert is_snapshot_subdomain(snapshot_subdomain)
assert is_snapshot_subdomain(snapshot_id)
client = Client()
@@ -236,37 +234,77 @@ class TestUrlRouting:
assert resp["Location"].startswith("/api/")
print("OK")
"""
""",
)
def test_web_admin_routing(self) -> None:
self._run(
"""
ensure_admin_user()
snapshot = get_snapshot()
client = Client()
web_host = get_web_host()
public_host = get_public_host()
admin_host = get_admin_host()
snapshot_host = get_snapshot_host(str(snapshot.id))
original_host = get_original_host(snapshot.domain)
resp = client.get("/admin/login/", HTTP_HOST=web_host)
assert resp.status_code in (301, 302)
assert admin_host in resp["Location"]
resp = client.get("/admin/login/?next=/admin/", HTTP_HOST=public_host)
assert resp.status_code in (301, 302)
assert resp["Location"] == f"http://{admin_host}/admin/login/?next=/admin/"
resp = client.get("/admin/login/?next=/admin/", HTTP_HOST=snapshot_host)
assert resp.status_code in (301, 302)
assert resp["Location"] == f"http://{admin_host}/admin/login/?next=/admin/"
resp = client.get("/admin/login/?next=/admin/", HTTP_HOST=original_host)
assert resp.status_code in (301, 302)
assert resp["Location"] == f"http://{admin_host}/admin/login/?next=/admin/"
resp = client.get("/admin/login/", HTTP_HOST=admin_host)
assert resp.status_code == 200
resp = client.get(f"/{snapshot.url_path}", HTTP_HOST=admin_host)
assert resp.status_code in (301, 302)
assert resp["Location"] == f"http://{snapshot_host}"
resp = client.get(f"/{snapshot.url_path}/index.html", HTTP_HOST=admin_host)
assert resp.status_code in (301, 302)
assert resp["Location"] == f"http://{snapshot_host}"
resp = client.get("/static/jquery.min.js", HTTP_HOST=snapshot_host)
assert resp.status_code == 200
assert "javascript" in (resp.headers.get("Content-Type") or "")
resp = client.get("/static/jquery.min.js", HTTP_HOST=original_host)
assert resp.status_code == 200
assert "javascript" in (resp.headers.get("Content-Type") or "")
print("OK")
"""
""",
)
def test_snapshot_routing_and_hosts(self) -> None:
self._run(
"""
import io
import zipfile
snapshot = get_snapshot()
output_rel, response_file, response_rel, response_output_path = get_snapshot_files(snapshot)
snapshot_id = str(snapshot.id)
snapshot_subdomain = get_snapshot_subdomain(snapshot_id)
snapshot_host = get_snapshot_host(snapshot_id)
original_host = get_original_host(snapshot.domain)
web_host = get_web_host()
host_only, port = split_host_port(SERVER_CONFIG.LISTEN_HOST)
legacy_snapshot_host = f"{snapshot_id}.{host_only}"
if port:
legacy_snapshot_host = f"{legacy_snapshot_host}:{port}"
client = Client()
@@ -289,6 +327,11 @@ class TestUrlRouting:
assert resp.status_code in (301, 302)
assert snapshot_host in resp["Location"]
resp = client.get("/", HTTP_HOST=legacy_snapshot_host)
assert resp.status_code in (301, 302)
assert resp["Location"].startswith(f"http://{snapshot_host}")
assert snapshot_subdomain in resp["Location"]
resp = client.get(f"/{output_rel}", HTTP_HOST=snapshot_host)
assert resp.status_code == 200
assert response_body(resp) == Path(snapshot.output_dir, output_rel).read_bytes()
@@ -296,7 +339,10 @@ class TestUrlRouting:
resp = client.get(f"/{response_rel}", HTTP_HOST=snapshot_host)
assert resp.status_code == 200
snapshot_body = response_body(resp)
if response_output_path.exists():
if response_rel == "index.html":
assert f"http://{snapshot_host}/".encode() in snapshot_body
assert b"See all files..." in snapshot_body
elif response_output_path.exists():
assert snapshot_body == response_output_path.read_bytes()
else:
assert snapshot_body == response_file.read_bytes()
@@ -319,8 +365,149 @@ class TestUrlRouting:
files_html = response_body(resp).decode("utf-8", "ignore")
assert output_rel.split("/", 1)[0] in files_html
resp = client.get("/?files=1&download=zip", HTTP_HOST=snapshot_host)
assert resp.status_code == 200
assert resp["Content-Type"] == "application/zip"
assert ".zip" in resp["Content-Disposition"]
assert resp.streaming
with zipfile.ZipFile(io.BytesIO(response_body(resp))) as zip_file:
assert any(name.endswith(f"/{output_rel}") for name in zip_file.namelist())
output_dir = next((output.get("path", "").split("/", 1)[0] for output in snapshot.discover_outputs() if "/" in (output.get("path") or "")), None)
assert output_dir is not None
resp = client.get(f"/{output_dir}/", HTTP_HOST=snapshot_host)
assert resp.status_code == 200
dir_html = response_body(resp).decode("utf-8", "ignore")
assert f"Index of {output_dir}/" in dir_html
print("OK")
""",
)
def test_safe_subdomains_original_domain_host_uses_latest_matching_response(self) -> None:
self._run(
"""
from datetime import timedelta
import shutil
from django.utils import timezone
from archivebox.crawls.models import Crawl
snapshot = get_snapshot()
original_host = get_original_host(snapshot.domain)
client = Client()
assert SERVER_CONFIG.SERVER_SECURITY_MODE == "safe-subdomains-fullreplay"
now = timezone.now()
created_by_id = snapshot.crawl.created_by_id
created_snapshots = []
created_crawls = []
def make_snapshot(url):
crawl = Crawl.objects.create(urls=url, created_by_id=created_by_id)
created_crawls.append(crawl)
snap = Snapshot.objects.create(url=url, crawl=crawl, status=Snapshot.StatusChoices.STARTED)
created_snapshots.append(snap)
return snap
try:
fixtures = (
(make_snapshot("https://example.com"), now + timedelta(minutes=1), "old root"),
(make_snapshot("https://example.com"), now + timedelta(minutes=2), "new root"),
(make_snapshot("https://example.com/about.html"), now + timedelta(minutes=3), "old about"),
(make_snapshot("https://example.com/about.html"), now + timedelta(minutes=4), "new about"),
)
for snap, stamp, content in fixtures:
snap.created_at = stamp
snap.bookmarked_at = stamp
snap.downloaded_at = stamp
snap.save(update_fields=["created_at", "bookmarked_at", "downloaded_at", "modified_at"])
responses_root = Path(snap.output_dir) / "responses" / snap.domain
responses_root.mkdir(parents=True, exist_ok=True)
rel_path = "about.html" if snap.url.endswith("/about.html") else "index.html"
(responses_root / rel_path).write_text(content, encoding="utf-8")
resp = client.get("/", HTTP_HOST=original_host)
assert resp.status_code == 200
root_html = response_body(resp).decode("utf-8", "ignore")
assert "new root" in root_html
assert "old root" not in root_html
resp = client.get("/about.html", HTTP_HOST=original_host)
assert resp.status_code == 200
about_html = response_body(resp).decode("utf-8", "ignore")
assert "new about" in about_html
assert "old about" not in about_html
finally:
for snap in created_snapshots:
shutil.rmtree(snap.output_dir, ignore_errors=True)
for crawl in created_crawls:
crawl.delete()
print("OK")
""",
)
def test_safe_subdomains_original_domain_host_falls_back_to_latest_snapshot_live_page(self) -> None:
self._run(
"""
import shutil
from django.utils import timezone
from archivebox.crawls.models import Crawl
snapshot = get_snapshot()
fallback_domain = "fallback-original-host.example"
original_host = get_original_host(fallback_domain)
client = Client()
assert SERVER_CONFIG.SERVER_SECURITY_MODE == "safe-subdomains-fullreplay"
crawl = Crawl.objects.create(urls=f"https://{fallback_domain}", created_by_id=snapshot.crawl.created_by_id)
latest_snapshot = Snapshot.objects.create(
url=f"https://{fallback_domain}",
crawl=crawl,
status=Snapshot.StatusChoices.STARTED,
)
stamp = timezone.now()
latest_snapshot.created_at = stamp
latest_snapshot.bookmarked_at = stamp
latest_snapshot.downloaded_at = stamp
latest_snapshot.save(update_fields=["created_at", "bookmarked_at", "downloaded_at", "modified_at"])
try:
shutil.rmtree(Path(latest_snapshot.output_dir) / "responses", ignore_errors=True)
resp = client.get("/", HTTP_HOST=original_host)
assert resp.status_code == 200
html = response_body(resp).decode("utf-8", "ignore")
assert latest_snapshot.url in html
assert f"http://{get_snapshot_host(str(latest_snapshot.id))}/" in html
finally:
shutil.rmtree(latest_snapshot.output_dir, ignore_errors=True)
crawl.delete()
print("OK")
""",
)
def test_safe_subdomains_original_domain_host_redirects_to_save_page_now_when_missing_and_authenticated(self) -> None:
self._run(
"""
ensure_admin_user()
client = Client()
client.login(username="testadmin", password="testpassword")
missing_domain = "missing-original-host.example"
original_host = get_original_host(missing_domain)
resp = client.get("/", HTTP_HOST=original_host)
assert resp.status_code in (301, 302)
assert resp["Location"] == f"http://{get_web_host()}/web/https://{missing_domain}"
print("OK")
""",
)
def test_safe_subdomains_fullreplay_leaves_risky_replay_unrestricted(self) -> None:
@@ -346,7 +533,7 @@ class TestUrlRouting:
assert resp.headers.get("Content-Security-Policy") is None
print("OK")
"""
""",
)
def test_safe_onedomain_nojsreplay_routes_and_neuters_risky_documents(self) -> None:
@@ -396,6 +583,9 @@ class TestUrlRouting:
assert resp.headers.get("Content-Security-Policy") is None
assert resp.headers.get("X-Content-Type-Options") == "nosniff"
resp = client.get("/snapshot/{}/singlefile/".format(snapshot_id), HTTP_HOST=base_host)
assert resp.status_code == 404
resp = client.get(f"/snapshot/{snapshot_id}/{sniffed_rel}", HTTP_HOST=base_host)
assert resp.status_code == 200
csp = resp.headers.get("Content-Security-Policy") or ""
@@ -486,6 +676,33 @@ class TestUrlRouting:
mode="danger-onedomain-fullreplay",
)
def test_onedomain_base_url_overrides_are_preserved_for_external_links(self) -> None:
self._run(
"""
snapshot = get_snapshot()
snapshot_id = str(snapshot.id)
base_host = SERVER_CONFIG.LISTEN_HOST
assert SERVER_CONFIG.SERVER_SECURITY_MODE == "safe-onedomain-nojsreplay"
assert get_admin_host() == base_host
assert get_web_host() == base_host
assert get_admin_base_url() == "https://admin.archivebox.example"
assert get_web_base_url() == "https://archivebox.example"
assert build_admin_url("/admin/login/") == "https://admin.archivebox.example/admin/login/"
assert build_snapshot_url(snapshot_id, "index.jsonl") == (
f"https://archivebox.example/snapshot/{snapshot_id}/index.jsonl"
)
print("OK")
""",
mode="safe-onedomain-nojsreplay",
env_overrides={
"ADMIN_BASE_URL": "https://admin.archivebox.example",
"ARCHIVE_BASE_URL": "https://archivebox.example",
},
)
def test_template_and_admin_links(self) -> None:
self._run(
"""
@@ -510,6 +727,25 @@ class TestUrlRouting:
live_html = response_body(resp).decode("utf-8", "ignore")
assert f"http://{snapshot_host}/" in live_html
assert f"http://{public_host}/static/archive.png" in live_html
assert "?preview=1" in live_html
assert "function createMainFrame(previousFrame)" in live_html
assert "function activateCardPreview(card, link)" in live_html
assert "ensureMainFrame(true)" in live_html
assert "previousFrame.parentNode.replaceChild(frame, previousFrame)" in live_html
assert "previousFrame.src = 'about:blank'" in live_html
assert "event.stopImmediatePropagation()" in live_html
assert "const matchingLink = [...document.querySelectorAll('a[target=preview]')].find" in live_html
assert "jQuery(link).click()" not in live_html
assert "searchParams.delete('preview')" in live_html
assert "doc.body.style.flexDirection = 'column'" in live_html
assert "doc.body.style.alignItems = 'center'" in live_html
assert "img.style.margin = '0 auto'" in live_html
assert "window.location.hash = getPreviewHashValue(link)" in live_html
assert "const selectedPreviewHash = decodeURIComponent(window.location.hash.slice(1)).toLowerCase()" in live_html
assert "pointer-events: none;" in live_html
assert "pointer-events: auto;" in live_html
assert 'class="thumbnail-click-overlay"' in live_html
assert "window.location.hash = getPreviewTypeFromPath(link)" not in live_html
assert ">WARC<" not in live_html
assert ">Media<" not in live_html
assert ">Git<" not in live_html
@@ -517,6 +753,25 @@ class TestUrlRouting:
static_html = Path(snapshot.output_dir, "index.html").read_text(encoding="utf-8", errors="ignore")
assert f"http://{snapshot_host}/" in static_html
assert f"http://{public_host}/static/archive.png" in static_html
assert "?preview=1" in static_html
assert "function createMainFrame(previousFrame)" in static_html
assert "function activateCardPreview(card, link)" in static_html
assert "ensureMainFrame(true)" in static_html
assert "previousFrame.parentNode.replaceChild(frame, previousFrame)" in static_html
assert "previousFrame.src = 'about:blank'" in static_html
assert "e.stopImmediatePropagation()" in static_html
assert "const matchingLink = [...document.querySelectorAll('a[target=preview]')].find" in static_html
assert "jQuery(link).click()" not in static_html
assert "searchParams.delete('preview')" in static_html
assert "doc.body.style.flexDirection = 'column'" in static_html
assert "doc.body.style.alignItems = 'center'" in static_html
assert "img.style.margin = '0 auto'" in static_html
assert "window.location.hash = getPreviewHashValue(link)" in static_html
assert "const selectedPreviewHash = decodeURIComponent(window.location.hash.slice(1)).toLowerCase()" in static_html
assert "pointer-events: none;" in static_html
assert "pointer-events: auto;" in static_html
assert 'class="thumbnail-click-overlay"' in static_html
assert "window.location.hash = getPreviewTypeFromPath(link)" not in static_html
assert ">WARC<" not in static_html
assert ">Media<" not in static_html
assert ">Git<" not in static_html
@@ -536,7 +791,53 @@ class TestUrlRouting:
assert f"http://{snapshot_host}/" in ar_html
print("OK")
""",
)
def test_snapshot_pages_preview_filesystem_text_outputs(self) -> None:
self._run(
"""
snapshot = get_snapshot()
web_host = get_web_host()
consolelog_dir = Path(snapshot.output_dir) / "consolelog"
consolelog_dir.mkdir(parents=True, exist_ok=True)
(consolelog_dir / "console.jsonl").write_text(
'{"level":"log","text":"console preview works"}\\n'
'{"level":"warn","text":"second line"}\\n',
encoding="utf-8",
)
client = Client()
resp = client.get(f"/{snapshot.url_path}/index.html", HTTP_HOST=web_host)
assert resp.status_code == 200
live_html = response_body(resp).decode("utf-8", "ignore")
assert 'data-plugin="consolelog" data-compact="1"' in live_html
assert "console preview works" in live_html
snapshot_host = get_snapshot_host(str(snapshot.id))
resp = client.get("/consolelog/console.jsonl?preview=1", HTTP_HOST=snapshot_host)
assert resp.status_code == 200
assert resp["Content-Type"].startswith("text/html")
preview_html = response_body(resp).decode("utf-8", "ignore")
assert "archivebox-text-preview" in preview_html
assert "console preview works" in preview_html
screenshot_dir = Path(snapshot.output_dir) / "screenshot"
screenshot_dir.mkdir(parents=True, exist_ok=True)
(screenshot_dir / "screenshot.png").write_bytes(
bytes.fromhex(
"89504e470d0a1a0a"
"0000000d49484452000000010000000108060000001f15c489"
"0000000d49444154789c63f8ffffff7f0009fb03fd2a86e38a"
"0000000049454e44ae426082",
),
)
resp = client.get("/screenshot/screenshot.png?preview=1", HTTP_HOST=snapshot_host)
assert resp.status_code == 200
assert resp["Content-Type"].startswith("text/html")
print("OK")
""",
)
def test_api_available_on_admin_and_api_hosts(self) -> None:
@@ -553,7 +854,7 @@ class TestUrlRouting:
assert resp.status_code == 200
print("OK")
"""
""",
)
def test_api_auth_token_endpoint_available_on_admin_and_api_hosts(self) -> None:
@@ -587,7 +888,7 @@ class TestUrlRouting:
assert data.get("token")
print("OK")
"""
""",
)
def test_api_post_with_token_on_admin_and_api_hosts(self) -> None:
@@ -631,5 +932,5 @@ class TestUrlRouting:
assert data.get("tag_name") == "apitest-tag"
print("OK")
"""
""",
)

View File

@@ -16,6 +16,7 @@ class _ExampleHandler(BaseHTTPRequestHandler):
def log_message(self, format, *args):
return
def test_download_url_downloads_content():
server = ThreadingHTTPServer(("127.0.0.1", 0), _ExampleHandler)
thread = Thread(target=server.serve_forever, daemon=True)