mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
462 lines
15 KiB
Python
462 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Comprehensive tests for archivebox add command.
|
|
Verify add creates snapshots in DB, crawls, source files, and archive directories.
|
|
"""
|
|
|
|
import os
|
|
import sqlite3
|
|
import subprocess
|
|
from pathlib import Path
|
|
|
|
|
|
def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None:
|
|
candidates = {snapshot_id}
|
|
if len(snapshot_id) == 32:
|
|
candidates.add(f"{snapshot_id[:8]}-{snapshot_id[8:12]}-{snapshot_id[12:16]}-{snapshot_id[16:20]}-{snapshot_id[20:]}")
|
|
elif len(snapshot_id) == 36 and "-" in snapshot_id:
|
|
candidates.add(snapshot_id.replace("-", ""))
|
|
|
|
for needle in candidates:
|
|
for path in data_dir.rglob(needle):
|
|
if path.is_dir():
|
|
return path
|
|
return None
|
|
|
|
|
|
def test_add_single_url_creates_snapshot_in_db(tmp_path, process, disable_extractors_dict):
|
|
"""Test that adding a single URL creates a snapshot in the database."""
|
|
os.chdir(tmp_path)
|
|
result = subprocess.run(
|
|
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
assert result.returncode == 0
|
|
|
|
conn = sqlite3.connect("index.sqlite3")
|
|
c = conn.cursor()
|
|
snapshots = c.execute("SELECT url FROM core_snapshot").fetchall()
|
|
conn.close()
|
|
|
|
assert len(snapshots) == 1
|
|
assert snapshots[0][0] == "https://example.com"
|
|
|
|
|
|
def test_add_bg_creates_root_snapshot_rows_immediately(tmp_path, process, disable_extractors_dict):
|
|
"""Background add should create root snapshots immediately so the queue is visible in the DB."""
|
|
os.chdir(tmp_path)
|
|
result = subprocess.run(
|
|
["archivebox", "add", "--bg", "--depth=0", "https://example.com"],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
assert result.returncode == 0
|
|
|
|
conn = sqlite3.connect("index.sqlite3")
|
|
c = conn.cursor()
|
|
snapshots = c.execute("SELECT url, status FROM core_snapshot").fetchall()
|
|
conn.close()
|
|
|
|
assert len(snapshots) == 1
|
|
assert snapshots[0][0] == "https://example.com"
|
|
assert snapshots[0][1] == "queued"
|
|
|
|
|
|
def test_add_creates_crawl_record(tmp_path, process, disable_extractors_dict):
|
|
"""Test that add command creates a Crawl record in the database."""
|
|
os.chdir(tmp_path)
|
|
subprocess.run(
|
|
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
conn = sqlite3.connect("index.sqlite3")
|
|
c = conn.cursor()
|
|
crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0]
|
|
conn.close()
|
|
|
|
assert crawl_count == 1
|
|
|
|
|
|
def test_add_creates_source_file(tmp_path, process, disable_extractors_dict):
|
|
"""Test that add creates a source file with the URL."""
|
|
os.chdir(tmp_path)
|
|
subprocess.run(
|
|
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
sources_dir = tmp_path / "sources"
|
|
assert sources_dir.exists()
|
|
|
|
source_files = list(sources_dir.glob("*cli_add.txt"))
|
|
assert len(source_files) >= 1
|
|
|
|
source_content = source_files[0].read_text()
|
|
assert "https://example.com" in source_content
|
|
|
|
|
|
def test_add_multiple_urls_single_command(tmp_path, process, disable_extractors_dict):
|
|
"""Test adding multiple URLs in a single command."""
|
|
os.chdir(tmp_path)
|
|
result = subprocess.run(
|
|
["archivebox", "add", "--index-only", "--depth=0", "https://example.com", "https://example.org"],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
assert result.returncode == 0
|
|
|
|
conn = sqlite3.connect("index.sqlite3")
|
|
c = conn.cursor()
|
|
snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
|
|
urls = c.execute("SELECT url FROM core_snapshot ORDER BY url").fetchall()
|
|
conn.close()
|
|
|
|
assert snapshot_count == 2
|
|
assert urls[0][0] == "https://example.com"
|
|
assert urls[1][0] == "https://example.org"
|
|
|
|
|
|
def test_add_from_file(tmp_path, process, disable_extractors_dict):
|
|
"""Test adding URLs from a file.
|
|
|
|
The add command should treat a file argument as URL input and create snapshots
|
|
for each URL it contains.
|
|
"""
|
|
os.chdir(tmp_path)
|
|
|
|
# Create a file with URLs
|
|
urls_file = tmp_path / "urls.txt"
|
|
urls_file.write_text("https://example.com\nhttps://example.org\n")
|
|
|
|
result = subprocess.run(
|
|
["archivebox", "add", "--index-only", "--depth=0", str(urls_file)],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
assert result.returncode == 0
|
|
|
|
conn = sqlite3.connect("index.sqlite3")
|
|
c = conn.cursor()
|
|
crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0]
|
|
snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
|
|
conn.close()
|
|
|
|
# The file is parsed into two input URLs.
|
|
assert crawl_count == 1
|
|
assert snapshot_count == 2
|
|
|
|
|
|
def test_add_with_depth_0_flag(tmp_path, process, disable_extractors_dict):
|
|
"""Test that --depth=0 flag is accepted and works."""
|
|
os.chdir(tmp_path)
|
|
result = subprocess.run(
|
|
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
assert result.returncode == 0
|
|
assert "unrecognized arguments: --depth" not in result.stderr.decode("utf-8")
|
|
|
|
|
|
def test_add_with_depth_1_flag(tmp_path, process, disable_extractors_dict):
|
|
"""Test that --depth=1 flag is accepted."""
|
|
os.chdir(tmp_path)
|
|
result = subprocess.run(
|
|
["archivebox", "add", "--index-only", "--depth=1", "https://example.com"],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
assert result.returncode == 0
|
|
assert "unrecognized arguments: --depth" not in result.stderr.decode("utf-8")
|
|
|
|
|
|
def test_add_rejects_invalid_depth_values(tmp_path, process, disable_extractors_dict):
|
|
"""Test that add rejects depth values outside the supported range."""
|
|
os.chdir(tmp_path)
|
|
|
|
for depth in ("5", "-1"):
|
|
result = subprocess.run(
|
|
["archivebox", "add", "--index-only", f"--depth={depth}", "https://example.com"],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
stderr = result.stderr.decode("utf-8").lower()
|
|
assert result.returncode != 0
|
|
assert "invalid" in stderr or "not one of" in stderr
|
|
|
|
|
|
def test_add_with_tags(tmp_path, process, disable_extractors_dict):
|
|
"""Test adding URL with tags stores tags_str in crawl.
|
|
|
|
With --index-only, Tag objects are not created until archiving happens.
|
|
Tags are stored as a string in the Crawl.tags_str field.
|
|
"""
|
|
os.chdir(tmp_path)
|
|
subprocess.run(
|
|
["archivebox", "add", "--index-only", "--depth=0", "--tag=test,example", "https://example.com"],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
conn = sqlite3.connect("index.sqlite3")
|
|
c = conn.cursor()
|
|
tags_str = c.execute("SELECT tags_str FROM crawls_crawl").fetchone()[0]
|
|
conn.close()
|
|
|
|
# Tags are stored as a comma-separated string in crawl
|
|
assert "test" in tags_str or "example" in tags_str
|
|
|
|
|
|
def test_add_records_selected_persona_on_crawl(tmp_path, process, disable_extractors_dict):
|
|
"""Test add persists the selected persona so browser config derives from it later."""
|
|
os.chdir(tmp_path)
|
|
result = subprocess.run(
|
|
["archivebox", "add", "--index-only", "--depth=0", "--persona=Default", "https://example.com"],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
assert result.returncode == 0
|
|
|
|
conn = sqlite3.connect("index.sqlite3")
|
|
c = conn.cursor()
|
|
persona_id, default_persona = c.execute(
|
|
"SELECT persona_id, json_extract(config, '$.DEFAULT_PERSONA') FROM crawls_crawl LIMIT 1",
|
|
).fetchone()
|
|
conn.close()
|
|
|
|
assert persona_id
|
|
assert default_persona == "Default"
|
|
assert (tmp_path / "personas" / "Default" / "chrome_user_data").is_dir()
|
|
|
|
|
|
def test_add_records_url_filter_overrides_on_crawl(tmp_path, process, disable_extractors_dict):
|
|
os.chdir(tmp_path)
|
|
result = subprocess.run(
|
|
[
|
|
"archivebox",
|
|
"add",
|
|
"--index-only",
|
|
"--depth=0",
|
|
"--domain-allowlist=example.com,*.example.com",
|
|
"--domain-denylist=static.example.com",
|
|
"https://example.com",
|
|
],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
assert result.returncode == 0
|
|
|
|
conn = sqlite3.connect("index.sqlite3")
|
|
c = conn.cursor()
|
|
allowlist, denylist = c.execute(
|
|
"SELECT json_extract(config, '$.URL_ALLOWLIST'), json_extract(config, '$.URL_DENYLIST') FROM crawls_crawl LIMIT 1",
|
|
).fetchone()
|
|
conn.close()
|
|
|
|
assert allowlist == "example.com,*.example.com"
|
|
assert denylist == "static.example.com"
|
|
assert (tmp_path / "personas" / "Default" / "chrome_extensions").is_dir()
|
|
|
|
|
|
def test_add_duplicate_url_creates_separate_crawls(tmp_path, process, disable_extractors_dict):
|
|
"""Test that adding the same URL twice creates separate crawls and snapshots.
|
|
|
|
Each 'add' command creates a new Crawl. Multiple crawls can archive the same URL.
|
|
This allows re-archiving URLs at different times.
|
|
"""
|
|
os.chdir(tmp_path)
|
|
|
|
# Add URL first time
|
|
subprocess.run(
|
|
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
# Add same URL second time
|
|
subprocess.run(
|
|
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
conn = sqlite3.connect("index.sqlite3")
|
|
c = conn.cursor()
|
|
snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot WHERE url='https://example.com'").fetchone()[0]
|
|
crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0]
|
|
conn.close()
|
|
|
|
# Each add creates a new crawl with its own snapshot
|
|
assert crawl_count == 2
|
|
assert snapshot_count == 2
|
|
|
|
|
|
def test_add_with_overwrite_flag(tmp_path, process, disable_extractors_dict):
|
|
"""Test that --overwrite flag forces re-archiving."""
|
|
os.chdir(tmp_path)
|
|
|
|
# Add URL first time
|
|
subprocess.run(
|
|
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
# Add with overwrite
|
|
result = subprocess.run(
|
|
["archivebox", "add", "--index-only", "--overwrite", "https://example.com"],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
assert result.returncode == 0
|
|
assert "unrecognized arguments: --overwrite" not in result.stderr.decode("utf-8")
|
|
|
|
|
|
def test_add_creates_snapshot_output_directory(tmp_path, process, disable_extractors_dict):
|
|
"""Test that add creates the current snapshot output directory on disk."""
|
|
os.chdir(tmp_path)
|
|
subprocess.run(
|
|
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
conn = sqlite3.connect("index.sqlite3")
|
|
c = conn.cursor()
|
|
snapshot_id = str(c.execute("SELECT id FROM core_snapshot").fetchone()[0])
|
|
conn.close()
|
|
|
|
snapshot_dir = _find_snapshot_dir(tmp_path, snapshot_id)
|
|
assert snapshot_dir is not None, f"Snapshot output directory not found for {snapshot_id}"
|
|
assert snapshot_dir.is_dir()
|
|
|
|
|
|
def test_add_help_shows_depth_and_tag_options(tmp_path, process):
|
|
"""Test that add --help documents the main filter and crawl options."""
|
|
os.chdir(tmp_path)
|
|
|
|
result = subprocess.run(
|
|
["archivebox", "add", "--help"],
|
|
capture_output=True,
|
|
text=True,
|
|
)
|
|
|
|
assert result.returncode == 0
|
|
assert "--depth" in result.stdout
|
|
assert "--max-urls" in result.stdout
|
|
assert "--max-size" in result.stdout
|
|
assert "--tag" in result.stdout
|
|
|
|
|
|
def test_add_records_max_url_and_size_limits_on_crawl(tmp_path, process, disable_extractors_dict):
|
|
os.chdir(tmp_path)
|
|
result = subprocess.run(
|
|
["archivebox", "add", "--index-only", "--depth=1", "--max-urls=3", "--max-size=45mb", "https://example.com"],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
assert result.returncode == 0
|
|
|
|
conn = sqlite3.connect("index.sqlite3")
|
|
c = conn.cursor()
|
|
max_urls, max_size, config_max_urls, config_max_size = c.execute(
|
|
"SELECT max_urls, max_size, json_extract(config, '$.MAX_URLS'), json_extract(config, '$.MAX_SIZE') FROM crawls_crawl LIMIT 1",
|
|
).fetchone()
|
|
conn.close()
|
|
|
|
assert max_urls == 3
|
|
assert max_size == 45 * 1024 * 1024
|
|
assert config_max_urls == 3
|
|
assert config_max_size == 45 * 1024 * 1024
|
|
|
|
|
|
def test_add_without_args_shows_usage(tmp_path, process):
|
|
"""Test that add without URLs fails with a usage hint instead of crashing."""
|
|
os.chdir(tmp_path)
|
|
|
|
result = subprocess.run(
|
|
["archivebox", "add"],
|
|
capture_output=True,
|
|
text=True,
|
|
)
|
|
|
|
combined = result.stdout + result.stderr
|
|
assert result.returncode != 0
|
|
assert "usage" in combined.lower() or "url" in combined.lower()
|
|
|
|
|
|
def test_add_index_only_skips_extraction(tmp_path, process, disable_extractors_dict):
|
|
"""Test that --index-only flag skips extraction (fast)."""
|
|
os.chdir(tmp_path)
|
|
result = subprocess.run(
|
|
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
timeout=30, # Should be fast
|
|
)
|
|
|
|
assert result.returncode == 0
|
|
|
|
# Snapshot should exist but archive results should be minimal
|
|
conn = sqlite3.connect("index.sqlite3")
|
|
c = conn.cursor()
|
|
snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
|
|
conn.close()
|
|
|
|
assert snapshot_count == 1
|
|
|
|
|
|
def test_add_links_snapshot_to_crawl(tmp_path, process, disable_extractors_dict):
|
|
"""Test that add links the snapshot to the crawl via crawl_id."""
|
|
os.chdir(tmp_path)
|
|
subprocess.run(
|
|
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
conn = sqlite3.connect("index.sqlite3")
|
|
c = conn.cursor()
|
|
|
|
# Get crawl id
|
|
crawl_id = c.execute("SELECT id FROM crawls_crawl").fetchone()[0]
|
|
|
|
# Get snapshot's crawl_id
|
|
snapshot_crawl = c.execute("SELECT crawl_id FROM core_snapshot").fetchone()[0]
|
|
|
|
conn.close()
|
|
|
|
assert snapshot_crawl == crawl_id
|
|
|
|
|
|
def test_add_sets_snapshot_timestamp(tmp_path, process, disable_extractors_dict):
|
|
"""Test that add sets a timestamp on the snapshot."""
|
|
os.chdir(tmp_path)
|
|
subprocess.run(
|
|
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
conn = sqlite3.connect("index.sqlite3")
|
|
c = conn.cursor()
|
|
timestamp = c.execute("SELECT timestamp FROM core_snapshot").fetchone()[0]
|
|
conn.close()
|
|
|
|
assert timestamp is not None
|
|
assert len(str(timestamp)) > 0
|