This commit is contained in:
Nick Sweeting
2025-12-28 17:51:54 -08:00
parent 54f91c1339
commit f0aa19fa7d
157 changed files with 6774 additions and 5061 deletions

290
tests/test_cli_add.py Normal file
View File

@@ -0,0 +1,290 @@
#!/usr/bin/env python3
"""
Comprehensive tests for archivebox add command.
Verify add creates snapshots in DB, crawls, source files, and archive directories.
"""
import os
import subprocess
import sqlite3
from pathlib import Path
from .fixtures import *
def test_add_single_url_creates_snapshot_in_db(tmp_path, process, disable_extractors_dict):
"""Test that adding a single URL creates a snapshot in the database."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
assert result.returncode == 0
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
snapshots = c.execute("SELECT url FROM core_snapshot").fetchall()
conn.close()
assert len(snapshots) == 1
assert snapshots[0][0] == 'https://example.com'
def test_add_creates_crawl_record(tmp_path, process, disable_extractors_dict):
"""Test that add command creates a Crawl record in the database."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0]
conn.close()
assert crawl_count == 1
def test_add_creates_source_file(tmp_path, process, disable_extractors_dict):
"""Test that add creates a source file with the URL."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
sources_dir = tmp_path / "sources"
assert sources_dir.exists()
source_files = list(sources_dir.glob("*cli_add.txt"))
assert len(source_files) >= 1
source_content = source_files[0].read_text()
assert "https://example.com" in source_content
def test_add_multiple_urls_single_command(tmp_path, process, disable_extractors_dict):
"""Test adding multiple URLs in a single command."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com', 'https://example.org'],
capture_output=True,
env=disable_extractors_dict,
)
assert result.returncode == 0
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
urls = c.execute("SELECT url FROM core_snapshot ORDER BY url").fetchall()
conn.close()
assert snapshot_count == 2
assert urls[0][0] == 'https://example.com'
assert urls[1][0] == 'https://example.org'
def test_add_from_file(tmp_path, process, disable_extractors_dict):
"""Test adding URLs from a file."""
os.chdir(tmp_path)
# Create a file with URLs
urls_file = tmp_path / "urls.txt"
urls_file.write_text("https://example.com\nhttps://example.org\n")
result = subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', str(urls_file)],
capture_output=True,
env=disable_extractors_dict,
)
assert result.returncode == 0
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
conn.close()
assert snapshot_count == 2
def test_add_with_depth_0_flag(tmp_path, process, disable_extractors_dict):
"""Test that --depth=0 flag is accepted and works."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
assert result.returncode == 0
assert 'unrecognized arguments: --depth' not in result.stderr.decode('utf-8')
def test_add_with_depth_1_flag(tmp_path, process, disable_extractors_dict):
"""Test that --depth=1 flag is accepted."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'add', '--index-only', '--depth=1', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
assert result.returncode == 0
assert 'unrecognized arguments: --depth' not in result.stderr.decode('utf-8')
def test_add_with_tags(tmp_path, process, disable_extractors_dict):
"""Test adding URL with tags creates tag records."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', '--tag=test,example', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
tags = c.execute("SELECT name FROM core_tag").fetchall()
conn.close()
tag_names = [t[0] for t in tags]
assert 'test' in tag_names or 'example' in tag_names
def test_add_duplicate_url_updates_existing(tmp_path, process, disable_extractors_dict):
"""Test that adding the same URL twice updates rather than duplicates."""
os.chdir(tmp_path)
# Add URL first time
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
# Add same URL second time
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot WHERE url='https://example.com'").fetchone()[0]
conn.close()
# Should still only have one snapshot for this URL
assert snapshot_count == 1
def test_add_with_overwrite_flag(tmp_path, process, disable_extractors_dict):
"""Test that --overwrite flag forces re-archiving."""
os.chdir(tmp_path)
# Add URL first time
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
# Add with overwrite
result = subprocess.run(
['archivebox', 'add', '--index-only', '--overwrite', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
assert result.returncode == 0
assert 'unrecognized arguments: --overwrite' not in result.stderr.decode('utf-8')
def test_add_creates_archive_subdirectory(tmp_path, process, disable_extractors_dict):
"""Test that add creates archive subdirectory for the snapshot."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
# Get the snapshot ID from the database
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
snapshot_id = c.execute("SELECT id FROM core_snapshot").fetchone()[0]
conn.close()
# Check that archive subdirectory was created
archive_dir = tmp_path / "archive" / snapshot_id
assert archive_dir.exists()
assert archive_dir.is_dir()
def test_add_index_only_skips_extraction(tmp_path, process, disable_extractors_dict):
"""Test that --index-only flag skips extraction (fast)."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
timeout=30, # Should be fast
)
assert result.returncode == 0
# Snapshot should exist but archive results should be minimal
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
conn.close()
assert snapshot_count == 1
def test_add_links_snapshot_to_crawl(tmp_path, process, disable_extractors_dict):
"""Test that add links the snapshot to the crawl via crawl_id."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
# Get crawl id
crawl_id = c.execute("SELECT id FROM crawls_crawl").fetchone()[0]
# Get snapshot's crawl_id
snapshot_crawl = c.execute("SELECT crawl_id FROM core_snapshot").fetchone()[0]
conn.close()
assert snapshot_crawl == crawl_id
def test_add_sets_snapshot_timestamp(tmp_path, process, disable_extractors_dict):
"""Test that add sets a timestamp on the snapshot."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
timestamp = c.execute("SELECT timestamp FROM core_snapshot").fetchone()[0]
conn.close()
assert timestamp is not None
assert len(str(timestamp)) > 0

32
tests/test_cli_help.py Normal file
View File

@@ -0,0 +1,32 @@
#!/usr/bin/env python3
"""
Tests for archivebox help command.
Verify command runs successfully and produces output.
"""
import os
import subprocess
from .fixtures import *
def test_help_runs_successfully(tmp_path):
"""Test that help command runs and produces output."""
os.chdir(tmp_path)
result = subprocess.run(['archivebox', 'help'], capture_output=True, text=True)
assert result.returncode == 0
combined = result.stdout + result.stderr
assert len(combined) > 100
assert 'archivebox' in combined.lower()
def test_help_in_initialized_dir(tmp_path, process):
"""Test help command in initialized data directory."""
os.chdir(tmp_path)
result = subprocess.run(['archivebox', 'help'], capture_output=True, text=True)
assert result.returncode == 0
combined = result.stdout + result.stderr
assert 'init' in combined
assert 'add' in combined

250
tests/test_cli_init.py Normal file
View File

@@ -0,0 +1,250 @@
#!/usr/bin/env python3
"""
Comprehensive tests for archivebox init command.
Verify init creates correct database schema, filesystem structure, and config.
"""
import os
import subprocess
import sqlite3
from pathlib import Path
from archivebox.config.common import STORAGE_CONFIG
from .fixtures import *
DIR_PERMISSIONS = STORAGE_CONFIG.OUTPUT_PERMISSIONS.replace('6', '7').replace('4', '5')
def test_init_creates_database_file(tmp_path):
"""Test that init creates index.sqlite3 database file."""
os.chdir(tmp_path)
result = subprocess.run(['archivebox', 'init'], capture_output=True)
assert result.returncode == 0
db_path = tmp_path / "index.sqlite3"
assert db_path.exists()
assert db_path.is_file()
def test_init_creates_archive_directory(tmp_path):
"""Test that init creates archive directory."""
os.chdir(tmp_path)
subprocess.run(['archivebox', 'init'], capture_output=True)
archive_dir = tmp_path / "archive"
assert archive_dir.exists()
assert archive_dir.is_dir()
def test_init_creates_sources_directory(tmp_path):
"""Test that init creates sources directory."""
os.chdir(tmp_path)
subprocess.run(['archivebox', 'init'], capture_output=True)
sources_dir = tmp_path / "sources"
assert sources_dir.exists()
assert sources_dir.is_dir()
def test_init_creates_logs_directory(tmp_path):
"""Test that init creates logs directory."""
os.chdir(tmp_path)
subprocess.run(['archivebox', 'init'], capture_output=True)
logs_dir = tmp_path / "logs"
assert logs_dir.exists()
assert logs_dir.is_dir()
def test_init_creates_config_file(tmp_path):
"""Test that init creates ArchiveBox.conf config file."""
os.chdir(tmp_path)
subprocess.run(['archivebox', 'init'], capture_output=True)
config_file = tmp_path / "ArchiveBox.conf"
assert config_file.exists()
assert config_file.is_file()
def test_init_runs_migrations(tmp_path):
"""Test that init runs Django migrations and creates core tables."""
os.chdir(tmp_path)
subprocess.run(['archivebox', 'init'], capture_output=True)
# Check that migrations were applied
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
# Check django_migrations table exists
migrations = c.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='django_migrations'"
).fetchall()
assert len(migrations) == 1
# Check that some migrations were applied
migration_count = c.execute("SELECT COUNT(*) FROM django_migrations").fetchone()[0]
assert migration_count > 0
conn.close()
def test_init_creates_core_snapshot_table(tmp_path):
"""Test that init creates core_snapshot table."""
os.chdir(tmp_path)
subprocess.run(['archivebox', 'init'], capture_output=True)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
# Check core_snapshot table exists
tables = c.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='core_snapshot'"
).fetchall()
assert len(tables) == 1
conn.close()
def test_init_creates_crawls_crawl_table(tmp_path):
"""Test that init creates crawls_crawl table."""
os.chdir(tmp_path)
subprocess.run(['archivebox', 'init'], capture_output=True)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
# Check crawls_crawl table exists
tables = c.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='crawls_crawl'"
).fetchall()
assert len(tables) == 1
conn.close()
def test_init_creates_core_archiveresult_table(tmp_path):
"""Test that init creates core_archiveresult table."""
os.chdir(tmp_path)
subprocess.run(['archivebox', 'init'], capture_output=True)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
# Check core_archiveresult table exists
tables = c.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='core_archiveresult'"
).fetchall()
assert len(tables) == 1
conn.close()
def test_init_sets_correct_file_permissions(tmp_path):
"""Test that init sets correct permissions on created files."""
os.chdir(tmp_path)
subprocess.run(['archivebox', 'init'], capture_output=True)
# Check database permissions
db_path = tmp_path / "index.sqlite3"
assert oct(db_path.stat().st_mode)[-3:] in (STORAGE_CONFIG.OUTPUT_PERMISSIONS, DIR_PERMISSIONS)
# Check directory permissions
archive_dir = tmp_path / "archive"
assert oct(archive_dir.stat().st_mode)[-3:] in (STORAGE_CONFIG.OUTPUT_PERMISSIONS, DIR_PERMISSIONS)
def test_init_is_idempotent(tmp_path):
"""Test that running init multiple times is safe (idempotent)."""
os.chdir(tmp_path)
# First init
result1 = subprocess.run(['archivebox', 'init'], capture_output=True, text=True)
assert result1.returncode == 0
assert "Initializing a new ArchiveBox" in result1.stdout
# Second init should update, not fail
result2 = subprocess.run(['archivebox', 'init'], capture_output=True, text=True)
assert result2.returncode == 0
assert "updating existing ArchiveBox" in result2.stdout or "up-to-date" in result2.stdout.lower()
# Database should still be valid
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
count = c.execute("SELECT COUNT(*) FROM django_migrations").fetchone()[0]
assert count > 0
conn.close()
def test_init_with_existing_data_preserves_snapshots(tmp_path, process, disable_extractors_dict):
"""Test that re-running init preserves existing snapshot data."""
os.chdir(tmp_path)
# Add a snapshot
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
# Check snapshot was created
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
count_before = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
assert count_before == 1
conn.close()
# Run init again
result = subprocess.run(['archivebox', 'init'], capture_output=True)
assert result.returncode == 0
# Snapshot should still exist
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
count_after = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
assert count_after == count_before
conn.close()
def test_init_quick_flag_skips_checks(tmp_path):
"""Test that init --quick runs faster by skipping some checks."""
os.chdir(tmp_path)
result = subprocess.run(['archivebox', 'init', '--quick'], capture_output=True, text=True)
assert result.returncode == 0
# Database should still be created
db_path = tmp_path / "index.sqlite3"
assert db_path.exists()
def test_init_creates_machine_record(tmp_path):
"""Test that init creates a Machine record in machine_machine table."""
os.chdir(tmp_path)
subprocess.run(['archivebox', 'init'], capture_output=True)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
# Check machine_machine table exists
tables = c.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='machine_machine'"
).fetchall()
assert len(tables) == 1
# Check that a machine record was created
machine_count = c.execute("SELECT COUNT(*) FROM machine_machine").fetchone()[0]
assert machine_count >= 1
conn.close()
def test_init_output_shows_collection_info(tmp_path):
"""Test that init output shows helpful collection information."""
os.chdir(tmp_path)
result = subprocess.run(['archivebox', 'init'], capture_output=True, text=True)
output = result.stdout
# Should show some helpful info about the collection
assert 'ArchiveBox' in output or 'collection' in output.lower() or 'Initializing' in output

68
tests/test_cli_version.py Normal file
View File

@@ -0,0 +1,68 @@
#!/usr/bin/env python3
"""
Tests for archivebox version command.
Verify version output and system information reporting.
"""
import os
import subprocess
import sqlite3
from .fixtures import *
def test_version_quiet_outputs_version_number(tmp_path):
"""Test that version --quiet outputs just the version number."""
os.chdir(tmp_path)
result = subprocess.run(['archivebox', 'version', '--quiet'], capture_output=True, text=True)
assert result.returncode == 0
version = result.stdout.strip()
assert version
# Version should be semver-ish format (e.g., 0.8.0)
parts = version.split('.')
assert len(parts) >= 2
def test_version_shows_system_info_in_initialized_dir(tmp_path, process):
"""Test that version shows system metadata in initialized directory."""
os.chdir(tmp_path)
result = subprocess.run(['archivebox', 'version'], capture_output=True, text=True)
output = result.stdout
assert 'ArchiveBox' in output
# Should show system info
assert any(x in output for x in ['ARCH=', 'OS=', 'PYTHON='])
def test_version_shows_binaries_after_init(tmp_path, process):
"""Test that version shows binary dependencies in initialized directory."""
os.chdir(tmp_path)
result = subprocess.run(['archivebox', 'version'], capture_output=True, text=True)
output = result.stdout
# Should show binary section
assert 'Binary' in output or 'Dependencies' in output
def test_version_shows_data_locations(tmp_path, process):
"""Test that version shows data directory locations."""
os.chdir(tmp_path)
result = subprocess.run(['archivebox', 'version'], capture_output=True, text=True)
output = result.stdout
# Should show paths
assert any(x in output for x in ['Data', 'Code', 'location'])
def test_version_in_uninitialized_dir_still_works(tmp_path):
"""Test that version command works even without initialized data dir."""
empty_dir = tmp_path / "empty"
empty_dir.mkdir()
os.chdir(empty_dir)
result = subprocess.run(['archivebox', 'version', '--quiet'], capture_output=True, text=True)
# Should still output version
assert result.returncode == 0
assert len(result.stdout.strip()) > 0

View File

@@ -12,7 +12,7 @@ def test_remove_single_snapshot(tmp_path, process, disable_extractors_dict):
# Verify snapshot exists
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
count_before = c.execute("SELECT COUNT() from core_snapshot").fetchone()[0]
count_before = c.execute("SELECT COUNT() from archivebox.core.snapshot").fetchone()[0]
conn.close()
assert count_before >= 1
@@ -24,7 +24,7 @@ def test_remove_single_snapshot(tmp_path, process, disable_extractors_dict):
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
count = c.execute("SELECT COUNT() from core_snapshot").fetchone()[0]
count = c.execute("SELECT COUNT() from archivebox.core.snapshot").fetchone()[0]
conn.close()
assert count == 0
@@ -59,7 +59,7 @@ def test_remove_regex(tmp_path, process, disable_extractors_dict):
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
count_before = c.execute("SELECT COUNT() from core_snapshot").fetchone()[0]
count_before = c.execute("SELECT COUNT() from archivebox.core.snapshot").fetchone()[0]
conn.close()
assert count_before >= 2
@@ -67,7 +67,7 @@ def test_remove_regex(tmp_path, process, disable_extractors_dict):
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
count_after = c.execute("SELECT COUNT() from core_snapshot").fetchone()[0]
count_after = c.execute("SELECT COUNT() from archivebox.core.snapshot").fetchone()[0]
conn.close()
assert count_after == 0
@@ -80,7 +80,7 @@ def test_add_creates_crawls(tmp_path, process, disable_extractors_dict):
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
crawl_count = c.execute("SELECT COUNT() from crawls_crawl").fetchone()[0]
crawl_count = c.execute("SELECT COUNT() from archivebox.crawls.crawl").fetchone()[0]
conn.close()
assert crawl_count == 2

View File

@@ -13,7 +13,7 @@ def test_title_is_extracted(tmp_path, process, disable_extractors_dict):
conn = sqlite3.connect("index.sqlite3")
conn.row_factory = sqlite3.Row
c = conn.cursor()
c.execute("SELECT title from core_snapshot")
c.execute("SELECT title from archivebox.core.snapshot")
snapshot = c.fetchone()
conn.close()