Files
ArchiveBox/archivebox/tests/test_cli_add.py
Nick Sweeting 934e02695b fix lint
2026-03-15 18:45:29 -07:00

332 lines
11 KiB
Python

#!/usr/bin/env python3
"""
Comprehensive tests for archivebox add command.
Verify add creates snapshots in DB, crawls, source files, and archive directories.
"""
import os
import sqlite3
import subprocess
def test_add_single_url_creates_snapshot_in_db(tmp_path, process, disable_extractors_dict):
"""Test that adding a single URL creates a snapshot in the database."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
assert result.returncode == 0
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
snapshots = c.execute("SELECT url FROM core_snapshot").fetchall()
conn.close()
assert len(snapshots) == 1
assert snapshots[0][0] == 'https://example.com'
def test_add_creates_crawl_record(tmp_path, process, disable_extractors_dict):
"""Test that add command creates a Crawl record in the database."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0]
conn.close()
assert crawl_count == 1
def test_add_creates_source_file(tmp_path, process, disable_extractors_dict):
"""Test that add creates a source file with the URL."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
sources_dir = tmp_path / "sources"
assert sources_dir.exists()
source_files = list(sources_dir.glob("*cli_add.txt"))
assert len(source_files) >= 1
source_content = source_files[0].read_text()
assert "https://example.com" in source_content
def test_add_multiple_urls_single_command(tmp_path, process, disable_extractors_dict):
"""Test adding multiple URLs in a single command."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com', 'https://example.org'],
capture_output=True,
env=disable_extractors_dict,
)
assert result.returncode == 0
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
urls = c.execute("SELECT url FROM core_snapshot ORDER BY url").fetchall()
conn.close()
assert snapshot_count == 2
assert urls[0][0] == 'https://example.com'
assert urls[1][0] == 'https://example.org'
def test_add_from_file(tmp_path, process, disable_extractors_dict):
"""Test adding URLs from a file.
With --index-only, this creates a snapshot for the file itself, not the URLs inside.
To get snapshots for the URLs inside, you need to run without --index-only so parsers run.
"""
os.chdir(tmp_path)
# Create a file with URLs
urls_file = tmp_path / "urls.txt"
urls_file.write_text("https://example.com\nhttps://example.org\n")
result = subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', str(urls_file)],
capture_output=True,
env=disable_extractors_dict,
)
assert result.returncode == 0
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0]
snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
conn.close()
# With --index-only, creates 1 snapshot for the file itself
assert crawl_count == 1
assert snapshot_count == 1
def test_add_with_depth_0_flag(tmp_path, process, disable_extractors_dict):
"""Test that --depth=0 flag is accepted and works."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
assert result.returncode == 0
assert 'unrecognized arguments: --depth' not in result.stderr.decode('utf-8')
def test_add_with_depth_1_flag(tmp_path, process, disable_extractors_dict):
"""Test that --depth=1 flag is accepted."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'add', '--index-only', '--depth=1', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
assert result.returncode == 0
assert 'unrecognized arguments: --depth' not in result.stderr.decode('utf-8')
def test_add_with_tags(tmp_path, process, disable_extractors_dict):
"""Test adding URL with tags stores tags_str in crawl.
With --index-only, Tag objects are not created until archiving happens.
Tags are stored as a string in the Crawl.tags_str field.
"""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', '--tag=test,example', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
tags_str = c.execute("SELECT tags_str FROM crawls_crawl").fetchone()[0]
conn.close()
# Tags are stored as a comma-separated string in crawl
assert 'test' in tags_str or 'example' in tags_str
def test_add_records_selected_persona_on_crawl(tmp_path, process, disable_extractors_dict):
"""Test add persists the selected persona so browser config derives from it later."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', '--persona=Default', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
assert result.returncode == 0
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
persona_id, default_persona = c.execute(
"SELECT persona_id, json_extract(config, '$.DEFAULT_PERSONA') FROM crawls_crawl LIMIT 1"
).fetchone()
conn.close()
assert persona_id
assert default_persona == 'Default'
assert (tmp_path / "personas" / "Default" / "chrome_user_data").is_dir()
assert (tmp_path / "personas" / "Default" / "chrome_extensions").is_dir()
def test_add_duplicate_url_creates_separate_crawls(tmp_path, process, disable_extractors_dict):
"""Test that adding the same URL twice creates separate crawls and snapshots.
Each 'add' command creates a new Crawl. Multiple crawls can archive the same URL.
This allows re-archiving URLs at different times.
"""
os.chdir(tmp_path)
# Add URL first time
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
# Add same URL second time
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot WHERE url='https://example.com'").fetchone()[0]
crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0]
conn.close()
# Each add creates a new crawl with its own snapshot
assert crawl_count == 2
assert snapshot_count == 2
def test_add_with_overwrite_flag(tmp_path, process, disable_extractors_dict):
"""Test that --overwrite flag forces re-archiving."""
os.chdir(tmp_path)
# Add URL first time
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
# Add with overwrite
result = subprocess.run(
['archivebox', 'add', '--index-only', '--overwrite', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
assert result.returncode == 0
assert 'unrecognized arguments: --overwrite' not in result.stderr.decode('utf-8')
def test_add_creates_archive_subdirectory(tmp_path, process, disable_extractors_dict):
"""Test that add creates archive subdirectory for the snapshot.
Archive subdirectories are named by timestamp, not by snapshot ID.
"""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
# Get the snapshot timestamp from the database
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
timestamp = c.execute("SELECT timestamp FROM core_snapshot").fetchone()[0]
conn.close()
# Check that archive subdirectory was created using timestamp
archive_dir = tmp_path / "archive" / str(timestamp)
assert archive_dir.exists()
assert archive_dir.is_dir()
def test_add_index_only_skips_extraction(tmp_path, process, disable_extractors_dict):
"""Test that --index-only flag skips extraction (fast)."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
timeout=30, # Should be fast
)
assert result.returncode == 0
# Snapshot should exist but archive results should be minimal
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
conn.close()
assert snapshot_count == 1
def test_add_links_snapshot_to_crawl(tmp_path, process, disable_extractors_dict):
"""Test that add links the snapshot to the crawl via crawl_id."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
# Get crawl id
crawl_id = c.execute("SELECT id FROM crawls_crawl").fetchone()[0]
# Get snapshot's crawl_id
snapshot_crawl = c.execute("SELECT crawl_id FROM core_snapshot").fetchone()[0]
conn.close()
assert snapshot_crawl == crawl_id
def test_add_sets_snapshot_timestamp(tmp_path, process, disable_extractors_dict):
"""Test that add sets a timestamp on the snapshot."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
timestamp = c.execute("SELECT timestamp FROM core_snapshot").fetchone()[0]
conn.close()
assert timestamp is not None
assert len(str(timestamp)) > 0