#!/usr/bin/env python3 """ Comprehensive tests for archivebox add command. Verify add creates snapshots in DB, crawls, source files, and archive directories. """ import os import subprocess import sqlite3 from pathlib import Path from .fixtures import * def test_add_single_url_creates_snapshot_in_db(tmp_path, process, disable_extractors_dict): """Test that adding a single URL creates a snapshot in the database.""" os.chdir(tmp_path) result = subprocess.run( ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'], capture_output=True, env=disable_extractors_dict, ) assert result.returncode == 0 conn = sqlite3.connect("index.sqlite3") c = conn.cursor() snapshots = c.execute("SELECT url FROM core_snapshot").fetchall() conn.close() assert len(snapshots) == 1 assert snapshots[0][0] == 'https://example.com' def test_add_creates_crawl_record(tmp_path, process, disable_extractors_dict): """Test that add command creates a Crawl record in the database.""" os.chdir(tmp_path) subprocess.run( ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'], capture_output=True, env=disable_extractors_dict, ) conn = sqlite3.connect("index.sqlite3") c = conn.cursor() crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0] conn.close() assert crawl_count == 1 def test_add_creates_source_file(tmp_path, process, disable_extractors_dict): """Test that add creates a source file with the URL.""" os.chdir(tmp_path) subprocess.run( ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'], capture_output=True, env=disable_extractors_dict, ) sources_dir = tmp_path / "sources" assert sources_dir.exists() source_files = list(sources_dir.glob("*cli_add.txt")) assert len(source_files) >= 1 source_content = source_files[0].read_text() assert "https://example.com" in source_content def test_add_multiple_urls_single_command(tmp_path, process, disable_extractors_dict): """Test adding multiple URLs in a single command.""" os.chdir(tmp_path) result = subprocess.run( ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com', 'https://example.org'], capture_output=True, env=disable_extractors_dict, ) assert result.returncode == 0 conn = sqlite3.connect("index.sqlite3") c = conn.cursor() snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0] urls = c.execute("SELECT url FROM core_snapshot ORDER BY url").fetchall() conn.close() assert snapshot_count == 2 assert urls[0][0] == 'https://example.com' assert urls[1][0] == 'https://example.org' def test_add_from_file(tmp_path, process, disable_extractors_dict): """Test adding URLs from a file.""" os.chdir(tmp_path) # Create a file with URLs urls_file = tmp_path / "urls.txt" urls_file.write_text("https://example.com\nhttps://example.org\n") result = subprocess.run( ['archivebox', 'add', '--index-only', '--depth=0', str(urls_file)], capture_output=True, env=disable_extractors_dict, ) assert result.returncode == 0 conn = sqlite3.connect("index.sqlite3") c = conn.cursor() snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0] conn.close() assert snapshot_count == 2 def test_add_with_depth_0_flag(tmp_path, process, disable_extractors_dict): """Test that --depth=0 flag is accepted and works.""" os.chdir(tmp_path) result = subprocess.run( ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'], capture_output=True, env=disable_extractors_dict, ) assert result.returncode == 0 assert 'unrecognized arguments: --depth' not in result.stderr.decode('utf-8') def test_add_with_depth_1_flag(tmp_path, process, disable_extractors_dict): """Test that --depth=1 flag is accepted.""" os.chdir(tmp_path) result = subprocess.run( ['archivebox', 'add', '--index-only', '--depth=1', 'https://example.com'], capture_output=True, env=disable_extractors_dict, ) assert result.returncode == 0 assert 'unrecognized arguments: --depth' not in result.stderr.decode('utf-8') def test_add_with_tags(tmp_path, process, disable_extractors_dict): """Test adding URL with tags creates tag records.""" os.chdir(tmp_path) subprocess.run( ['archivebox', 'add', '--index-only', '--depth=0', '--tag=test,example', 'https://example.com'], capture_output=True, env=disable_extractors_dict, ) conn = sqlite3.connect("index.sqlite3") c = conn.cursor() tags = c.execute("SELECT name FROM core_tag").fetchall() conn.close() tag_names = [t[0] for t in tags] assert 'test' in tag_names or 'example' in tag_names def test_add_duplicate_url_updates_existing(tmp_path, process, disable_extractors_dict): """Test that adding the same URL twice updates rather than duplicates.""" os.chdir(tmp_path) # Add URL first time subprocess.run( ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'], capture_output=True, env=disable_extractors_dict, ) # Add same URL second time subprocess.run( ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'], capture_output=True, env=disable_extractors_dict, ) conn = sqlite3.connect("index.sqlite3") c = conn.cursor() snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot WHERE url='https://example.com'").fetchone()[0] conn.close() # Should still only have one snapshot for this URL assert snapshot_count == 1 def test_add_with_overwrite_flag(tmp_path, process, disable_extractors_dict): """Test that --overwrite flag forces re-archiving.""" os.chdir(tmp_path) # Add URL first time subprocess.run( ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'], capture_output=True, env=disable_extractors_dict, ) # Add with overwrite result = subprocess.run( ['archivebox', 'add', '--index-only', '--overwrite', 'https://example.com'], capture_output=True, env=disable_extractors_dict, ) assert result.returncode == 0 assert 'unrecognized arguments: --overwrite' not in result.stderr.decode('utf-8') def test_add_creates_archive_subdirectory(tmp_path, process, disable_extractors_dict): """Test that add creates archive subdirectory for the snapshot.""" os.chdir(tmp_path) subprocess.run( ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'], capture_output=True, env=disable_extractors_dict, ) # Get the snapshot ID from the database conn = sqlite3.connect("index.sqlite3") c = conn.cursor() snapshot_id = c.execute("SELECT id FROM core_snapshot").fetchone()[0] conn.close() # Check that archive subdirectory was created archive_dir = tmp_path / "archive" / snapshot_id assert archive_dir.exists() assert archive_dir.is_dir() def test_add_index_only_skips_extraction(tmp_path, process, disable_extractors_dict): """Test that --index-only flag skips extraction (fast).""" os.chdir(tmp_path) result = subprocess.run( ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'], capture_output=True, env=disable_extractors_dict, timeout=30, # Should be fast ) assert result.returncode == 0 # Snapshot should exist but archive results should be minimal conn = sqlite3.connect("index.sqlite3") c = conn.cursor() snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0] conn.close() assert snapshot_count == 1 def test_add_links_snapshot_to_crawl(tmp_path, process, disable_extractors_dict): """Test that add links the snapshot to the crawl via crawl_id.""" os.chdir(tmp_path) subprocess.run( ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'], capture_output=True, env=disable_extractors_dict, ) conn = sqlite3.connect("index.sqlite3") c = conn.cursor() # Get crawl id crawl_id = c.execute("SELECT id FROM crawls_crawl").fetchone()[0] # Get snapshot's crawl_id snapshot_crawl = c.execute("SELECT crawl_id FROM core_snapshot").fetchone()[0] conn.close() assert snapshot_crawl == crawl_id def test_add_sets_snapshot_timestamp(tmp_path, process, disable_extractors_dict): """Test that add sets a timestamp on the snapshot.""" os.chdir(tmp_path) subprocess.run( ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'], capture_output=True, env=disable_extractors_dict, ) conn = sqlite3.connect("index.sqlite3") c = conn.cursor() timestamp = c.execute("SELECT timestamp FROM core_snapshot").fetchone()[0] conn.close() assert timestamp is not None assert len(str(timestamp)) > 0