mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-03 01:15:57 +10:00
311 lines
10 KiB
Python
311 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Comprehensive tests for archivebox add command.
|
|
Verify add creates snapshots in DB, crawls, source files, and archive directories.
|
|
"""
|
|
|
|
import os
|
|
import subprocess
|
|
import sqlite3
|
|
from pathlib import Path
|
|
|
|
from .fixtures import *
|
|
|
|
|
|
def test_add_single_url_creates_snapshot_in_db(tmp_path, process, disable_extractors_dict):
|
|
"""Test that adding a single URL creates a snapshot in the database."""
|
|
os.chdir(tmp_path)
|
|
result = subprocess.run(
|
|
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
assert result.returncode == 0
|
|
|
|
conn = sqlite3.connect("index.sqlite3")
|
|
c = conn.cursor()
|
|
snapshots = c.execute("SELECT url FROM core_snapshot").fetchall()
|
|
conn.close()
|
|
|
|
assert len(snapshots) == 1
|
|
assert snapshots[0][0] == 'https://example.com'
|
|
|
|
|
|
def test_add_creates_crawl_record(tmp_path, process, disable_extractors_dict):
|
|
"""Test that add command creates a Crawl record in the database."""
|
|
os.chdir(tmp_path)
|
|
subprocess.run(
|
|
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
conn = sqlite3.connect("index.sqlite3")
|
|
c = conn.cursor()
|
|
crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0]
|
|
conn.close()
|
|
|
|
assert crawl_count == 1
|
|
|
|
|
|
def test_add_creates_source_file(tmp_path, process, disable_extractors_dict):
|
|
"""Test that add creates a source file with the URL."""
|
|
os.chdir(tmp_path)
|
|
subprocess.run(
|
|
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
sources_dir = tmp_path / "sources"
|
|
assert sources_dir.exists()
|
|
|
|
source_files = list(sources_dir.glob("*cli_add.txt"))
|
|
assert len(source_files) >= 1
|
|
|
|
source_content = source_files[0].read_text()
|
|
assert "https://example.com" in source_content
|
|
|
|
|
|
def test_add_multiple_urls_single_command(tmp_path, process, disable_extractors_dict):
|
|
"""Test adding multiple URLs in a single command."""
|
|
os.chdir(tmp_path)
|
|
result = subprocess.run(
|
|
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com', 'https://example.org'],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
assert result.returncode == 0
|
|
|
|
conn = sqlite3.connect("index.sqlite3")
|
|
c = conn.cursor()
|
|
snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
|
|
urls = c.execute("SELECT url FROM core_snapshot ORDER BY url").fetchall()
|
|
conn.close()
|
|
|
|
assert snapshot_count == 2
|
|
assert urls[0][0] == 'https://example.com'
|
|
assert urls[1][0] == 'https://example.org'
|
|
|
|
|
|
def test_add_from_file(tmp_path, process, disable_extractors_dict):
|
|
"""Test adding URLs from a file.
|
|
|
|
With --index-only, this creates a snapshot for the file itself, not the URLs inside.
|
|
To get snapshots for the URLs inside, you need to run without --index-only so parsers run.
|
|
"""
|
|
os.chdir(tmp_path)
|
|
|
|
# Create a file with URLs
|
|
urls_file = tmp_path / "urls.txt"
|
|
urls_file.write_text("https://example.com\nhttps://example.org\n")
|
|
|
|
result = subprocess.run(
|
|
['archivebox', 'add', '--index-only', '--depth=0', str(urls_file)],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
assert result.returncode == 0
|
|
|
|
conn = sqlite3.connect("index.sqlite3")
|
|
c = conn.cursor()
|
|
crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0]
|
|
snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
|
|
conn.close()
|
|
|
|
# With --index-only, creates 1 snapshot for the file itself
|
|
assert crawl_count == 1
|
|
assert snapshot_count == 1
|
|
|
|
|
|
def test_add_with_depth_0_flag(tmp_path, process, disable_extractors_dict):
|
|
"""Test that --depth=0 flag is accepted and works."""
|
|
os.chdir(tmp_path)
|
|
result = subprocess.run(
|
|
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
assert result.returncode == 0
|
|
assert 'unrecognized arguments: --depth' not in result.stderr.decode('utf-8')
|
|
|
|
|
|
def test_add_with_depth_1_flag(tmp_path, process, disable_extractors_dict):
|
|
"""Test that --depth=1 flag is accepted."""
|
|
os.chdir(tmp_path)
|
|
result = subprocess.run(
|
|
['archivebox', 'add', '--index-only', '--depth=1', 'https://example.com'],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
assert result.returncode == 0
|
|
assert 'unrecognized arguments: --depth' not in result.stderr.decode('utf-8')
|
|
|
|
|
|
def test_add_with_tags(tmp_path, process, disable_extractors_dict):
|
|
"""Test adding URL with tags stores tags_str in crawl.
|
|
|
|
With --index-only, Tag objects are not created until archiving happens.
|
|
Tags are stored as a string in the Crawl.tags_str field.
|
|
"""
|
|
os.chdir(tmp_path)
|
|
subprocess.run(
|
|
['archivebox', 'add', '--index-only', '--depth=0', '--tag=test,example', 'https://example.com'],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
conn = sqlite3.connect("index.sqlite3")
|
|
c = conn.cursor()
|
|
tags_str = c.execute("SELECT tags_str FROM crawls_crawl").fetchone()[0]
|
|
conn.close()
|
|
|
|
# Tags are stored as a comma-separated string in crawl
|
|
assert 'test' in tags_str or 'example' in tags_str
|
|
|
|
|
|
def test_add_duplicate_url_creates_separate_crawls(tmp_path, process, disable_extractors_dict):
|
|
"""Test that adding the same URL twice creates separate crawls and snapshots.
|
|
|
|
Each 'add' command creates a new Crawl. Multiple crawls can archive the same URL.
|
|
This allows re-archiving URLs at different times.
|
|
"""
|
|
os.chdir(tmp_path)
|
|
|
|
# Add URL first time
|
|
subprocess.run(
|
|
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
# Add same URL second time
|
|
subprocess.run(
|
|
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
conn = sqlite3.connect("index.sqlite3")
|
|
c = conn.cursor()
|
|
snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot WHERE url='https://example.com'").fetchone()[0]
|
|
crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0]
|
|
conn.close()
|
|
|
|
# Each add creates a new crawl with its own snapshot
|
|
assert crawl_count == 2
|
|
assert snapshot_count == 2
|
|
|
|
|
|
def test_add_with_overwrite_flag(tmp_path, process, disable_extractors_dict):
|
|
"""Test that --overwrite flag forces re-archiving."""
|
|
os.chdir(tmp_path)
|
|
|
|
# Add URL first time
|
|
subprocess.run(
|
|
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
# Add with overwrite
|
|
result = subprocess.run(
|
|
['archivebox', 'add', '--index-only', '--overwrite', 'https://example.com'],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
assert result.returncode == 0
|
|
assert 'unrecognized arguments: --overwrite' not in result.stderr.decode('utf-8')
|
|
|
|
|
|
def test_add_creates_archive_subdirectory(tmp_path, process, disable_extractors_dict):
|
|
"""Test that add creates archive subdirectory for the snapshot.
|
|
|
|
Archive subdirectories are named by timestamp, not by snapshot ID.
|
|
"""
|
|
os.chdir(tmp_path)
|
|
subprocess.run(
|
|
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
# Get the snapshot timestamp from the database
|
|
conn = sqlite3.connect("index.sqlite3")
|
|
c = conn.cursor()
|
|
timestamp = c.execute("SELECT timestamp FROM core_snapshot").fetchone()[0]
|
|
conn.close()
|
|
|
|
# Check that archive subdirectory was created using timestamp
|
|
archive_dir = tmp_path / "archive" / str(timestamp)
|
|
assert archive_dir.exists()
|
|
assert archive_dir.is_dir()
|
|
|
|
|
|
def test_add_index_only_skips_extraction(tmp_path, process, disable_extractors_dict):
|
|
"""Test that --index-only flag skips extraction (fast)."""
|
|
os.chdir(tmp_path)
|
|
result = subprocess.run(
|
|
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
timeout=30, # Should be fast
|
|
)
|
|
|
|
assert result.returncode == 0
|
|
|
|
# Snapshot should exist but archive results should be minimal
|
|
conn = sqlite3.connect("index.sqlite3")
|
|
c = conn.cursor()
|
|
snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
|
|
conn.close()
|
|
|
|
assert snapshot_count == 1
|
|
|
|
|
|
def test_add_links_snapshot_to_crawl(tmp_path, process, disable_extractors_dict):
|
|
"""Test that add links the snapshot to the crawl via crawl_id."""
|
|
os.chdir(tmp_path)
|
|
subprocess.run(
|
|
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
conn = sqlite3.connect("index.sqlite3")
|
|
c = conn.cursor()
|
|
|
|
# Get crawl id
|
|
crawl_id = c.execute("SELECT id FROM crawls_crawl").fetchone()[0]
|
|
|
|
# Get snapshot's crawl_id
|
|
snapshot_crawl = c.execute("SELECT crawl_id FROM core_snapshot").fetchone()[0]
|
|
|
|
conn.close()
|
|
|
|
assert snapshot_crawl == crawl_id
|
|
|
|
|
|
def test_add_sets_snapshot_timestamp(tmp_path, process, disable_extractors_dict):
|
|
"""Test that add sets a timestamp on the snapshot."""
|
|
os.chdir(tmp_path)
|
|
subprocess.run(
|
|
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
|
capture_output=True,
|
|
env=disable_extractors_dict,
|
|
)
|
|
|
|
conn = sqlite3.connect("index.sqlite3")
|
|
c = conn.cursor()
|
|
timestamp = c.execute("SELECT timestamp FROM core_snapshot").fetchone()[0]
|
|
conn.close()
|
|
|
|
assert timestamp is not None
|
|
assert len(str(timestamp)) > 0
|