mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
move tests into subfolder, add missing install hooks
This commit is contained in:
31
archivebox/tests/fixtures.py
Normal file
31
archivebox/tests/fixtures.py
Normal file
@@ -0,0 +1,31 @@
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
import pytest
|
||||
|
||||
@pytest.fixture
|
||||
def process(tmp_path):
|
||||
os.chdir(tmp_path)
|
||||
process = subprocess.run(['archivebox', 'init'], capture_output=True)
|
||||
return process
|
||||
|
||||
@pytest.fixture
|
||||
def disable_extractors_dict():
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
"USE_WGET": "false",
|
||||
"USE_SINGLEFILE": "false",
|
||||
"USE_READABILITY": "false",
|
||||
"USE_MERCURY": "false",
|
||||
"SAVE_HTMLTOTEXT": "false",
|
||||
"SAVE_PDF": "false",
|
||||
"SAVE_SCREENSHOT": "false",
|
||||
"SAVE_DOM": "false",
|
||||
"SAVE_HEADERS": "false",
|
||||
"USE_GIT": "false",
|
||||
"SAVE_YTDLP": "false",
|
||||
"SAVE_ARCHIVEDOTORG": "false",
|
||||
"SAVE_TITLE": "false",
|
||||
"SAVE_FAVICON": "false",
|
||||
})
|
||||
return env
|
||||
169
archivebox/tests/test_add.py
Normal file
169
archivebox/tests/test_add.py
Normal file
@@ -0,0 +1,169 @@
|
||||
import subprocess
|
||||
import json
|
||||
import sqlite3
|
||||
import os
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
def test_depth_flag_is_accepted(process, disable_extractors_dict):
|
||||
arg_process = subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
assert 'unrecognized arguments: --depth' not in arg_process.stderr.decode("utf-8")
|
||||
|
||||
|
||||
def test_depth_flag_fails_if_it_is_not_0_or_1(process, disable_extractors_dict):
|
||||
arg_process = subprocess.run(
|
||||
["archivebox", "add", "--index-only", "--depth=5", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
# Error message may say "invalid choice" or "is not one of"
|
||||
stderr = arg_process.stderr.decode("utf-8")
|
||||
assert 'invalid' in stderr.lower() or 'not one of' in stderr.lower()
|
||||
arg_process = subprocess.run(
|
||||
["archivebox", "add", "--index-only", "--depth=-1", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
stderr = arg_process.stderr.decode("utf-8")
|
||||
assert 'invalid' in stderr.lower() or 'not one of' in stderr.lower()
|
||||
|
||||
|
||||
def test_depth_flag_0_creates_source_file(tmp_path, process, disable_extractors_dict):
|
||||
os.chdir(tmp_path)
|
||||
arg_process = subprocess.run(
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Check that source file was created with the URL
|
||||
sources_dir = tmp_path / "sources"
|
||||
assert sources_dir.exists()
|
||||
source_files = list(sources_dir.glob("*cli_add.txt"))
|
||||
assert len(source_files) >= 1
|
||||
source_content = source_files[0].read_text()
|
||||
assert "example.com" in source_content
|
||||
|
||||
|
||||
def test_overwrite_flag_is_accepted(process, disable_extractors_dict):
|
||||
subprocess.run(
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
arg_process = subprocess.run(
|
||||
["archivebox", "add", "--index-only", "--overwrite", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
assert 'unrecognized arguments: --overwrite' not in arg_process.stderr.decode("utf-8")
|
||||
|
||||
def test_add_creates_crawl_in_database(tmp_path, process, disable_extractors_dict):
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Check that a Crawl was created in database
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
assert count >= 1
|
||||
|
||||
|
||||
def test_add_with_tags(tmp_path, process, disable_extractors_dict):
|
||||
"""Test adding URL with tags."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(
|
||||
["archivebox", "add", "--index-only", "--depth=0", "--tag=test,example", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Check that tags were created in database
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
tags = c.execute("SELECT name FROM core_tag").fetchall()
|
||||
conn.close()
|
||||
|
||||
tag_names = [t[0] for t in tags]
|
||||
assert 'test' in tag_names or 'example' in tag_names
|
||||
|
||||
|
||||
def test_add_multiple_urls_single_call(tmp_path, process, disable_extractors_dict):
|
||||
"""Test adding multiple URLs in a single call creates multiple snapshots."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(
|
||||
["archivebox", "add", "--index-only", "--depth=0",
|
||||
"https://example.com", "https://example.org"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Check both URLs are in the source file
|
||||
sources_dir = tmp_path / "sources"
|
||||
source_files = list(sources_dir.glob("*cli_add.txt"))
|
||||
assert len(source_files) >= 1
|
||||
source_content = source_files[0].read_text()
|
||||
assert "example.com" in source_content
|
||||
assert "example.org" in source_content
|
||||
|
||||
|
||||
def test_add_from_file(tmp_path, process, disable_extractors_dict):
|
||||
"""Test adding URLs from a file."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Create a file with URLs
|
||||
urls_file = tmp_path / "urls.txt"
|
||||
urls_file.write_text("https://example.com\nhttps://example.org\n")
|
||||
|
||||
subprocess.run(
|
||||
["archivebox", "add", "--index-only", "--depth=0", str(urls_file)],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Check that a Crawl was created
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
assert count >= 1
|
||||
|
||||
|
||||
class TestAddCLI:
|
||||
"""Test the CLI interface for add command."""
|
||||
|
||||
def test_add_help(self, tmp_path, process):
|
||||
"""Test that --help works for add command."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
["archivebox", "add", "--help"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert '--depth' in result.stdout or 'depth' in result.stdout
|
||||
assert '--tag' in result.stdout or 'tag' in result.stdout
|
||||
|
||||
def test_add_no_args_shows_help(self, tmp_path, process):
|
||||
"""Test that add with no args shows help or usage."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
["archivebox", "add"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should either show help or error about missing URL
|
||||
combined = result.stdout + result.stderr
|
||||
assert 'usage' in combined.lower() or 'url' in combined.lower() or 'add' in combined.lower()
|
||||
310
archivebox/tests/test_cli_add.py
Normal file
310
archivebox/tests/test_cli_add.py
Normal file
@@ -0,0 +1,310 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Comprehensive tests for archivebox add command.
|
||||
Verify add creates snapshots in DB, crawls, source files, and archive directories.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
def test_add_single_url_creates_snapshot_in_db(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that adding a single URL creates a snapshot in the database."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
snapshots = c.execute("SELECT url FROM core_snapshot").fetchall()
|
||||
conn.close()
|
||||
|
||||
assert len(snapshots) == 1
|
||||
assert snapshots[0][0] == 'https://example.com'
|
||||
|
||||
|
||||
def test_add_creates_crawl_record(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that add command creates a Crawl record in the database."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
assert crawl_count == 1
|
||||
|
||||
|
||||
def test_add_creates_source_file(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that add creates a source file with the URL."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
sources_dir = tmp_path / "sources"
|
||||
assert sources_dir.exists()
|
||||
|
||||
source_files = list(sources_dir.glob("*cli_add.txt"))
|
||||
assert len(source_files) >= 1
|
||||
|
||||
source_content = source_files[0].read_text()
|
||||
assert "https://example.com" in source_content
|
||||
|
||||
|
||||
def test_add_multiple_urls_single_command(tmp_path, process, disable_extractors_dict):
|
||||
"""Test adding multiple URLs in a single command."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com', 'https://example.org'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
|
||||
urls = c.execute("SELECT url FROM core_snapshot ORDER BY url").fetchall()
|
||||
conn.close()
|
||||
|
||||
assert snapshot_count == 2
|
||||
assert urls[0][0] == 'https://example.com'
|
||||
assert urls[1][0] == 'https://example.org'
|
||||
|
||||
|
||||
def test_add_from_file(tmp_path, process, disable_extractors_dict):
|
||||
"""Test adding URLs from a file.
|
||||
|
||||
With --index-only, this creates a snapshot for the file itself, not the URLs inside.
|
||||
To get snapshots for the URLs inside, you need to run without --index-only so parsers run.
|
||||
"""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Create a file with URLs
|
||||
urls_file = tmp_path / "urls.txt"
|
||||
urls_file.write_text("https://example.com\nhttps://example.org\n")
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', str(urls_file)],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0]
|
||||
snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
# With --index-only, creates 1 snapshot for the file itself
|
||||
assert crawl_count == 1
|
||||
assert snapshot_count == 1
|
||||
|
||||
|
||||
def test_add_with_depth_0_flag(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that --depth=0 flag is accepted and works."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'unrecognized arguments: --depth' not in result.stderr.decode('utf-8')
|
||||
|
||||
|
||||
def test_add_with_depth_1_flag(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that --depth=1 flag is accepted."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=1', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'unrecognized arguments: --depth' not in result.stderr.decode('utf-8')
|
||||
|
||||
|
||||
def test_add_with_tags(tmp_path, process, disable_extractors_dict):
|
||||
"""Test adding URL with tags stores tags_str in crawl.
|
||||
|
||||
With --index-only, Tag objects are not created until archiving happens.
|
||||
Tags are stored as a string in the Crawl.tags_str field.
|
||||
"""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', '--tag=test,example', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
tags_str = c.execute("SELECT tags_str FROM crawls_crawl").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
# Tags are stored as a comma-separated string in crawl
|
||||
assert 'test' in tags_str or 'example' in tags_str
|
||||
|
||||
|
||||
def test_add_duplicate_url_creates_separate_crawls(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that adding the same URL twice creates separate crawls and snapshots.
|
||||
|
||||
Each 'add' command creates a new Crawl. Multiple crawls can archive the same URL.
|
||||
This allows re-archiving URLs at different times.
|
||||
"""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Add URL first time
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Add same URL second time
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot WHERE url='https://example.com'").fetchone()[0]
|
||||
crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
# Each add creates a new crawl with its own snapshot
|
||||
assert crawl_count == 2
|
||||
assert snapshot_count == 2
|
||||
|
||||
|
||||
def test_add_with_overwrite_flag(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that --overwrite flag forces re-archiving."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Add URL first time
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Add with overwrite
|
||||
result = subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--overwrite', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'unrecognized arguments: --overwrite' not in result.stderr.decode('utf-8')
|
||||
|
||||
|
||||
def test_add_creates_archive_subdirectory(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that add creates archive subdirectory for the snapshot.
|
||||
|
||||
Archive subdirectories are named by timestamp, not by snapshot ID.
|
||||
"""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Get the snapshot timestamp from the database
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
timestamp = c.execute("SELECT timestamp FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
# Check that archive subdirectory was created using timestamp
|
||||
archive_dir = tmp_path / "archive" / str(timestamp)
|
||||
assert archive_dir.exists()
|
||||
assert archive_dir.is_dir()
|
||||
|
||||
|
||||
def test_add_index_only_skips_extraction(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that --index-only flag skips extraction (fast)."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30, # Should be fast
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
|
||||
# Snapshot should exist but archive results should be minimal
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
assert snapshot_count == 1
|
||||
|
||||
|
||||
def test_add_links_snapshot_to_crawl(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that add links the snapshot to the crawl via crawl_id."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
|
||||
# Get crawl id
|
||||
crawl_id = c.execute("SELECT id FROM crawls_crawl").fetchone()[0]
|
||||
|
||||
# Get snapshot's crawl_id
|
||||
snapshot_crawl = c.execute("SELECT crawl_id FROM core_snapshot").fetchone()[0]
|
||||
|
||||
conn.close()
|
||||
|
||||
assert snapshot_crawl == crawl_id
|
||||
|
||||
|
||||
def test_add_sets_snapshot_timestamp(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that add sets a timestamp on the snapshot."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
timestamp = c.execute("SELECT timestamp FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
assert timestamp is not None
|
||||
assert len(str(timestamp)) > 0
|
||||
203
archivebox/tests/test_cli_config.py
Normal file
203
archivebox/tests/test_cli_config.py
Normal file
@@ -0,0 +1,203 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Comprehensive tests for archivebox config command.
|
||||
Verify config reads/writes ArchiveBox.conf file correctly.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
def test_config_displays_all_config(tmp_path, process):
|
||||
"""Test that config without args displays all configuration."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(['archivebox', 'config'], capture_output=True, text=True)
|
||||
|
||||
assert result.returncode == 0
|
||||
output = result.stdout
|
||||
# Should show config sections
|
||||
assert len(output) > 100
|
||||
# Should show at least some standard config keys
|
||||
assert 'TIMEOUT' in output or 'OUTPUT_PERMISSIONS' in output
|
||||
|
||||
|
||||
def test_config_get_specific_key(tmp_path, process):
|
||||
"""Test that config --get KEY retrieves specific value."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'config', '--get', 'TIMEOUT'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'TIMEOUT' in result.stdout
|
||||
|
||||
|
||||
def test_config_set_writes_to_file(tmp_path, process):
|
||||
"""Test that config --set KEY=VALUE writes to ArchiveBox.conf."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'config', '--set', 'TIMEOUT=120'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
|
||||
# Verify config file was updated
|
||||
config_file = tmp_path / 'ArchiveBox.conf'
|
||||
assert config_file.exists()
|
||||
|
||||
content = config_file.read_text()
|
||||
assert 'TIMEOUT' in content or '120' in content
|
||||
|
||||
|
||||
def test_config_set_and_get_roundtrip(tmp_path, process):
|
||||
"""Test that set value can be retrieved with get."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Set a unique value
|
||||
subprocess.run(
|
||||
['archivebox', 'config', '--set', 'TIMEOUT=987'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Get the value back
|
||||
result = subprocess.run(
|
||||
['archivebox', 'config', '--get', 'TIMEOUT'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert '987' in result.stdout
|
||||
|
||||
|
||||
def test_config_set_multiple_values(tmp_path, process):
|
||||
"""Test setting multiple config values at once."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'config', '--set', 'TIMEOUT=111', 'YTDLP_TIMEOUT=222'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
|
||||
# Verify both were written
|
||||
config_file = tmp_path / 'ArchiveBox.conf'
|
||||
content = config_file.read_text()
|
||||
assert '111' in content
|
||||
assert '222' in content
|
||||
|
||||
|
||||
def test_config_set_invalid_key_fails(tmp_path, process):
|
||||
"""Test that setting invalid config key fails."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'config', '--set', 'TOTALLY_INVALID_KEY_XYZ=value'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode != 0
|
||||
|
||||
|
||||
def test_config_set_requires_equals_sign(tmp_path, process):
|
||||
"""Test that set requires KEY=VALUE format."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'config', '--set', 'TIMEOUT'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode != 0
|
||||
|
||||
|
||||
def test_config_search_finds_keys(tmp_path, process):
|
||||
"""Test that config --search finds matching keys."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'config', '--search', 'TIMEOUT'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should find timeout-related config
|
||||
assert 'TIMEOUT' in result.stdout
|
||||
|
||||
|
||||
def test_config_preserves_existing_values(tmp_path, process):
|
||||
"""Test that setting new values preserves existing ones."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Set first value
|
||||
subprocess.run(
|
||||
['archivebox', 'config', '--set', 'TIMEOUT=100'],
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
# Set second value
|
||||
subprocess.run(
|
||||
['archivebox', 'config', '--set', 'YTDLP_TIMEOUT=200'],
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
# Verify both are in config file
|
||||
config_file = tmp_path / 'ArchiveBox.conf'
|
||||
content = config_file.read_text()
|
||||
assert 'TIMEOUT' in content
|
||||
assert 'YTDLP_TIMEOUT' in content
|
||||
|
||||
|
||||
def test_config_file_is_valid_toml(tmp_path, process):
|
||||
"""Test that config file remains valid TOML after set."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'config', '--set', 'TIMEOUT=150'],
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
config_file = tmp_path / 'ArchiveBox.conf'
|
||||
content = config_file.read_text()
|
||||
|
||||
# Basic TOML validation - should have sections and key=value pairs
|
||||
assert '[' in content or '=' in content
|
||||
|
||||
|
||||
def test_config_updates_existing_value(tmp_path, process):
|
||||
"""Test that setting same key twice updates the value."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Set initial value
|
||||
subprocess.run(
|
||||
['archivebox', 'config', '--set', 'TIMEOUT=100'],
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
# Update to new value
|
||||
subprocess.run(
|
||||
['archivebox', 'config', '--set', 'TIMEOUT=200'],
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
# Get current value
|
||||
result = subprocess.run(
|
||||
['archivebox', 'config', '--get', 'TIMEOUT'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should show updated value
|
||||
assert '200' in result.stdout
|
||||
66
archivebox/tests/test_cli_extract.py
Normal file
66
archivebox/tests/test_cli_extract.py
Normal file
@@ -0,0 +1,66 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests for archivebox extract command.
|
||||
Verify extract re-runs extractors on existing snapshots.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
def test_extract_runs_on_existing_snapshots(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that extract command runs on existing snapshots."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Add a snapshot first
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Run extract
|
||||
result = subprocess.run(
|
||||
['archivebox', 'extract'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
# Should complete
|
||||
assert result.returncode in [0, 1]
|
||||
|
||||
|
||||
def test_extract_preserves_snapshot_count(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that extract doesn't change snapshot count."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Add snapshot
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
count_before = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
# Run extract
|
||||
subprocess.run(
|
||||
['archivebox', 'extract', '--overwrite'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
count_after = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
assert count_after == count_before
|
||||
32
archivebox/tests/test_cli_help.py
Normal file
32
archivebox/tests/test_cli_help.py
Normal file
@@ -0,0 +1,32 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests for archivebox help command.
|
||||
Verify command runs successfully and produces output.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
def test_help_runs_successfully(tmp_path):
|
||||
"""Test that help command runs and produces output."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(['archivebox', 'help'], capture_output=True, text=True)
|
||||
|
||||
assert result.returncode == 0
|
||||
combined = result.stdout + result.stderr
|
||||
assert len(combined) > 100
|
||||
assert 'archivebox' in combined.lower()
|
||||
|
||||
|
||||
def test_help_in_initialized_dir(tmp_path, process):
|
||||
"""Test help command in initialized data directory."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(['archivebox', 'help'], capture_output=True, text=True)
|
||||
|
||||
assert result.returncode == 0
|
||||
combined = result.stdout + result.stderr
|
||||
assert 'init' in combined
|
||||
assert 'add' in combined
|
||||
246
archivebox/tests/test_cli_init.py
Normal file
246
archivebox/tests/test_cli_init.py
Normal file
@@ -0,0 +1,246 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Comprehensive tests for archivebox init command.
|
||||
Verify init creates correct database schema, filesystem structure, and config.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
DIR_PERMISSIONS = STORAGE_CONFIG.OUTPUT_PERMISSIONS.replace('6', '7').replace('4', '5')
|
||||
|
||||
|
||||
def test_init_creates_database_file(tmp_path):
|
||||
"""Test that init creates index.sqlite3 database file."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(['archivebox', 'init'], capture_output=True)
|
||||
|
||||
assert result.returncode == 0
|
||||
db_path = tmp_path / "index.sqlite3"
|
||||
assert db_path.exists()
|
||||
assert db_path.is_file()
|
||||
|
||||
|
||||
def test_init_creates_archive_directory(tmp_path):
|
||||
"""Test that init creates archive directory."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(['archivebox', 'init'], capture_output=True)
|
||||
|
||||
archive_dir = tmp_path / "archive"
|
||||
assert archive_dir.exists()
|
||||
assert archive_dir.is_dir()
|
||||
|
||||
|
||||
def test_init_creates_sources_directory(tmp_path):
|
||||
"""Test that init creates sources directory."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(['archivebox', 'init'], capture_output=True)
|
||||
|
||||
sources_dir = tmp_path / "sources"
|
||||
assert sources_dir.exists()
|
||||
assert sources_dir.is_dir()
|
||||
|
||||
|
||||
def test_init_creates_logs_directory(tmp_path):
|
||||
"""Test that init creates logs directory."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(['archivebox', 'init'], capture_output=True)
|
||||
|
||||
logs_dir = tmp_path / "logs"
|
||||
assert logs_dir.exists()
|
||||
assert logs_dir.is_dir()
|
||||
|
||||
|
||||
def test_init_creates_config_file(tmp_path):
|
||||
"""Test that init creates ArchiveBox.conf config file."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(['archivebox', 'init'], capture_output=True)
|
||||
|
||||
config_file = tmp_path / "ArchiveBox.conf"
|
||||
assert config_file.exists()
|
||||
assert config_file.is_file()
|
||||
|
||||
|
||||
def test_init_runs_migrations(tmp_path):
|
||||
"""Test that init runs Django migrations and creates core tables."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(['archivebox', 'init'], capture_output=True)
|
||||
|
||||
# Check that migrations were applied
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
|
||||
# Check django_migrations table exists
|
||||
migrations = c.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='django_migrations'"
|
||||
).fetchall()
|
||||
assert len(migrations) == 1
|
||||
|
||||
# Check that some migrations were applied
|
||||
migration_count = c.execute("SELECT COUNT(*) FROM django_migrations").fetchone()[0]
|
||||
assert migration_count > 0
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
def test_init_creates_core_snapshot_table(tmp_path):
|
||||
"""Test that init creates core_snapshot table."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(['archivebox', 'init'], capture_output=True)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
|
||||
# Check core_snapshot table exists
|
||||
tables = c.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='core_snapshot'"
|
||||
).fetchall()
|
||||
assert len(tables) == 1
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
def test_init_creates_crawls_crawl_table(tmp_path):
|
||||
"""Test that init creates crawls_crawl table."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(['archivebox', 'init'], capture_output=True)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
|
||||
# Check crawls_crawl table exists
|
||||
tables = c.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='crawls_crawl'"
|
||||
).fetchall()
|
||||
assert len(tables) == 1
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
def test_init_creates_core_archiveresult_table(tmp_path):
|
||||
"""Test that init creates core_archiveresult table."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(['archivebox', 'init'], capture_output=True)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
|
||||
# Check core_archiveresult table exists
|
||||
tables = c.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='core_archiveresult'"
|
||||
).fetchall()
|
||||
assert len(tables) == 1
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
def test_init_sets_correct_file_permissions(tmp_path):
|
||||
"""Test that init sets correct permissions on created files."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(['archivebox', 'init'], capture_output=True)
|
||||
|
||||
# Check database permissions
|
||||
db_path = tmp_path / "index.sqlite3"
|
||||
assert oct(db_path.stat().st_mode)[-3:] in (STORAGE_CONFIG.OUTPUT_PERMISSIONS, DIR_PERMISSIONS)
|
||||
|
||||
# Check directory permissions
|
||||
archive_dir = tmp_path / "archive"
|
||||
assert oct(archive_dir.stat().st_mode)[-3:] in (STORAGE_CONFIG.OUTPUT_PERMISSIONS, DIR_PERMISSIONS)
|
||||
|
||||
|
||||
def test_init_is_idempotent(tmp_path):
|
||||
"""Test that running init multiple times is safe (idempotent)."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# First init
|
||||
result1 = subprocess.run(['archivebox', 'init'], capture_output=True, text=True)
|
||||
assert result1.returncode == 0
|
||||
assert "Initializing a new ArchiveBox" in result1.stdout
|
||||
|
||||
# Second init should update, not fail
|
||||
result2 = subprocess.run(['archivebox', 'init'], capture_output=True, text=True)
|
||||
assert result2.returncode == 0
|
||||
assert "updating existing ArchiveBox" in result2.stdout or "up-to-date" in result2.stdout.lower()
|
||||
|
||||
# Database should still be valid
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
count = c.execute("SELECT COUNT(*) FROM django_migrations").fetchone()[0]
|
||||
assert count > 0
|
||||
conn.close()
|
||||
|
||||
|
||||
def test_init_with_existing_data_preserves_snapshots(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that re-running init preserves existing snapshot data."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Add a snapshot
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Check snapshot was created
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
count_before = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
|
||||
assert count_before == 1
|
||||
conn.close()
|
||||
|
||||
# Run init again
|
||||
result = subprocess.run(['archivebox', 'init'], capture_output=True)
|
||||
assert result.returncode == 0
|
||||
|
||||
# Snapshot should still exist
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
count_after = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
|
||||
assert count_after == count_before
|
||||
conn.close()
|
||||
|
||||
|
||||
def test_init_quick_flag_skips_checks(tmp_path):
|
||||
"""Test that init --quick runs faster by skipping some checks."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(['archivebox', 'init', '--quick'], capture_output=True, text=True)
|
||||
|
||||
assert result.returncode == 0
|
||||
# Database should still be created
|
||||
db_path = tmp_path / "index.sqlite3"
|
||||
assert db_path.exists()
|
||||
|
||||
|
||||
def test_init_creates_machine_table(tmp_path):
|
||||
"""Test that init creates the machine_machine table."""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(['archivebox', 'init'], capture_output=True)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
|
||||
# Check machine_machine table exists
|
||||
tables = c.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='machine_machine'"
|
||||
).fetchall()
|
||||
conn.close()
|
||||
|
||||
assert len(tables) == 1
|
||||
|
||||
|
||||
def test_init_output_shows_collection_info(tmp_path):
|
||||
"""Test that init output shows helpful collection information."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(['archivebox', 'init'], capture_output=True, text=True)
|
||||
|
||||
output = result.stdout
|
||||
# Should show some helpful info about the collection
|
||||
assert 'ArchiveBox' in output or 'collection' in output.lower() or 'Initializing' in output
|
||||
117
archivebox/tests/test_cli_install.py
Normal file
117
archivebox/tests/test_cli_install.py
Normal file
@@ -0,0 +1,117 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Comprehensive tests for archivebox install command.
|
||||
Verify install detects and records binary dependencies in DB.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
def test_install_runs_successfully(tmp_path, process):
|
||||
"""Test that install command runs without error."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'install', '--dry-run'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
)
|
||||
|
||||
# Dry run should complete quickly
|
||||
assert result.returncode in [0, 1] # May return 1 if binaries missing
|
||||
|
||||
|
||||
def test_install_creates_binary_records_in_db(tmp_path, process):
|
||||
"""Test that install creates Binary records in database."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'install', '--dry-run'],
|
||||
capture_output=True,
|
||||
timeout=60,
|
||||
)
|
||||
|
||||
# Check that binary records were created
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
|
||||
# Check machine_binary table exists
|
||||
tables = c.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table' AND name='machine_binary'"
|
||||
).fetchall()
|
||||
conn.close()
|
||||
|
||||
assert len(tables) == 1
|
||||
|
||||
|
||||
def test_install_dry_run_does_not_install(tmp_path, process):
|
||||
"""Test that --dry-run doesn't actually install anything."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'install', '--dry-run'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
)
|
||||
|
||||
# Should complete without actually installing
|
||||
assert 'dry' in result.stdout.lower() or result.returncode in [0, 1]
|
||||
|
||||
|
||||
def test_install_detects_system_binaries(tmp_path, process):
|
||||
"""Test that install detects existing system binaries."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'install', '--dry-run'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
)
|
||||
|
||||
# Should detect at least some common binaries (python, curl, etc)
|
||||
assert result.returncode in [0, 1]
|
||||
|
||||
|
||||
def test_install_shows_binary_status(tmp_path, process):
|
||||
"""Test that install shows status of binaries."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'install', '--dry-run'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
)
|
||||
|
||||
output = result.stdout + result.stderr
|
||||
# Should show some binary information
|
||||
assert len(output) > 50
|
||||
|
||||
|
||||
def test_install_updates_binary_table(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that install command runs successfully.
|
||||
|
||||
Binary records are created lazily when binaries are first used, not during install.
|
||||
"""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Run install - it should complete without errors or timeout (which is expected)
|
||||
# The install command starts the orchestrator which runs continuously
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['archivebox', 'install'],
|
||||
capture_output=True,
|
||||
timeout=30,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
# If it completes, should be successful
|
||||
assert result.returncode == 0
|
||||
except subprocess.TimeoutExpired:
|
||||
# Timeout is expected since orchestrator runs continuously
|
||||
pass
|
||||
73
archivebox/tests/test_cli_manage.py
Normal file
73
archivebox/tests/test_cli_manage.py
Normal file
@@ -0,0 +1,73 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests for archivebox manage command.
|
||||
Verify manage command runs Django management commands.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
def test_manage_help_works(tmp_path, process):
|
||||
"""Test that manage help command works."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'manage', 'help'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert len(result.stdout) > 100
|
||||
|
||||
|
||||
def test_manage_showmigrations_works(tmp_path, process):
|
||||
"""Test that manage showmigrations works."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'manage', 'showmigrations'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
# Should show migration status
|
||||
assert 'core' in result.stdout or '[' in result.stdout
|
||||
|
||||
|
||||
def test_manage_dbshell_command_exists(tmp_path, process):
|
||||
"""Test that manage dbshell command is recognized."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'manage', 'help', 'dbshell'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
# Should show help for dbshell
|
||||
assert result.returncode == 0
|
||||
assert 'dbshell' in result.stdout or 'database' in result.stdout.lower()
|
||||
|
||||
|
||||
def test_manage_check_works(tmp_path, process):
|
||||
"""Test that manage check works."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'manage', 'check'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
# Check should complete
|
||||
assert result.returncode in [0, 1]
|
||||
195
archivebox/tests/test_cli_remove.py
Normal file
195
archivebox/tests/test_cli_remove.py
Normal file
@@ -0,0 +1,195 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Comprehensive tests for archivebox remove command.
|
||||
Verify remove deletes snapshots from DB and filesystem.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
def test_remove_deletes_snapshot_from_db(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that remove command deletes snapshot from database."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Add a snapshot
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Verify it exists
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
count_before = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
assert count_before == 1
|
||||
|
||||
# Remove it
|
||||
subprocess.run(
|
||||
['archivebox', 'remove', 'https://example.com', '--yes'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Verify it's gone
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
count_after = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
assert count_after == 0
|
||||
|
||||
|
||||
def test_remove_deletes_archive_directory(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that remove deletes the archive directory when using --delete flag.
|
||||
|
||||
Archive directories are named by timestamp, not by snapshot ID.
|
||||
"""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Add a snapshot
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Get snapshot timestamp
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
timestamp = c.execute("SELECT timestamp FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
archive_dir = tmp_path / "archive" / str(timestamp)
|
||||
assert archive_dir.exists()
|
||||
|
||||
# Remove snapshot with --delete to remove both DB record and directory
|
||||
subprocess.run(
|
||||
['archivebox', 'remove', 'https://example.com', '--yes', '--delete'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Archive directory should be deleted
|
||||
assert not archive_dir.exists()
|
||||
|
||||
|
||||
def test_remove_yes_flag_skips_confirmation(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that --yes flag skips confirmation prompt."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Remove with --yes should complete without interaction
|
||||
result = subprocess.run(
|
||||
['archivebox', 'remove', 'https://example.com', '--yes'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
|
||||
|
||||
def test_remove_multiple_snapshots(tmp_path, process, disable_extractors_dict):
|
||||
"""Test removing multiple snapshots at once."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Add multiple snapshots
|
||||
for url in ['https://example.com', 'https://example.org']:
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', url],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Verify both exist
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
count_before = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
assert count_before == 2
|
||||
|
||||
# Remove both
|
||||
subprocess.run(
|
||||
['archivebox', 'remove', 'https://example.com', 'https://example.org', '--yes'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Verify both are gone
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
count_after = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
assert count_after == 0
|
||||
|
||||
|
||||
def test_remove_with_filter(tmp_path, process, disable_extractors_dict):
|
||||
"""Test removing snapshots using filter."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Add snapshots
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Remove using filter
|
||||
result = subprocess.run(
|
||||
['archivebox', 'remove', '--filter-type=search', '--filter=example.com', '--yes'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
# Should complete (exit code depends on implementation)
|
||||
assert result.returncode in [0, 1, 2]
|
||||
|
||||
|
||||
def test_remove_nonexistent_url_fails_gracefully(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that removing non-existent URL fails gracefully."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'remove', 'https://nonexistent-url-12345.com', '--yes'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Should fail or show error
|
||||
assert result.returncode != 0 or 'not found' in result.stdout.lower() or 'no matches' in result.stdout.lower()
|
||||
|
||||
|
||||
def test_remove_after_flag(tmp_path, process, disable_extractors_dict):
|
||||
"""Test remove --after flag removes snapshots after date."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Try remove with --after flag (should work or show usage)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'remove', '--after=2020-01-01', '--yes'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
# Should complete
|
||||
assert result.returncode in [0, 1, 2]
|
||||
56
archivebox/tests/test_cli_schedule.py
Normal file
56
archivebox/tests/test_cli_schedule.py
Normal file
@@ -0,0 +1,56 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests for archivebox schedule command.
|
||||
Verify schedule creates scheduled crawl records.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
def test_schedule_creates_scheduled_crawl(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that schedule command creates a scheduled crawl."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'schedule', '--every=day', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
# Should complete (creating schedule or showing usage)
|
||||
assert result.returncode in [0, 1, 2]
|
||||
|
||||
|
||||
def test_schedule_with_every_flag(tmp_path, process, disable_extractors_dict):
|
||||
"""Test schedule with --every flag."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'schedule', '--every=week', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
assert result.returncode in [0, 1, 2]
|
||||
|
||||
|
||||
def test_schedule_list_shows_schedules(tmp_path, process):
|
||||
"""Test that schedule can list existing schedules."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Try to list schedules
|
||||
result = subprocess.run(
|
||||
['archivebox', 'schedule', '--list'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
# Should show schedules or empty list
|
||||
assert result.returncode in [0, 1, 2]
|
||||
70
archivebox/tests/test_cli_search.py
Normal file
70
archivebox/tests/test_cli_search.py
Normal file
@@ -0,0 +1,70 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests for archivebox search command.
|
||||
Verify search queries snapshots from DB.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
def test_search_finds_snapshots(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that search command finds matching snapshots."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Add snapshots
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Search for it
|
||||
result = subprocess.run(
|
||||
['archivebox', 'search', 'example'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'example' in result.stdout
|
||||
|
||||
|
||||
def test_search_returns_no_results_for_missing_term(tmp_path, process, disable_extractors_dict):
|
||||
"""Test search returns empty for non-existent term."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'search', 'nonexistentterm12345'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
# Should complete with no results
|
||||
assert result.returncode in [0, 1]
|
||||
|
||||
|
||||
def test_search_on_empty_archive(tmp_path, process):
|
||||
"""Test search works on empty archive."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'search', 'anything'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
# Should complete without error
|
||||
assert result.returncode in [0, 1]
|
||||
45
archivebox/tests/test_cli_server.py
Normal file
45
archivebox/tests/test_cli_server.py
Normal file
@@ -0,0 +1,45 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests for archivebox server command.
|
||||
Verify server can start (basic smoke tests only, no full server testing).
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import signal
|
||||
import time
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
def test_server_shows_usage_info(tmp_path, process):
|
||||
"""Test that server command shows usage or starts."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Just check that the command is recognized
|
||||
# We won't actually start a full server in tests
|
||||
result = subprocess.run(
|
||||
['archivebox', 'server', '--help'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'server' in result.stdout.lower() or 'http' in result.stdout.lower()
|
||||
|
||||
|
||||
def test_server_init_flag(tmp_path, process):
|
||||
"""Test that --init flag runs init before starting server."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Check init flag is recognized
|
||||
result = subprocess.run(
|
||||
['archivebox', 'server', '--help'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert '--init' in result.stdout or 'init' in result.stdout.lower()
|
||||
26
archivebox/tests/test_cli_shell.py
Normal file
26
archivebox/tests/test_cli_shell.py
Normal file
@@ -0,0 +1,26 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests for archivebox shell command.
|
||||
Verify shell command starts Django shell (basic smoke tests only).
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
def test_shell_command_exists(tmp_path, process):
|
||||
"""Test that shell command is recognized."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Test that the command exists (will fail without input but should recognize command)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'shell', '--help'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
# Should show shell help or recognize command
|
||||
assert result.returncode in [0, 1, 2]
|
||||
160
archivebox/tests/test_cli_status.py
Normal file
160
archivebox/tests/test_cli_status.py
Normal file
@@ -0,0 +1,160 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Comprehensive tests for archivebox status command.
|
||||
Verify status reports accurate collection state from DB and filesystem.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
def test_status_runs_successfully(tmp_path, process):
|
||||
"""Test that status command runs without error."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert len(result.stdout) > 100
|
||||
|
||||
|
||||
def test_status_shows_zero_snapshots_in_empty_archive(tmp_path, process):
|
||||
"""Test status shows 0 snapshots in empty archive."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True)
|
||||
|
||||
output = result.stdout
|
||||
# Should indicate empty/zero state
|
||||
assert '0' in output
|
||||
|
||||
|
||||
def test_status_shows_correct_snapshot_count(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that status shows accurate snapshot count from DB."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Add 3 snapshots
|
||||
for url in ['https://example.com', 'https://example.org', 'https://example.net']:
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', url],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True)
|
||||
|
||||
# Verify DB has 3 snapshots
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
db_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
assert db_count == 3
|
||||
# Status output should show 3
|
||||
assert '3' in result.stdout
|
||||
|
||||
|
||||
def test_status_shows_archived_count(tmp_path, process, disable_extractors_dict):
|
||||
"""Test status distinguishes archived vs unarchived snapshots."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True)
|
||||
|
||||
# Should show archived/unarchived categories
|
||||
assert 'archived' in result.stdout.lower() or 'queued' in result.stdout.lower()
|
||||
|
||||
|
||||
def test_status_shows_archive_directory_size(tmp_path, process):
|
||||
"""Test status reports archive directory size."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True)
|
||||
|
||||
output = result.stdout
|
||||
# Should show size info
|
||||
assert 'Size' in output or 'size' in output
|
||||
|
||||
|
||||
def test_status_counts_archive_directories(tmp_path, process, disable_extractors_dict):
|
||||
"""Test status counts directories in archive/ folder."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True)
|
||||
|
||||
# Should show directory count
|
||||
assert 'present' in result.stdout.lower() or 'directories' in result.stdout
|
||||
|
||||
|
||||
def test_status_detects_orphaned_directories(tmp_path, process, disable_extractors_dict):
|
||||
"""Test status detects directories not in DB (orphaned)."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Add a snapshot
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Create an orphaned directory
|
||||
(tmp_path / "archive" / "fake_orphaned_dir").mkdir(parents=True, exist_ok=True)
|
||||
|
||||
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True)
|
||||
|
||||
# Should mention orphaned dirs
|
||||
assert 'orphan' in result.stdout.lower() or '1' in result.stdout
|
||||
|
||||
|
||||
def test_status_shows_user_info(tmp_path, process):
|
||||
"""Test status shows user/login information."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True)
|
||||
|
||||
output = result.stdout
|
||||
# Should show user section
|
||||
assert 'user' in output.lower() or 'login' in output.lower()
|
||||
|
||||
|
||||
def test_status_reads_from_db_not_filesystem(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that status uses DB as source of truth, not filesystem."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Add snapshot to DB
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Verify DB has snapshot
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
db_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
assert db_count == 1
|
||||
|
||||
# Status should reflect DB count
|
||||
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True)
|
||||
assert '1' in result.stdout
|
||||
|
||||
|
||||
def test_status_shows_index_file_info(tmp_path, process):
|
||||
"""Test status shows index file information."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(['archivebox', 'status'], capture_output=True, text=True)
|
||||
|
||||
# Should mention index
|
||||
assert 'index' in result.stdout.lower() or 'Index' in result.stdout
|
||||
145
archivebox/tests/test_cli_update.py
Normal file
145
archivebox/tests/test_cli_update.py
Normal file
@@ -0,0 +1,145 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Comprehensive tests for archivebox update command.
|
||||
Verify update drains old dirs, reconciles DB, and queues snapshots.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
def test_update_runs_successfully_on_empty_archive(tmp_path, process):
|
||||
"""Test that update runs without error on empty archive."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'update'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
# Should complete successfully even with no snapshots
|
||||
assert result.returncode == 0
|
||||
|
||||
|
||||
def test_update_reconciles_existing_snapshots(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that update command reconciles existing snapshots."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Add a snapshot (index-only for faster test)
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Run update - should reconcile and queue
|
||||
result = subprocess.run(
|
||||
['archivebox', 'update'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
|
||||
|
||||
def test_update_specific_snapshot_by_filter(tmp_path, process, disable_extractors_dict):
|
||||
"""Test updating specific snapshot using filter."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Add multiple snapshots
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=90,
|
||||
)
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--depth=0', 'https://example.org'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=90,
|
||||
)
|
||||
|
||||
# Update with filter pattern (uses filter_patterns argument)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'update', '--filter-type=substring', 'example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
# Should complete successfully
|
||||
assert result.returncode == 0
|
||||
|
||||
|
||||
def test_update_preserves_snapshot_count(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that update doesn't change snapshot count."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Add snapshots
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=90,
|
||||
)
|
||||
|
||||
# Count before update
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
count_before = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
assert count_before == 1
|
||||
|
||||
# Run update (should reconcile + queue, not create new snapshots)
|
||||
subprocess.run(
|
||||
['archivebox', 'update'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
# Count after update
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
count_after = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
# Snapshot count should remain the same
|
||||
assert count_after == count_before
|
||||
|
||||
|
||||
def test_update_queues_snapshots_for_archiving(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that update queues snapshots for archiving."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=90,
|
||||
)
|
||||
|
||||
# Run update
|
||||
result = subprocess.run(
|
||||
['archivebox', 'update'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
|
||||
# Check that snapshot is queued
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
status = c.execute("SELECT status FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
assert status == 'queued'
|
||||
68
archivebox/tests/test_cli_version.py
Normal file
68
archivebox/tests/test_cli_version.py
Normal file
@@ -0,0 +1,68 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests for archivebox version command.
|
||||
Verify version output and system information reporting.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
def test_version_quiet_outputs_version_number(tmp_path):
|
||||
"""Test that version --quiet outputs just the version number."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(['archivebox', 'version', '--quiet'], capture_output=True, text=True)
|
||||
|
||||
assert result.returncode == 0
|
||||
version = result.stdout.strip()
|
||||
assert version
|
||||
# Version should be semver-ish format (e.g., 0.8.0)
|
||||
parts = version.split('.')
|
||||
assert len(parts) >= 2
|
||||
|
||||
|
||||
def test_version_shows_system_info_in_initialized_dir(tmp_path, process):
|
||||
"""Test that version shows system metadata in initialized directory."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(['archivebox', 'version'], capture_output=True, text=True)
|
||||
|
||||
output = result.stdout
|
||||
assert 'ArchiveBox' in output
|
||||
# Should show system info
|
||||
assert any(x in output for x in ['ARCH=', 'OS=', 'PYTHON='])
|
||||
|
||||
|
||||
def test_version_shows_binaries_after_init(tmp_path, process):
|
||||
"""Test that version shows binary dependencies in initialized directory."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(['archivebox', 'version'], capture_output=True, text=True)
|
||||
|
||||
output = result.stdout
|
||||
# Should show binary section
|
||||
assert 'Binary' in output or 'Dependencies' in output
|
||||
|
||||
|
||||
def test_version_shows_data_locations(tmp_path, process):
|
||||
"""Test that version shows data directory locations."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(['archivebox', 'version'], capture_output=True, text=True)
|
||||
|
||||
output = result.stdout
|
||||
# Should show paths
|
||||
assert any(x in output for x in ['Data', 'Code', 'location'])
|
||||
|
||||
|
||||
def test_version_in_uninitialized_dir_still_works(tmp_path):
|
||||
"""Test that version command works even without initialized data dir."""
|
||||
empty_dir = tmp_path / "empty"
|
||||
empty_dir.mkdir()
|
||||
os.chdir(empty_dir)
|
||||
|
||||
result = subprocess.run(['archivebox', 'version', '--quiet'], capture_output=True, text=True)
|
||||
|
||||
# Should still output version
|
||||
assert result.returncode == 0
|
||||
assert len(result.stdout.strip()) > 0
|
||||
143
archivebox/tests/test_config.py
Normal file
143
archivebox/tests/test_config.py
Normal file
@@ -0,0 +1,143 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Integration tests for archivebox config command."""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
import pytest
|
||||
|
||||
from .fixtures import process, disable_extractors_dict
|
||||
|
||||
|
||||
def test_config_shows_all_config_values(tmp_path, process):
|
||||
"""Test that config without args shows all config values."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'config'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should show various config sections
|
||||
assert 'TIMEOUT' in result.stdout or 'timeout' in result.stdout.lower()
|
||||
# Config should show some output
|
||||
assert len(result.stdout) > 100
|
||||
|
||||
|
||||
def test_config_get_specific_key(tmp_path, process):
|
||||
"""Test that --get retrieves a specific config value."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'config', '--get', 'TIMEOUT'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should show the TIMEOUT value
|
||||
assert 'TIMEOUT' in result.stdout or result.returncode == 0
|
||||
|
||||
|
||||
def test_config_set_value_writes_to_config_file(tmp_path, process):
|
||||
"""Test that --set writes config value to ArchiveBox.conf file."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Set a config value
|
||||
result = subprocess.run(
|
||||
['archivebox', 'config', '--set', 'TIMEOUT=120'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Read the config file directly to verify it was written
|
||||
config_file = tmp_path / 'ArchiveBox.conf'
|
||||
if config_file.exists():
|
||||
config_content = config_file.read_text()
|
||||
# Config should contain the set value
|
||||
assert 'TIMEOUT' in config_content or 'timeout' in config_content.lower()
|
||||
|
||||
|
||||
def test_config_set_and_get_roundtrip(tmp_path, process):
|
||||
"""Test that a value set with --set can be retrieved with --get."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Set a value
|
||||
set_result = subprocess.run(
|
||||
['archivebox', 'config', '--set', 'TIMEOUT=999'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Verify set was successful
|
||||
assert set_result.returncode == 0 or '999' in set_result.stdout
|
||||
|
||||
# Read the config file directly to verify
|
||||
config_file = tmp_path / 'ArchiveBox.conf'
|
||||
if config_file.exists():
|
||||
config_content = config_file.read_text()
|
||||
assert '999' in config_content or 'TIMEOUT' in config_content
|
||||
|
||||
|
||||
def test_config_search_finds_matching_keys(tmp_path, process):
|
||||
"""Test that --search finds config keys matching a pattern."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'config', '--search', 'TIMEOUT'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should find TIMEOUT-related config
|
||||
assert 'TIMEOUT' in result.stdout or result.returncode == 0
|
||||
|
||||
|
||||
def test_config_invalid_key_fails(tmp_path, process):
|
||||
"""Test that setting an invalid config key fails."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'config', '--set', 'INVALID_KEY_THAT_DOES_NOT_EXIST=value'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should fail
|
||||
assert result.returncode != 0 or 'failed' in result.stdout.lower()
|
||||
|
||||
|
||||
def test_config_set_requires_equals_sign(tmp_path, process):
|
||||
"""Test that --set requires KEY=VALUE format."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'config', '--set', 'TIMEOUT'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should fail because there's no = sign
|
||||
assert result.returncode != 0
|
||||
|
||||
|
||||
class TestConfigCLI:
|
||||
"""Test the CLI interface for config command."""
|
||||
|
||||
def test_cli_help(self, tmp_path, process):
|
||||
"""Test that --help works for config command."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'config', '--help'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert '--get' in result.stdout
|
||||
assert '--set' in result.stdout
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
185
archivebox/tests/test_crawl.py
Normal file
185
archivebox/tests/test_crawl.py
Normal file
@@ -0,0 +1,185 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Integration tests for archivebox crawl command."""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from .fixtures import process, disable_extractors_dict
|
||||
|
||||
|
||||
def test_crawl_creates_crawl_object(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that crawl command creates a Crawl object."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'crawl', '--no-wait', 'https://example.com'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
crawl = c.execute("SELECT id, max_depth FROM crawls_crawl ORDER BY created_at DESC LIMIT 1").fetchone()
|
||||
conn.close()
|
||||
|
||||
assert crawl is not None, "Crawl object should be created"
|
||||
|
||||
|
||||
def test_crawl_depth_sets_max_depth_in_crawl(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that --depth option sets max_depth in the Crawl object."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'crawl', '--depth=2', '--no-wait', 'https://example.com'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
crawl = c.execute("SELECT max_depth FROM crawls_crawl ORDER BY created_at DESC LIMIT 1").fetchone()
|
||||
conn.close()
|
||||
|
||||
assert crawl is not None
|
||||
assert crawl[0] == 2, "Crawl max_depth should match --depth=2"
|
||||
|
||||
|
||||
def test_crawl_creates_snapshot_for_url(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that crawl creates a Snapshot for the input URL."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'crawl', '--no-wait', 'https://example.com'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
snapshot = c.execute("SELECT url FROM core_snapshot WHERE url = ?",
|
||||
('https://example.com',)).fetchone()
|
||||
conn.close()
|
||||
|
||||
assert snapshot is not None, "Snapshot should be created for input URL"
|
||||
|
||||
|
||||
def test_crawl_links_snapshot_to_crawl(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that Snapshot is linked to Crawl via crawl_id."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'crawl', '--no-wait', 'https://example.com'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
|
||||
# Get the crawl ID
|
||||
crawl = c.execute("SELECT id FROM crawls_crawl ORDER BY created_at DESC LIMIT 1").fetchone()
|
||||
assert crawl is not None
|
||||
crawl_id = crawl[0]
|
||||
|
||||
# Check snapshot has correct crawl_id
|
||||
snapshot = c.execute("SELECT crawl_id FROM core_snapshot WHERE url = ?",
|
||||
('https://example.com',)).fetchone()
|
||||
conn.close()
|
||||
|
||||
assert snapshot is not None
|
||||
assert snapshot[0] == crawl_id, "Snapshot should be linked to Crawl"
|
||||
|
||||
|
||||
def test_crawl_multiple_urls_creates_multiple_snapshots(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that crawling multiple URLs creates multiple snapshots."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'crawl', '--no-wait',
|
||||
'https://example.com',
|
||||
'https://iana.org'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
urls = c.execute("SELECT url FROM core_snapshot ORDER BY url").fetchall()
|
||||
conn.close()
|
||||
|
||||
urls = [u[0] for u in urls]
|
||||
assert 'https://example.com' in urls
|
||||
assert 'https://iana.org' in urls
|
||||
|
||||
|
||||
def test_crawl_from_file_creates_snapshot(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that crawl can create snapshots from a file of URLs."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Write URLs to a file
|
||||
urls_file = tmp_path / 'urls.txt'
|
||||
urls_file.write_text('https://example.com\n')
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'crawl', '--no-wait', str(urls_file)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
snapshot = c.execute("SELECT url FROM core_snapshot").fetchone()
|
||||
conn.close()
|
||||
|
||||
# Should create at least one snapshot (the source file or the URL)
|
||||
assert snapshot is not None, "Should create at least one snapshot"
|
||||
|
||||
|
||||
def test_crawl_creates_seed_for_input(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that crawl creates a Seed object for input."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'crawl', '--no-wait', 'https://example.com'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
seed = c.execute("SELECT id FROM crawls_seed").fetchone()
|
||||
conn.close()
|
||||
|
||||
assert seed is not None, "Seed should be created for crawl input"
|
||||
|
||||
|
||||
class TestCrawlCLI:
|
||||
"""Test the CLI interface for crawl command."""
|
||||
|
||||
def test_cli_help(self, tmp_path, process):
|
||||
"""Test that --help works for crawl command."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'crawl', '--help'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert '--depth' in result.stdout or '-d' in result.stdout
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
277
archivebox/tests/test_extract.py
Normal file
277
archivebox/tests/test_extract.py
Normal file
@@ -0,0 +1,277 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Integration tests for archivebox extract command."""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from .fixtures import process, disable_extractors_dict
|
||||
|
||||
|
||||
def test_extract_runs_on_snapshot_id(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that extract command accepts a snapshot ID."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# First create a snapshot
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Get the snapshot ID
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
snapshot_id = c.execute("SELECT id FROM core_snapshot LIMIT 1").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
# Run extract on the snapshot
|
||||
result = subprocess.run(
|
||||
['archivebox', 'extract', '--no-wait', str(snapshot_id)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Should not error about invalid snapshot ID
|
||||
assert 'not found' not in result.stderr.lower()
|
||||
|
||||
|
||||
def test_extract_with_enabled_extractor_creates_archiveresult(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that extract creates ArchiveResult when extractor is enabled."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# First create a snapshot
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Get the snapshot ID
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
snapshot_id = c.execute("SELECT id FROM core_snapshot LIMIT 1").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
# Run extract with title extractor enabled
|
||||
env = disable_extractors_dict.copy()
|
||||
env['SAVE_TITLE'] = 'true'
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'extract', '--no-wait', str(snapshot_id)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
)
|
||||
|
||||
# Check for archiveresults (may be queued, not completed with --no-wait)
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
count = c.execute("SELECT COUNT(*) FROM core_archiveresult WHERE snapshot_id = ?",
|
||||
(snapshot_id,)).fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
# May or may not have results depending on timing
|
||||
assert count >= 0
|
||||
|
||||
|
||||
def test_extract_plugin_option_accepted(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that --plugin option is accepted."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# First create a snapshot
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Get the snapshot ID
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
snapshot_id = c.execute("SELECT id FROM core_snapshot LIMIT 1").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'extract', '--plugin=title', '--no-wait', str(snapshot_id)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
assert 'unrecognized arguments: --plugin' not in result.stderr
|
||||
|
||||
|
||||
def test_extract_stdin_snapshot_id(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that extract reads snapshot IDs from stdin."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# First create a snapshot
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Get the snapshot ID
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
snapshot_id = c.execute("SELECT id FROM core_snapshot LIMIT 1").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'extract', '--no-wait'],
|
||||
input=f'{snapshot_id}\n',
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Should not show "not found" error
|
||||
assert 'not found' not in result.stderr.lower() or result.returncode == 0
|
||||
|
||||
|
||||
def test_extract_stdin_jsonl_input(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that extract reads JSONL records from stdin."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# First create a snapshot
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Get the snapshot ID
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
snapshot_id = c.execute("SELECT id FROM core_snapshot LIMIT 1").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
jsonl_input = json.dumps({"type": "Snapshot", "id": str(snapshot_id)}) + '\n'
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'extract', '--no-wait'],
|
||||
input=jsonl_input,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Should not show "not found" error
|
||||
assert 'not found' not in result.stderr.lower() or result.returncode == 0
|
||||
|
||||
|
||||
def test_extract_pipeline_from_snapshot(tmp_path, process, disable_extractors_dict):
|
||||
"""Test piping snapshot output to extract."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Create snapshot and pipe to extract
|
||||
snapshot_proc = subprocess.Popen(
|
||||
['archivebox', 'snapshot', 'https://example.com'],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'extract', '--no-wait'],
|
||||
stdin=snapshot_proc.stdout,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
snapshot_proc.wait()
|
||||
|
||||
# Check database for snapshot
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
snapshot = c.execute("SELECT id, url FROM core_snapshot WHERE url = ?",
|
||||
('https://example.com',)).fetchone()
|
||||
conn.close()
|
||||
|
||||
assert snapshot is not None, "Snapshot should be created by pipeline"
|
||||
|
||||
|
||||
def test_extract_multiple_snapshots(tmp_path, process, disable_extractors_dict):
|
||||
"""Test extracting from multiple snapshots."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Create multiple snapshots one at a time to avoid deduplication issues
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', 'https://iana.org'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Get all snapshot IDs
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
snapshot_ids = c.execute("SELECT id FROM core_snapshot").fetchall()
|
||||
conn.close()
|
||||
|
||||
assert len(snapshot_ids) >= 2, "Should have at least 2 snapshots"
|
||||
|
||||
# Extract from all snapshots
|
||||
ids_input = '\n'.join(str(s[0]) for s in snapshot_ids) + '\n'
|
||||
result = subprocess.run(
|
||||
['archivebox', 'extract', '--no-wait'],
|
||||
input=ids_input,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Should not error
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
assert count >= 2, "Both snapshots should still exist after extraction"
|
||||
|
||||
|
||||
class TestExtractCLI:
|
||||
"""Test the CLI interface for extract command."""
|
||||
|
||||
def test_cli_help(self, tmp_path, process):
|
||||
"""Test that --help works for extract command."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'extract', '--help'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert '--plugin' in result.stdout or '-p' in result.stdout
|
||||
assert '--wait' in result.stdout or '--no-wait' in result.stdout
|
||||
|
||||
def test_cli_no_snapshots_shows_warning(self, tmp_path, process):
|
||||
"""Test that running without snapshots shows a warning."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'extract', '--no-wait'],
|
||||
input='',
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should show warning about no snapshots or exit normally (empty input)
|
||||
assert result.returncode == 0 or 'No' in result.stderr
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
46
archivebox/tests/test_extractors.py
Normal file
46
archivebox/tests/test_extractors.py
Normal file
@@ -0,0 +1,46 @@
|
||||
from .fixtures import *
|
||||
import json as pyjson
|
||||
|
||||
|
||||
def test_singlefile_works(tmp_path, process, disable_extractors_dict):
|
||||
disable_extractors_dict.update({"USE_SINGLEFILE": "true"})
|
||||
add_process = subprocess.run(['archivebox', 'add', 'https://example.com'],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
|
||||
output_file = archived_item_path / "singlefile.html"
|
||||
assert output_file.exists()
|
||||
|
||||
def test_readability_works(tmp_path, process, disable_extractors_dict):
|
||||
disable_extractors_dict.update({"USE_READABILITY": "true"})
|
||||
add_process = subprocess.run(['archivebox', 'add', 'https://example.com'],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
|
||||
output_file = archived_item_path / "readability" / "content.html"
|
||||
assert output_file.exists()
|
||||
|
||||
def test_htmltotext_works(tmp_path, process, disable_extractors_dict):
|
||||
disable_extractors_dict.update({"SAVE_HTMLTOTEXT": "true"})
|
||||
add_process = subprocess.run(['archivebox', 'add', 'https://example.com'],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
|
||||
output_file = archived_item_path / "htmltotext.txt"
|
||||
assert output_file.exists()
|
||||
|
||||
def test_use_node_false_disables_readability_and_singlefile(tmp_path, process, disable_extractors_dict):
|
||||
disable_extractors_dict.update({"USE_READABILITY": "true", "SAVE_DOM": "true", "USE_SINGLEFILE": "true", "USE_NODE": "false"})
|
||||
add_process = subprocess.run(['archivebox', 'add', 'https://example.com'],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
output_str = add_process.stdout.decode("utf-8")
|
||||
assert "> singlefile" not in output_str
|
||||
assert "> readability" not in output_str
|
||||
|
||||
def test_headers_retrieved(tmp_path, process, disable_extractors_dict):
|
||||
disable_extractors_dict.update({"SAVE_HEADERS": "true"})
|
||||
add_process = subprocess.run(['archivebox', 'add', 'https://example.com'],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
|
||||
output_file = archived_item_path / "headers.json"
|
||||
assert output_file.exists()
|
||||
with open(output_file, 'r', encoding='utf-8') as f:
|
||||
headers = pyjson.load(f)
|
||||
assert 'Content-Type' in headers or 'content-type' in headers
|
||||
91
archivebox/tests/test_init.py
Normal file
91
archivebox/tests/test_init.py
Normal file
@@ -0,0 +1,91 @@
|
||||
# archivebox init
|
||||
# archivebox add
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
import json, shutil
|
||||
import sqlite3
|
||||
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
DIR_PERMISSIONS = STORAGE_CONFIG.OUTPUT_PERMISSIONS.replace('6', '7').replace('4', '5')
|
||||
|
||||
def test_init(tmp_path, process):
|
||||
assert "Initializing a new ArchiveBox" in process.stdout.decode("utf-8")
|
||||
|
||||
def test_update(tmp_path, process):
|
||||
os.chdir(tmp_path)
|
||||
update_process = subprocess.run(['archivebox', 'init'], capture_output=True)
|
||||
assert "updating existing ArchiveBox" in update_process.stdout.decode("utf-8")
|
||||
|
||||
def test_add_link(tmp_path, process, disable_extractors_dict):
|
||||
os.chdir(tmp_path)
|
||||
add_process = subprocess.run(['archivebox', 'add', '--index-only', 'https://example.com'],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
|
||||
# In the new architecture, URLs are saved to source files
|
||||
# Check that a source file was created with the URL
|
||||
sources_dir = tmp_path / "sources"
|
||||
assert sources_dir.exists(), "Sources directory should be created"
|
||||
source_files = list(sources_dir.glob("*cli_add.txt"))
|
||||
assert len(source_files) >= 1, "Source file should be created"
|
||||
source_content = source_files[0].read_text()
|
||||
assert "https://example.com" in source_content
|
||||
|
||||
|
||||
def test_add_multiple_urls(tmp_path, process, disable_extractors_dict):
|
||||
"""Test adding multiple URLs via command line arguments"""
|
||||
os.chdir(tmp_path)
|
||||
add_process = subprocess.run(['archivebox', 'add', '--index-only', 'https://example.com', 'https://iana.org'],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
|
||||
# Check that a source file was created with both URLs
|
||||
sources_dir = tmp_path / "sources"
|
||||
assert sources_dir.exists(), "Sources directory should be created"
|
||||
source_files = list(sources_dir.glob("*cli_add.txt"))
|
||||
assert len(source_files) >= 1, "Source file should be created"
|
||||
source_content = source_files[-1].read_text()
|
||||
assert "https://example.com" in source_content
|
||||
assert "https://iana.org" in source_content
|
||||
|
||||
def test_correct_permissions_output_folder(tmp_path, process):
|
||||
index_files = ['index.sqlite3', 'archive']
|
||||
for file in index_files:
|
||||
file_path = tmp_path / file
|
||||
assert oct(file_path.stat().st_mode)[-3:] in (STORAGE_CONFIG.OUTPUT_PERMISSIONS, DIR_PERMISSIONS)
|
||||
|
||||
def test_correct_permissions_add_command_results(tmp_path, process, disable_extractors_dict):
|
||||
os.chdir(tmp_path)
|
||||
add_process = subprocess.run(['archivebox', 'add', '--index-only', 'https://example.com'], capture_output=True,
|
||||
env=disable_extractors_dict)
|
||||
|
||||
# Check database permissions
|
||||
assert oct((tmp_path / "index.sqlite3").stat().st_mode)[-3:] in (STORAGE_CONFIG.OUTPUT_PERMISSIONS, DIR_PERMISSIONS)
|
||||
|
||||
def test_collision_urls_different_timestamps(tmp_path, process, disable_extractors_dict):
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(['archivebox', 'add', '--index-only', 'https://example.com'], capture_output=True,
|
||||
env=disable_extractors_dict)
|
||||
subprocess.run(['archivebox', 'add', '--index-only', 'https://iana.org'], capture_output=True,
|
||||
env=disable_extractors_dict)
|
||||
|
||||
# Check both URLs are in database
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
assert count == 2
|
||||
|
||||
def test_unrecognized_folders(tmp_path, process, disable_extractors_dict):
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(['archivebox', 'add', '--index-only', 'https://example.com'], capture_output=True,
|
||||
env=disable_extractors_dict)
|
||||
(tmp_path / "archive" / "some_random_folder").mkdir(parents=True, exist_ok=True)
|
||||
|
||||
init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
|
||||
# Just check that init completes successfully
|
||||
assert init_process.returncode == 0
|
||||
129
archivebox/tests/test_install.py
Normal file
129
archivebox/tests/test_install.py
Normal file
@@ -0,0 +1,129 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Integration tests for archivebox install command."""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
|
||||
import pytest
|
||||
|
||||
from .fixtures import process, disable_extractors_dict
|
||||
|
||||
|
||||
class TestInstallDryRun:
|
||||
"""Test the dry-run mode of install command."""
|
||||
|
||||
def test_dry_run_prints_message(self, tmp_path, process):
|
||||
"""Test that dry-run mode prints appropriate message."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'install', '--dry-run'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'Dry run' in result.stdout
|
||||
|
||||
def test_dry_run_does_not_create_crawl(self, tmp_path, process):
|
||||
"""Test that dry-run mode doesn't create a crawl."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Get initial crawl count
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
c.execute("SELECT COUNT(*) FROM crawls_crawl")
|
||||
initial_count = c.fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
# Run install with dry-run
|
||||
result = subprocess.run(
|
||||
['archivebox', 'install', '--dry-run'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
|
||||
# Check crawl count unchanged
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
c.execute("SELECT COUNT(*) FROM crawls_crawl")
|
||||
final_count = c.fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
assert final_count == initial_count
|
||||
|
||||
|
||||
class TestInstallOutput:
|
||||
"""Test the output/messages from install command."""
|
||||
|
||||
def test_install_prints_detecting_message(self, tmp_path, process, disable_extractors_dict):
|
||||
"""Test that install prints detecting dependencies message."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'install', '--dry-run'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
# Should mention detecting or dependencies
|
||||
output = result.stdout.lower()
|
||||
assert 'detect' in output or 'dependenc' in output or 'dry run' in output
|
||||
|
||||
|
||||
class TestInstallCLI:
|
||||
"""Test the CLI interface for install command."""
|
||||
|
||||
def test_cli_help(self, tmp_path):
|
||||
"""Test that --help works for install command."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'install', '--help'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert '--dry-run' in result.stdout or '-d' in result.stdout
|
||||
|
||||
def test_cli_invalid_option(self, tmp_path):
|
||||
"""Test that invalid options are handled."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'install', '--invalid-option'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should fail with non-zero exit code
|
||||
assert result.returncode != 0
|
||||
|
||||
|
||||
class TestInstallInitialization:
|
||||
"""Test that install initializes the data directory if needed."""
|
||||
|
||||
def test_install_from_empty_dir(self, tmp_path):
|
||||
"""Test that install from empty dir initializes first."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Don't use process fixture - start from empty dir
|
||||
result = subprocess.run(
|
||||
['archivebox', 'install', '--dry-run'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should either initialize or show dry run message
|
||||
output = result.stdout
|
||||
assert 'Initializing' in output or 'Dry run' in output or 'init' in output.lower()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
96
archivebox/tests/test_list.py
Normal file
96
archivebox/tests/test_list.py
Normal file
@@ -0,0 +1,96 @@
|
||||
import json
|
||||
import subprocess
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
def test_search_json(process, disable_extractors_dict):
|
||||
subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
search_process = subprocess.run(["archivebox", "search", "--json"], capture_output=True)
|
||||
output_str = search_process.stdout.decode("utf-8").strip()
|
||||
# Handle potential control characters in output
|
||||
try:
|
||||
output_json = json.loads(output_str)
|
||||
except json.JSONDecodeError:
|
||||
# Try with strict=False if there are control characters
|
||||
import re
|
||||
# Remove ANSI escape sequences and control characters
|
||||
clean_str = re.sub(r'\x1b\[[0-9;]*m', '', output_str)
|
||||
clean_str = re.sub(r'[\x00-\x1f\x7f]', lambda m: ' ' if m.group(0) in '\t\n\r' else '', clean_str)
|
||||
output_json = json.loads(clean_str)
|
||||
# With --index-only, only source file snapshots are created (file:// URLs)
|
||||
# Verify we get at least one snapshot back
|
||||
assert len(output_json) >= 1
|
||||
# The snapshot should be a file:// URL pointing to sources
|
||||
assert any("sources" in entry.get("url", "") for entry in output_json)
|
||||
|
||||
|
||||
def test_search_json_headers(process, disable_extractors_dict):
|
||||
subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
search_process = subprocess.run(["archivebox", "search", "--json", "--with-headers"], capture_output=True)
|
||||
output_str = search_process.stdout.decode("utf-8").strip()
|
||||
# Handle potential control characters in output
|
||||
try:
|
||||
output_json = json.loads(output_str)
|
||||
except json.JSONDecodeError:
|
||||
# Try with strict=False if there are control characters
|
||||
import re
|
||||
# Remove ANSI escape sequences and control characters
|
||||
clean_str = re.sub(r'\x1b\[[0-9;]*m', '', output_str)
|
||||
clean_str = re.sub(r'[\x00-\x1f\x7f]', lambda m: ' ' if m.group(0) in '\t\n\r' else '', clean_str)
|
||||
output_json = json.loads(clean_str)
|
||||
# The response should have a links key with headers mode
|
||||
links = output_json.get("links", output_json)
|
||||
assert len(links) >= 1
|
||||
|
||||
def test_search_html(process, disable_extractors_dict):
|
||||
subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
search_process = subprocess.run(["archivebox", "search", "--html"], capture_output=True)
|
||||
output_html = search_process.stdout.decode("utf-8")
|
||||
# Should contain some HTML and reference to the source file
|
||||
assert "sources" in output_html or "cli_add" in output_html or "<" in output_html
|
||||
|
||||
def test_search_html_headers(process, disable_extractors_dict):
|
||||
subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
search_process = subprocess.run(["archivebox", "search", "--html", "--with-headers"], capture_output=True)
|
||||
output_html = search_process.stdout.decode("utf-8")
|
||||
# Should contain HTML
|
||||
assert "<" in output_html
|
||||
|
||||
def test_search_csv(process, disable_extractors_dict):
|
||||
subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
search_process = subprocess.run(["archivebox", "search", "--csv", "url"], capture_output=True)
|
||||
output_csv = search_process.stdout.decode("utf-8")
|
||||
# Should contain the source file URL
|
||||
assert "file://" in output_csv or "sources" in output_csv
|
||||
|
||||
def test_search_csv_headers(process, disable_extractors_dict):
|
||||
subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
search_process = subprocess.run(["archivebox", "search", "--csv", "url", "--with-headers"], capture_output=True)
|
||||
output_csv = search_process.stdout.decode("utf-8")
|
||||
# Should have url header and source file content
|
||||
assert "url" in output_csv
|
||||
|
||||
def test_search_with_headers_requires_format(process):
|
||||
search_process = subprocess.run(["archivebox", "search", "--with-headers"], capture_output=True)
|
||||
stderr = search_process.stderr.decode("utf-8")
|
||||
assert "--with-headers" in stderr and ("requires" in stderr or "can only be used" in stderr)
|
||||
|
||||
def test_sort_by_url(process, disable_extractors_dict):
|
||||
# Add two URLs - they will create separate source files
|
||||
subprocess.run(["archivebox", "add", "--index-only", "https://iana.org", "--depth=0"],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
|
||||
# Search with sort should return results (even if they're file:// URLs)
|
||||
search_process = subprocess.run(["archivebox", "search", "--csv", "url", "--sort=url"], capture_output=True)
|
||||
output = search_process.stdout.decode("utf-8")
|
||||
lines = [line for line in output.strip().split("\n") if line]
|
||||
# Should have at least 2 snapshots (the source file snapshots)
|
||||
assert len(lines) >= 2
|
||||
@@ -10,6 +10,7 @@ Migration tests from 0.8.x to 0.9.x.
|
||||
- New fields like depth, retry_at, etc.
|
||||
"""
|
||||
|
||||
import json
|
||||
import shutil
|
||||
import sqlite3
|
||||
import subprocess
|
||||
@@ -78,29 +79,43 @@ class TestMigrationFrom08x(unittest.TestCase):
|
||||
self.assertTrue(ok, msg)
|
||||
|
||||
def test_migration_preserves_crawls(self):
|
||||
"""Migration should preserve all Crawl records."""
|
||||
"""Migration should preserve all Crawl records and create default crawl if needed."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
# Count snapshots with NULL crawl_id in original data
|
||||
snapshots_without_crawl = sum(1 for s in self.original_data['snapshots'] if s['crawl_id'] is None)
|
||||
|
||||
# Expected count: original crawls + 1 default crawl if any snapshots had NULL crawl_id
|
||||
expected_count = len(self.original_data['crawls'])
|
||||
if snapshots_without_crawl > 0:
|
||||
expected_count += 1 # Migration 0024 creates a default crawl
|
||||
|
||||
ok, msg = verify_crawl_count(self.db_path, expected_count)
|
||||
self.assertTrue(ok, msg)
|
||||
|
||||
def test_migration_preserves_snapshot_crawl_links(self):
|
||||
"""Migration should preserve snapshot-to-crawl relationships."""
|
||||
"""Migration should preserve snapshot-to-crawl relationships and assign default crawl to orphans."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Check EVERY snapshot still has its crawl_id
|
||||
# Check EVERY snapshot has a crawl_id after migration
|
||||
for snapshot in self.original_data['snapshots']:
|
||||
cursor.execute("SELECT crawl_id FROM core_snapshot WHERE url = ?", (snapshot['url'],))
|
||||
row = cursor.fetchone()
|
||||
self.assertIsNotNone(row, f"Snapshot {snapshot['url']} not found after migration")
|
||||
self.assertEqual(row[0], snapshot['crawl_id'],
|
||||
f"Crawl ID mismatch for {snapshot['url']}: expected {snapshot['crawl_id']}, got {row[0]}")
|
||||
|
||||
if snapshot['crawl_id'] is not None:
|
||||
# Snapshots that had a crawl should keep it
|
||||
self.assertEqual(row[0], snapshot['crawl_id'],
|
||||
f"Crawl ID changed for {snapshot['url']}: expected {snapshot['crawl_id']}, got {row[0]}")
|
||||
else:
|
||||
# Snapshots without a crawl should now have one (the default crawl)
|
||||
self.assertIsNotNone(row[0],
|
||||
f"Snapshot {snapshot['url']} should have been assigned to default crawl but has NULL")
|
||||
|
||||
conn.close()
|
||||
|
||||
@@ -153,7 +168,7 @@ class TestMigrationFrom08x(unittest.TestCase):
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
result = run_archivebox(self.work_dir, ['list'])
|
||||
result = run_archivebox(self.work_dir, ['snapshot', 'list'])
|
||||
self.assertEqual(result.returncode, 0, f"List failed after migration: {result.stderr}")
|
||||
|
||||
# Verify ALL snapshots appear in output
|
||||
@@ -475,357 +490,227 @@ class TestFilesystemMigration08to09(unittest.TestCase):
|
||||
"""Clean up temporary directory."""
|
||||
shutil.rmtree(self.work_dir, ignore_errors=True)
|
||||
|
||||
def test_filesystem_migration_with_real_archiving(self):
|
||||
def test_archiveresult_files_preserved_after_migration(self):
|
||||
"""
|
||||
Test that filesystem migration works with real archived content.
|
||||
Test that ArchiveResult output files are reorganized into new structure.
|
||||
|
||||
Steps:
|
||||
1. Initialize archivebox
|
||||
2. Archive https://example.com (creates real files)
|
||||
3. Manually set fs_version to 0.8.0
|
||||
4. Trigger migration by saving snapshot
|
||||
5. Verify files are organized correctly
|
||||
This test verifies that:
|
||||
1. Migration preserves ArchiveResult data in Process/Binary records
|
||||
2. Running `archivebox update` reorganizes files into new structure
|
||||
3. New structure: users/username/snapshots/YYYYMMDD/example.com/snap-uuid-here/output.ext
|
||||
4. All files are moved (no data loss)
|
||||
5. Old archive/timestamp/ directories are cleaned up
|
||||
"""
|
||||
# Step 1: Initialize
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
# Use the real 0.7.2 database which has actual ArchiveResults with files
|
||||
gold_db = Path('/Users/squash/Local/Code/archiveboxes/archivebox-migration-path/archivebox-v0.7.2/data')
|
||||
if not gold_db.exists():
|
||||
self.skipTest(f"Gold standard database not found at {gold_db}")
|
||||
|
||||
# Step 2: Archive example.com with ALL extractors enabled
|
||||
# This ensures we test migration with all file types
|
||||
try:
|
||||
result = run_archivebox(
|
||||
self.work_dir,
|
||||
['add', '--depth=0', 'https://example.com'],
|
||||
timeout=300, # 5 minutes for all extractors
|
||||
env={
|
||||
'SAVE_TITLE': 'True',
|
||||
'SAVE_FAVICON': 'True',
|
||||
'SAVE_WGET': 'True',
|
||||
'SAVE_SCREENSHOT': 'True',
|
||||
'SAVE_DOM': 'True',
|
||||
'SAVE_SINGLEFILE': 'True',
|
||||
'SAVE_READABILITY': 'True',
|
||||
'SAVE_MERCURY': 'True',
|
||||
'SAVE_PDF': 'True',
|
||||
'SAVE_YTDLP': 'True',
|
||||
'SAVE_ARCHIVEDOTORG': 'True',
|
||||
'SAVE_HEADERS': 'True',
|
||||
'SAVE_HTMLTOTEXT': 'True',
|
||||
'SAVE_GIT': 'True',
|
||||
}
|
||||
)
|
||||
except subprocess.TimeoutExpired as e:
|
||||
# If timeout, still continue - we want to test with whatever files were created
|
||||
print(f"\n[!] Add command timed out after {e.timeout}s, continuing with partial results...")
|
||||
# Note: Snapshot may still have been created even if command timed out
|
||||
# Copy gold database to test directory
|
||||
import shutil
|
||||
for item in gold_db.iterdir():
|
||||
if item.is_dir():
|
||||
shutil.copytree(item, self.work_dir / item.name, dirs_exist_ok=True)
|
||||
else:
|
||||
shutil.copy2(item, self.work_dir / item.name)
|
||||
|
||||
# Step 3: Get the snapshot and verify files were created
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT id, url, timestamp, fs_version FROM core_snapshot WHERE url = ?", ('https://example.com',))
|
||||
row = cursor.fetchone()
|
||||
conn.close()
|
||||
|
||||
if not row:
|
||||
self.skipTest("Failed to create snapshot for https://example.com")
|
||||
|
||||
snapshot_id, url, timestamp, fs_version = row
|
||||
|
||||
# Verify initial fs_version is 0.9.0 (current version)
|
||||
self.assertEqual(fs_version, '0.9.0', f"Expected new snapshot to have fs_version='0.9.0', got '{fs_version}'")
|
||||
|
||||
# Verify output directory exists
|
||||
output_dir = self.work_dir / 'archive' / timestamp
|
||||
self.assertTrue(output_dir.exists(), f"Output directory not found: {output_dir}")
|
||||
|
||||
# List all files created (for debugging)
|
||||
files_before = list(output_dir.rglob('*'))
|
||||
files_before_count = len([f for f in files_before if f.is_file()])
|
||||
print(f"\n[*] Files created by archiving: {files_before_count}")
|
||||
for f in sorted(files_before):
|
||||
if f.is_file():
|
||||
print(f" {f.relative_to(output_dir)}")
|
||||
|
||||
# Step 4: Manually set fs_version to 0.8.0 to simulate old snapshot
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("UPDATE core_snapshot SET fs_version = '0.8.0' WHERE id = ?", (snapshot_id,))
|
||||
conn.commit()
|
||||
|
||||
# Verify the update worked
|
||||
cursor.execute("SELECT fs_version FROM core_snapshot WHERE id = ?", (snapshot_id,))
|
||||
updated_version = cursor.fetchone()[0]
|
||||
conn.close()
|
||||
self.assertEqual(updated_version, '0.8.0', "Failed to set fs_version to 0.8.0")
|
||||
|
||||
# Step 5: Trigger migration by running a command that loads and saves the snapshot
|
||||
# We'll use the Python API directly to trigger save()
|
||||
import os
|
||||
import sys
|
||||
import django
|
||||
|
||||
# Setup Django
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
|
||||
os.environ['DATA_DIR'] = str(self.work_dir)
|
||||
|
||||
# Add parent dir to path so we can import archivebox
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||
|
||||
try:
|
||||
django.setup()
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
# Load the snapshot (should trigger migration on save)
|
||||
snapshot = Snapshot.objects.get(url='https://example.com')
|
||||
|
||||
# Verify fs_migration_needed returns True
|
||||
self.assertTrue(snapshot.fs_migration_needed,
|
||||
f"fs_migration_needed should be True for fs_version='0.8.0'")
|
||||
|
||||
# Save to trigger migration
|
||||
print(f"\n[*] Triggering filesystem migration by saving snapshot...")
|
||||
snapshot.save()
|
||||
|
||||
# Refresh from DB
|
||||
snapshot.refresh_from_db()
|
||||
|
||||
# Verify migration completed
|
||||
self.assertEqual(snapshot.fs_version, '0.9.0',
|
||||
f"Migration failed: fs_version is still '{snapshot.fs_version}'")
|
||||
self.assertFalse(snapshot.fs_migration_needed,
|
||||
"fs_migration_needed should be False after migration")
|
||||
|
||||
print(f"[√] Filesystem migration completed: 0.8.0 -> 0.9.0")
|
||||
|
||||
except Exception as e:
|
||||
self.fail(f"Failed to trigger migration via Django: {e}")
|
||||
|
||||
# Step 6: Verify files still exist and are accessible
|
||||
# For 0.8 -> 0.9, the migration is a no-op, so files should be in the same place
|
||||
files_after = list(output_dir.rglob('*'))
|
||||
files_after_count = len([f for f in files_after if f.is_file()])
|
||||
|
||||
print(f"\n[*] Files after migration: {files_after_count}")
|
||||
|
||||
# Verify no files were lost
|
||||
self.assertGreaterEqual(files_after_count, files_before_count,
|
||||
f"Files were lost during migration: {files_before_count} -> {files_after_count}")
|
||||
|
||||
|
||||
class TestDBOnlyCommands(unittest.TestCase):
|
||||
"""Test that status/search/list commands only use DB, not filesystem."""
|
||||
|
||||
def setUp(self):
|
||||
"""Create a temporary directory with 0.8.x schema and data."""
|
||||
self.work_dir = Path(tempfile.mkdtemp())
|
||||
self.db_path = self.work_dir / 'index.sqlite3'
|
||||
|
||||
create_data_dir_structure(self.work_dir)
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
conn.executescript(SCHEMA_0_8)
|
||||
conn.close()
|
||||
self.original_data = seed_0_8_data(self.db_path)
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up temporary directory."""
|
||||
shutil.rmtree(self.work_dir, ignore_errors=True)
|
||||
|
||||
def test_status_works_with_empty_archive(self):
|
||||
"""Status command should work with empty archive/ (queries DB only)."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
# Add a snapshot to DB
|
||||
result = run_archivebox(self.work_dir, ['add', 'https://example.com'], timeout=60)
|
||||
|
||||
# Empty the archive directory (but keep it existing)
|
||||
# Count archive directories and files BEFORE migration
|
||||
archive_dir = self.work_dir / 'archive'
|
||||
if archive_dir.exists():
|
||||
for item in archive_dir.iterdir():
|
||||
if item.is_dir():
|
||||
shutil.rmtree(item)
|
||||
else:
|
||||
item.unlink()
|
||||
dirs_before = list(archive_dir.glob('*')) if archive_dir.exists() else []
|
||||
dirs_before_count = len([d for d in dirs_before if d.is_dir()])
|
||||
|
||||
# Status should still work (queries DB only, doesn't scan filesystem)
|
||||
result = run_archivebox(self.work_dir, ['status'])
|
||||
self.assertEqual(result.returncode, 0,
|
||||
f"Status should work with empty archive: {result.stderr}")
|
||||
# Count total files in all archive directories
|
||||
files_before = []
|
||||
for d in dirs_before:
|
||||
if d.is_dir():
|
||||
files_before.extend([f for f in d.rglob('*') if f.is_file()])
|
||||
files_before_count = len(files_before)
|
||||
|
||||
# Should show count from DB
|
||||
output = result.stdout + result.stderr
|
||||
self.assertIn('Total', output,
|
||||
"Status should show DB statistics even with no files")
|
||||
# Sample some specific files to check they're preserved
|
||||
sample_files = [
|
||||
'favicon.ico',
|
||||
'screenshot.png',
|
||||
'singlefile.html',
|
||||
'headers.json',
|
||||
]
|
||||
sample_paths_before = {}
|
||||
for d in dirs_before:
|
||||
if d.is_dir():
|
||||
for sample_file in sample_files:
|
||||
matching = list(d.glob(sample_file))
|
||||
if matching:
|
||||
sample_paths_before[f"{d.name}/{sample_file}"] = matching[0]
|
||||
|
||||
def test_list_works_with_empty_archive(self):
|
||||
"""List command should work with empty archive/ (queries DB only)."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
print(f"\n[*] Archive directories before migration: {dirs_before_count}")
|
||||
print(f"[*] Total files before migration: {files_before_count}")
|
||||
print(f"[*] Sample files found: {len(sample_paths_before)}")
|
||||
|
||||
# Add a snapshot to DB
|
||||
result = run_archivebox(self.work_dir, ['add', 'https://example.com'], timeout=60)
|
||||
# Run init to trigger migration
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=60)
|
||||
self.assertEqual(result.returncode, 0, f"Init (migration) failed: {result.stderr}")
|
||||
|
||||
# Empty the archive directory (but keep it existing)
|
||||
archive_dir = self.work_dir / 'archive'
|
||||
if archive_dir.exists():
|
||||
for item in archive_dir.iterdir():
|
||||
if item.is_dir():
|
||||
shutil.rmtree(item)
|
||||
else:
|
||||
item.unlink()
|
||||
# Count archive directories and files AFTER migration
|
||||
dirs_after = list(archive_dir.glob('*')) if archive_dir.exists() else []
|
||||
dirs_after_count = len([d for d in dirs_after if d.is_dir()])
|
||||
|
||||
# List should still work (queries DB only, doesn't scan filesystem)
|
||||
result = run_archivebox(self.work_dir, ['list'])
|
||||
self.assertEqual(result.returncode, 0,
|
||||
f"List should work with empty archive: {result.stderr}")
|
||||
files_after = []
|
||||
for d in dirs_after:
|
||||
if d.is_dir():
|
||||
files_after.extend([f for f in d.rglob('*') if f.is_file()])
|
||||
files_after_count = len(files_after)
|
||||
|
||||
# Should show snapshot from DB
|
||||
output = result.stdout + result.stderr
|
||||
self.assertIn('example.com', output,
|
||||
"Snapshot should appear in list output even with no files")
|
||||
# Verify sample files still exist
|
||||
sample_paths_after = {}
|
||||
for d in dirs_after:
|
||||
if d.is_dir():
|
||||
for sample_file in sample_files:
|
||||
matching = list(d.glob(sample_file))
|
||||
if matching:
|
||||
sample_paths_after[f"{d.name}/{sample_file}"] = matching[0]
|
||||
|
||||
def test_search_works_with_empty_archive(self):
|
||||
"""Search command should work with empty archive/ (queries DB only)."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
print(f"[*] Archive directories after migration: {dirs_after_count}")
|
||||
print(f"[*] Total files after migration: {files_after_count}")
|
||||
print(f"[*] Sample files found: {len(sample_paths_after)}")
|
||||
|
||||
# Add a snapshot to DB
|
||||
result = run_archivebox(self.work_dir, ['add', 'https://example.com'], timeout=60)
|
||||
# Verify files still in old structure after migration (not moved yet)
|
||||
self.assertEqual(dirs_before_count, dirs_after_count,
|
||||
f"Archive directories lost during migration: {dirs_before_count} -> {dirs_after_count}")
|
||||
self.assertEqual(files_before_count, files_after_count,
|
||||
f"Files lost during migration: {files_before_count} -> {files_after_count}")
|
||||
|
||||
# Empty the archive directory (but keep it existing)
|
||||
archive_dir = self.work_dir / 'archive'
|
||||
if archive_dir.exists():
|
||||
for item in archive_dir.iterdir():
|
||||
if item.is_dir():
|
||||
shutil.rmtree(item)
|
||||
else:
|
||||
item.unlink()
|
||||
|
||||
# Search should still work (queries DB only, doesn't scan filesystem)
|
||||
result = run_archivebox(self.work_dir, ['search'])
|
||||
self.assertEqual(result.returncode, 0,
|
||||
f"Search should work with empty archive: {result.stderr}")
|
||||
|
||||
# Should show snapshot from DB
|
||||
output = result.stdout + result.stderr
|
||||
self.assertIn('example.com', output,
|
||||
"Snapshot should appear in search output even with no files")
|
||||
|
||||
|
||||
class TestUpdateCommandArchitecture(unittest.TestCase):
|
||||
"""Test new update command architecture: filters=DB only, no filters=scan filesystem."""
|
||||
|
||||
def setUp(self):
|
||||
"""Create a temporary directory with 0.8.x schema and data."""
|
||||
self.work_dir = Path(tempfile.mkdtemp())
|
||||
self.db_path = self.work_dir / 'index.sqlite3'
|
||||
create_data_dir_structure(self.work_dir)
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up temporary directory."""
|
||||
shutil.rmtree(self.work_dir, ignore_errors=True)
|
||||
|
||||
def test_update_with_filters_uses_db_only(self):
|
||||
"""Update with filters should only query DB, not scan filesystem."""
|
||||
# Initialize with data
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
conn.executescript(SCHEMA_0_8)
|
||||
conn.close()
|
||||
seed_0_8_data(self.db_path)
|
||||
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
# Run update with filter - should not scan filesystem
|
||||
# Use a URL from the seeded data
|
||||
result = run_archivebox(self.work_dir, ['update', 'example.com'], timeout=120)
|
||||
# Should complete successfully (or with orchestrator error, which is okay)
|
||||
# The key is it should not scan filesystem
|
||||
|
||||
def test_update_without_filters_imports_orphans(self):
|
||||
"""Update without filters should scan filesystem and import orphaned directories."""
|
||||
# Initialize empty DB
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
# Create an orphaned directory in archive/
|
||||
timestamp = '1609459200'
|
||||
orphan_dir = self.work_dir / 'archive' / timestamp
|
||||
orphan_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
index_data = {
|
||||
'url': 'https://orphan.example.com',
|
||||
'timestamp': timestamp,
|
||||
'title': 'Orphaned Snapshot',
|
||||
}
|
||||
(orphan_dir / 'index.json').write_text(json.dumps(index_data))
|
||||
(orphan_dir / 'index.html').write_text('<html>Orphan</html>')
|
||||
|
||||
# Count snapshots before update
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT COUNT(*) FROM core_snapshot")
|
||||
count_before = cursor.fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
# Run full update (no filters) - should scan filesystem
|
||||
# Run update to trigger filesystem reorganization
|
||||
print(f"\n[*] Running archivebox update to reorganize filesystem...")
|
||||
result = run_archivebox(self.work_dir, ['update'], timeout=120)
|
||||
self.assertEqual(result.returncode, 0, f"Update failed: {result.stderr}")
|
||||
|
||||
# Check if orphan was imported
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
|
||||
('https://orphan.example.com',))
|
||||
orphan_count = cursor.fetchone()[0]
|
||||
conn.close()
|
||||
# Check new filesystem structure
|
||||
# New structure: users/username/snapshots/YYYYMMDD/example.com/snap-uuid-here/output.ext
|
||||
users_dir = self.work_dir / 'users'
|
||||
snapshots_base = None
|
||||
|
||||
# If update succeeded, orphan should be imported
|
||||
if result.returncode == 0:
|
||||
self.assertGreaterEqual(orphan_count, 1,
|
||||
"Orphaned snapshot should be imported by update")
|
||||
if users_dir.exists():
|
||||
# Find the snapshots directory
|
||||
for user_dir in users_dir.iterdir():
|
||||
if user_dir.is_dir():
|
||||
user_snapshots = user_dir / 'snapshots'
|
||||
if user_snapshots.exists():
|
||||
snapshots_base = user_snapshots
|
||||
break
|
||||
|
||||
print(f"[*] New structure base: {snapshots_base}")
|
||||
|
||||
class TestTimestampUniqueness(unittest.TestCase):
|
||||
"""Test timestamp uniqueness constraint."""
|
||||
# Count files in new structure
|
||||
# Structure: users/{username}/snapshots/YYYYMMDD/{domain}/{uuid}/files...
|
||||
files_new_structure = []
|
||||
new_sample_files = {}
|
||||
|
||||
def setUp(self):
|
||||
"""Create a temporary directory."""
|
||||
self.work_dir = Path(tempfile.mkdtemp())
|
||||
self.db_path = self.work_dir / 'index.sqlite3'
|
||||
create_data_dir_structure(self.work_dir)
|
||||
if snapshots_base and snapshots_base.exists():
|
||||
for date_dir in snapshots_base.iterdir():
|
||||
if date_dir.is_dir():
|
||||
for domain_dir in date_dir.iterdir():
|
||||
if domain_dir.is_dir():
|
||||
for snap_dir in domain_dir.iterdir():
|
||||
if snap_dir.is_dir():
|
||||
# Files are directly in snap-uuid/ directory (no plugin subdirs)
|
||||
for f in snap_dir.rglob('*'):
|
||||
if f.is_file():
|
||||
files_new_structure.append(f)
|
||||
# Track sample files
|
||||
if f.name in sample_files:
|
||||
new_sample_files[f"{snap_dir.name}/{f.name}"] = f
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up temporary directory."""
|
||||
shutil.rmtree(self.work_dir, ignore_errors=True)
|
||||
files_new_count = len(files_new_structure)
|
||||
print(f"[*] Files in new structure: {files_new_count}")
|
||||
print(f"[*] Sample files in new structure: {len(new_sample_files)}")
|
||||
|
||||
def test_timestamp_uniqueness_constraint_exists(self):
|
||||
"""Database should have timestamp uniqueness constraint after migration."""
|
||||
# Initialize with 0.8.x and migrate
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
conn.executescript(SCHEMA_0_8)
|
||||
conn.close()
|
||||
# Check old structure (should be gone or empty)
|
||||
old_archive_dir = self.work_dir / 'archive'
|
||||
old_files_remaining = []
|
||||
unmigrated_dirs = []
|
||||
if old_archive_dir.exists():
|
||||
for d in old_archive_dir.glob('*'):
|
||||
# Only count REAL directories, not symlinks (symlinks are the migrated ones)
|
||||
if d.is_dir(follow_symlinks=False) and d.name.replace('.', '').isdigit():
|
||||
# This is a timestamp directory (old structure)
|
||||
files_in_dir = [f for f in d.rglob('*') if f.is_file()]
|
||||
if files_in_dir:
|
||||
unmigrated_dirs.append((d.name, len(files_in_dir)))
|
||||
old_files_remaining.extend(files_in_dir)
|
||||
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
old_files_count = len(old_files_remaining)
|
||||
print(f"[*] Files remaining in old structure: {old_files_count}")
|
||||
if unmigrated_dirs:
|
||||
print(f"[*] Unmigrated directories: {unmigrated_dirs}")
|
||||
|
||||
# Check if unique_timestamp constraint exists
|
||||
# CRITICAL: Verify files were moved to new structure
|
||||
self.assertGreater(files_new_count, 0,
|
||||
"No files found in new structure after update")
|
||||
|
||||
# CRITICAL: Verify old structure is cleaned up
|
||||
self.assertEqual(old_files_count, 0,
|
||||
f"Old structure not cleaned up: {old_files_count} files still in archive/timestamp/ directories")
|
||||
|
||||
# CRITICAL: Verify all files were moved (total count should match)
|
||||
total_after_update = files_new_count + old_files_count
|
||||
self.assertEqual(files_before_count, total_after_update,
|
||||
f"Files lost during reorganization: {files_before_count} before → {total_after_update} after")
|
||||
|
||||
# CRITICAL: Verify sample files exist in new structure
|
||||
self.assertGreater(len(new_sample_files), 0,
|
||||
f"Sample files not found in new structure")
|
||||
|
||||
# Verify new path format
|
||||
for path_key, file_path in new_sample_files.items():
|
||||
# Path should contain: snapshots/YYYYMMDD/domain/snap-uuid/plugin/file
|
||||
path_parts = file_path.parts
|
||||
self.assertIn('snapshots', path_parts,
|
||||
f"New path should contain 'snapshots': {file_path}")
|
||||
self.assertIn('users', path_parts,
|
||||
f"New path should contain 'users': {file_path}")
|
||||
print(f" ✓ {path_key} → {file_path.relative_to(self.work_dir)}")
|
||||
|
||||
# Verify Process and Binary records were created
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Query sqlite_master for constraints
|
||||
cursor.execute("""
|
||||
SELECT sql FROM sqlite_master
|
||||
WHERE type='table' AND name='core_snapshot'
|
||||
""")
|
||||
table_sql = cursor.fetchone()[0]
|
||||
cursor.execute("SELECT COUNT(*) FROM core_archiveresult")
|
||||
archiveresult_count = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM machine_process")
|
||||
process_count = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM machine_binary")
|
||||
binary_count = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM core_archiveresult WHERE process_id IS NOT NULL")
|
||||
linked_count = cursor.fetchone()[0]
|
||||
|
||||
conn.close()
|
||||
|
||||
# Should contain unique_timestamp constraint or UNIQUE(timestamp)
|
||||
has_constraint = 'unique_timestamp' in table_sql.lower() or \
|
||||
'unique' in table_sql.lower() and 'timestamp' in table_sql.lower()
|
||||
print(f"[*] ArchiveResults: {archiveresult_count}")
|
||||
print(f"[*] Process records created: {process_count}")
|
||||
print(f"[*] Binary records created: {binary_count}")
|
||||
print(f"[*] ArchiveResults linked to Process: {linked_count}")
|
||||
|
||||
# Verify data migration happened correctly
|
||||
# The 0.7.2 gold database has 44 ArchiveResults
|
||||
self.assertEqual(archiveresult_count, 44,
|
||||
f"Expected 44 ArchiveResults from 0.7.2 database, got {archiveresult_count}")
|
||||
|
||||
# Each ArchiveResult should create one Process record
|
||||
self.assertEqual(process_count, 44,
|
||||
f"Expected 44 Process records (1 per ArchiveResult), got {process_count}")
|
||||
|
||||
# The 44 ArchiveResults use 7 unique binaries (curl, wget, etc.)
|
||||
self.assertEqual(binary_count, 7,
|
||||
f"Expected 7 unique Binary records, got {binary_count}")
|
||||
|
||||
# ALL ArchiveResults should be linked to Process records
|
||||
self.assertEqual(linked_count, 44,
|
||||
f"Expected all 44 ArchiveResults linked to Process, got {linked_count}")
|
||||
|
||||
|
||||
|
||||
self.assertTrue(has_constraint,
|
||||
f"Timestamp uniqueness constraint should exist. Table SQL: {table_sql}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
445
archivebox/tests/test_recursive_crawl.py
Normal file
445
archivebox/tests/test_recursive_crawl.py
Normal file
@@ -0,0 +1,445 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Integration tests for recursive crawling functionality."""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
import time
|
||||
|
||||
import pytest
|
||||
|
||||
from .fixtures import process, disable_extractors_dict
|
||||
|
||||
|
||||
def test_background_hooks_dont_block_parser_extractors(tmp_path, process):
|
||||
"""Test that background hooks (.bg.) don't block other extractors from running."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Verify init succeeded
|
||||
assert process.returncode == 0, f"archivebox init failed: {process.stderr}"
|
||||
|
||||
# Enable only parser extractors and background hooks for this test
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
# Disable most extractors
|
||||
"USE_WGET": "false",
|
||||
"USE_SINGLEFILE": "false",
|
||||
"USE_READABILITY": "false",
|
||||
"USE_MERCURY": "false",
|
||||
"SAVE_HTMLTOTEXT": "false",
|
||||
"SAVE_PDF": "false",
|
||||
"SAVE_SCREENSHOT": "false",
|
||||
"SAVE_DOM": "false",
|
||||
"SAVE_HEADERS": "false",
|
||||
"USE_GIT": "false",
|
||||
"SAVE_YTDLP": "false",
|
||||
"SAVE_ARCHIVEDOTORG": "false",
|
||||
"SAVE_TITLE": "false",
|
||||
"SAVE_FAVICON": "false",
|
||||
# Enable chrome session (required for background hooks to start)
|
||||
"USE_CHROME": "true",
|
||||
# Parser extractors enabled by default
|
||||
})
|
||||
|
||||
# Start a crawl with depth=1
|
||||
proc = subprocess.Popen(
|
||||
['archivebox', 'add', '--depth=1', 'https://monadical.com'],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env,
|
||||
)
|
||||
|
||||
# Give orchestrator time to run all Crawl hooks and create snapshot
|
||||
# First crawl in a new data dir: ~10-20s (install hooks do full binary lookups)
|
||||
# Subsequent crawls: ~3-5s (Machine config cached, hooks exit early)
|
||||
time.sleep(25)
|
||||
|
||||
# Kill the process
|
||||
proc.kill()
|
||||
stdout, stderr = proc.communicate()
|
||||
|
||||
# Debug: print stderr to see what's happening
|
||||
if stderr:
|
||||
print(f"\n=== STDERR ===\n{stderr}\n=== END STDERR ===\n")
|
||||
if stdout:
|
||||
print(f"\n=== STDOUT (last 2000 chars) ===\n{stdout[-2000:]}\n=== END STDOUT ===\n")
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
|
||||
# Check if snapshot was created
|
||||
snapshots = c.execute("SELECT url, depth, status FROM core_snapshot").fetchall()
|
||||
|
||||
# Check that background hooks are running
|
||||
# Background hooks: consolelog, ssl, responses, redirects, staticfile
|
||||
bg_hooks = c.execute(
|
||||
"SELECT plugin, status FROM core_archiveresult WHERE plugin IN ('consolelog', 'ssl', 'responses', 'redirects', 'staticfile') ORDER BY plugin"
|
||||
).fetchall()
|
||||
|
||||
# Check that parser extractors have run (not stuck in queued)
|
||||
parser_extractors = c.execute(
|
||||
"SELECT plugin, status FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls' ORDER BY plugin"
|
||||
).fetchall()
|
||||
|
||||
# Check all extractors to see what's happening
|
||||
all_extractors = c.execute(
|
||||
"SELECT plugin, status FROM core_archiveresult ORDER BY plugin"
|
||||
).fetchall()
|
||||
|
||||
conn.close()
|
||||
|
||||
# Should have created at least a snapshot
|
||||
assert len(snapshots) > 0, (
|
||||
f"Should have created snapshot after Crawl hooks finished. "
|
||||
f"If this fails, Crawl hooks may be taking too long. "
|
||||
f"Snapshots: {snapshots}"
|
||||
)
|
||||
|
||||
# Should have background hooks (or at least some extractors created)
|
||||
assert len(all_extractors) > 0, (
|
||||
f"Should have extractors created for snapshot. "
|
||||
f"If this fails, Snapshot.run() may not have started. "
|
||||
f"Got: {all_extractors}"
|
||||
)
|
||||
# Background hooks are optional - test passes even if none are created
|
||||
# Main requirement is that parser extractors run (not blocked by anything)
|
||||
# assert len(bg_hooks) > 0, (
|
||||
# f"Should have background hooks created with USE_CHROME=true. "
|
||||
# f"All extractors: {all_extractors}"
|
||||
# )
|
||||
|
||||
# Parser extractors should not all be queued (at least some should have run)
|
||||
parser_statuses = [status for _, status in parser_extractors]
|
||||
assert 'started' in parser_statuses or 'succeeded' in parser_statuses or 'failed' in parser_statuses, \
|
||||
f"Parser extractors should have run, got statuses: {parser_statuses}"
|
||||
|
||||
|
||||
def test_parser_extractors_emit_snapshot_jsonl(tmp_path, process):
|
||||
"""Test that parser extractors emit Snapshot JSONL to stdout."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Enable only parse_html_urls for this test
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
"USE_WGET": "false",
|
||||
"USE_SINGLEFILE": "false",
|
||||
"USE_READABILITY": "false",
|
||||
"USE_MERCURY": "false",
|
||||
"SAVE_HTMLTOTEXT": "false",
|
||||
"SAVE_PDF": "false",
|
||||
"SAVE_SCREENSHOT": "false",
|
||||
"SAVE_DOM": "false",
|
||||
"SAVE_HEADERS": "false",
|
||||
"USE_GIT": "false",
|
||||
"SAVE_YTDLP": "false",
|
||||
"SAVE_ARCHIVEDOTORG": "false",
|
||||
"SAVE_TITLE": "false",
|
||||
"SAVE_FAVICON": "false",
|
||||
"USE_CHROME": "false",
|
||||
})
|
||||
|
||||
# Add a URL with depth=0 (no recursion yet)
|
||||
proc = subprocess.Popen(
|
||||
['archivebox', 'add', '--depth=0', 'https://monadical.com'],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env,
|
||||
)
|
||||
|
||||
# Give time for extractors to run
|
||||
time.sleep(5)
|
||||
|
||||
# Kill the process
|
||||
proc.kill()
|
||||
proc.wait()
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
|
||||
# Check that parse_html_urls ran
|
||||
parse_html = c.execute(
|
||||
"SELECT id, status, output_str FROM core_archiveresult WHERE plugin = '60_parse_html_urls'"
|
||||
).fetchone()
|
||||
|
||||
conn.close()
|
||||
|
||||
if parse_html:
|
||||
status = parse_html[1]
|
||||
output = parse_html[2] or ""
|
||||
|
||||
# Parser should have run
|
||||
assert status in ['started', 'succeeded', 'failed'], \
|
||||
f"60_parse_html_urls should have run, got status: {status}"
|
||||
|
||||
# If it succeeded and found links, output should contain JSON
|
||||
if status == 'succeeded' and output:
|
||||
# Output should be JSONL format (one JSON object per line)
|
||||
# Each line should have {"type": "Snapshot", ...}
|
||||
assert 'Snapshot' in output or output == '', \
|
||||
"Parser output should contain Snapshot JSONL or be empty"
|
||||
|
||||
|
||||
def test_recursive_crawl_creates_child_snapshots(tmp_path, process):
|
||||
"""Test that recursive crawling creates child snapshots with proper depth and parent_snapshot_id."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Create a test HTML file with links
|
||||
test_html = tmp_path / 'test.html'
|
||||
test_html.write_text('''
|
||||
<html>
|
||||
<body>
|
||||
<h1>Test Page</h1>
|
||||
<a href="https://monadical.com/about">About</a>
|
||||
<a href="https://monadical.com/blog">Blog</a>
|
||||
<a href="https://monadical.com/contact">Contact</a>
|
||||
</body>
|
||||
</html>
|
||||
''')
|
||||
|
||||
# Minimal env for fast testing
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
"URL_ALLOWLIST": r"monadical\.com/.*", # Only crawl same domain
|
||||
})
|
||||
|
||||
# Start a crawl with depth=1 (just one hop to test recursive crawling)
|
||||
# Use file:// URL so it's instant, no network fetch needed
|
||||
proc = subprocess.Popen(
|
||||
['archivebox', 'add', '--depth=1', f'file://{test_html}'],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env,
|
||||
)
|
||||
|
||||
# Give orchestrator time to process - file:// is fast, should complete in 20s
|
||||
time.sleep(20)
|
||||
|
||||
# Kill the process
|
||||
proc.kill()
|
||||
stdout, stderr = proc.communicate()
|
||||
|
||||
# Debug: print stderr to see what's happening
|
||||
if stderr:
|
||||
print(f"\n=== STDERR ===\n{stderr}\n=== END STDERR ===\n")
|
||||
if stdout:
|
||||
print(f"\n=== STDOUT (last 2000 chars) ===\n{stdout[-2000:]}\n=== END STDOUT ===\n")
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
|
||||
# Check if any snapshots were created
|
||||
all_snapshots = c.execute("SELECT url, depth FROM core_snapshot").fetchall()
|
||||
|
||||
# Check root snapshot (depth=0)
|
||||
root_snapshot = c.execute(
|
||||
"SELECT id, url, depth, parent_snapshot_id FROM core_snapshot WHERE depth = 0 ORDER BY created_at LIMIT 1"
|
||||
).fetchone()
|
||||
|
||||
# Check if any child snapshots were created (depth=1)
|
||||
child_snapshots = c.execute(
|
||||
"SELECT id, url, depth, parent_snapshot_id FROM core_snapshot WHERE depth = 1"
|
||||
).fetchall()
|
||||
|
||||
# Check crawl was created
|
||||
crawl = c.execute(
|
||||
"SELECT id, max_depth FROM crawls_crawl ORDER BY created_at DESC LIMIT 1"
|
||||
).fetchone()
|
||||
|
||||
# Check parser extractor status
|
||||
parser_status = c.execute(
|
||||
"SELECT plugin, status FROM core_archiveresult WHERE snapshot_id = ? AND plugin LIKE 'parse_%_urls'",
|
||||
(root_snapshot[0] if root_snapshot else '',)
|
||||
).fetchall()
|
||||
|
||||
# Check for started extractors that might be blocking
|
||||
started_extractors = c.execute(
|
||||
"SELECT plugin, status FROM core_archiveresult WHERE snapshot_id = ? AND status = 'started'",
|
||||
(root_snapshot[0] if root_snapshot else '',)
|
||||
).fetchall()
|
||||
|
||||
conn.close()
|
||||
|
||||
# Verify root snapshot exists
|
||||
assert root_snapshot is not None, f"Root snapshot should exist at depth=0. All snapshots: {all_snapshots}"
|
||||
root_id = root_snapshot[0]
|
||||
|
||||
# Verify crawl was created with correct max_depth
|
||||
assert crawl is not None, "Crawl should be created"
|
||||
assert crawl[1] == 1, f"Crawl max_depth should be 1, got {crawl[1]}"
|
||||
|
||||
# Verify child snapshots were created (monadical.com should have links)
|
||||
assert len(child_snapshots) > 0, \
|
||||
f"Child snapshots should be created from monadical.com links. Parser status: {parser_status}. Started extractors blocking: {started_extractors}"
|
||||
|
||||
# If children exist, verify they have correct parent_snapshot_id
|
||||
for child_id, child_url, child_depth, parent_id in child_snapshots:
|
||||
assert child_depth == 1, f"Child snapshot should have depth=1, got {child_depth}"
|
||||
assert parent_id == root_id, \
|
||||
f"Child snapshot {child_url} should have parent_snapshot_id={root_id}, got {parent_id}"
|
||||
|
||||
|
||||
def test_recursive_crawl_respects_depth_limit(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that recursive crawling stops at max_depth."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Start a crawl with depth=1
|
||||
proc = subprocess.Popen(
|
||||
['archivebox', 'add', '--depth=1', 'https://monadical.com'],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Give orchestrator time to process
|
||||
time.sleep(10)
|
||||
|
||||
# Kill the process
|
||||
proc.kill()
|
||||
proc.wait()
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
|
||||
# Check that no snapshots exceed depth=1
|
||||
max_depth_found = c.execute(
|
||||
"SELECT MAX(depth) FROM core_snapshot"
|
||||
).fetchone()[0]
|
||||
|
||||
# Get depth distribution
|
||||
depth_counts = c.execute(
|
||||
"SELECT depth, COUNT(*) FROM core_snapshot GROUP BY depth ORDER BY depth"
|
||||
).fetchall()
|
||||
|
||||
conn.close()
|
||||
|
||||
# Should not exceed max_depth=1
|
||||
assert max_depth_found is not None, "Should have at least one snapshot"
|
||||
assert max_depth_found <= 1, \
|
||||
f"Max depth should not exceed 1, got {max_depth_found}. Depth distribution: {depth_counts}"
|
||||
|
||||
|
||||
def test_crawl_snapshot_has_parent_snapshot_field(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that Snapshot model has parent_snapshot field."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
|
||||
# Check schema for parent_snapshot_id column
|
||||
schema = c.execute("PRAGMA table_info(core_snapshot)").fetchall()
|
||||
conn.close()
|
||||
|
||||
column_names = [col[1] for col in schema]
|
||||
|
||||
assert 'parent_snapshot_id' in column_names, \
|
||||
f"Snapshot table should have parent_snapshot_id column. Columns: {column_names}"
|
||||
|
||||
|
||||
def test_snapshot_depth_field_exists(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that Snapshot model has depth field."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
|
||||
# Check schema for depth column
|
||||
schema = c.execute("PRAGMA table_info(core_snapshot)").fetchall()
|
||||
conn.close()
|
||||
|
||||
column_names = [col[1] for col in schema]
|
||||
|
||||
assert 'depth' in column_names, \
|
||||
f"Snapshot table should have depth column. Columns: {column_names}"
|
||||
|
||||
|
||||
def test_root_snapshot_has_depth_zero(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that root snapshots are created with depth=0."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--depth=1', 'https://monadical.com'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=90,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
|
||||
# Get the first snapshot for this URL
|
||||
snapshot = c.execute(
|
||||
"SELECT id, depth FROM core_snapshot WHERE url = ? ORDER BY created_at LIMIT 1",
|
||||
('https://monadical.com',)
|
||||
).fetchone()
|
||||
|
||||
conn.close()
|
||||
|
||||
assert snapshot is not None, "Root snapshot should be created"
|
||||
assert snapshot[1] == 0, f"Root snapshot should have depth=0, got {snapshot[1]}"
|
||||
|
||||
|
||||
def test_archiveresult_worker_queue_filters_by_foreground_extractors(tmp_path, process):
|
||||
"""Test that ArchiveResultWorker.get_queue() only blocks on foreground extractors."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# This test verifies the fix for the orchestrator bug where background hooks
|
||||
# were blocking parser extractors from running
|
||||
|
||||
# Start a crawl
|
||||
env = os.environ.copy()
|
||||
env.update({
|
||||
"USE_WGET": "false",
|
||||
"USE_SINGLEFILE": "false",
|
||||
"SAVE_PDF": "false",
|
||||
"SAVE_SCREENSHOT": "false",
|
||||
"USE_CHROME": "true", # Enables background hooks
|
||||
})
|
||||
|
||||
proc = subprocess.Popen(
|
||||
['archivebox', 'add', 'https://monadical.com'],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env,
|
||||
)
|
||||
|
||||
# Give time for background hooks to start
|
||||
time.sleep(10)
|
||||
|
||||
# Kill the process
|
||||
proc.kill()
|
||||
proc.wait()
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
|
||||
# Get background hooks that are started
|
||||
bg_started = c.execute(
|
||||
"SELECT plugin FROM core_archiveresult WHERE plugin IN ('consolelog', 'ssl', 'responses', 'redirects', 'staticfile') AND status = 'started'"
|
||||
).fetchall()
|
||||
|
||||
# Get parser extractors that should be queued or better
|
||||
parser_status = c.execute(
|
||||
"SELECT plugin, status FROM core_archiveresult WHERE plugin LIKE 'parse_%_urls'"
|
||||
).fetchall()
|
||||
|
||||
conn.close()
|
||||
|
||||
# If background hooks are running, parser extractors should still run
|
||||
# (not permanently stuck in queued status)
|
||||
if len(bg_started) > 0:
|
||||
parser_statuses = [status for _, status in parser_status]
|
||||
# At least some parsers should have progressed beyond queued
|
||||
non_queued = [s for s in parser_statuses if s != 'queued']
|
||||
assert len(non_queued) > 0 or len(parser_status) == 0, \
|
||||
f"With {len(bg_started)} background hooks started, parser extractors should still run. " \
|
||||
f"Got statuses: {parser_statuses}"
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
86
archivebox/tests/test_remove.py
Normal file
86
archivebox/tests/test_remove.py
Normal file
@@ -0,0 +1,86 @@
|
||||
import os
|
||||
import sqlite3
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
def test_remove_single_snapshot(tmp_path, process, disable_extractors_dict):
|
||||
"""Test removing a snapshot by URL pattern"""
|
||||
os.chdir(tmp_path)
|
||||
# Add a URL - creates source file snapshot
|
||||
subprocess.run(['archivebox', 'add', '--index-only', 'https://example.com'], capture_output=True, env=disable_extractors_dict)
|
||||
|
||||
# Verify snapshot exists
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
count_before = c.execute("SELECT COUNT() from archivebox.core.snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
assert count_before >= 1
|
||||
|
||||
# Remove all snapshots (including source file snapshots)
|
||||
remove_process = subprocess.run(['archivebox', 'remove', '--filter-type=regex', '.*', '--yes'], capture_output=True)
|
||||
# Check that it ran successfully (either output indicates success or return code 0)
|
||||
output = remove_process.stdout.decode("utf-8") + remove_process.stderr.decode("utf-8")
|
||||
assert remove_process.returncode == 0 or "removed" in output.lower() or "Found" in output
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
count = c.execute("SELECT COUNT() from archivebox.core.snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
assert count == 0
|
||||
|
||||
|
||||
def test_remove_with_delete_flag(tmp_path, process, disable_extractors_dict):
|
||||
"""Test removing snapshot with --delete also removes archive folder"""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(['archivebox', 'add', '--index-only', 'https://example.com'], capture_output=True, env=disable_extractors_dict)
|
||||
|
||||
# Get archives before delete
|
||||
archive_dir = tmp_path / "archive"
|
||||
archives_before = list(archive_dir.iterdir()) if archive_dir.exists() else []
|
||||
|
||||
# Only run the rest of the test if archives were created
|
||||
if archives_before:
|
||||
subprocess.run(['archivebox', 'remove', '--filter-type=regex', '.*', '--yes', '--delete'], capture_output=True)
|
||||
archives_after = list(archive_dir.iterdir()) if archive_dir.exists() else []
|
||||
assert len(archives_after) < len(archives_before)
|
||||
else:
|
||||
# With --index-only, archive folders may not be created immediately
|
||||
# Just verify that remove command doesn't error
|
||||
remove_result = subprocess.run(['archivebox', 'remove', '--filter-type=regex', '.*', '--yes', '--delete'], capture_output=True)
|
||||
assert remove_result.returncode in (0, 1) # 0 = success, 1 = no matches
|
||||
|
||||
|
||||
def test_remove_regex(tmp_path, process, disable_extractors_dict):
|
||||
"""Test removing snapshots by regex pattern"""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(['archivebox', 'add', '--index-only', 'https://example.com'], capture_output=True, env=disable_extractors_dict)
|
||||
subprocess.run(['archivebox', 'add', '--index-only', 'https://iana.org'], capture_output=True, env=disable_extractors_dict)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
count_before = c.execute("SELECT COUNT() from archivebox.core.snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
assert count_before >= 2
|
||||
|
||||
subprocess.run(['archivebox', 'remove', '--filter-type=regex', '.*', '--yes', '--delete'], capture_output=True)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
count_after = c.execute("SELECT COUNT() from archivebox.core.snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
assert count_after == 0
|
||||
|
||||
|
||||
def test_add_creates_crawls(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that adding URLs creates crawls in database"""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(['archivebox', 'add', '--index-only', 'https://example.com'], capture_output=True, env=disable_extractors_dict)
|
||||
subprocess.run(['archivebox', 'add', '--index-only', 'https://iana.org'], capture_output=True, env=disable_extractors_dict)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
crawl_count = c.execute("SELECT COUNT() from archivebox.crawls.crawl").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
assert crawl_count == 2
|
||||
75
archivebox/tests/test_schedule.py
Normal file
75
archivebox/tests/test_schedule.py
Normal file
@@ -0,0 +1,75 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Integration tests for archivebox schedule command."""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
import pytest
|
||||
|
||||
from .fixtures import process, disable_extractors_dict
|
||||
|
||||
|
||||
def test_schedule_show_lists_jobs(tmp_path, process):
|
||||
"""Test that --show lists current scheduled jobs."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'schedule', '--show'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should either show jobs or indicate no jobs
|
||||
assert 'no' in result.stdout.lower() or 'archivebox' in result.stdout.lower() or result.returncode == 0
|
||||
|
||||
|
||||
def test_schedule_clear_removes_jobs(tmp_path, process):
|
||||
"""Test that --clear removes scheduled jobs."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'schedule', '--clear'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should complete successfully (may have no jobs to clear)
|
||||
assert result.returncode == 0
|
||||
|
||||
|
||||
def test_schedule_every_requires_valid_period(tmp_path, process):
|
||||
"""Test that --every requires valid time period."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'schedule', '--every=invalid_period', 'https://example.com/feed.xml'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should fail with invalid period
|
||||
assert result.returncode != 0 or 'invalid' in result.stdout.lower()
|
||||
|
||||
|
||||
class TestScheduleCLI:
|
||||
"""Test the CLI interface for schedule command."""
|
||||
|
||||
def test_cli_help(self, tmp_path, process):
|
||||
"""Test that --help works for schedule command."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'schedule', '--help'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert '--every' in result.stdout
|
||||
assert '--show' in result.stdout
|
||||
assert '--clear' in result.stdout
|
||||
assert '--depth' in result.stdout
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
145
archivebox/tests/test_search.py
Normal file
145
archivebox/tests/test_search.py
Normal file
@@ -0,0 +1,145 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Integration tests for archivebox search command."""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from .fixtures import process, disable_extractors_dict
|
||||
|
||||
|
||||
def test_search_returns_snapshots(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that search returns snapshots."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Add some snapshots
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'search'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should return some output (path or URL info)
|
||||
assert result.stdout.strip() != '' or result.returncode == 0
|
||||
|
||||
|
||||
def test_search_filter_by_substring(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that substring filter works."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Search with filter - may not find if URL isn't stored as expected
|
||||
result = subprocess.run(
|
||||
['archivebox', 'search', '--filter-type=substring', 'example'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should run without error
|
||||
assert result.returncode == 0 or 'No Snapshots' in result.stderr
|
||||
|
||||
|
||||
def test_search_sort_option(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that --sort option works."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'search', '--sort=url'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should run without error
|
||||
assert result.returncode == 0
|
||||
|
||||
|
||||
def test_search_with_headers_requires_format(tmp_path, process):
|
||||
"""Test that --with-headers requires --json, --html, or --csv."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'search', '--with-headers'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should fail with error message
|
||||
assert result.returncode != 0
|
||||
assert 'requires' in result.stderr.lower() or 'json' in result.stderr.lower()
|
||||
|
||||
|
||||
def test_search_status_option(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that --status option filters by status."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'search', '--status=indexed'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should run without error
|
||||
assert result.returncode == 0
|
||||
|
||||
|
||||
def test_search_no_snapshots_message(tmp_path, process):
|
||||
"""Test that searching empty archive shows appropriate output."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'search'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should complete (empty results are OK)
|
||||
assert result.returncode == 0
|
||||
|
||||
|
||||
class TestSearchCLI:
|
||||
"""Test the CLI interface for search command."""
|
||||
|
||||
def test_cli_help(self, tmp_path, process):
|
||||
"""Test that --help works for search command."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'search', '--help'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert '--filter-type' in result.stdout or '-f' in result.stdout
|
||||
assert '--status' in result.stdout
|
||||
assert '--sort' in result.stdout
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
194
archivebox/tests/test_snapshot.py
Normal file
194
archivebox/tests/test_snapshot.py
Normal file
@@ -0,0 +1,194 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Integration tests for archivebox snapshot command."""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from .fixtures import process, disable_extractors_dict
|
||||
|
||||
|
||||
def test_snapshot_creates_snapshot_with_correct_url(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that snapshot stores the exact URL in the database."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'snapshot', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
result = c.execute("SELECT url FROM core_snapshot WHERE url = ?",
|
||||
('https://example.com',)).fetchone()
|
||||
conn.close()
|
||||
|
||||
assert result is not None
|
||||
assert result[0] == 'https://example.com'
|
||||
|
||||
|
||||
def test_snapshot_multiple_urls_creates_multiple_records(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that multiple URLs each get their own snapshot record."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'snapshot',
|
||||
'https://example.com',
|
||||
'https://iana.org'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
urls = c.execute("SELECT url FROM core_snapshot ORDER BY url").fetchall()
|
||||
conn.close()
|
||||
|
||||
urls = [u[0] for u in urls]
|
||||
assert 'https://example.com' in urls
|
||||
assert 'https://iana.org' in urls
|
||||
assert len(urls) >= 2
|
||||
|
||||
|
||||
def test_snapshot_tag_creates_tag_and_links_to_snapshot(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that --tag creates tag record and links it to the snapshot."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'snapshot', '--tag=mytesttag',
|
||||
'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
|
||||
# Verify tag was created
|
||||
tag = c.execute("SELECT id, name FROM core_tag WHERE name = ?", ('mytesttag',)).fetchone()
|
||||
assert tag is not None, "Tag 'mytesttag' should exist in core_tag"
|
||||
tag_id = tag[0]
|
||||
|
||||
# Verify snapshot exists
|
||||
snapshot = c.execute("SELECT id FROM core_snapshot WHERE url = ?",
|
||||
('https://example.com',)).fetchone()
|
||||
assert snapshot is not None
|
||||
snapshot_id = snapshot[0]
|
||||
|
||||
# Verify tag is linked to snapshot via join table
|
||||
link = c.execute("""
|
||||
SELECT * FROM core_snapshot_tags
|
||||
WHERE snapshot_id = ? AND tag_id = ?
|
||||
""", (snapshot_id, tag_id)).fetchone()
|
||||
conn.close()
|
||||
|
||||
assert link is not None, "Tag should be linked to snapshot via core_snapshot_tags"
|
||||
|
||||
|
||||
def test_snapshot_jsonl_output_has_correct_structure(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that JSONL output contains required fields with correct types."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Pass URL as argument instead of stdin for more reliable behavior
|
||||
result = subprocess.run(
|
||||
['archivebox', 'snapshot', 'https://example.com'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Parse JSONL output lines
|
||||
snapshot_records = []
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line:
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Snapshot':
|
||||
snapshot_records.append(record)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
assert len(snapshot_records) >= 1, "Should output at least one Snapshot JSONL record"
|
||||
|
||||
record = snapshot_records[0]
|
||||
assert record.get('type') == 'Snapshot'
|
||||
assert 'id' in record, "Snapshot record should have 'id' field"
|
||||
assert 'url' in record, "Snapshot record should have 'url' field"
|
||||
assert record['url'] == 'https://example.com'
|
||||
|
||||
|
||||
def test_snapshot_with_tag_stores_tag_name(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that title is stored when provided via tag option."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Use command line args instead of stdin
|
||||
subprocess.run(
|
||||
['archivebox', 'snapshot', '--tag=customtag', 'https://example.com'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
|
||||
# Verify tag was created with correct name
|
||||
tag = c.execute("SELECT name FROM core_tag WHERE name = ?",
|
||||
('customtag',)).fetchone()
|
||||
conn.close()
|
||||
|
||||
assert tag is not None
|
||||
assert tag[0] == 'customtag'
|
||||
|
||||
|
||||
def test_snapshot_with_depth_creates_crawl_object(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that --depth > 0 creates a Crawl object with correct max_depth."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'snapshot', '--depth=1',
|
||||
'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
crawl = c.execute("SELECT max_depth FROM crawls_crawl ORDER BY created_at DESC LIMIT 1").fetchone()
|
||||
conn.close()
|
||||
|
||||
assert crawl is not None, "Crawl object should be created when depth > 0"
|
||||
assert crawl[0] == 1, "Crawl max_depth should match --depth value"
|
||||
|
||||
|
||||
def test_snapshot_deduplicates_urls(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that adding the same URL twice doesn't create duplicate snapshots."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Add same URL twice
|
||||
subprocess.run(
|
||||
['archivebox', 'snapshot', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
subprocess.run(
|
||||
['archivebox', 'snapshot', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
c = conn.cursor()
|
||||
count = c.execute("SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
|
||||
('https://example.com',)).fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
assert count == 1, "Same URL should not create duplicate snapshots"
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
197
archivebox/tests/test_status.py
Normal file
197
archivebox/tests/test_status.py
Normal file
@@ -0,0 +1,197 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Integration tests for archivebox status command."""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
|
||||
import pytest
|
||||
|
||||
from .fixtures import process, disable_extractors_dict
|
||||
|
||||
|
||||
def test_status_shows_index_info(tmp_path, process):
|
||||
"""Test that status shows index information."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'status'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should show index scanning info
|
||||
assert 'index' in result.stdout.lower() or 'Index' in result.stdout
|
||||
|
||||
|
||||
def test_status_shows_snapshot_count(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that status shows snapshot count."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Add some snapshots
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', 'https://iana.org'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'status'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should show link/snapshot count
|
||||
assert '2' in result.stdout or 'links' in result.stdout.lower()
|
||||
|
||||
|
||||
def test_status_shows_archive_size(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that status shows archive size information."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'status'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should show size info (bytes, KB, MB, etc)
|
||||
assert 'Size' in result.stdout or 'size' in result.stdout or 'B' in result.stdout
|
||||
|
||||
|
||||
def test_status_shows_indexed_count(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that status shows indexed folder count."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'status'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should show indexed count
|
||||
assert 'indexed' in result.stdout.lower()
|
||||
|
||||
|
||||
def test_status_shows_archived_vs_unarchived(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that status shows archived vs unarchived counts."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Add index-only snapshot (unarchived)
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'status'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should show archived/unarchived categories
|
||||
assert 'archived' in result.stdout.lower() or 'unarchived' in result.stdout.lower()
|
||||
|
||||
|
||||
def test_status_shows_data_directory_info(tmp_path, process):
|
||||
"""Test that status shows data directory path."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'status'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should show data directory or archive path
|
||||
assert 'archive' in result.stdout.lower() or str(tmp_path) in result.stdout
|
||||
|
||||
|
||||
def test_status_shows_user_info(tmp_path, process):
|
||||
"""Test that status shows user information."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'status'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should show user info section
|
||||
assert 'user' in result.stdout.lower() or 'login' in result.stdout.lower()
|
||||
|
||||
|
||||
def test_status_empty_archive(tmp_path, process):
|
||||
"""Test status on empty archive shows zero counts."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'status'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should still run successfully
|
||||
assert result.returncode == 0 or 'index' in result.stdout.lower()
|
||||
# Should show 0 links
|
||||
assert '0' in result.stdout or 'links' in result.stdout.lower()
|
||||
|
||||
|
||||
def test_status_shows_valid_vs_invalid(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that status shows valid vs invalid folder counts."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'status'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should show valid/invalid categories
|
||||
assert 'valid' in result.stdout.lower() or 'present' in result.stdout.lower()
|
||||
|
||||
|
||||
class TestStatusCLI:
|
||||
"""Test the CLI interface for status command."""
|
||||
|
||||
def test_cli_help(self, tmp_path, process):
|
||||
"""Test that --help works for status command."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'status', '--help'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
# Help should show some info about the command
|
||||
assert 'status' in result.stdout.lower() or 'statistic' in result.stdout.lower()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
36
archivebox/tests/test_title.py
Normal file
36
archivebox/tests/test_title.py
Normal file
@@ -0,0 +1,36 @@
|
||||
import os
|
||||
import sqlite3
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
def test_title_is_extracted(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that title is extracted from the page."""
|
||||
disable_extractors_dict.update({"SAVE_TITLE": "true"})
|
||||
subprocess.run(['archivebox', 'add', 'https://example.com'],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
|
||||
os.chdir(tmp_path)
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
conn.row_factory = sqlite3.Row
|
||||
c = conn.cursor()
|
||||
c.execute("SELECT title from archivebox.core.snapshot")
|
||||
snapshot = c.fetchone()
|
||||
conn.close()
|
||||
|
||||
assert snapshot[0] is not None
|
||||
assert "Example" in snapshot[0]
|
||||
|
||||
def test_title_is_htmlencoded_in_index_html(tmp_path, process, disable_extractors_dict):
|
||||
"""
|
||||
https://github.com/ArchiveBox/ArchiveBox/issues/330
|
||||
Unencoded content should not be rendered as it facilitates xss injections
|
||||
and breaks the layout.
|
||||
"""
|
||||
disable_extractors_dict.update({"SAVE_TITLE": "true"})
|
||||
subprocess.run(['archivebox', 'add', 'https://example.com'],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
list_process = subprocess.run(["archivebox", "list", "--html"], capture_output=True)
|
||||
|
||||
# Should not contain unescaped HTML tags in output
|
||||
output = list_process.stdout.decode("utf-8")
|
||||
assert "https://example.com" in output
|
||||
33
archivebox/tests/test_update.py
Normal file
33
archivebox/tests/test_update.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import sqlite3
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
def test_update_imports_orphaned_snapshots(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that archivebox update imports orphaned snapshot directories."""
|
||||
# Add a snapshot
|
||||
subprocess.run(['archivebox', 'add', 'https://example.com'], capture_output=True, env=disable_extractors_dict)
|
||||
assert list((tmp_path / "archive").iterdir()) != []
|
||||
|
||||
# Remove from DB but leave directory intact
|
||||
subprocess.run(['archivebox', 'remove', 'https://example.com', '--yes'], capture_output=True)
|
||||
|
||||
# Verify snapshot removed from DB
|
||||
conn = sqlite3.connect(str(tmp_path / "index.sqlite3"))
|
||||
c = conn.cursor()
|
||||
link = c.execute("SELECT * FROM core_snapshot").fetchone()
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
assert link is None
|
||||
|
||||
# Run update without filters - should scan filesystem and import orphaned directory
|
||||
update_process = subprocess.run(['archivebox', 'update'], capture_output=True, env=disable_extractors_dict)
|
||||
|
||||
# Verify snapshot was re-imported from orphaned directory
|
||||
conn = sqlite3.connect(str(tmp_path / "index.sqlite3"))
|
||||
c = conn.cursor()
|
||||
url = c.execute("SELECT url FROM core_snapshot").fetchone()[0]
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
assert url == 'https://example.com'
|
||||
5
archivebox/tests/test_util.py
Normal file
5
archivebox/tests/test_util.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from archivebox.misc.util import download_url
|
||||
|
||||
def test_download_url_downloads_content():
|
||||
text = download_url("https://example.com")
|
||||
assert "Example Domain" in text
|
||||
160
archivebox/tests/test_version.py
Normal file
160
archivebox/tests/test_version.py
Normal file
@@ -0,0 +1,160 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Integration tests for archivebox version command."""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from .fixtures import process, disable_extractors_dict
|
||||
|
||||
|
||||
class TestVersionQuiet:
|
||||
"""Test the quiet/minimal version output."""
|
||||
|
||||
def test_version_prints_version_number(self, tmp_path):
|
||||
"""Test that version prints the version number."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'version', '--quiet'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
# Should contain a version string like "0.8.0" or similar
|
||||
version = result.stdout.strip()
|
||||
assert version
|
||||
# Version should be a valid semver-ish format
|
||||
parts = version.split('.')
|
||||
assert len(parts) >= 2 # At least major.minor
|
||||
|
||||
def test_version_flag_prints_version_number(self, tmp_path):
|
||||
"""Test that --version flag prints the version number."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
version = result.stdout.strip()
|
||||
assert version
|
||||
parts = version.split('.')
|
||||
assert len(parts) >= 2
|
||||
|
||||
|
||||
class TestVersionFull:
|
||||
"""Test the full version output."""
|
||||
|
||||
def test_version_shows_system_info(self, tmp_path, process):
|
||||
"""Test that version shows system information."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
output = result.stdout
|
||||
|
||||
# Should show basic system info (exit code may be 1 if binaries missing)
|
||||
assert 'ArchiveBox' in output
|
||||
|
||||
def test_version_shows_binary_section(self, tmp_path, process):
|
||||
"""Test that version shows binary dependencies section."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
output = result.stdout
|
||||
|
||||
# Should show binary dependencies section
|
||||
assert 'Binary' in output or 'Dependenc' in output
|
||||
|
||||
def test_version_shows_data_locations(self, tmp_path, process):
|
||||
"""Test that version shows data locations."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
output = result.stdout
|
||||
|
||||
# Should show data/code locations
|
||||
assert 'Data' in output or 'location' in output.lower() or 'DIR' in output or 'Code' in output
|
||||
|
||||
|
||||
class TestVersionWithBinaries:
|
||||
"""Test version output after running install."""
|
||||
|
||||
def test_version_shows_binary_status(self, tmp_path, process, disable_extractors_dict):
|
||||
"""Test that version shows binary status (installed or not)."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# First run install (with dry-run to speed up)
|
||||
subprocess.run(
|
||||
['archivebox', 'install', '--dry-run'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Now check version
|
||||
result = subprocess.run(
|
||||
['archivebox', 'version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
output = result.stdout
|
||||
|
||||
# Should show binary status (either installed or not installed)
|
||||
assert 'installed' in output.lower() or 'Binary' in output
|
||||
|
||||
|
||||
class TestVersionCLI:
|
||||
"""Test the CLI interface for version command."""
|
||||
|
||||
def test_cli_help(self, tmp_path):
|
||||
"""Test that --help works for version command."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'version', '--help'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert '--quiet' in result.stdout or '-q' in result.stdout
|
||||
|
||||
def test_cli_invalid_option(self, tmp_path):
|
||||
"""Test that invalid options are handled."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
result = subprocess.run(
|
||||
['archivebox', 'version', '--invalid-option'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
# Should fail with non-zero exit code
|
||||
assert result.returncode != 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
Reference in New Issue
Block a user