wip major changes

This commit is contained in:
Nick Sweeting
2025-12-24 20:09:51 -08:00
parent c1335fed37
commit 1915333b81
450 changed files with 35814 additions and 19015 deletions

143
tests/test_config.py Normal file
View File

@@ -0,0 +1,143 @@
#!/usr/bin/env python3
"""Integration tests for archivebox config command."""
import os
import subprocess
import pytest
from .fixtures import process, disable_extractors_dict
def test_config_shows_all_config_values(tmp_path, process):
"""Test that config without args shows all config values."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'config'],
capture_output=True,
text=True,
)
# Should show various config sections
assert 'TIMEOUT' in result.stdout or 'timeout' in result.stdout.lower()
# Config should show some output
assert len(result.stdout) > 100
def test_config_get_specific_key(tmp_path, process):
"""Test that --get retrieves a specific config value."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'config', '--get', 'TIMEOUT'],
capture_output=True,
text=True,
)
# Should show the TIMEOUT value
assert 'TIMEOUT' in result.stdout or result.returncode == 0
def test_config_set_value_writes_to_config_file(tmp_path, process):
"""Test that --set writes config value to ArchiveBox.conf file."""
os.chdir(tmp_path)
# Set a config value
result = subprocess.run(
['archivebox', 'config', '--set', 'TIMEOUT=120'],
capture_output=True,
text=True,
)
# Read the config file directly to verify it was written
config_file = tmp_path / 'ArchiveBox.conf'
if config_file.exists():
config_content = config_file.read_text()
# Config should contain the set value
assert 'TIMEOUT' in config_content or 'timeout' in config_content.lower()
def test_config_set_and_get_roundtrip(tmp_path, process):
"""Test that a value set with --set can be retrieved with --get."""
os.chdir(tmp_path)
# Set a value
set_result = subprocess.run(
['archivebox', 'config', '--set', 'TIMEOUT=999'],
capture_output=True,
text=True,
)
# Verify set was successful
assert set_result.returncode == 0 or '999' in set_result.stdout
# Read the config file directly to verify
config_file = tmp_path / 'ArchiveBox.conf'
if config_file.exists():
config_content = config_file.read_text()
assert '999' in config_content or 'TIMEOUT' in config_content
def test_config_search_finds_matching_keys(tmp_path, process):
"""Test that --search finds config keys matching a pattern."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'config', '--search', 'TIMEOUT'],
capture_output=True,
text=True,
)
# Should find TIMEOUT-related config
assert 'TIMEOUT' in result.stdout or result.returncode == 0
def test_config_invalid_key_fails(tmp_path, process):
"""Test that setting an invalid config key fails."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'config', '--set', 'INVALID_KEY_THAT_DOES_NOT_EXIST=value'],
capture_output=True,
text=True,
)
# Should fail
assert result.returncode != 0 or 'failed' in result.stdout.lower()
def test_config_set_requires_equals_sign(tmp_path, process):
"""Test that --set requires KEY=VALUE format."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'config', '--set', 'TIMEOUT'],
capture_output=True,
text=True,
)
# Should fail because there's no = sign
assert result.returncode != 0
class TestConfigCLI:
"""Test the CLI interface for config command."""
def test_cli_help(self, tmp_path, process):
"""Test that --help works for config command."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'config', '--help'],
capture_output=True,
text=True,
)
assert result.returncode == 0
assert '--get' in result.stdout
assert '--set' in result.stdout
if __name__ == '__main__':
pytest.main([__file__, '-v'])

185
tests/test_crawl.py Normal file
View File

@@ -0,0 +1,185 @@
#!/usr/bin/env python3
"""Integration tests for archivebox crawl command."""
import os
import subprocess
import sqlite3
import json
import pytest
from .fixtures import process, disable_extractors_dict
def test_crawl_creates_crawl_object(tmp_path, process, disable_extractors_dict):
"""Test that crawl command creates a Crawl object."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'crawl', '--no-wait', 'https://example.com'],
capture_output=True,
text=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect('index.sqlite3')
c = conn.cursor()
crawl = c.execute("SELECT id, max_depth FROM crawls_crawl ORDER BY created_at DESC LIMIT 1").fetchone()
conn.close()
assert crawl is not None, "Crawl object should be created"
def test_crawl_depth_sets_max_depth_in_crawl(tmp_path, process, disable_extractors_dict):
"""Test that --depth option sets max_depth in the Crawl object."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'crawl', '--depth=2', '--no-wait', 'https://example.com'],
capture_output=True,
text=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect('index.sqlite3')
c = conn.cursor()
crawl = c.execute("SELECT max_depth FROM crawls_crawl ORDER BY created_at DESC LIMIT 1").fetchone()
conn.close()
assert crawl is not None
assert crawl[0] == 2, "Crawl max_depth should match --depth=2"
def test_crawl_creates_snapshot_for_url(tmp_path, process, disable_extractors_dict):
"""Test that crawl creates a Snapshot for the input URL."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'crawl', '--no-wait', 'https://example.com'],
capture_output=True,
text=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect('index.sqlite3')
c = conn.cursor()
snapshot = c.execute("SELECT url FROM core_snapshot WHERE url = ?",
('https://example.com',)).fetchone()
conn.close()
assert snapshot is not None, "Snapshot should be created for input URL"
def test_crawl_links_snapshot_to_crawl(tmp_path, process, disable_extractors_dict):
"""Test that Snapshot is linked to Crawl via crawl_id."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'crawl', '--no-wait', 'https://example.com'],
capture_output=True,
text=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect('index.sqlite3')
c = conn.cursor()
# Get the crawl ID
crawl = c.execute("SELECT id FROM crawls_crawl ORDER BY created_at DESC LIMIT 1").fetchone()
assert crawl is not None
crawl_id = crawl[0]
# Check snapshot has correct crawl_id
snapshot = c.execute("SELECT crawl_id FROM core_snapshot WHERE url = ?",
('https://example.com',)).fetchone()
conn.close()
assert snapshot is not None
assert snapshot[0] == crawl_id, "Snapshot should be linked to Crawl"
def test_crawl_multiple_urls_creates_multiple_snapshots(tmp_path, process, disable_extractors_dict):
"""Test that crawling multiple URLs creates multiple snapshots."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'crawl', '--no-wait',
'https://example.com',
'https://iana.org'],
capture_output=True,
text=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect('index.sqlite3')
c = conn.cursor()
urls = c.execute("SELECT url FROM core_snapshot ORDER BY url").fetchall()
conn.close()
urls = [u[0] for u in urls]
assert 'https://example.com' in urls
assert 'https://iana.org' in urls
def test_crawl_from_file_creates_snapshot(tmp_path, process, disable_extractors_dict):
"""Test that crawl can create snapshots from a file of URLs."""
os.chdir(tmp_path)
# Write URLs to a file
urls_file = tmp_path / 'urls.txt'
urls_file.write_text('https://example.com\n')
subprocess.run(
['archivebox', 'crawl', '--no-wait', str(urls_file)],
capture_output=True,
text=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect('index.sqlite3')
c = conn.cursor()
snapshot = c.execute("SELECT url FROM core_snapshot").fetchone()
conn.close()
# Should create at least one snapshot (the source file or the URL)
assert snapshot is not None, "Should create at least one snapshot"
def test_crawl_creates_seed_for_input(tmp_path, process, disable_extractors_dict):
"""Test that crawl creates a Seed object for input."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'crawl', '--no-wait', 'https://example.com'],
capture_output=True,
text=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect('index.sqlite3')
c = conn.cursor()
seed = c.execute("SELECT id FROM crawls_seed").fetchone()
conn.close()
assert seed is not None, "Seed should be created for crawl input"
class TestCrawlCLI:
"""Test the CLI interface for crawl command."""
def test_cli_help(self, tmp_path, process):
"""Test that --help works for crawl command."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'crawl', '--help'],
capture_output=True,
text=True,
)
assert result.returncode == 0
assert '--depth' in result.stdout or '-d' in result.stdout
if __name__ == '__main__':
pytest.main([__file__, '-v'])

277
tests/test_extract.py Normal file
View File

@@ -0,0 +1,277 @@
#!/usr/bin/env python3
"""Integration tests for archivebox extract command."""
import os
import subprocess
import sqlite3
import json
import pytest
from .fixtures import process, disable_extractors_dict
def test_extract_runs_on_snapshot_id(tmp_path, process, disable_extractors_dict):
"""Test that extract command accepts a snapshot ID."""
os.chdir(tmp_path)
# First create a snapshot
subprocess.run(
['archivebox', 'add', '--index-only', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
# Get the snapshot ID
conn = sqlite3.connect('index.sqlite3')
c = conn.cursor()
snapshot_id = c.execute("SELECT id FROM core_snapshot LIMIT 1").fetchone()[0]
conn.close()
# Run extract on the snapshot
result = subprocess.run(
['archivebox', 'extract', '--no-wait', str(snapshot_id)],
capture_output=True,
text=True,
env=disable_extractors_dict,
)
# Should not error about invalid snapshot ID
assert 'not found' not in result.stderr.lower()
def test_extract_with_enabled_extractor_creates_archiveresult(tmp_path, process, disable_extractors_dict):
"""Test that extract creates ArchiveResult when extractor is enabled."""
os.chdir(tmp_path)
# First create a snapshot
subprocess.run(
['archivebox', 'add', '--index-only', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
# Get the snapshot ID
conn = sqlite3.connect('index.sqlite3')
c = conn.cursor()
snapshot_id = c.execute("SELECT id FROM core_snapshot LIMIT 1").fetchone()[0]
conn.close()
# Run extract with title extractor enabled
env = disable_extractors_dict.copy()
env['SAVE_TITLE'] = 'true'
subprocess.run(
['archivebox', 'extract', '--no-wait', str(snapshot_id)],
capture_output=True,
text=True,
env=env,
)
# Check for archiveresults (may be queued, not completed with --no-wait)
conn = sqlite3.connect('index.sqlite3')
c = conn.cursor()
count = c.execute("SELECT COUNT(*) FROM core_archiveresult WHERE snapshot_id = ?",
(snapshot_id,)).fetchone()[0]
conn.close()
# May or may not have results depending on timing
assert count >= 0
def test_extract_plugin_option_accepted(tmp_path, process, disable_extractors_dict):
"""Test that --plugin option is accepted."""
os.chdir(tmp_path)
# First create a snapshot
subprocess.run(
['archivebox', 'add', '--index-only', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
# Get the snapshot ID
conn = sqlite3.connect('index.sqlite3')
c = conn.cursor()
snapshot_id = c.execute("SELECT id FROM core_snapshot LIMIT 1").fetchone()[0]
conn.close()
result = subprocess.run(
['archivebox', 'extract', '--plugin=title', '--no-wait', str(snapshot_id)],
capture_output=True,
text=True,
env=disable_extractors_dict,
)
assert 'unrecognized arguments: --plugin' not in result.stderr
def test_extract_stdin_snapshot_id(tmp_path, process, disable_extractors_dict):
"""Test that extract reads snapshot IDs from stdin."""
os.chdir(tmp_path)
# First create a snapshot
subprocess.run(
['archivebox', 'add', '--index-only', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
# Get the snapshot ID
conn = sqlite3.connect('index.sqlite3')
c = conn.cursor()
snapshot_id = c.execute("SELECT id FROM core_snapshot LIMIT 1").fetchone()[0]
conn.close()
result = subprocess.run(
['archivebox', 'extract', '--no-wait'],
input=f'{snapshot_id}\n',
capture_output=True,
text=True,
env=disable_extractors_dict,
)
# Should not show "not found" error
assert 'not found' not in result.stderr.lower() or result.returncode == 0
def test_extract_stdin_jsonl_input(tmp_path, process, disable_extractors_dict):
"""Test that extract reads JSONL records from stdin."""
os.chdir(tmp_path)
# First create a snapshot
subprocess.run(
['archivebox', 'add', '--index-only', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
# Get the snapshot ID
conn = sqlite3.connect('index.sqlite3')
c = conn.cursor()
snapshot_id = c.execute("SELECT id FROM core_snapshot LIMIT 1").fetchone()[0]
conn.close()
jsonl_input = json.dumps({"type": "Snapshot", "id": str(snapshot_id)}) + '\n'
result = subprocess.run(
['archivebox', 'extract', '--no-wait'],
input=jsonl_input,
capture_output=True,
text=True,
env=disable_extractors_dict,
)
# Should not show "not found" error
assert 'not found' not in result.stderr.lower() or result.returncode == 0
def test_extract_pipeline_from_snapshot(tmp_path, process, disable_extractors_dict):
"""Test piping snapshot output to extract."""
os.chdir(tmp_path)
# Create snapshot and pipe to extract
snapshot_proc = subprocess.Popen(
['archivebox', 'snapshot', 'https://example.com'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
env=disable_extractors_dict,
)
subprocess.run(
['archivebox', 'extract', '--no-wait'],
stdin=snapshot_proc.stdout,
capture_output=True,
text=True,
env=disable_extractors_dict,
)
snapshot_proc.wait()
# Check database for snapshot
conn = sqlite3.connect('index.sqlite3')
c = conn.cursor()
snapshot = c.execute("SELECT id, url FROM core_snapshot WHERE url = ?",
('https://example.com',)).fetchone()
conn.close()
assert snapshot is not None, "Snapshot should be created by pipeline"
def test_extract_multiple_snapshots(tmp_path, process, disable_extractors_dict):
"""Test extracting from multiple snapshots."""
os.chdir(tmp_path)
# Create multiple snapshots one at a time to avoid deduplication issues
subprocess.run(
['archivebox', 'add', '--index-only', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
subprocess.run(
['archivebox', 'add', '--index-only', 'https://iana.org'],
capture_output=True,
env=disable_extractors_dict,
)
# Get all snapshot IDs
conn = sqlite3.connect('index.sqlite3')
c = conn.cursor()
snapshot_ids = c.execute("SELECT id FROM core_snapshot").fetchall()
conn.close()
assert len(snapshot_ids) >= 2, "Should have at least 2 snapshots"
# Extract from all snapshots
ids_input = '\n'.join(str(s[0]) for s in snapshot_ids) + '\n'
result = subprocess.run(
['archivebox', 'extract', '--no-wait'],
input=ids_input,
capture_output=True,
text=True,
env=disable_extractors_dict,
)
# Should not error
conn = sqlite3.connect('index.sqlite3')
c = conn.cursor()
count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
conn.close()
assert count >= 2, "Both snapshots should still exist after extraction"
class TestExtractCLI:
"""Test the CLI interface for extract command."""
def test_cli_help(self, tmp_path, process):
"""Test that --help works for extract command."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'extract', '--help'],
capture_output=True,
text=True,
)
assert result.returncode == 0
assert '--plugin' in result.stdout or '-p' in result.stdout
assert '--wait' in result.stdout or '--no-wait' in result.stdout
def test_cli_no_snapshots_shows_warning(self, tmp_path, process):
"""Test that running without snapshots shows a warning."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'extract', '--no-wait'],
input='',
capture_output=True,
text=True,
)
# Should show warning about no snapshots or exit normally (empty input)
assert result.returncode == 0 or 'No' in result.stderr
if __name__ == '__main__':
pytest.main([__file__, '-v'])

129
tests/test_install.py Normal file
View File

@@ -0,0 +1,129 @@
#!/usr/bin/env python3
"""Integration tests for archivebox install command."""
import os
import subprocess
import sqlite3
import pytest
from .fixtures import process, disable_extractors_dict
class TestInstallDryRun:
"""Test the dry-run mode of install command."""
def test_dry_run_prints_message(self, tmp_path, process):
"""Test that dry-run mode prints appropriate message."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'install', '--dry-run'],
capture_output=True,
text=True,
)
assert result.returncode == 0
assert 'Dry run' in result.stdout
def test_dry_run_does_not_create_crawl(self, tmp_path, process):
"""Test that dry-run mode doesn't create a crawl."""
os.chdir(tmp_path)
# Get initial crawl count
conn = sqlite3.connect('index.sqlite3')
c = conn.cursor()
c.execute("SELECT COUNT(*) FROM crawls_crawl")
initial_count = c.fetchone()[0]
conn.close()
# Run install with dry-run
result = subprocess.run(
['archivebox', 'install', '--dry-run'],
capture_output=True,
text=True,
)
assert result.returncode == 0
# Check crawl count unchanged
conn = sqlite3.connect('index.sqlite3')
c = conn.cursor()
c.execute("SELECT COUNT(*) FROM crawls_crawl")
final_count = c.fetchone()[0]
conn.close()
assert final_count == initial_count
class TestInstallOutput:
"""Test the output/messages from install command."""
def test_install_prints_detecting_message(self, tmp_path, process, disable_extractors_dict):
"""Test that install prints detecting dependencies message."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'install', '--dry-run'],
capture_output=True,
text=True,
env=disable_extractors_dict,
)
assert result.returncode == 0
# Should mention detecting or dependencies
output = result.stdout.lower()
assert 'detect' in output or 'dependenc' in output or 'dry run' in output
class TestInstallCLI:
"""Test the CLI interface for install command."""
def test_cli_help(self, tmp_path):
"""Test that --help works for install command."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'install', '--help'],
capture_output=True,
text=True,
)
assert result.returncode == 0
assert '--dry-run' in result.stdout or '-d' in result.stdout
def test_cli_invalid_option(self, tmp_path):
"""Test that invalid options are handled."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'install', '--invalid-option'],
capture_output=True,
text=True,
)
# Should fail with non-zero exit code
assert result.returncode != 0
class TestInstallInitialization:
"""Test that install initializes the data directory if needed."""
def test_install_from_empty_dir(self, tmp_path):
"""Test that install from empty dir initializes first."""
os.chdir(tmp_path)
# Don't use process fixture - start from empty dir
result = subprocess.run(
['archivebox', 'install', '--dry-run'],
capture_output=True,
text=True,
)
# Should either initialize or show dry run message
output = result.stdout
assert 'Initializing' in output or 'Dry run' in output or 'init' in output.lower()
if __name__ == '__main__':
pytest.main([__file__, '-v'])

75
tests/test_schedule.py Normal file
View File

@@ -0,0 +1,75 @@
#!/usr/bin/env python3
"""Integration tests for archivebox schedule command."""
import os
import subprocess
import pytest
from .fixtures import process, disable_extractors_dict
def test_schedule_show_lists_jobs(tmp_path, process):
"""Test that --show lists current scheduled jobs."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'schedule', '--show'],
capture_output=True,
text=True,
)
# Should either show jobs or indicate no jobs
assert 'no' in result.stdout.lower() or 'archivebox' in result.stdout.lower() or result.returncode == 0
def test_schedule_clear_removes_jobs(tmp_path, process):
"""Test that --clear removes scheduled jobs."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'schedule', '--clear'],
capture_output=True,
text=True,
)
# Should complete successfully (may have no jobs to clear)
assert result.returncode == 0
def test_schedule_every_requires_valid_period(tmp_path, process):
"""Test that --every requires valid time period."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'schedule', '--every=invalid_period', 'https://example.com/feed.xml'],
capture_output=True,
text=True,
)
# Should fail with invalid period
assert result.returncode != 0 or 'invalid' in result.stdout.lower()
class TestScheduleCLI:
"""Test the CLI interface for schedule command."""
def test_cli_help(self, tmp_path, process):
"""Test that --help works for schedule command."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'schedule', '--help'],
capture_output=True,
text=True,
)
assert result.returncode == 0
assert '--every' in result.stdout
assert '--show' in result.stdout
assert '--clear' in result.stdout
assert '--depth' in result.stdout
if __name__ == '__main__':
pytest.main([__file__, '-v'])

145
tests/test_search.py Normal file
View File

@@ -0,0 +1,145 @@
#!/usr/bin/env python3
"""Integration tests for archivebox search command."""
import os
import subprocess
import sqlite3
import json
import pytest
from .fixtures import process, disable_extractors_dict
def test_search_returns_snapshots(tmp_path, process, disable_extractors_dict):
"""Test that search returns snapshots."""
os.chdir(tmp_path)
# Add some snapshots
subprocess.run(
['archivebox', 'add', '--index-only', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
result = subprocess.run(
['archivebox', 'search'],
capture_output=True,
text=True,
)
# Should return some output (path or URL info)
assert result.stdout.strip() != '' or result.returncode == 0
def test_search_filter_by_substring(tmp_path, process, disable_extractors_dict):
"""Test that substring filter works."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
# Search with filter - may not find if URL isn't stored as expected
result = subprocess.run(
['archivebox', 'search', '--filter-type=substring', 'example'],
capture_output=True,
text=True,
)
# Should run without error
assert result.returncode == 0 or 'No Snapshots' in result.stderr
def test_search_sort_option(tmp_path, process, disable_extractors_dict):
"""Test that --sort option works."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
result = subprocess.run(
['archivebox', 'search', '--sort=url'],
capture_output=True,
text=True,
)
# Should run without error
assert result.returncode == 0
def test_search_with_headers_requires_format(tmp_path, process):
"""Test that --with-headers requires --json, --html, or --csv."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'search', '--with-headers'],
capture_output=True,
text=True,
)
# Should fail with error message
assert result.returncode != 0
assert 'requires' in result.stderr.lower() or 'json' in result.stderr.lower()
def test_search_status_option(tmp_path, process, disable_extractors_dict):
"""Test that --status option filters by status."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
result = subprocess.run(
['archivebox', 'search', '--status=indexed'],
capture_output=True,
text=True,
)
# Should run without error
assert result.returncode == 0
def test_search_no_snapshots_message(tmp_path, process):
"""Test that searching empty archive shows appropriate output."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'search'],
capture_output=True,
text=True,
)
# Should complete (empty results are OK)
assert result.returncode == 0
class TestSearchCLI:
"""Test the CLI interface for search command."""
def test_cli_help(self, tmp_path, process):
"""Test that --help works for search command."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'search', '--help'],
capture_output=True,
text=True,
)
assert result.returncode == 0
assert '--filter-type' in result.stdout or '-f' in result.stdout
assert '--status' in result.stdout
assert '--sort' in result.stdout
if __name__ == '__main__':
pytest.main([__file__, '-v'])

194
tests/test_snapshot.py Normal file
View File

@@ -0,0 +1,194 @@
#!/usr/bin/env python3
"""Integration tests for archivebox snapshot command."""
import os
import subprocess
import sqlite3
import json
import pytest
from .fixtures import process, disable_extractors_dict
def test_snapshot_creates_snapshot_with_correct_url(tmp_path, process, disable_extractors_dict):
"""Test that snapshot stores the exact URL in the database."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'snapshot', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect('index.sqlite3')
c = conn.cursor()
result = c.execute("SELECT url FROM core_snapshot WHERE url = ?",
('https://example.com',)).fetchone()
conn.close()
assert result is not None
assert result[0] == 'https://example.com'
def test_snapshot_multiple_urls_creates_multiple_records(tmp_path, process, disable_extractors_dict):
"""Test that multiple URLs each get their own snapshot record."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'snapshot',
'https://example.com',
'https://iana.org'],
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect('index.sqlite3')
c = conn.cursor()
urls = c.execute("SELECT url FROM core_snapshot ORDER BY url").fetchall()
conn.close()
urls = [u[0] for u in urls]
assert 'https://example.com' in urls
assert 'https://iana.org' in urls
assert len(urls) >= 2
def test_snapshot_tag_creates_tag_and_links_to_snapshot(tmp_path, process, disable_extractors_dict):
"""Test that --tag creates tag record and links it to the snapshot."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'snapshot', '--tag=mytesttag',
'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect('index.sqlite3')
c = conn.cursor()
# Verify tag was created
tag = c.execute("SELECT id, name FROM core_tag WHERE name = ?", ('mytesttag',)).fetchone()
assert tag is not None, "Tag 'mytesttag' should exist in core_tag"
tag_id = tag[0]
# Verify snapshot exists
snapshot = c.execute("SELECT id FROM core_snapshot WHERE url = ?",
('https://example.com',)).fetchone()
assert snapshot is not None
snapshot_id = snapshot[0]
# Verify tag is linked to snapshot via join table
link = c.execute("""
SELECT * FROM core_snapshot_tags
WHERE snapshot_id = ? AND tag_id = ?
""", (snapshot_id, tag_id)).fetchone()
conn.close()
assert link is not None, "Tag should be linked to snapshot via core_snapshot_tags"
def test_snapshot_jsonl_output_has_correct_structure(tmp_path, process, disable_extractors_dict):
"""Test that JSONL output contains required fields with correct types."""
os.chdir(tmp_path)
# Pass URL as argument instead of stdin for more reliable behavior
result = subprocess.run(
['archivebox', 'snapshot', 'https://example.com'],
capture_output=True,
text=True,
env=disable_extractors_dict,
)
# Parse JSONL output lines
snapshot_records = []
for line in result.stdout.strip().split('\n'):
if line:
try:
record = json.loads(line)
if record.get('type') == 'Snapshot':
snapshot_records.append(record)
except json.JSONDecodeError:
continue
assert len(snapshot_records) >= 1, "Should output at least one Snapshot JSONL record"
record = snapshot_records[0]
assert record.get('type') == 'Snapshot'
assert 'id' in record, "Snapshot record should have 'id' field"
assert 'url' in record, "Snapshot record should have 'url' field"
assert record['url'] == 'https://example.com'
def test_snapshot_with_tag_stores_tag_name(tmp_path, process, disable_extractors_dict):
"""Test that title is stored when provided via tag option."""
os.chdir(tmp_path)
# Use command line args instead of stdin
subprocess.run(
['archivebox', 'snapshot', '--tag=customtag', 'https://example.com'],
capture_output=True,
text=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect('index.sqlite3')
c = conn.cursor()
# Verify tag was created with correct name
tag = c.execute("SELECT name FROM core_tag WHERE name = ?",
('customtag',)).fetchone()
conn.close()
assert tag is not None
assert tag[0] == 'customtag'
def test_snapshot_with_depth_creates_crawl_object(tmp_path, process, disable_extractors_dict):
"""Test that --depth > 0 creates a Crawl object with correct max_depth."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'snapshot', '--depth=1',
'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect('index.sqlite3')
c = conn.cursor()
crawl = c.execute("SELECT max_depth FROM crawls_crawl ORDER BY created_at DESC LIMIT 1").fetchone()
conn.close()
assert crawl is not None, "Crawl object should be created when depth > 0"
assert crawl[0] == 1, "Crawl max_depth should match --depth value"
def test_snapshot_deduplicates_urls(tmp_path, process, disable_extractors_dict):
"""Test that adding the same URL twice doesn't create duplicate snapshots."""
os.chdir(tmp_path)
# Add same URL twice
subprocess.run(
['archivebox', 'snapshot', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
subprocess.run(
['archivebox', 'snapshot', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
conn = sqlite3.connect('index.sqlite3')
c = conn.cursor()
count = c.execute("SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
('https://example.com',)).fetchone()[0]
conn.close()
assert count == 1, "Same URL should not create duplicate snapshots"
if __name__ == '__main__':
pytest.main([__file__, '-v'])

197
tests/test_status.py Normal file
View File

@@ -0,0 +1,197 @@
#!/usr/bin/env python3
"""Integration tests for archivebox status command."""
import os
import subprocess
import sqlite3
import pytest
from .fixtures import process, disable_extractors_dict
def test_status_shows_index_info(tmp_path, process):
"""Test that status shows index information."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'status'],
capture_output=True,
text=True,
)
# Should show index scanning info
assert 'index' in result.stdout.lower() or 'Index' in result.stdout
def test_status_shows_snapshot_count(tmp_path, process, disable_extractors_dict):
"""Test that status shows snapshot count."""
os.chdir(tmp_path)
# Add some snapshots
subprocess.run(
['archivebox', 'add', '--index-only', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
subprocess.run(
['archivebox', 'add', '--index-only', 'https://iana.org'],
capture_output=True,
env=disable_extractors_dict,
)
result = subprocess.run(
['archivebox', 'status'],
capture_output=True,
text=True,
)
# Should show link/snapshot count
assert '2' in result.stdout or 'links' in result.stdout.lower()
def test_status_shows_archive_size(tmp_path, process, disable_extractors_dict):
"""Test that status shows archive size information."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
result = subprocess.run(
['archivebox', 'status'],
capture_output=True,
text=True,
)
# Should show size info (bytes, KB, MB, etc)
assert 'Size' in result.stdout or 'size' in result.stdout or 'B' in result.stdout
def test_status_shows_indexed_count(tmp_path, process, disable_extractors_dict):
"""Test that status shows indexed folder count."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
result = subprocess.run(
['archivebox', 'status'],
capture_output=True,
text=True,
)
# Should show indexed count
assert 'indexed' in result.stdout.lower()
def test_status_shows_archived_vs_unarchived(tmp_path, process, disable_extractors_dict):
"""Test that status shows archived vs unarchived counts."""
os.chdir(tmp_path)
# Add index-only snapshot (unarchived)
subprocess.run(
['archivebox', 'add', '--index-only', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
result = subprocess.run(
['archivebox', 'status'],
capture_output=True,
text=True,
)
# Should show archived/unarchived categories
assert 'archived' in result.stdout.lower() or 'unarchived' in result.stdout.lower()
def test_status_shows_data_directory_info(tmp_path, process):
"""Test that status shows data directory path."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'status'],
capture_output=True,
text=True,
)
# Should show data directory or archive path
assert 'archive' in result.stdout.lower() or str(tmp_path) in result.stdout
def test_status_shows_user_info(tmp_path, process):
"""Test that status shows user information."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'status'],
capture_output=True,
text=True,
)
# Should show user info section
assert 'user' in result.stdout.lower() or 'login' in result.stdout.lower()
def test_status_empty_archive(tmp_path, process):
"""Test status on empty archive shows zero counts."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'status'],
capture_output=True,
text=True,
)
# Should still run successfully
assert result.returncode == 0 or 'index' in result.stdout.lower()
# Should show 0 links
assert '0' in result.stdout or 'links' in result.stdout.lower()
def test_status_shows_valid_vs_invalid(tmp_path, process, disable_extractors_dict):
"""Test that status shows valid vs invalid folder counts."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
result = subprocess.run(
['archivebox', 'status'],
capture_output=True,
text=True,
)
# Should show valid/invalid categories
assert 'valid' in result.stdout.lower() or 'present' in result.stdout.lower()
class TestStatusCLI:
"""Test the CLI interface for status command."""
def test_cli_help(self, tmp_path, process):
"""Test that --help works for status command."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'status', '--help'],
capture_output=True,
text=True,
)
assert result.returncode == 0
# Help should show some info about the command
assert 'status' in result.stdout.lower() or 'statistic' in result.stdout.lower()
if __name__ == '__main__':
pytest.main([__file__, '-v'])

160
tests/test_version.py Normal file
View File

@@ -0,0 +1,160 @@
#!/usr/bin/env python3
"""Integration tests for archivebox version command."""
import os
import subprocess
import json
import pytest
from .fixtures import process, disable_extractors_dict
class TestVersionQuiet:
"""Test the quiet/minimal version output."""
def test_version_prints_version_number(self, tmp_path):
"""Test that version prints the version number."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'version', '--quiet'],
capture_output=True,
text=True,
)
assert result.returncode == 0
# Should contain a version string like "0.8.0" or similar
version = result.stdout.strip()
assert version
# Version should be a valid semver-ish format
parts = version.split('.')
assert len(parts) >= 2 # At least major.minor
def test_version_flag_prints_version_number(self, tmp_path):
"""Test that --version flag prints the version number."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', '--version'],
capture_output=True,
text=True,
)
assert result.returncode == 0
version = result.stdout.strip()
assert version
parts = version.split('.')
assert len(parts) >= 2
class TestVersionFull:
"""Test the full version output."""
def test_version_shows_system_info(self, tmp_path, process):
"""Test that version shows system information."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'version'],
capture_output=True,
text=True,
)
output = result.stdout
# Should show basic system info (exit code may be 1 if binaries missing)
assert 'ArchiveBox' in output
def test_version_shows_binary_section(self, tmp_path, process):
"""Test that version shows binary dependencies section."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'version'],
capture_output=True,
text=True,
)
output = result.stdout
# Should show binary dependencies section
assert 'Binary' in output or 'Dependenc' in output
def test_version_shows_data_locations(self, tmp_path, process):
"""Test that version shows data locations."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'version'],
capture_output=True,
text=True,
)
output = result.stdout
# Should show data/code locations
assert 'Data' in output or 'location' in output.lower() or 'DIR' in output or 'Code' in output
class TestVersionWithInstalledBinaries:
"""Test version output after running install."""
def test_version_shows_binary_status(self, tmp_path, process, disable_extractors_dict):
"""Test that version shows binary status (installed or not)."""
os.chdir(tmp_path)
# First run install (with dry-run to speed up)
subprocess.run(
['archivebox', 'install', '--dry-run'],
capture_output=True,
text=True,
env=disable_extractors_dict,
)
# Now check version
result = subprocess.run(
['archivebox', 'version'],
capture_output=True,
text=True,
env=disable_extractors_dict,
)
output = result.stdout
# Should show binary status (either installed or not installed)
assert 'installed' in output.lower() or 'Binary' in output
class TestVersionCLI:
"""Test the CLI interface for version command."""
def test_cli_help(self, tmp_path):
"""Test that --help works for version command."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'version', '--help'],
capture_output=True,
text=True,
)
assert result.returncode == 0
assert '--quiet' in result.stdout or '-q' in result.stdout
def test_cli_invalid_option(self, tmp_path):
"""Test that invalid options are handled."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'version', '--invalid-option'],
capture_output=True,
text=True,
)
# Should fail with non-zero exit code
assert result.returncode != 0
if __name__ == '__main__':
pytest.main([__file__, '-v'])