This commit is contained in:
Nick Sweeting
2026-03-15 18:45:29 -07:00
parent f97725d16f
commit 934e02695b
111 changed files with 919 additions and 461 deletions

View File

@@ -1,7 +1,6 @@
"""archivebox/tests/conftest.py - Pytest fixtures for CLI tests."""
import os
import shutil
import sys
import subprocess
import textwrap
@@ -13,6 +12,8 @@ import pytest
from archivebox.uuid_compat import uuid7
pytest_plugins = ["archivebox.tests.fixtures"]
# =============================================================================
# CLI Helpers (defined before fixtures that use them)

View File

@@ -1,9 +1,6 @@
import subprocess
import json
import sqlite3
import os
from .fixtures import *
import sqlite3
import subprocess
def test_depth_flag_is_accepted(process, disable_extractors_dict):
arg_process = subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],
@@ -31,7 +28,7 @@ def test_depth_flag_fails_if_it_is_not_0_or_1(process, disable_extractors_dict):
def test_depth_flag_0_creates_source_file(tmp_path, process, disable_extractors_dict):
os.chdir(tmp_path)
arg_process = subprocess.run(
subprocess.run(
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
capture_output=True,
env=disable_extractors_dict,

View File

@@ -9,7 +9,7 @@ Tests cover:
"""
import pytest
from django.test import TestCase, Client, override_settings
from django.test import override_settings
from django.urls import reverse
from django.contrib.auth import get_user_model

View File

@@ -9,7 +9,7 @@ import os
import sys
import tempfile
import unittest
from pathlib import Path
from importlib.util import find_spec
class TestLDAPConfig(unittest.TestCase):
@@ -100,13 +100,7 @@ class TestLDAPIntegration(unittest.TestCase):
def test_django_settings_with_ldap_library_check(self):
"""Test that Django settings check for LDAP libraries when enabled."""
# Try to import django-auth-ldap to see if it's available
try:
import django_auth_ldap
import ldap
ldap_available = True
except ImportError:
ldap_available = False
ldap_available = find_spec("django_auth_ldap") is not None and find_spec("ldap") is not None
# If LDAP libraries are not available, settings should handle gracefully
if not ldap_available:

View File

@@ -5,11 +5,8 @@ Verify add creates snapshots in DB, crawls, source files, and archive directorie
"""
import os
import subprocess
import sqlite3
from pathlib import Path
from .fixtures import *
import subprocess
def test_add_single_url_creates_snapshot_in_db(tmp_path, process, disable_extractors_dict):
@@ -169,6 +166,30 @@ def test_add_with_tags(tmp_path, process, disable_extractors_dict):
assert 'test' in tags_str or 'example' in tags_str
def test_add_records_selected_persona_on_crawl(tmp_path, process, disable_extractors_dict):
"""Test add persists the selected persona so browser config derives from it later."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', '--persona=Default', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
assert result.returncode == 0
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
persona_id, default_persona = c.execute(
"SELECT persona_id, json_extract(config, '$.DEFAULT_PERSONA') FROM crawls_crawl LIMIT 1"
).fetchone()
conn.close()
assert persona_id
assert default_persona == 'Default'
assert (tmp_path / "personas" / "Default" / "chrome_user_data").is_dir()
assert (tmp_path / "personas" / "Default" / "chrome_extensions").is_dir()
def test_add_duplicate_url_creates_separate_crawls(tmp_path, process, disable_extractors_dict):
"""Test that adding the same URL twice creates separate crawls and snapshots.

View File

@@ -9,7 +9,6 @@ Tests cover:
"""
import json
import pytest
from archivebox.tests.conftest import (
run_archivebox_cmd,

View File

@@ -6,9 +6,6 @@ Verify config reads/writes ArchiveBox.conf file correctly.
import os
import subprocess
from pathlib import Path
from .fixtures import *
def test_config_displays_all_config(tmp_path, process):

View File

@@ -9,14 +9,11 @@ Tests cover:
"""
import json
import pytest
from archivebox.tests.conftest import (
run_archivebox_cmd,
parse_jsonl_output,
assert_jsonl_contains_type,
create_test_url,
create_test_crawl_json,
)

View File

@@ -5,10 +5,8 @@ Verify extract re-runs extractors on existing snapshots.
"""
import os
import subprocess
import sqlite3
from .fixtures import *
import subprocess
def test_extract_runs_on_existing_snapshots(tmp_path, process, disable_extractors_dict):

View File

@@ -7,8 +7,6 @@ Verify command runs successfully and produces output.
import os
import subprocess
from .fixtures import *
def test_help_runs_successfully(tmp_path):
"""Test that help command runs and produces output."""

View File

@@ -5,14 +5,11 @@ Verify init creates correct database schema, filesystem structure, and config.
"""
import os
import subprocess
import sqlite3
from pathlib import Path
import subprocess
from archivebox.config.common import STORAGE_CONFIG
from .fixtures import *
DIR_PERMISSIONS = STORAGE_CONFIG.OUTPUT_PERMISSIONS.replace('6', '7').replace('4', '5')

View File

@@ -5,12 +5,10 @@ Verify install detects and records binary dependencies in DB.
"""
import os
import subprocess
import sqlite3
import subprocess
from pathlib import Path
from .fixtures import *
def test_install_runs_successfully(tmp_path, process):
"""Test that install command runs without error."""

View File

@@ -6,9 +6,6 @@ Verify manage command runs Django management commands.
import os
import subprocess
import sqlite3
from .fixtures import *
def test_manage_help_works(tmp_path, process):

View File

@@ -5,11 +5,8 @@ Verify remove deletes snapshots from DB and filesystem.
"""
import os
import subprocess
import sqlite3
from pathlib import Path
from .fixtures import *
import subprocess
def test_remove_deletes_snapshot_from_db(tmp_path, process, disable_extractors_dict):

View File

@@ -8,7 +8,6 @@ Tests cover:
"""
import json
import pytest
from archivebox.tests.conftest import (
run_archivebox_cmd,

View File

@@ -10,11 +10,9 @@ Tests cover:
import json
import sqlite3
import time
from archivebox.tests.conftest import (
run_archivebox_cmd,
parse_jsonl_output,
)

View File

@@ -5,7 +5,6 @@ import os
import sqlite3
import subprocess
from .fixtures import process, disable_extractors_dict
def test_schedule_run_all_enqueues_scheduled_crawl(tmp_path, process, disable_extractors_dict):

View File

@@ -6,9 +6,6 @@ Verify search queries snapshots from DB.
import os
import subprocess
import sqlite3
from .fixtures import *
def test_search_finds_snapshots(tmp_path, process, disable_extractors_dict):

View File

@@ -6,10 +6,6 @@ Verify server can start (basic smoke tests only, no full server testing).
import os
import subprocess
import signal
import time
from .fixtures import *
def test_server_shows_usage_info(tmp_path, process):

View File

@@ -7,8 +7,6 @@ Verify shell command starts Django shell (basic smoke tests only).
import os
import subprocess
from .fixtures import *
def test_shell_command_exists(tmp_path, process):
"""Test that shell command is recognized."""

View File

@@ -9,12 +9,10 @@ Tests cover:
"""
import json
import pytest
from archivebox.tests.conftest import (
run_archivebox_cmd,
parse_jsonl_output,
assert_jsonl_contains_type,
create_test_url,
)

View File

@@ -5,12 +5,10 @@ Verify status reports accurate collection state from DB and filesystem.
"""
import os
import subprocess
import sqlite3
import subprocess
from pathlib import Path
from .fixtures import *
def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None:
candidates = {snapshot_id}

View File

@@ -5,10 +5,8 @@ Verify update drains old dirs, reconciles DB, and queues snapshots.
"""
import os
import subprocess
import sqlite3
from .fixtures import *
import subprocess
def test_update_runs_successfully_on_empty_archive(tmp_path, process):

View File

@@ -11,7 +11,9 @@ import tempfile
import subprocess
from pathlib import Path
from .fixtures import *
from .fixtures import process
FIXTURES = (process,)
def _archivebox_cli() -> str:

View File

@@ -6,7 +6,6 @@ import subprocess
import pytest
from .fixtures import process, disable_extractors_dict
def test_config_shows_all_config_values(tmp_path, process):
@@ -49,6 +48,7 @@ def test_config_set_value_writes_to_config_file(tmp_path, process):
capture_output=True,
text=True,
)
assert result.returncode == 0, result.stderr
# Read the config file directly to verify it was written
config_file = tmp_path / 'ArchiveBox.conf'

View File

@@ -4,11 +4,9 @@
import os
import subprocess
import sqlite3
import json
import pytest
from .fixtures import process, disable_extractors_dict
def test_crawl_creates_crawl_object(tmp_path, process, disable_extractors_dict):

View File

@@ -8,7 +8,6 @@ import json
import pytest
from .fixtures import process, disable_extractors_dict
def test_extract_runs_on_snapshot_id(tmp_path, process, disable_extractors_dict):
@@ -231,6 +230,7 @@ def test_extract_multiple_snapshots(tmp_path, process, disable_extractors_dict):
text=True,
env=disable_extractors_dict,
)
assert result.returncode == 0, result.stderr
# Should not error
conn = sqlite3.connect('index.sqlite3')

View File

@@ -1,8 +1,12 @@
from .fixtures import *
import json as pyjson
import sqlite3
import subprocess
from pathlib import Path
from .fixtures import disable_extractors_dict, process
FIXTURES = (disable_extractors_dict, process)
def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None:
candidates = {snapshot_id}

View File

@@ -16,7 +16,7 @@ import subprocess
import tempfile
import unittest
from pathlib import Path
from unittest.mock import MagicMock, patch
from unittest.mock import patch
# Set up Django before importing any Django-dependent modules
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')

View File

@@ -3,13 +3,13 @@
import os
import subprocess
from pathlib import Path
import json, shutil
import sqlite3
from archivebox.config.common import STORAGE_CONFIG
from .fixtures import *
from .fixtures import disable_extractors_dict, process
FIXTURES = (disable_extractors_dict, process)
DIR_PERMISSIONS = STORAGE_CONFIG.OUTPUT_PERMISSIONS.replace('6', '7').replace('4', '5')
@@ -25,6 +25,7 @@ def test_add_link(tmp_path, process, disable_extractors_dict):
os.chdir(tmp_path)
add_process = subprocess.run(['archivebox', 'add', '--index-only', 'https://example.com'],
capture_output=True, env=disable_extractors_dict)
assert add_process.returncode == 0, add_process.stderr.decode("utf-8")
# In the new architecture, URLs are saved to source files
# Check that a source file was created with the URL
@@ -41,6 +42,7 @@ def test_add_multiple_urls(tmp_path, process, disable_extractors_dict):
os.chdir(tmp_path)
add_process = subprocess.run(['archivebox', 'add', '--index-only', 'https://example.com', 'https://iana.org'],
capture_output=True, env=disable_extractors_dict)
assert add_process.returncode == 0, add_process.stderr.decode("utf-8")
# Check that a source file was created with both URLs
sources_dir = tmp_path / "sources"
@@ -61,6 +63,7 @@ def test_correct_permissions_add_command_results(tmp_path, process, disable_extr
os.chdir(tmp_path)
add_process = subprocess.run(['archivebox', 'add', '--index-only', 'https://example.com'], capture_output=True,
env=disable_extractors_dict)
assert add_process.returncode == 0, add_process.stderr.decode("utf-8")
# Check database permissions
assert oct((tmp_path / "index.sqlite3").stat().st_mode)[-3:] in (STORAGE_CONFIG.OUTPUT_PERMISSIONS, DIR_PERMISSIONS)

View File

@@ -7,7 +7,6 @@ import sqlite3
import pytest
from .fixtures import process, disable_extractors_dict
class TestInstallDryRun:

View File

@@ -1,7 +1,9 @@
import json
import subprocess
from .fixtures import *
from .fixtures import disable_extractors_dict, process
FIXTURES = (disable_extractors_dict, process)
def test_search_json(process, disable_extractors_dict):
subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],

View File

@@ -10,10 +10,8 @@ Migration tests from 0.8.x to 0.9.x.
- New fields like depth, retry_at, etc.
"""
import json
import shutil
import sqlite3
import subprocess
import tempfile
import unittest
from pathlib import Path
@@ -579,7 +577,7 @@ class TestFilesystemMigration08to09(unittest.TestCase):
f"Files lost during migration: {files_before_count} -> {files_after_count}")
# Run update to trigger filesystem reorganization
print(f"\n[*] Running archivebox update to reorganize filesystem...")
print("\n[*] Running archivebox update to reorganize filesystem...")
result = run_archivebox(self.work_dir, ['update'], timeout=120)
self.assertEqual(result.returncode, 0, f"Update failed: {result.stderr}")
@@ -657,7 +655,7 @@ class TestFilesystemMigration08to09(unittest.TestCase):
# CRITICAL: Verify sample files exist in new structure
self.assertGreater(len(new_sample_files), 0,
f"Sample files not found in new structure")
"Sample files not found in new structure")
# Verify new path format
for path_key, file_path in new_sample_files.items():

View File

@@ -10,7 +10,6 @@ from pathlib import Path
import pytest
from .fixtures import process, disable_extractors_dict, recursive_test_site
def wait_for_db_condition(timeout, condition, interval=0.5):
@@ -77,7 +76,6 @@ def test_background_hooks_dont_block_parser_extractors(tmp_path, process, recurs
"SAVE_ARCHIVEDOTORG": "false",
"SAVE_TITLE": "false",
"SAVE_FAVICON": "true",
"SAVE_WGET": "false",
})
proc = subprocess.Popen(

View File

@@ -1,7 +1,10 @@
import os
import sqlite3
import subprocess
from .fixtures import *
from .fixtures import disable_extractors_dict, process
FIXTURES = (disable_extractors_dict, process)
def test_remove_single_snapshot(tmp_path, process, disable_extractors_dict):
"""Test removing a snapshot by URL pattern"""

View File

@@ -7,7 +7,6 @@ import subprocess
import pytest
from .fixtures import process
def _fetchone(tmp_path, query):

View File

@@ -0,0 +1,420 @@
#!/usr/bin/env python3
"""End-to-end tests for scheduling across CLI, server, API, and web UI."""
import os
import socket
import sqlite3
import subprocess
import sys
import textwrap
import time
from pathlib import Path
import pytest
import requests
from .conftest import run_python_cwd
REPO_ROOT = Path(__file__).resolve().parents[2]
def init_archive(cwd: Path) -> None:
result = subprocess.run(
[sys.executable, '-m', 'archivebox', 'init', '--quick'],
cwd=cwd,
capture_output=True,
text=True,
timeout=60,
)
assert result.returncode == 0, result.stderr
def build_test_env(port: int, **extra: str) -> dict[str, str]:
env = os.environ.copy()
env.pop('DATA_DIR', None)
env.update({
'LISTEN_HOST': f'archivebox.localhost:{port}',
'ALLOWED_HOSTS': '*',
'CSRF_TRUSTED_ORIGINS': f'http://admin.archivebox.localhost:{port}',
'PUBLIC_ADD_VIEW': 'True',
'USE_COLOR': 'False',
'SHOW_PROGRESS': 'False',
'TIMEOUT': '20',
'URL_ALLOWLIST': r'127\.0\.0\.1[:/].*',
'SAVE_ARCHIVEDOTORG': 'False',
'SAVE_TITLE': 'False',
'SAVE_FAVICON': 'False',
'SAVE_WARC': 'False',
'SAVE_PDF': 'False',
'SAVE_SCREENSHOT': 'False',
'SAVE_DOM': 'False',
'SAVE_SINGLEFILE': 'False',
'SAVE_READABILITY': 'False',
'SAVE_MERCURY': 'False',
'SAVE_GIT': 'False',
'SAVE_YTDLP': 'False',
'SAVE_HEADERS': 'False',
'SAVE_HTMLTOTEXT': 'False',
'SAVE_WGET': 'True',
'USE_CHROME': 'False',
})
env.update(extra)
return env
def get_free_port() -> int:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
sock.bind(('127.0.0.1', 0))
return sock.getsockname()[1]
def start_server(cwd: Path, env: dict[str, str], port: int) -> None:
result = subprocess.run(
[sys.executable, '-m', 'archivebox', 'server', '--daemonize', f'127.0.0.1:{port}'],
cwd=cwd,
capture_output=True,
text=True,
env=env,
timeout=60,
)
assert result.returncode == 0, result.stderr
def stop_server(cwd: Path) -> None:
script = textwrap.dedent(
"""
import os
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
import django
django.setup()
from archivebox.workers.supervisord_util import stop_existing_supervisord_process
stop_existing_supervisord_process()
print('stopped')
"""
)
run_python_cwd(script, cwd=cwd, timeout=30)
def wait_for_http(port: int, host: str, path: str = '/', timeout: int = 30) -> requests.Response:
deadline = time.time() + timeout
last_exc = None
while time.time() < deadline:
try:
response = requests.get(
f'http://127.0.0.1:{port}{path}',
headers={'Host': host},
timeout=2,
allow_redirects=False,
)
if response.status_code < 500:
return response
except requests.RequestException as exc:
last_exc = exc
time.sleep(0.5)
raise AssertionError(f'Timed out waiting for HTTP on {host}: {last_exc}')
def make_latest_schedule_due(cwd: Path) -> None:
conn = sqlite3.connect(cwd / 'index.sqlite3')
try:
conn.execute(
"""
UPDATE crawls_crawl
SET created_at = datetime('now', '-2 day'),
modified_at = datetime('now', '-2 day')
WHERE id = (
SELECT template_id
FROM crawls_crawlschedule
ORDER BY created_at DESC
LIMIT 1
)
"""
)
conn.commit()
finally:
conn.close()
def get_snapshot_file_text(cwd: Path, url: str) -> str:
script = textwrap.dedent(
f"""
import os
from pathlib import Path
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
import django
django.setup()
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.filter(url={url!r}).order_by('-created_at').first()
assert snapshot is not None, 'missing snapshot'
assert snapshot.status == 'sealed', snapshot.status
snapshot_dir = Path(snapshot.output_dir)
candidates = []
preferred_patterns = (
'wget/**/index.html',
'wget/**/*.html',
'trafilatura/content.html',
'trafilatura/content.txt',
'defuddle/content.html',
'defuddle/content.txt',
)
for pattern in preferred_patterns:
for candidate in snapshot_dir.glob(pattern):
if candidate.is_file():
candidates.append(candidate)
if not candidates:
for candidate in snapshot_dir.rglob('*'):
if not candidate.is_file():
continue
rel = candidate.relative_to(snapshot_dir)
if rel.parts and rel.parts[0] == 'responses':
continue
if candidate.suffix not in ('.html', '.htm', '.txt'):
continue
if candidate.name in ('stdout.log', 'stderr.log', 'cmd.sh'):
continue
candidates.append(candidate)
assert candidates, f'no captured html/txt files found in {{snapshot_dir}}'
print(candidates[0].read_text(errors='ignore'))
"""
)
stdout, stderr, code = run_python_cwd(script, cwd=cwd, timeout=60)
assert code == 0, stderr
return stdout
def wait_for_snapshot_capture(cwd: Path, url: str, timeout: int = 180) -> str:
deadline = time.time() + timeout
last_error = None
while time.time() < deadline:
try:
return get_snapshot_file_text(cwd, url)
except AssertionError as err:
last_error = err
time.sleep(2)
raise AssertionError(f'timed out waiting for captured content for {url}: {last_error}')
def get_counts(cwd: Path, scheduled_url: str, one_shot_url: str) -> tuple[int, int, int]:
conn = sqlite3.connect(cwd / 'index.sqlite3')
try:
scheduled_snapshots = conn.execute(
"SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
(scheduled_url,),
).fetchone()[0]
one_shot_snapshots = conn.execute(
"SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
(one_shot_url,),
).fetchone()[0]
scheduled_crawls = conn.execute(
"""
SELECT COUNT(*)
FROM crawls_crawl
WHERE schedule_id IS NOT NULL
AND urls = ?
""",
(scheduled_url,),
).fetchone()[0]
return scheduled_snapshots, one_shot_snapshots, scheduled_crawls
finally:
conn.close()
def create_admin_and_token(cwd: Path) -> str:
script = textwrap.dedent(
"""
import os
from datetime import timedelta
from django.utils import timezone
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
import django
django.setup()
from django.contrib.auth import get_user_model
from archivebox.api.models import APIToken
User = get_user_model()
user, _ = User.objects.get_or_create(
username='apitestadmin',
defaults={
'email': 'apitestadmin@example.com',
'is_staff': True,
'is_superuser': True,
},
)
user.is_staff = True
user.is_superuser = True
user.set_password('testpass123')
user.save()
token = APIToken.objects.create(
created_by=user,
expires=timezone.now() + timedelta(days=1),
)
print(token.token)
"""
)
stdout, stderr, code = run_python_cwd(script, cwd=cwd, timeout=60)
assert code == 0, stderr
return stdout.strip().splitlines()[-1]
@pytest.mark.timeout(180)
def test_server_processes_due_cli_schedule_and_saves_real_content(tmp_path, recursive_test_site):
os.chdir(tmp_path)
init_archive(tmp_path)
port = get_free_port()
env = build_test_env(port)
schedule_result = subprocess.run(
[sys.executable, '-m', 'archivebox', 'schedule', '--every=daily', '--depth=0', recursive_test_site['root_url']],
cwd=tmp_path,
capture_output=True,
text=True,
env=env,
timeout=60,
)
assert schedule_result.returncode == 0, schedule_result.stderr
assert 'Created scheduled crawl' in schedule_result.stdout
make_latest_schedule_due(tmp_path)
try:
start_server(tmp_path, env=env, port=port)
wait_for_http(port, host=f'web.archivebox.localhost:{port}')
captured_text = wait_for_snapshot_capture(tmp_path, recursive_test_site['root_url'], timeout=180)
assert 'Root' in captured_text
assert 'About' in captured_text
finally:
stop_server(tmp_path)
@pytest.mark.timeout(180)
def test_archivebox_add_remains_one_shot_even_when_schedule_is_due(tmp_path, recursive_test_site):
os.chdir(tmp_path)
init_archive(tmp_path)
port = get_free_port()
env = build_test_env(port)
scheduled_url = recursive_test_site['root_url']
one_shot_url = recursive_test_site['child_urls'][0]
schedule_result = subprocess.run(
[sys.executable, '-m', 'archivebox', 'schedule', '--every=daily', '--depth=0', scheduled_url],
cwd=tmp_path,
capture_output=True,
text=True,
env=env,
timeout=60,
)
assert schedule_result.returncode == 0, schedule_result.stderr
make_latest_schedule_due(tmp_path)
add_result = subprocess.run(
[sys.executable, '-m', 'archivebox', 'add', '--depth=0', '--plugins=wget', one_shot_url],
cwd=tmp_path,
capture_output=True,
text=True,
env=env,
timeout=120,
)
assert add_result.returncode == 0, add_result.stderr
captured_text = wait_for_snapshot_capture(tmp_path, one_shot_url, timeout=120)
assert 'Deep About' in captured_text or 'About' in captured_text
scheduled_snapshots, one_shot_snapshots, scheduled_crawls = get_counts(tmp_path, scheduled_url, one_shot_url)
assert one_shot_snapshots >= 1
assert scheduled_snapshots == 0
assert scheduled_crawls == 1 # template only, no materialized scheduled run
@pytest.mark.timeout(180)
def test_schedule_rest_api_works_over_running_server(tmp_path, recursive_test_site):
os.chdir(tmp_path)
init_archive(tmp_path)
port = get_free_port()
env = build_test_env(port)
api_token = create_admin_and_token(tmp_path)
try:
start_server(tmp_path, env=env, port=port)
wait_for_http(port, host=f'api.archivebox.localhost:{port}', path='/api/v1/docs')
response = requests.post(
f'http://127.0.0.1:{port}/api/v1/cli/schedule',
headers={
'Host': f'api.archivebox.localhost:{port}',
'X-ArchiveBox-API-Key': api_token,
},
json={
'every': 'daily',
'import_path': recursive_test_site['root_url'],
'quiet': True,
},
timeout=10,
)
assert response.status_code == 200, response.text
payload = response.json()
assert payload['success'] is True
assert payload['result_format'] == 'json'
assert len(payload['result']['created_schedule_ids']) == 1
finally:
stop_server(tmp_path)
@pytest.mark.timeout(180)
def test_schedule_web_ui_post_works_over_running_server(tmp_path, recursive_test_site):
os.chdir(tmp_path)
init_archive(tmp_path)
port = get_free_port()
env = build_test_env(port, PUBLIC_ADD_VIEW='True')
try:
start_server(tmp_path, env=env, port=port)
wait_for_http(port, host=f'web.archivebox.localhost:{port}', path='/add/')
response = requests.post(
f'http://127.0.0.1:{port}/add/',
headers={'Host': f'web.archivebox.localhost:{port}'},
data={
'url': recursive_test_site['root_url'],
'depth': '0',
'schedule': 'daily',
'tag': 'web-ui',
'notes': 'created from web ui',
},
timeout=10,
allow_redirects=False,
)
assert response.status_code in (302, 303), response.text
conn = sqlite3.connect(tmp_path / 'index.sqlite3')
try:
row = conn.execute(
"""
SELECT cs.schedule, c.urls, c.tags_str
FROM crawls_crawlschedule cs
JOIN crawls_crawl c ON c.schedule_id = cs.id
ORDER BY cs.created_at DESC
LIMIT 1
"""
).fetchone()
finally:
conn.close()
assert row == ('daily', recursive_test_site['root_url'], 'web-ui')
finally:
stop_server(tmp_path)

View File

@@ -3,12 +3,9 @@
import os
import subprocess
import sqlite3
import json
import pytest
from .fixtures import process, disable_extractors_dict
def test_search_returns_snapshots(tmp_path, process, disable_extractors_dict):

View File

@@ -6,13 +6,11 @@ import subprocess
import sqlite3
from archivebox.machine.models import Process
from datetime import datetime
from pathlib import Path
from urllib.parse import urlparse
import uuid
import pytest
from .fixtures import process, disable_extractors_dict
def test_snapshot_creates_snapshot_with_correct_url(tmp_path, process, disable_extractors_dict):
@@ -46,9 +44,7 @@ def test_snapshot_creates_snapshot_with_correct_url(tmp_path, process, disable_e
snapshot_id_raw, snapshot_created_at, snapshot_url, crawl_id = snapshot_row
snapshot_id = str(uuid.UUID(snapshot_id_raw))
crawl_id, crawl_created_at, crawl_urls, crawl_created_by_id = crawl_row
username = user_row[0]
crawl_date_str = datetime.fromisoformat(crawl_created_at).strftime('%Y%m%d')
snapshot_date_str = datetime.fromisoformat(snapshot_created_at).strftime('%Y%m%d')
domain = urlparse(snapshot_url).hostname or 'unknown'

View File

@@ -3,11 +3,9 @@
import os
import subprocess
import sqlite3
import pytest
from .fixtures import process, disable_extractors_dict
def test_status_shows_index_info(tmp_path, process):

View File

@@ -1,7 +1,10 @@
import os
import sqlite3
import subprocess
from .fixtures import *
from .fixtures import disable_extractors_dict, process
FIXTURES = (disable_extractors_dict, process)
def test_title_is_extracted(tmp_path, process, disable_extractors_dict):
"""Test that title is extracted from the page."""

View File

@@ -1,7 +1,10 @@
import json
import sqlite3
import subprocess
from .fixtures import *
from .fixtures import disable_extractors_dict, process
FIXTURES = (disable_extractors_dict, process)
def test_update_imports_orphaned_snapshots(tmp_path, process, disable_extractors_dict):
"""Test that archivebox update imports real legacy archive directories."""

View File

@@ -3,11 +3,9 @@
import os
import subprocess
import json
import pytest
from .fixtures import process, disable_extractors_dict
class TestVersionQuiet:

View File

@@ -18,11 +18,9 @@ Config priority order (highest to lowest):
"""
import os
import json
import sys
import tempfile
import subprocess
import time
from pathlib import Path
@@ -45,7 +43,7 @@ def test_config_propagation_through_worker_hierarchy():
data_dir.mkdir()
print(f"\n{'='*80}")
print(f"Test: Config Propagation Through Worker Hierarchy")
print("Test: Config Propagation Through Worker Hierarchy")
print(f"DATA_DIR: {data_dir}")
print(f"{'='*80}\n")
@@ -63,7 +61,7 @@ def test_config_propagation_through_worker_hierarchy():
timeout=60,
)
assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
print(f"✓ Archive initialized\n")
print("✓ Archive initialized\n")
# Step 2: Write custom config to ArchiveBox.conf
print("Step 2: Write custom config to ArchiveBox.conf")
@@ -90,7 +88,7 @@ SAVE_TITLE = True
SAVE_FAVICON = True
SAVE_SCREENSHOT = True
""")
print(f"✓ Wrote config file with TIMEOUT=999, all plugins enabled\n")
print("✓ Wrote config file with TIMEOUT=999, all plugins enabled\n")
# Step 2.5: Set Machine.config values
print("Step 2.5: Set Machine.config with custom binary path")
@@ -123,7 +121,7 @@ print(f"Machine {{machine.hostname}} config updated")
timeout=30,
)
assert result.returncode == 0, f"Set machine config failed: {result.stderr.decode()}"
print(f"✓ Set Machine.config with CUSTOM_MACHINE_KEY=from_machine_config, WGET_BINARY=/custom/machine/wget\n")
print("✓ Set Machine.config with CUSTOM_MACHINE_KEY=from_machine_config, WGET_BINARY=/custom/machine/wget\n")
# Step 3: Create Crawl via Django ORM with custom crawl.config
print("Step 3: Create Crawl with custom crawl.config JSON")
@@ -421,7 +419,7 @@ def test_config_environment_variable_parsing():
data_dir.mkdir()
print(f"\n{'='*80}")
print(f"Test: Config Environment Variable Parsing")
print("Test: Config Environment Variable Parsing")
print(f"DATA_DIR: {data_dir}")
print(f"{'='*80}\n")
@@ -557,7 +555,7 @@ def test_parent_environment_preserved_in_hooks():
data_dir.mkdir()
print(f"\n{'='*80}")
print(f"Test: Parent Environment Preserved in Hooks")
print("Test: Parent Environment Preserved in Hooks")
print(f"DATA_DIR: {data_dir}")
print(f"{'='*80}\n")
@@ -575,7 +573,7 @@ def test_parent_environment_preserved_in_hooks():
timeout=60,
)
assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
print(f"✓ Archive initialized\n")
print("✓ Archive initialized\n")
# Create snapshot
print("Step 2: Create Snapshot")
@@ -635,7 +633,6 @@ print(snapshot.id)
timeout=120,
)
stdout = result.stdout.decode()
stderr = result.stderr.decode()
print("\n--- SnapshotWorker stderr (first 50 lines) ---")
@@ -760,7 +757,7 @@ def test_config_auto_fetch_relationships():
data_dir.mkdir()
print(f"\n{'='*80}")
print(f"Test: Config Auto-Fetch Relationships")
print("Test: Config Auto-Fetch Relationships")
print(f"DATA_DIR: {data_dir}")
print(f"{'='*80}\n")
@@ -778,7 +775,7 @@ def test_config_auto_fetch_relationships():
timeout=60,
)
assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
print(f"✓ Archive initialized\n")
print("✓ Archive initialized\n")
# Create objects with config at each level
print("Step 2: Create Crawl -> Snapshot -> ArchiveResult with config at each level")
@@ -906,7 +903,7 @@ def test_config_precedence_with_environment_vars():
data_dir.mkdir()
print(f"\n{'='*80}")
print(f"Test: Config Precedence with Environment Variables")
print("Test: Config Precedence with Environment Variables")
print(f"DATA_DIR: {data_dir}")
print(f"{'='*80}\n")
@@ -1006,7 +1003,7 @@ def test_new_environment_variables_added():
data_dir.mkdir()
print(f"\n{'='*80}")
print(f"Test: New Environment Variables Added to Config")
print("Test: New Environment Variables Added to Config")
print(f"DATA_DIR: {data_dir}")
print(f"{'='*80}\n")