mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
423 lines
13 KiB
Python
423 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
"""End-to-end tests for scheduling across CLI, server, API, and web UI."""
|
|
|
|
import os
|
|
import socket
|
|
import sqlite3
|
|
import subprocess
|
|
import sys
|
|
import textwrap
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
import requests
|
|
|
|
from .conftest import run_python_cwd
|
|
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parents[2]
|
|
|
|
|
|
def init_archive(cwd: Path) -> None:
|
|
result = subprocess.run(
|
|
[sys.executable, "-m", "archivebox", "init", "--quick"],
|
|
cwd=cwd,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=60,
|
|
)
|
|
assert result.returncode == 0, result.stderr
|
|
|
|
|
|
def build_test_env(port: int, **extra: str) -> dict[str, str]:
|
|
env = os.environ.copy()
|
|
env.pop("DATA_DIR", None)
|
|
env.update(
|
|
{
|
|
"LISTEN_HOST": f"archivebox.localhost:{port}",
|
|
"ALLOWED_HOSTS": "*",
|
|
"CSRF_TRUSTED_ORIGINS": f"http://admin.archivebox.localhost:{port}",
|
|
"PUBLIC_ADD_VIEW": "True",
|
|
"USE_COLOR": "False",
|
|
"SHOW_PROGRESS": "False",
|
|
"TIMEOUT": "30",
|
|
"URL_ALLOWLIST": r"127\.0\.0\.1[:/].*",
|
|
"SAVE_ARCHIVEDOTORG": "False",
|
|
"SAVE_TITLE": "False",
|
|
"SAVE_FAVICON": "False",
|
|
"SAVE_WARC": "False",
|
|
"SAVE_PDF": "False",
|
|
"SAVE_SCREENSHOT": "False",
|
|
"SAVE_DOM": "False",
|
|
"SAVE_SINGLEFILE": "False",
|
|
"SAVE_READABILITY": "False",
|
|
"SAVE_MERCURY": "False",
|
|
"SAVE_GIT": "False",
|
|
"SAVE_YTDLP": "False",
|
|
"SAVE_HEADERS": "False",
|
|
"SAVE_HTMLTOTEXT": "False",
|
|
"SAVE_WGET": "True",
|
|
"USE_CHROME": "False",
|
|
},
|
|
)
|
|
env.update(extra)
|
|
return env
|
|
|
|
|
|
def get_free_port() -> int:
|
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
|
|
sock.bind(("127.0.0.1", 0))
|
|
return sock.getsockname()[1]
|
|
|
|
|
|
def start_server(cwd: Path, env: dict[str, str], port: int) -> None:
|
|
result = subprocess.run(
|
|
[sys.executable, "-m", "archivebox", "server", "--daemonize", f"127.0.0.1:{port}"],
|
|
cwd=cwd,
|
|
capture_output=True,
|
|
text=True,
|
|
env=env,
|
|
timeout=60,
|
|
)
|
|
assert result.returncode == 0, result.stderr
|
|
|
|
|
|
def stop_server(cwd: Path) -> None:
|
|
script = textwrap.dedent(
|
|
"""
|
|
import os
|
|
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
|
|
import django
|
|
django.setup()
|
|
from archivebox.workers.supervisord_util import stop_existing_supervisord_process
|
|
stop_existing_supervisord_process()
|
|
print('stopped')
|
|
""",
|
|
)
|
|
run_python_cwd(script, cwd=cwd, timeout=30)
|
|
|
|
|
|
def wait_for_http(port: int, host: str, path: str = "/", timeout: int = 30) -> requests.Response:
|
|
deadline = time.time() + timeout
|
|
last_exc = None
|
|
while time.time() < deadline:
|
|
try:
|
|
response = requests.get(
|
|
f"http://127.0.0.1:{port}{path}",
|
|
headers={"Host": host},
|
|
timeout=2,
|
|
allow_redirects=False,
|
|
)
|
|
if response.status_code < 500:
|
|
return response
|
|
except requests.RequestException as exc:
|
|
last_exc = exc
|
|
time.sleep(0.5)
|
|
raise AssertionError(f"Timed out waiting for HTTP on {host}: {last_exc}")
|
|
|
|
|
|
def make_latest_schedule_due(cwd: Path) -> None:
|
|
conn = sqlite3.connect(cwd / "index.sqlite3")
|
|
try:
|
|
conn.execute(
|
|
"""
|
|
UPDATE crawls_crawl
|
|
SET created_at = datetime('now', '-2 day'),
|
|
modified_at = datetime('now', '-2 day')
|
|
WHERE id = (
|
|
SELECT template_id
|
|
FROM crawls_crawlschedule
|
|
ORDER BY created_at DESC
|
|
LIMIT 1
|
|
)
|
|
""",
|
|
)
|
|
conn.commit()
|
|
finally:
|
|
conn.close()
|
|
|
|
|
|
def get_snapshot_file_text(cwd: Path, url: str) -> str:
|
|
script = textwrap.dedent(
|
|
f"""
|
|
import os
|
|
from pathlib import Path
|
|
|
|
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
|
|
import django
|
|
django.setup()
|
|
|
|
from archivebox.core.models import Snapshot
|
|
|
|
snapshot = Snapshot.objects.filter(url={url!r}).order_by('-created_at').first()
|
|
assert snapshot is not None, 'missing snapshot'
|
|
assert snapshot.status == 'sealed', snapshot.status
|
|
|
|
snapshot_dir = Path(snapshot.output_dir)
|
|
candidates = []
|
|
preferred_patterns = (
|
|
'wget/**/index.html',
|
|
'wget/**/*.html',
|
|
'trafilatura/content.html',
|
|
'trafilatura/content.txt',
|
|
'defuddle/content.html',
|
|
'defuddle/content.txt',
|
|
)
|
|
for pattern in preferred_patterns:
|
|
for candidate in snapshot_dir.glob(pattern):
|
|
if candidate.is_file():
|
|
candidates.append(candidate)
|
|
|
|
if not candidates:
|
|
for candidate in snapshot_dir.rglob('*'):
|
|
if not candidate.is_file():
|
|
continue
|
|
rel = candidate.relative_to(snapshot_dir)
|
|
if rel.parts and rel.parts[0] == 'responses':
|
|
continue
|
|
if candidate.suffix not in ('.html', '.htm', '.txt'):
|
|
continue
|
|
if candidate.name in ('stdout.log', 'stderr.log', 'cmd.sh'):
|
|
continue
|
|
candidates.append(candidate)
|
|
|
|
assert candidates, f'no captured html/txt files found in {{snapshot_dir}}'
|
|
print(candidates[0].read_text(errors='ignore'))
|
|
""",
|
|
)
|
|
stdout, stderr, code = run_python_cwd(script, cwd=cwd, timeout=60)
|
|
assert code == 0, stderr
|
|
return stdout
|
|
|
|
|
|
def wait_for_snapshot_capture(cwd: Path, url: str, timeout: int = 180) -> str:
|
|
deadline = time.time() + timeout
|
|
last_error = None
|
|
while time.time() < deadline:
|
|
try:
|
|
return get_snapshot_file_text(cwd, url)
|
|
except AssertionError as err:
|
|
last_error = err
|
|
time.sleep(2)
|
|
raise AssertionError(f"timed out waiting for captured content for {url}: {last_error}")
|
|
|
|
|
|
def get_counts(cwd: Path, scheduled_url: str, one_shot_url: str) -> tuple[int, int, int]:
|
|
conn = sqlite3.connect(cwd / "index.sqlite3")
|
|
try:
|
|
scheduled_snapshots = conn.execute(
|
|
"SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
|
|
(scheduled_url,),
|
|
).fetchone()[0]
|
|
one_shot_snapshots = conn.execute(
|
|
"SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
|
|
(one_shot_url,),
|
|
).fetchone()[0]
|
|
scheduled_crawls = conn.execute(
|
|
"""
|
|
SELECT COUNT(*)
|
|
FROM crawls_crawl
|
|
WHERE schedule_id IS NOT NULL
|
|
AND urls = ?
|
|
""",
|
|
(scheduled_url,),
|
|
).fetchone()[0]
|
|
return scheduled_snapshots, one_shot_snapshots, scheduled_crawls
|
|
finally:
|
|
conn.close()
|
|
|
|
|
|
def create_admin_and_token(cwd: Path) -> str:
|
|
script = textwrap.dedent(
|
|
"""
|
|
import os
|
|
from datetime import timedelta
|
|
from django.utils import timezone
|
|
|
|
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
|
|
import django
|
|
django.setup()
|
|
|
|
from django.contrib.auth import get_user_model
|
|
from archivebox.api.models import APIToken
|
|
|
|
User = get_user_model()
|
|
user, _ = User.objects.get_or_create(
|
|
username='apitestadmin',
|
|
defaults={
|
|
'email': 'apitestadmin@example.com',
|
|
'is_staff': True,
|
|
'is_superuser': True,
|
|
},
|
|
)
|
|
user.is_staff = True
|
|
user.is_superuser = True
|
|
user.set_password('testpass123')
|
|
user.save()
|
|
|
|
token = APIToken.objects.create(
|
|
created_by=user,
|
|
expires=timezone.now() + timedelta(days=1),
|
|
)
|
|
print(token.token)
|
|
""",
|
|
)
|
|
stdout, stderr, code = run_python_cwd(script, cwd=cwd, timeout=60)
|
|
assert code == 0, stderr
|
|
return stdout.strip().splitlines()[-1]
|
|
|
|
|
|
@pytest.mark.timeout(180)
|
|
def test_server_processes_due_cli_schedule_and_saves_real_content(tmp_path, recursive_test_site):
|
|
os.chdir(tmp_path)
|
|
init_archive(tmp_path)
|
|
|
|
port = get_free_port()
|
|
env = build_test_env(port)
|
|
|
|
schedule_result = subprocess.run(
|
|
[sys.executable, "-m", "archivebox", "schedule", "--every=daily", "--depth=0", recursive_test_site["root_url"]],
|
|
cwd=tmp_path,
|
|
capture_output=True,
|
|
text=True,
|
|
env=env,
|
|
timeout=60,
|
|
)
|
|
assert schedule_result.returncode == 0, schedule_result.stderr
|
|
assert "Created scheduled crawl" in schedule_result.stdout
|
|
|
|
make_latest_schedule_due(tmp_path)
|
|
|
|
try:
|
|
start_server(tmp_path, env=env, port=port)
|
|
wait_for_http(port, host=f"web.archivebox.localhost:{port}")
|
|
captured_text = wait_for_snapshot_capture(tmp_path, recursive_test_site["root_url"], timeout=180)
|
|
assert "Root" in captured_text
|
|
assert "About" in captured_text
|
|
finally:
|
|
stop_server(tmp_path)
|
|
|
|
|
|
@pytest.mark.timeout(180)
|
|
def test_archivebox_add_remains_one_shot_even_when_schedule_is_due(tmp_path, recursive_test_site):
|
|
os.chdir(tmp_path)
|
|
init_archive(tmp_path)
|
|
|
|
port = get_free_port()
|
|
env = build_test_env(port)
|
|
scheduled_url = recursive_test_site["root_url"]
|
|
one_shot_url = recursive_test_site["child_urls"][0]
|
|
|
|
schedule_result = subprocess.run(
|
|
[sys.executable, "-m", "archivebox", "schedule", "--every=daily", "--depth=0", scheduled_url],
|
|
cwd=tmp_path,
|
|
capture_output=True,
|
|
text=True,
|
|
env=env,
|
|
timeout=60,
|
|
)
|
|
assert schedule_result.returncode == 0, schedule_result.stderr
|
|
|
|
make_latest_schedule_due(tmp_path)
|
|
|
|
add_result = subprocess.run(
|
|
[sys.executable, "-m", "archivebox", "add", "--depth=0", "--plugins=wget", one_shot_url],
|
|
cwd=tmp_path,
|
|
capture_output=True,
|
|
text=True,
|
|
env=env,
|
|
timeout=120,
|
|
)
|
|
assert add_result.returncode == 0, add_result.stderr
|
|
captured_text = wait_for_snapshot_capture(tmp_path, one_shot_url, timeout=120)
|
|
assert "Deep About" in captured_text or "About" in captured_text
|
|
|
|
scheduled_snapshots, one_shot_snapshots, scheduled_crawls = get_counts(tmp_path, scheduled_url, one_shot_url)
|
|
assert one_shot_snapshots >= 1
|
|
assert scheduled_snapshots == 0
|
|
assert scheduled_crawls == 1 # template only, no materialized scheduled run
|
|
|
|
|
|
@pytest.mark.timeout(180)
|
|
def test_schedule_rest_api_works_over_running_server(tmp_path, recursive_test_site):
|
|
os.chdir(tmp_path)
|
|
init_archive(tmp_path)
|
|
|
|
port = get_free_port()
|
|
env = build_test_env(port)
|
|
api_token = create_admin_and_token(tmp_path)
|
|
|
|
try:
|
|
start_server(tmp_path, env=env, port=port)
|
|
wait_for_http(port, host=f"api.archivebox.localhost:{port}", path="/api/v1/docs")
|
|
|
|
response = requests.post(
|
|
f"http://127.0.0.1:{port}/api/v1/cli/schedule",
|
|
headers={
|
|
"Host": f"api.archivebox.localhost:{port}",
|
|
"X-ArchiveBox-API-Key": api_token,
|
|
},
|
|
json={
|
|
"every": "daily",
|
|
"import_path": recursive_test_site["root_url"],
|
|
"quiet": True,
|
|
},
|
|
timeout=10,
|
|
)
|
|
|
|
assert response.status_code == 200, response.text
|
|
payload = response.json()
|
|
assert payload["success"] is True
|
|
assert payload["result_format"] == "json"
|
|
assert len(payload["result"]["created_schedule_ids"]) == 1
|
|
finally:
|
|
stop_server(tmp_path)
|
|
|
|
|
|
@pytest.mark.timeout(180)
|
|
def test_schedule_web_ui_post_works_over_running_server(tmp_path, recursive_test_site):
|
|
os.chdir(tmp_path)
|
|
init_archive(tmp_path)
|
|
|
|
port = get_free_port()
|
|
env = build_test_env(port, PUBLIC_ADD_VIEW="True")
|
|
|
|
try:
|
|
start_server(tmp_path, env=env, port=port)
|
|
wait_for_http(port, host=f"web.archivebox.localhost:{port}", path="/add/")
|
|
|
|
response = requests.post(
|
|
f"http://127.0.0.1:{port}/add/",
|
|
headers={"Host": f"web.archivebox.localhost:{port}"},
|
|
data={
|
|
"url": recursive_test_site["root_url"],
|
|
"depth": "0",
|
|
"schedule": "daily",
|
|
"tag": "web-ui",
|
|
"notes": "created from web ui",
|
|
},
|
|
timeout=10,
|
|
allow_redirects=False,
|
|
)
|
|
|
|
assert response.status_code in (302, 303), response.text
|
|
|
|
conn = sqlite3.connect(tmp_path / "index.sqlite3")
|
|
try:
|
|
row = conn.execute(
|
|
"""
|
|
SELECT cs.schedule, c.urls, c.tags_str
|
|
FROM crawls_crawlschedule cs
|
|
JOIN crawls_crawl c ON c.schedule_id = cs.id
|
|
ORDER BY cs.created_at DESC
|
|
LIMIT 1
|
|
""",
|
|
).fetchone()
|
|
finally:
|
|
conn.close()
|
|
|
|
assert row == ("daily", recursive_test_site["root_url"], "web-ui")
|
|
finally:
|
|
stop_server(tmp_path)
|