mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
fix lint
This commit is contained in:
@@ -1,7 +1,6 @@
|
||||
"""archivebox/tests/conftest.py - Pytest fixtures for CLI tests."""
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import subprocess
|
||||
import textwrap
|
||||
@@ -13,6 +12,8 @@ import pytest
|
||||
|
||||
from archivebox.uuid_compat import uuid7
|
||||
|
||||
pytest_plugins = ["archivebox.tests.fixtures"]
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# CLI Helpers (defined before fixtures that use them)
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
import subprocess
|
||||
import json
|
||||
import sqlite3
|
||||
import os
|
||||
|
||||
from .fixtures import *
|
||||
import sqlite3
|
||||
import subprocess
|
||||
|
||||
def test_depth_flag_is_accepted(process, disable_extractors_dict):
|
||||
arg_process = subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],
|
||||
@@ -31,7 +28,7 @@ def test_depth_flag_fails_if_it_is_not_0_or_1(process, disable_extractors_dict):
|
||||
|
||||
def test_depth_flag_0_creates_source_file(tmp_path, process, disable_extractors_dict):
|
||||
os.chdir(tmp_path)
|
||||
arg_process = subprocess.run(
|
||||
subprocess.run(
|
||||
["archivebox", "add", "--index-only", "--depth=0", "https://example.com"],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
|
||||
@@ -9,7 +9,7 @@ Tests cover:
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from django.test import TestCase, Client, override_settings
|
||||
from django.test import override_settings
|
||||
from django.urls import reverse
|
||||
from django.contrib.auth import get_user_model
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ import os
|
||||
import sys
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from importlib.util import find_spec
|
||||
|
||||
|
||||
class TestLDAPConfig(unittest.TestCase):
|
||||
@@ -100,13 +100,7 @@ class TestLDAPIntegration(unittest.TestCase):
|
||||
|
||||
def test_django_settings_with_ldap_library_check(self):
|
||||
"""Test that Django settings check for LDAP libraries when enabled."""
|
||||
# Try to import django-auth-ldap to see if it's available
|
||||
try:
|
||||
import django_auth_ldap
|
||||
import ldap
|
||||
ldap_available = True
|
||||
except ImportError:
|
||||
ldap_available = False
|
||||
ldap_available = find_spec("django_auth_ldap") is not None and find_spec("ldap") is not None
|
||||
|
||||
# If LDAP libraries are not available, settings should handle gracefully
|
||||
if not ldap_available:
|
||||
|
||||
@@ -5,11 +5,8 @@ Verify add creates snapshots in DB, crawls, source files, and archive directorie
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
|
||||
from .fixtures import *
|
||||
import subprocess
|
||||
|
||||
|
||||
def test_add_single_url_creates_snapshot_in_db(tmp_path, process, disable_extractors_dict):
|
||||
@@ -169,6 +166,30 @@ def test_add_with_tags(tmp_path, process, disable_extractors_dict):
|
||||
assert 'test' in tags_str or 'example' in tags_str
|
||||
|
||||
|
||||
def test_add_records_selected_persona_on_crawl(tmp_path, process, disable_extractors_dict):
|
||||
"""Test add persists the selected persona so browser config derives from it later."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', '--persona=Default', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
persona_id, default_persona = c.execute(
|
||||
"SELECT persona_id, json_extract(config, '$.DEFAULT_PERSONA') FROM crawls_crawl LIMIT 1"
|
||||
).fetchone()
|
||||
conn.close()
|
||||
|
||||
assert persona_id
|
||||
assert default_persona == 'Default'
|
||||
assert (tmp_path / "personas" / "Default" / "chrome_user_data").is_dir()
|
||||
assert (tmp_path / "personas" / "Default" / "chrome_extensions").is_dir()
|
||||
|
||||
|
||||
def test_add_duplicate_url_creates_separate_crawls(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that adding the same URL twice creates separate crawls and snapshots.
|
||||
|
||||
|
||||
@@ -9,7 +9,6 @@ Tests cover:
|
||||
"""
|
||||
|
||||
import json
|
||||
import pytest
|
||||
|
||||
from archivebox.tests.conftest import (
|
||||
run_archivebox_cmd,
|
||||
|
||||
@@ -6,9 +6,6 @@ Verify config reads/writes ArchiveBox.conf file correctly.
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
def test_config_displays_all_config(tmp_path, process):
|
||||
|
||||
@@ -9,14 +9,11 @@ Tests cover:
|
||||
"""
|
||||
|
||||
import json
|
||||
import pytest
|
||||
|
||||
from archivebox.tests.conftest import (
|
||||
run_archivebox_cmd,
|
||||
parse_jsonl_output,
|
||||
assert_jsonl_contains_type,
|
||||
create_test_url,
|
||||
create_test_crawl_json,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -5,10 +5,8 @@ Verify extract re-runs extractors on existing snapshots.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
|
||||
from .fixtures import *
|
||||
import subprocess
|
||||
|
||||
|
||||
def test_extract_runs_on_existing_snapshots(tmp_path, process, disable_extractors_dict):
|
||||
|
||||
@@ -7,8 +7,6 @@ Verify command runs successfully and produces output.
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
def test_help_runs_successfully(tmp_path):
|
||||
"""Test that help command runs and produces output."""
|
||||
|
||||
@@ -5,14 +5,11 @@ Verify init creates correct database schema, filesystem structure, and config.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
import subprocess
|
||||
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
DIR_PERMISSIONS = STORAGE_CONFIG.OUTPUT_PERMISSIONS.replace('6', '7').replace('4', '5')
|
||||
|
||||
|
||||
@@ -5,12 +5,10 @@ Verify install detects and records binary dependencies in DB.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
def test_install_runs_successfully(tmp_path, process):
|
||||
"""Test that install command runs without error."""
|
||||
|
||||
@@ -6,9 +6,6 @@ Verify manage command runs Django management commands.
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
def test_manage_help_works(tmp_path, process):
|
||||
|
||||
@@ -5,11 +5,8 @@ Verify remove deletes snapshots from DB and filesystem.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
|
||||
from .fixtures import *
|
||||
import subprocess
|
||||
|
||||
|
||||
def test_remove_deletes_snapshot_from_db(tmp_path, process, disable_extractors_dict):
|
||||
|
||||
@@ -8,7 +8,6 @@ Tests cover:
|
||||
"""
|
||||
|
||||
import json
|
||||
import pytest
|
||||
|
||||
from archivebox.tests.conftest import (
|
||||
run_archivebox_cmd,
|
||||
|
||||
@@ -10,11 +10,9 @@ Tests cover:
|
||||
|
||||
import json
|
||||
import sqlite3
|
||||
import time
|
||||
|
||||
from archivebox.tests.conftest import (
|
||||
run_archivebox_cmd,
|
||||
parse_jsonl_output,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -5,7 +5,6 @@ import os
|
||||
import sqlite3
|
||||
import subprocess
|
||||
|
||||
from .fixtures import process, disable_extractors_dict
|
||||
|
||||
|
||||
def test_schedule_run_all_enqueues_scheduled_crawl(tmp_path, process, disable_extractors_dict):
|
||||
|
||||
@@ -6,9 +6,6 @@ Verify search queries snapshots from DB.
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
def test_search_finds_snapshots(tmp_path, process, disable_extractors_dict):
|
||||
|
||||
@@ -6,10 +6,6 @@ Verify server can start (basic smoke tests only, no full server testing).
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import signal
|
||||
import time
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
def test_server_shows_usage_info(tmp_path, process):
|
||||
|
||||
@@ -7,8 +7,6 @@ Verify shell command starts Django shell (basic smoke tests only).
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
def test_shell_command_exists(tmp_path, process):
|
||||
"""Test that shell command is recognized."""
|
||||
|
||||
@@ -9,12 +9,10 @@ Tests cover:
|
||||
"""
|
||||
|
||||
import json
|
||||
import pytest
|
||||
|
||||
from archivebox.tests.conftest import (
|
||||
run_archivebox_cmd,
|
||||
parse_jsonl_output,
|
||||
assert_jsonl_contains_type,
|
||||
create_test_url,
|
||||
)
|
||||
|
||||
|
||||
@@ -5,12 +5,10 @@ Verify status reports accurate collection state from DB and filesystem.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None:
|
||||
candidates = {snapshot_id}
|
||||
|
||||
@@ -5,10 +5,8 @@ Verify update drains old dirs, reconciles DB, and queues snapshots.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
|
||||
from .fixtures import *
|
||||
import subprocess
|
||||
|
||||
|
||||
def test_update_runs_successfully_on_empty_archive(tmp_path, process):
|
||||
|
||||
@@ -11,7 +11,9 @@ import tempfile
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
from .fixtures import *
|
||||
from .fixtures import process
|
||||
|
||||
FIXTURES = (process,)
|
||||
|
||||
|
||||
def _archivebox_cli() -> str:
|
||||
|
||||
@@ -6,7 +6,6 @@ import subprocess
|
||||
|
||||
import pytest
|
||||
|
||||
from .fixtures import process, disable_extractors_dict
|
||||
|
||||
|
||||
def test_config_shows_all_config_values(tmp_path, process):
|
||||
@@ -49,6 +48,7 @@ def test_config_set_value_writes_to_config_file(tmp_path, process):
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
assert result.returncode == 0, result.stderr
|
||||
|
||||
# Read the config file directly to verify it was written
|
||||
config_file = tmp_path / 'ArchiveBox.conf'
|
||||
|
||||
@@ -4,11 +4,9 @@
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from .fixtures import process, disable_extractors_dict
|
||||
|
||||
|
||||
def test_crawl_creates_crawl_object(tmp_path, process, disable_extractors_dict):
|
||||
|
||||
@@ -8,7 +8,6 @@ import json
|
||||
|
||||
import pytest
|
||||
|
||||
from .fixtures import process, disable_extractors_dict
|
||||
|
||||
|
||||
def test_extract_runs_on_snapshot_id(tmp_path, process, disable_extractors_dict):
|
||||
@@ -231,6 +230,7 @@ def test_extract_multiple_snapshots(tmp_path, process, disable_extractors_dict):
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
assert result.returncode == 0, result.stderr
|
||||
|
||||
# Should not error
|
||||
conn = sqlite3.connect('index.sqlite3')
|
||||
|
||||
@@ -1,8 +1,12 @@
|
||||
from .fixtures import *
|
||||
import json as pyjson
|
||||
import sqlite3
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
from .fixtures import disable_extractors_dict, process
|
||||
|
||||
FIXTURES = (disable_extractors_dict, process)
|
||||
|
||||
|
||||
def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None:
|
||||
candidates = {snapshot_id}
|
||||
|
||||
@@ -16,7 +16,7 @@ import subprocess
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, patch
|
||||
from unittest.mock import patch
|
||||
|
||||
# Set up Django before importing any Django-dependent modules
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
|
||||
|
||||
@@ -3,13 +3,13 @@
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
import json, shutil
|
||||
import sqlite3
|
||||
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
|
||||
from .fixtures import *
|
||||
from .fixtures import disable_extractors_dict, process
|
||||
|
||||
FIXTURES = (disable_extractors_dict, process)
|
||||
|
||||
DIR_PERMISSIONS = STORAGE_CONFIG.OUTPUT_PERMISSIONS.replace('6', '7').replace('4', '5')
|
||||
|
||||
@@ -25,6 +25,7 @@ def test_add_link(tmp_path, process, disable_extractors_dict):
|
||||
os.chdir(tmp_path)
|
||||
add_process = subprocess.run(['archivebox', 'add', '--index-only', 'https://example.com'],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
assert add_process.returncode == 0, add_process.stderr.decode("utf-8")
|
||||
|
||||
# In the new architecture, URLs are saved to source files
|
||||
# Check that a source file was created with the URL
|
||||
@@ -41,6 +42,7 @@ def test_add_multiple_urls(tmp_path, process, disable_extractors_dict):
|
||||
os.chdir(tmp_path)
|
||||
add_process = subprocess.run(['archivebox', 'add', '--index-only', 'https://example.com', 'https://iana.org'],
|
||||
capture_output=True, env=disable_extractors_dict)
|
||||
assert add_process.returncode == 0, add_process.stderr.decode("utf-8")
|
||||
|
||||
# Check that a source file was created with both URLs
|
||||
sources_dir = tmp_path / "sources"
|
||||
@@ -61,6 +63,7 @@ def test_correct_permissions_add_command_results(tmp_path, process, disable_extr
|
||||
os.chdir(tmp_path)
|
||||
add_process = subprocess.run(['archivebox', 'add', '--index-only', 'https://example.com'], capture_output=True,
|
||||
env=disable_extractors_dict)
|
||||
assert add_process.returncode == 0, add_process.stderr.decode("utf-8")
|
||||
|
||||
# Check database permissions
|
||||
assert oct((tmp_path / "index.sqlite3").stat().st_mode)[-3:] in (STORAGE_CONFIG.OUTPUT_PERMISSIONS, DIR_PERMISSIONS)
|
||||
|
||||
@@ -7,7 +7,6 @@ import sqlite3
|
||||
|
||||
import pytest
|
||||
|
||||
from .fixtures import process, disable_extractors_dict
|
||||
|
||||
|
||||
class TestInstallDryRun:
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
import json
|
||||
import subprocess
|
||||
|
||||
from .fixtures import *
|
||||
from .fixtures import disable_extractors_dict, process
|
||||
|
||||
FIXTURES = (disable_extractors_dict, process)
|
||||
|
||||
def test_search_json(process, disable_extractors_dict):
|
||||
subprocess.run(["archivebox", "add", "--index-only", "https://example.com", "--depth=0"],
|
||||
|
||||
@@ -10,10 +10,8 @@ Migration tests from 0.8.x to 0.9.x.
|
||||
- New fields like depth, retry_at, etc.
|
||||
"""
|
||||
|
||||
import json
|
||||
import shutil
|
||||
import sqlite3
|
||||
import subprocess
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
@@ -579,7 +577,7 @@ class TestFilesystemMigration08to09(unittest.TestCase):
|
||||
f"Files lost during migration: {files_before_count} -> {files_after_count}")
|
||||
|
||||
# Run update to trigger filesystem reorganization
|
||||
print(f"\n[*] Running archivebox update to reorganize filesystem...")
|
||||
print("\n[*] Running archivebox update to reorganize filesystem...")
|
||||
result = run_archivebox(self.work_dir, ['update'], timeout=120)
|
||||
self.assertEqual(result.returncode, 0, f"Update failed: {result.stderr}")
|
||||
|
||||
@@ -657,7 +655,7 @@ class TestFilesystemMigration08to09(unittest.TestCase):
|
||||
|
||||
# CRITICAL: Verify sample files exist in new structure
|
||||
self.assertGreater(len(new_sample_files), 0,
|
||||
f"Sample files not found in new structure")
|
||||
"Sample files not found in new structure")
|
||||
|
||||
# Verify new path format
|
||||
for path_key, file_path in new_sample_files.items():
|
||||
|
||||
@@ -10,7 +10,6 @@ from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from .fixtures import process, disable_extractors_dict, recursive_test_site
|
||||
|
||||
|
||||
def wait_for_db_condition(timeout, condition, interval=0.5):
|
||||
@@ -77,7 +76,6 @@ def test_background_hooks_dont_block_parser_extractors(tmp_path, process, recurs
|
||||
"SAVE_ARCHIVEDOTORG": "false",
|
||||
"SAVE_TITLE": "false",
|
||||
"SAVE_FAVICON": "true",
|
||||
"SAVE_WGET": "false",
|
||||
})
|
||||
|
||||
proc = subprocess.Popen(
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
import os
|
||||
import sqlite3
|
||||
import subprocess
|
||||
|
||||
from .fixtures import *
|
||||
from .fixtures import disable_extractors_dict, process
|
||||
|
||||
FIXTURES = (disable_extractors_dict, process)
|
||||
|
||||
def test_remove_single_snapshot(tmp_path, process, disable_extractors_dict):
|
||||
"""Test removing a snapshot by URL pattern"""
|
||||
|
||||
@@ -7,7 +7,6 @@ import subprocess
|
||||
|
||||
import pytest
|
||||
|
||||
from .fixtures import process
|
||||
|
||||
|
||||
def _fetchone(tmp_path, query):
|
||||
|
||||
420
archivebox/tests/test_schedule_e2e.py
Normal file
420
archivebox/tests/test_schedule_e2e.py
Normal file
@@ -0,0 +1,420 @@
|
||||
#!/usr/bin/env python3
|
||||
"""End-to-end tests for scheduling across CLI, server, API, and web UI."""
|
||||
|
||||
import os
|
||||
import socket
|
||||
import sqlite3
|
||||
import subprocess
|
||||
import sys
|
||||
import textwrap
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from .conftest import run_python_cwd
|
||||
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||
|
||||
|
||||
def init_archive(cwd: Path) -> None:
|
||||
result = subprocess.run(
|
||||
[sys.executable, '-m', 'archivebox', 'init', '--quick'],
|
||||
cwd=cwd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
)
|
||||
assert result.returncode == 0, result.stderr
|
||||
|
||||
|
||||
def build_test_env(port: int, **extra: str) -> dict[str, str]:
|
||||
env = os.environ.copy()
|
||||
env.pop('DATA_DIR', None)
|
||||
env.update({
|
||||
'LISTEN_HOST': f'archivebox.localhost:{port}',
|
||||
'ALLOWED_HOSTS': '*',
|
||||
'CSRF_TRUSTED_ORIGINS': f'http://admin.archivebox.localhost:{port}',
|
||||
'PUBLIC_ADD_VIEW': 'True',
|
||||
'USE_COLOR': 'False',
|
||||
'SHOW_PROGRESS': 'False',
|
||||
'TIMEOUT': '20',
|
||||
'URL_ALLOWLIST': r'127\.0\.0\.1[:/].*',
|
||||
'SAVE_ARCHIVEDOTORG': 'False',
|
||||
'SAVE_TITLE': 'False',
|
||||
'SAVE_FAVICON': 'False',
|
||||
'SAVE_WARC': 'False',
|
||||
'SAVE_PDF': 'False',
|
||||
'SAVE_SCREENSHOT': 'False',
|
||||
'SAVE_DOM': 'False',
|
||||
'SAVE_SINGLEFILE': 'False',
|
||||
'SAVE_READABILITY': 'False',
|
||||
'SAVE_MERCURY': 'False',
|
||||
'SAVE_GIT': 'False',
|
||||
'SAVE_YTDLP': 'False',
|
||||
'SAVE_HEADERS': 'False',
|
||||
'SAVE_HTMLTOTEXT': 'False',
|
||||
'SAVE_WGET': 'True',
|
||||
'USE_CHROME': 'False',
|
||||
})
|
||||
env.update(extra)
|
||||
return env
|
||||
|
||||
|
||||
def get_free_port() -> int:
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
|
||||
sock.bind(('127.0.0.1', 0))
|
||||
return sock.getsockname()[1]
|
||||
|
||||
|
||||
def start_server(cwd: Path, env: dict[str, str], port: int) -> None:
|
||||
result = subprocess.run(
|
||||
[sys.executable, '-m', 'archivebox', 'server', '--daemonize', f'127.0.0.1:{port}'],
|
||||
cwd=cwd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60,
|
||||
)
|
||||
assert result.returncode == 0, result.stderr
|
||||
|
||||
|
||||
def stop_server(cwd: Path) -> None:
|
||||
script = textwrap.dedent(
|
||||
"""
|
||||
import os
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
|
||||
import django
|
||||
django.setup()
|
||||
from archivebox.workers.supervisord_util import stop_existing_supervisord_process
|
||||
stop_existing_supervisord_process()
|
||||
print('stopped')
|
||||
"""
|
||||
)
|
||||
run_python_cwd(script, cwd=cwd, timeout=30)
|
||||
|
||||
|
||||
def wait_for_http(port: int, host: str, path: str = '/', timeout: int = 30) -> requests.Response:
|
||||
deadline = time.time() + timeout
|
||||
last_exc = None
|
||||
while time.time() < deadline:
|
||||
try:
|
||||
response = requests.get(
|
||||
f'http://127.0.0.1:{port}{path}',
|
||||
headers={'Host': host},
|
||||
timeout=2,
|
||||
allow_redirects=False,
|
||||
)
|
||||
if response.status_code < 500:
|
||||
return response
|
||||
except requests.RequestException as exc:
|
||||
last_exc = exc
|
||||
time.sleep(0.5)
|
||||
raise AssertionError(f'Timed out waiting for HTTP on {host}: {last_exc}')
|
||||
|
||||
|
||||
def make_latest_schedule_due(cwd: Path) -> None:
|
||||
conn = sqlite3.connect(cwd / 'index.sqlite3')
|
||||
try:
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE crawls_crawl
|
||||
SET created_at = datetime('now', '-2 day'),
|
||||
modified_at = datetime('now', '-2 day')
|
||||
WHERE id = (
|
||||
SELECT template_id
|
||||
FROM crawls_crawlschedule
|
||||
ORDER BY created_at DESC
|
||||
LIMIT 1
|
||||
)
|
||||
"""
|
||||
)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def get_snapshot_file_text(cwd: Path, url: str) -> str:
|
||||
script = textwrap.dedent(
|
||||
f"""
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
|
||||
import django
|
||||
django.setup()
|
||||
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
snapshot = Snapshot.objects.filter(url={url!r}).order_by('-created_at').first()
|
||||
assert snapshot is not None, 'missing snapshot'
|
||||
assert snapshot.status == 'sealed', snapshot.status
|
||||
|
||||
snapshot_dir = Path(snapshot.output_dir)
|
||||
candidates = []
|
||||
preferred_patterns = (
|
||||
'wget/**/index.html',
|
||||
'wget/**/*.html',
|
||||
'trafilatura/content.html',
|
||||
'trafilatura/content.txt',
|
||||
'defuddle/content.html',
|
||||
'defuddle/content.txt',
|
||||
)
|
||||
for pattern in preferred_patterns:
|
||||
for candidate in snapshot_dir.glob(pattern):
|
||||
if candidate.is_file():
|
||||
candidates.append(candidate)
|
||||
|
||||
if not candidates:
|
||||
for candidate in snapshot_dir.rglob('*'):
|
||||
if not candidate.is_file():
|
||||
continue
|
||||
rel = candidate.relative_to(snapshot_dir)
|
||||
if rel.parts and rel.parts[0] == 'responses':
|
||||
continue
|
||||
if candidate.suffix not in ('.html', '.htm', '.txt'):
|
||||
continue
|
||||
if candidate.name in ('stdout.log', 'stderr.log', 'cmd.sh'):
|
||||
continue
|
||||
candidates.append(candidate)
|
||||
|
||||
assert candidates, f'no captured html/txt files found in {{snapshot_dir}}'
|
||||
print(candidates[0].read_text(errors='ignore'))
|
||||
"""
|
||||
)
|
||||
stdout, stderr, code = run_python_cwd(script, cwd=cwd, timeout=60)
|
||||
assert code == 0, stderr
|
||||
return stdout
|
||||
|
||||
|
||||
def wait_for_snapshot_capture(cwd: Path, url: str, timeout: int = 180) -> str:
|
||||
deadline = time.time() + timeout
|
||||
last_error = None
|
||||
while time.time() < deadline:
|
||||
try:
|
||||
return get_snapshot_file_text(cwd, url)
|
||||
except AssertionError as err:
|
||||
last_error = err
|
||||
time.sleep(2)
|
||||
raise AssertionError(f'timed out waiting for captured content for {url}: {last_error}')
|
||||
|
||||
|
||||
def get_counts(cwd: Path, scheduled_url: str, one_shot_url: str) -> tuple[int, int, int]:
|
||||
conn = sqlite3.connect(cwd / 'index.sqlite3')
|
||||
try:
|
||||
scheduled_snapshots = conn.execute(
|
||||
"SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
|
||||
(scheduled_url,),
|
||||
).fetchone()[0]
|
||||
one_shot_snapshots = conn.execute(
|
||||
"SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
|
||||
(one_shot_url,),
|
||||
).fetchone()[0]
|
||||
scheduled_crawls = conn.execute(
|
||||
"""
|
||||
SELECT COUNT(*)
|
||||
FROM crawls_crawl
|
||||
WHERE schedule_id IS NOT NULL
|
||||
AND urls = ?
|
||||
""",
|
||||
(scheduled_url,),
|
||||
).fetchone()[0]
|
||||
return scheduled_snapshots, one_shot_snapshots, scheduled_crawls
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def create_admin_and_token(cwd: Path) -> str:
|
||||
script = textwrap.dedent(
|
||||
"""
|
||||
import os
|
||||
from datetime import timedelta
|
||||
from django.utils import timezone
|
||||
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
|
||||
import django
|
||||
django.setup()
|
||||
|
||||
from django.contrib.auth import get_user_model
|
||||
from archivebox.api.models import APIToken
|
||||
|
||||
User = get_user_model()
|
||||
user, _ = User.objects.get_or_create(
|
||||
username='apitestadmin',
|
||||
defaults={
|
||||
'email': 'apitestadmin@example.com',
|
||||
'is_staff': True,
|
||||
'is_superuser': True,
|
||||
},
|
||||
)
|
||||
user.is_staff = True
|
||||
user.is_superuser = True
|
||||
user.set_password('testpass123')
|
||||
user.save()
|
||||
|
||||
token = APIToken.objects.create(
|
||||
created_by=user,
|
||||
expires=timezone.now() + timedelta(days=1),
|
||||
)
|
||||
print(token.token)
|
||||
"""
|
||||
)
|
||||
stdout, stderr, code = run_python_cwd(script, cwd=cwd, timeout=60)
|
||||
assert code == 0, stderr
|
||||
return stdout.strip().splitlines()[-1]
|
||||
|
||||
|
||||
@pytest.mark.timeout(180)
|
||||
def test_server_processes_due_cli_schedule_and_saves_real_content(tmp_path, recursive_test_site):
|
||||
os.chdir(tmp_path)
|
||||
init_archive(tmp_path)
|
||||
|
||||
port = get_free_port()
|
||||
env = build_test_env(port)
|
||||
|
||||
schedule_result = subprocess.run(
|
||||
[sys.executable, '-m', 'archivebox', 'schedule', '--every=daily', '--depth=0', recursive_test_site['root_url']],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60,
|
||||
)
|
||||
assert schedule_result.returncode == 0, schedule_result.stderr
|
||||
assert 'Created scheduled crawl' in schedule_result.stdout
|
||||
|
||||
make_latest_schedule_due(tmp_path)
|
||||
|
||||
try:
|
||||
start_server(tmp_path, env=env, port=port)
|
||||
wait_for_http(port, host=f'web.archivebox.localhost:{port}')
|
||||
captured_text = wait_for_snapshot_capture(tmp_path, recursive_test_site['root_url'], timeout=180)
|
||||
assert 'Root' in captured_text
|
||||
assert 'About' in captured_text
|
||||
finally:
|
||||
stop_server(tmp_path)
|
||||
|
||||
|
||||
@pytest.mark.timeout(180)
|
||||
def test_archivebox_add_remains_one_shot_even_when_schedule_is_due(tmp_path, recursive_test_site):
|
||||
os.chdir(tmp_path)
|
||||
init_archive(tmp_path)
|
||||
|
||||
port = get_free_port()
|
||||
env = build_test_env(port)
|
||||
scheduled_url = recursive_test_site['root_url']
|
||||
one_shot_url = recursive_test_site['child_urls'][0]
|
||||
|
||||
schedule_result = subprocess.run(
|
||||
[sys.executable, '-m', 'archivebox', 'schedule', '--every=daily', '--depth=0', scheduled_url],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60,
|
||||
)
|
||||
assert schedule_result.returncode == 0, schedule_result.stderr
|
||||
|
||||
make_latest_schedule_due(tmp_path)
|
||||
|
||||
add_result = subprocess.run(
|
||||
[sys.executable, '-m', 'archivebox', 'add', '--depth=0', '--plugins=wget', one_shot_url],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=120,
|
||||
)
|
||||
assert add_result.returncode == 0, add_result.stderr
|
||||
captured_text = wait_for_snapshot_capture(tmp_path, one_shot_url, timeout=120)
|
||||
assert 'Deep About' in captured_text or 'About' in captured_text
|
||||
|
||||
scheduled_snapshots, one_shot_snapshots, scheduled_crawls = get_counts(tmp_path, scheduled_url, one_shot_url)
|
||||
assert one_shot_snapshots >= 1
|
||||
assert scheduled_snapshots == 0
|
||||
assert scheduled_crawls == 1 # template only, no materialized scheduled run
|
||||
|
||||
|
||||
@pytest.mark.timeout(180)
|
||||
def test_schedule_rest_api_works_over_running_server(tmp_path, recursive_test_site):
|
||||
os.chdir(tmp_path)
|
||||
init_archive(tmp_path)
|
||||
|
||||
port = get_free_port()
|
||||
env = build_test_env(port)
|
||||
api_token = create_admin_and_token(tmp_path)
|
||||
|
||||
try:
|
||||
start_server(tmp_path, env=env, port=port)
|
||||
wait_for_http(port, host=f'api.archivebox.localhost:{port}', path='/api/v1/docs')
|
||||
|
||||
response = requests.post(
|
||||
f'http://127.0.0.1:{port}/api/v1/cli/schedule',
|
||||
headers={
|
||||
'Host': f'api.archivebox.localhost:{port}',
|
||||
'X-ArchiveBox-API-Key': api_token,
|
||||
},
|
||||
json={
|
||||
'every': 'daily',
|
||||
'import_path': recursive_test_site['root_url'],
|
||||
'quiet': True,
|
||||
},
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
assert response.status_code == 200, response.text
|
||||
payload = response.json()
|
||||
assert payload['success'] is True
|
||||
assert payload['result_format'] == 'json'
|
||||
assert len(payload['result']['created_schedule_ids']) == 1
|
||||
finally:
|
||||
stop_server(tmp_path)
|
||||
|
||||
|
||||
@pytest.mark.timeout(180)
|
||||
def test_schedule_web_ui_post_works_over_running_server(tmp_path, recursive_test_site):
|
||||
os.chdir(tmp_path)
|
||||
init_archive(tmp_path)
|
||||
|
||||
port = get_free_port()
|
||||
env = build_test_env(port, PUBLIC_ADD_VIEW='True')
|
||||
|
||||
try:
|
||||
start_server(tmp_path, env=env, port=port)
|
||||
wait_for_http(port, host=f'web.archivebox.localhost:{port}', path='/add/')
|
||||
|
||||
response = requests.post(
|
||||
f'http://127.0.0.1:{port}/add/',
|
||||
headers={'Host': f'web.archivebox.localhost:{port}'},
|
||||
data={
|
||||
'url': recursive_test_site['root_url'],
|
||||
'depth': '0',
|
||||
'schedule': 'daily',
|
||||
'tag': 'web-ui',
|
||||
'notes': 'created from web ui',
|
||||
},
|
||||
timeout=10,
|
||||
allow_redirects=False,
|
||||
)
|
||||
|
||||
assert response.status_code in (302, 303), response.text
|
||||
|
||||
conn = sqlite3.connect(tmp_path / 'index.sqlite3')
|
||||
try:
|
||||
row = conn.execute(
|
||||
"""
|
||||
SELECT cs.schedule, c.urls, c.tags_str
|
||||
FROM crawls_crawlschedule cs
|
||||
JOIN crawls_crawl c ON c.schedule_id = cs.id
|
||||
ORDER BY cs.created_at DESC
|
||||
LIMIT 1
|
||||
"""
|
||||
).fetchone()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
assert row == ('daily', recursive_test_site['root_url'], 'web-ui')
|
||||
finally:
|
||||
stop_server(tmp_path)
|
||||
@@ -3,12 +3,9 @@
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from .fixtures import process, disable_extractors_dict
|
||||
|
||||
|
||||
def test_search_returns_snapshots(tmp_path, process, disable_extractors_dict):
|
||||
|
||||
@@ -6,13 +6,11 @@ import subprocess
|
||||
import sqlite3
|
||||
from archivebox.machine.models import Process
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
import uuid
|
||||
|
||||
import pytest
|
||||
|
||||
from .fixtures import process, disable_extractors_dict
|
||||
|
||||
|
||||
def test_snapshot_creates_snapshot_with_correct_url(tmp_path, process, disable_extractors_dict):
|
||||
@@ -46,9 +44,7 @@ def test_snapshot_creates_snapshot_with_correct_url(tmp_path, process, disable_e
|
||||
|
||||
snapshot_id_raw, snapshot_created_at, snapshot_url, crawl_id = snapshot_row
|
||||
snapshot_id = str(uuid.UUID(snapshot_id_raw))
|
||||
crawl_id, crawl_created_at, crawl_urls, crawl_created_by_id = crawl_row
|
||||
username = user_row[0]
|
||||
crawl_date_str = datetime.fromisoformat(crawl_created_at).strftime('%Y%m%d')
|
||||
snapshot_date_str = datetime.fromisoformat(snapshot_created_at).strftime('%Y%m%d')
|
||||
domain = urlparse(snapshot_url).hostname or 'unknown'
|
||||
|
||||
|
||||
@@ -3,11 +3,9 @@
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
|
||||
import pytest
|
||||
|
||||
from .fixtures import process, disable_extractors_dict
|
||||
|
||||
|
||||
def test_status_shows_index_info(tmp_path, process):
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
import os
|
||||
import sqlite3
|
||||
import subprocess
|
||||
|
||||
from .fixtures import *
|
||||
from .fixtures import disable_extractors_dict, process
|
||||
|
||||
FIXTURES = (disable_extractors_dict, process)
|
||||
|
||||
def test_title_is_extracted(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that title is extracted from the page."""
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
import json
|
||||
import sqlite3
|
||||
import subprocess
|
||||
|
||||
from .fixtures import *
|
||||
from .fixtures import disable_extractors_dict, process
|
||||
|
||||
FIXTURES = (disable_extractors_dict, process)
|
||||
|
||||
def test_update_imports_orphaned_snapshots(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that archivebox update imports real legacy archive directories."""
|
||||
|
||||
@@ -3,11 +3,9 @@
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from .fixtures import process, disable_extractors_dict
|
||||
|
||||
|
||||
class TestVersionQuiet:
|
||||
|
||||
@@ -18,11 +18,9 @@ Config priority order (highest to lowest):
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import sys
|
||||
import tempfile
|
||||
import subprocess
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
@@ -45,7 +43,7 @@ def test_config_propagation_through_worker_hierarchy():
|
||||
data_dir.mkdir()
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"Test: Config Propagation Through Worker Hierarchy")
|
||||
print("Test: Config Propagation Through Worker Hierarchy")
|
||||
print(f"DATA_DIR: {data_dir}")
|
||||
print(f"{'='*80}\n")
|
||||
|
||||
@@ -63,7 +61,7 @@ def test_config_propagation_through_worker_hierarchy():
|
||||
timeout=60,
|
||||
)
|
||||
assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
|
||||
print(f"✓ Archive initialized\n")
|
||||
print("✓ Archive initialized\n")
|
||||
|
||||
# Step 2: Write custom config to ArchiveBox.conf
|
||||
print("Step 2: Write custom config to ArchiveBox.conf")
|
||||
@@ -90,7 +88,7 @@ SAVE_TITLE = True
|
||||
SAVE_FAVICON = True
|
||||
SAVE_SCREENSHOT = True
|
||||
""")
|
||||
print(f"✓ Wrote config file with TIMEOUT=999, all plugins enabled\n")
|
||||
print("✓ Wrote config file with TIMEOUT=999, all plugins enabled\n")
|
||||
|
||||
# Step 2.5: Set Machine.config values
|
||||
print("Step 2.5: Set Machine.config with custom binary path")
|
||||
@@ -123,7 +121,7 @@ print(f"Machine {{machine.hostname}} config updated")
|
||||
timeout=30,
|
||||
)
|
||||
assert result.returncode == 0, f"Set machine config failed: {result.stderr.decode()}"
|
||||
print(f"✓ Set Machine.config with CUSTOM_MACHINE_KEY=from_machine_config, WGET_BINARY=/custom/machine/wget\n")
|
||||
print("✓ Set Machine.config with CUSTOM_MACHINE_KEY=from_machine_config, WGET_BINARY=/custom/machine/wget\n")
|
||||
|
||||
# Step 3: Create Crawl via Django ORM with custom crawl.config
|
||||
print("Step 3: Create Crawl with custom crawl.config JSON")
|
||||
@@ -421,7 +419,7 @@ def test_config_environment_variable_parsing():
|
||||
data_dir.mkdir()
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"Test: Config Environment Variable Parsing")
|
||||
print("Test: Config Environment Variable Parsing")
|
||||
print(f"DATA_DIR: {data_dir}")
|
||||
print(f"{'='*80}\n")
|
||||
|
||||
@@ -557,7 +555,7 @@ def test_parent_environment_preserved_in_hooks():
|
||||
data_dir.mkdir()
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"Test: Parent Environment Preserved in Hooks")
|
||||
print("Test: Parent Environment Preserved in Hooks")
|
||||
print(f"DATA_DIR: {data_dir}")
|
||||
print(f"{'='*80}\n")
|
||||
|
||||
@@ -575,7 +573,7 @@ def test_parent_environment_preserved_in_hooks():
|
||||
timeout=60,
|
||||
)
|
||||
assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
|
||||
print(f"✓ Archive initialized\n")
|
||||
print("✓ Archive initialized\n")
|
||||
|
||||
# Create snapshot
|
||||
print("Step 2: Create Snapshot")
|
||||
@@ -635,7 +633,6 @@ print(snapshot.id)
|
||||
timeout=120,
|
||||
)
|
||||
|
||||
stdout = result.stdout.decode()
|
||||
stderr = result.stderr.decode()
|
||||
|
||||
print("\n--- SnapshotWorker stderr (first 50 lines) ---")
|
||||
@@ -760,7 +757,7 @@ def test_config_auto_fetch_relationships():
|
||||
data_dir.mkdir()
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"Test: Config Auto-Fetch Relationships")
|
||||
print("Test: Config Auto-Fetch Relationships")
|
||||
print(f"DATA_DIR: {data_dir}")
|
||||
print(f"{'='*80}\n")
|
||||
|
||||
@@ -778,7 +775,7 @@ def test_config_auto_fetch_relationships():
|
||||
timeout=60,
|
||||
)
|
||||
assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
|
||||
print(f"✓ Archive initialized\n")
|
||||
print("✓ Archive initialized\n")
|
||||
|
||||
# Create objects with config at each level
|
||||
print("Step 2: Create Crawl -> Snapshot -> ArchiveResult with config at each level")
|
||||
@@ -906,7 +903,7 @@ def test_config_precedence_with_environment_vars():
|
||||
data_dir.mkdir()
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"Test: Config Precedence with Environment Variables")
|
||||
print("Test: Config Precedence with Environment Variables")
|
||||
print(f"DATA_DIR: {data_dir}")
|
||||
print(f"{'='*80}\n")
|
||||
|
||||
@@ -1006,7 +1003,7 @@ def test_new_environment_variables_added():
|
||||
data_dir.mkdir()
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"Test: New Environment Variables Added to Config")
|
||||
print("Test: New Environment Variables Added to Config")
|
||||
print(f"DATA_DIR: {data_dir}")
|
||||
print(f"{'='*80}\n")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user