Files
ArchiveBox/archivebox/core/tests.py
2026-03-15 20:47:28 -07:00

383 lines
14 KiB
Python

"""Tests for the core views, especially AddView."""
import importlib
import os
import django
from unittest.mock import patch
from typing import TypeVar, cast
from django.forms import BaseForm
# Set up Django before importing any Django-dependent modules
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
django.setup()
TestCase = importlib.import_module('django.test').TestCase
Client = importlib.import_module('django.test').Client
User = importlib.import_module('django.contrib.auth.models').User
reverse = importlib.import_module('django.urls').reverse
Crawl = importlib.import_module('archivebox.crawls.models').Crawl
CrawlSchedule = importlib.import_module('archivebox.crawls.models').CrawlSchedule
Tag = importlib.import_module('archivebox.core.models').Tag
SERVER_CONFIG = importlib.import_module('archivebox.config.common').SERVER_CONFIG
T = TypeVar('T')
def require(value: T | None) -> T:
if value is None:
raise AssertionError('Expected value to be present')
return value
class AddViewTests(TestCase):
"""Tests for the AddView (crawl creation form)."""
def setUp(self):
"""Set up test user and client."""
self.client = Client()
self.user = User.objects.create_user(
username='testuser',
password='testpass123',
email='test@example.com'
)
self.client.login(username='testuser', password='testpass123')
self.add_url = reverse('add')
def test_add_view_get_requires_auth(self):
"""Test that GET /add requires authentication."""
self.client.logout()
response = self.client.get(self.add_url)
# Should redirect to login or show 403/404
self.assertIn(response.status_code, [302, 403, 404])
def test_add_view_get_shows_form(self):
"""Test that GET /add shows the form with all fields."""
response = self.client.get(self.add_url)
self.assertEqual(response.status_code, 200)
# Check that form fields are present
self.assertContains(response, 'name="url"')
self.assertContains(response, 'name="tag"')
self.assertContains(response, 'name="depth"')
self.assertContains(response, 'name="notes"')
self.assertContains(response, 'name="schedule"')
self.assertContains(response, 'name="persona"')
self.assertContains(response, 'name="overwrite"')
self.assertContains(response, 'name="update"')
self.assertContains(response, 'name="index_only"')
# Check for plugin groups
self.assertContains(response, 'name="chrome_plugins"')
self.assertContains(response, 'name="archiving_plugins"')
self.assertContains(response, 'name="parsing_plugins"')
def test_add_view_shows_tag_autocomplete(self):
"""Test that tag autocomplete datalist is rendered."""
# Create some tags
Tag.objects.create(name='test-tag-1')
Tag.objects.create(name='test-tag-2')
response = self.client.get(self.add_url)
self.assertEqual(response.status_code, 200)
# Check for datalist with tags
self.assertContains(response, 'id="tag-datalist"')
self.assertContains(response, 'test-tag-1')
self.assertContains(response, 'test-tag-2')
def test_add_view_shows_plugin_presets(self):
"""Test that plugin preset buttons are rendered."""
response = self.client.get(self.add_url)
self.assertEqual(response.status_code, 200)
self.assertContains(response, 'Quick Archive')
self.assertContains(response, 'Full Chrome')
self.assertContains(response, 'Text Only')
self.assertContains(response, 'Select All')
self.assertContains(response, 'Clear All')
def test_add_view_shows_links_to_resources(self):
"""Test that helpful links are present."""
response = self.client.get(self.add_url)
self.assertEqual(response.status_code, 200)
# Link to plugin documentation
self.assertContains(response, '/admin/environment/plugins/')
# Link to create new persona
self.assertContains(response, '/admin/personas/persona/add/')
def test_add_basic_crawl_without_schedule(self):
"""Test creating a basic crawl without a schedule."""
response = self.client.post(self.add_url, {
'url': 'https://example.com\nhttps://example.org',
'tag': 'test-tag',
'depth': '0',
'notes': 'Test crawl notes',
})
# Should redirect to crawl admin page
self.assertEqual(response.status_code, 302)
# Check that crawl was created
self.assertEqual(Crawl.objects.count(), 1)
crawl = require(Crawl.objects.first())
self.assertIn('https://example.com', crawl.urls)
self.assertIn('https://example.org', crawl.urls)
self.assertEqual(crawl.tags_str, 'test-tag')
self.assertEqual(crawl.max_depth, 0)
self.assertEqual(crawl.notes, 'Test crawl notes')
self.assertEqual(crawl.created_by, self.user)
# No schedule should be created
self.assertIsNone(crawl.schedule)
self.assertEqual(CrawlSchedule.objects.count(), 0)
def test_add_crawl_with_schedule(self):
"""Test creating a crawl with a repeat schedule."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'tag': 'scheduled',
'depth': '1',
'notes': 'Daily crawl',
'schedule': 'daily',
})
self.assertEqual(response.status_code, 302)
# Check that crawl and schedule were created
self.assertEqual(Crawl.objects.count(), 1)
self.assertEqual(CrawlSchedule.objects.count(), 1)
crawl = require(Crawl.objects.first())
schedule = require(CrawlSchedule.objects.first())
self.assertEqual(crawl.schedule, schedule)
self.assertEqual(schedule.template, crawl)
self.assertEqual(schedule.schedule, 'daily')
self.assertTrue(schedule.is_enabled)
self.assertEqual(schedule.created_by, self.user)
def test_add_crawl_with_cron_schedule(self):
"""Test creating a crawl with a cron format schedule."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
'schedule': '0 */6 * * *', # Every 6 hours
})
self.assertEqual(response.status_code, 302)
schedule = require(CrawlSchedule.objects.first())
self.assertEqual(schedule.schedule, '0 */6 * * *')
def test_add_crawl_with_plugins(self):
"""Test creating a crawl with specific plugins selected."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
'chrome_plugins': ['screenshot', 'dom'],
'archiving_plugins': ['wget'],
})
self.assertEqual(response.status_code, 302)
crawl = require(Crawl.objects.first())
plugins = crawl.config.get('PLUGINS', '')
# Should contain the selected plugins
self.assertIn('screenshot', plugins)
self.assertIn('dom', plugins)
self.assertIn('wget', plugins)
def test_add_crawl_with_depth_range(self):
"""Test creating crawls with different depth values (0-4)."""
for depth in range(5):
response = self.client.post(self.add_url, {
'url': f'https://example{depth}.com',
'depth': str(depth),
})
self.assertEqual(response.status_code, 302)
self.assertEqual(Crawl.objects.count(), 5)
for i, crawl in enumerate(Crawl.objects.order_by('created_at')):
self.assertEqual(crawl.max_depth, i)
def test_add_crawl_with_advanced_options(self):
"""Test creating a crawl with advanced options."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
'persona': 'CustomPersona',
'overwrite': True,
'update': True,
'index_only': True,
})
self.assertEqual(response.status_code, 302)
crawl = require(Crawl.objects.first())
config = crawl.config
self.assertEqual(config.get('DEFAULT_PERSONA'), 'CustomPersona')
self.assertEqual(config.get('OVERWRITE'), True)
self.assertEqual(config.get('ONLY_NEW'), False) # opposite of update
self.assertEqual(config.get('INDEX_ONLY'), True)
def test_add_crawl_with_custom_config(self):
"""Test creating a crawl with custom config overrides."""
# Note: Django test client can't easily POST the KeyValueWidget format,
# so this test would need to use the form directly or mock the cleaned_data
# For now, we'll skip this test or mark it as TODO
pass
def test_add_public_anonymous_custom_config_is_silently_stripped(self):
"""Anonymous users cannot override crawl config, even with PUBLIC_ADD_VIEW enabled."""
self.client.logout()
with patch.object(SERVER_CONFIG, 'PUBLIC_ADD_VIEW', True):
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
'config': '{"YTDLP_ARGS_EXTRA":["--exec","id > /tmp/pwned"]}',
})
self.assertEqual(response.status_code, 302)
crawl = require(Crawl.objects.order_by('-created_at').first())
self.assertNotIn('YTDLP_ARGS_EXTRA', crawl.config)
def test_add_authenticated_non_admin_custom_config_is_silently_stripped(self):
"""Authenticated non-admin users cannot override crawl config."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
'config': '{"YTDLP_ARGS_EXTRA":["--exec","id > /tmp/pwned"]}',
})
self.assertEqual(response.status_code, 302)
crawl = require(Crawl.objects.order_by('-created_at').first())
self.assertNotIn('YTDLP_ARGS_EXTRA', crawl.config)
def test_add_staff_admin_custom_config_is_allowed(self):
"""Admin users can override crawl config."""
self.client.logout()
User.objects.create_user(
username='adminuser',
password='adminpass123',
email='admin@example.com',
is_staff=True,
)
self.client.login(username='adminuser', password='adminpass123')
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
'config': '{"YTDLP_ARGS_EXTRA":["--exec","echo hello"]}',
})
self.assertEqual(response.status_code, 302)
crawl = require(Crawl.objects.order_by('-created_at').first())
self.assertEqual(crawl.config.get('YTDLP_ARGS_EXTRA'), ['--exec', 'echo hello'])
def test_add_empty_urls_fails(self):
"""Test that submitting without URLs fails validation."""
response = self.client.post(self.add_url, {
'url': '',
'depth': '0',
})
# Should show form again with errors, not redirect
self.assertEqual(response.status_code, 200)
self.assertFormError(cast(BaseForm, response.context['form']), 'url', 'This field is required.')
def test_add_invalid_urls_fails(self):
"""Test that invalid URLs fail validation."""
response = self.client.post(self.add_url, {
'url': 'not-a-url',
'depth': '0',
})
# Should show form again with errors
self.assertEqual(response.status_code, 200)
# Check for validation error (URL regex should fail)
self.assertContains(response, 'error')
def test_add_success_message_without_schedule(self):
"""Test that success message is shown without schedule link."""
response = self.client.post(self.add_url, {
'url': 'https://example.com\nhttps://example.org',
'depth': '0',
}, follow=True)
# Check success message mentions crawl creation
messages = list(response.context['messages'])
self.assertEqual(len(messages), 1)
message_text = str(messages[0])
self.assertIn('Created crawl with 2 starting URL', message_text)
self.assertIn('View Crawl', message_text)
self.assertNotIn('scheduled to repeat', message_text)
def test_add_success_message_with_schedule(self):
"""Test that success message includes schedule link."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
'schedule': 'weekly',
}, follow=True)
# Check success message mentions schedule
messages = list(response.context['messages'])
self.assertEqual(len(messages), 1)
message_text = str(messages[0])
self.assertIn('Created crawl', message_text)
self.assertIn('scheduled to repeat weekly', message_text)
self.assertIn('View Crawl', message_text)
def test_add_crawl_creates_source_file(self):
"""Test that crawl creation saves URLs to sources file."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
})
self.assertEqual(response.status_code, 302)
# Check that source file was created in sources/ directory
from archivebox.config import CONSTANTS
sources_dir = CONSTANTS.SOURCES_DIR
# Should have created a source file
source_files = list(sources_dir.glob('*__web_ui_add_by_user_*.txt'))
self.assertGreater(len(source_files), 0)
def test_multiple_tags_are_saved(self):
"""Test that multiple comma-separated tags are saved."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
'tag': 'tag1,tag2,tag3',
})
self.assertEqual(response.status_code, 302)
crawl = require(Crawl.objects.first())
self.assertEqual(crawl.tags_str, 'tag1,tag2,tag3')
def test_crawl_redirects_to_admin_change_page(self):
"""Test that successful submission redirects to crawl admin page."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
})
crawl = require(Crawl.objects.first())
expected_redirect = f'/admin/crawls/crawl/{crawl.id}/change/'
self.assertRedirects(response, expected_redirect, fetch_redirect_response=False)