Files
ArchiveBox/archivebox/core/tests.py
Nick Sweeting f0aa19fa7d wip
2025-12-28 17:51:54 -08:00

320 lines
12 KiB
Python

"""Tests for the core views, especially AddView."""
import os
import django
# Set up Django before importing any Django-dependent modules
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
django.setup()
from django.test import TestCase, Client
from django.contrib.auth.models import User
from django.urls import reverse
from archivebox.crawls.models import Crawl, CrawlSchedule
from archivebox.core.models import Tag
class AddViewTests(TestCase):
"""Tests for the AddView (crawl creation form)."""
def setUp(self):
"""Set up test user and client."""
self.client = Client()
self.user = User.objects.create_user(
username='testuser',
password='testpass123',
email='test@example.com'
)
self.client.login(username='testuser', password='testpass123')
self.add_url = reverse('add')
def test_add_view_get_requires_auth(self):
"""Test that GET /add requires authentication."""
self.client.logout()
response = self.client.get(self.add_url)
# Should redirect to login or show 403/404
self.assertIn(response.status_code, [302, 403, 404])
def test_add_view_get_shows_form(self):
"""Test that GET /add shows the form with all fields."""
response = self.client.get(self.add_url)
self.assertEqual(response.status_code, 200)
# Check that form fields are present
self.assertContains(response, 'name="url"')
self.assertContains(response, 'name="tag"')
self.assertContains(response, 'name="depth"')
self.assertContains(response, 'name="notes"')
self.assertContains(response, 'name="schedule"')
self.assertContains(response, 'name="persona"')
self.assertContains(response, 'name="overwrite"')
self.assertContains(response, 'name="update"')
self.assertContains(response, 'name="index_only"')
# Check for plugin groups
self.assertContains(response, 'name="chrome_plugins"')
self.assertContains(response, 'name="archiving_plugins"')
self.assertContains(response, 'name="parsing_plugins"')
def test_add_view_shows_tag_autocomplete(self):
"""Test that tag autocomplete datalist is rendered."""
# Create some tags
Tag.objects.create(name='test-tag-1')
Tag.objects.create(name='test-tag-2')
response = self.client.get(self.add_url)
self.assertEqual(response.status_code, 200)
# Check for datalist with tags
self.assertContains(response, 'id="tag-datalist"')
self.assertContains(response, 'test-tag-1')
self.assertContains(response, 'test-tag-2')
def test_add_view_shows_plugin_presets(self):
"""Test that plugin preset buttons are rendered."""
response = self.client.get(self.add_url)
self.assertEqual(response.status_code, 200)
self.assertContains(response, 'Quick Archive')
self.assertContains(response, 'Full Chrome')
self.assertContains(response, 'Text Only')
self.assertContains(response, 'Select All')
self.assertContains(response, 'Clear All')
def test_add_view_shows_links_to_resources(self):
"""Test that helpful links are present."""
response = self.client.get(self.add_url)
self.assertEqual(response.status_code, 200)
# Link to plugin documentation
self.assertContains(response, '/admin/environment/plugins/')
# Link to create new persona
self.assertContains(response, '/admin/personas/persona/add/')
def test_add_basic_crawl_without_schedule(self):
"""Test creating a basic crawl without a schedule."""
response = self.client.post(self.add_url, {
'url': 'https://example.com\nhttps://example.org',
'tag': 'test-tag',
'depth': '0',
'notes': 'Test crawl notes',
})
# Should redirect to crawl admin page
self.assertEqual(response.status_code, 302)
# Check that crawl was created
self.assertEqual(Crawl.objects.count(), 1)
crawl = Crawl.objects.first()
self.assertIn('https://example.com', crawl.urls)
self.assertIn('https://example.org', crawl.urls)
self.assertEqual(crawl.tags_str, 'test-tag')
self.assertEqual(crawl.max_depth, 0)
self.assertEqual(crawl.notes, 'Test crawl notes')
self.assertEqual(crawl.created_by, self.user)
# No schedule should be created
self.assertIsNone(crawl.schedule)
self.assertEqual(CrawlSchedule.objects.count(), 0)
def test_add_crawl_with_schedule(self):
"""Test creating a crawl with a repeat schedule."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'tag': 'scheduled',
'depth': '1',
'notes': 'Daily crawl',
'schedule': 'daily',
})
self.assertEqual(response.status_code, 302)
# Check that crawl and schedule were created
self.assertEqual(Crawl.objects.count(), 1)
self.assertEqual(CrawlSchedule.objects.count(), 1)
crawl = Crawl.objects.first()
schedule = CrawlSchedule.objects.first()
self.assertEqual(crawl.schedule, schedule)
self.assertEqual(schedule.template, crawl)
self.assertEqual(schedule.schedule, 'daily')
self.assertTrue(schedule.is_enabled)
self.assertEqual(schedule.created_by, self.user)
def test_add_crawl_with_cron_schedule(self):
"""Test creating a crawl with a cron format schedule."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
'schedule': '0 */6 * * *', # Every 6 hours
})
self.assertEqual(response.status_code, 302)
schedule = CrawlSchedule.objects.first()
self.assertEqual(schedule.schedule, '0 */6 * * *')
def test_add_crawl_with_plugins(self):
"""Test creating a crawl with specific plugins selected."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
'chrome_plugins': ['screenshot', 'dom'],
'archiving_plugins': ['wget'],
})
self.assertEqual(response.status_code, 302)
crawl = Crawl.objects.first()
plugins = crawl.config.get('PLUGINS', '')
# Should contain the selected plugins
self.assertIn('screenshot', plugins)
self.assertIn('dom', plugins)
self.assertIn('wget', plugins)
def test_add_crawl_with_depth_range(self):
"""Test creating crawls with different depth values (0-4)."""
for depth in range(5):
response = self.client.post(self.add_url, {
'url': f'https://example{depth}.com',
'depth': str(depth),
})
self.assertEqual(response.status_code, 302)
self.assertEqual(Crawl.objects.count(), 5)
for i, crawl in enumerate(Crawl.objects.order_by('created_at')):
self.assertEqual(crawl.max_depth, i)
def test_add_crawl_with_advanced_options(self):
"""Test creating a crawl with advanced options."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
'persona': 'CustomPersona',
'overwrite': True,
'update': True,
'index_only': True,
})
self.assertEqual(response.status_code, 302)
crawl = Crawl.objects.first()
config = crawl.config
self.assertEqual(config.get('DEFAULT_PERSONA'), 'CustomPersona')
self.assertEqual(config.get('OVERWRITE'), True)
self.assertEqual(config.get('ONLY_NEW'), False) # opposite of update
self.assertEqual(config.get('INDEX_ONLY'), True)
def test_add_crawl_with_custom_config(self):
"""Test creating a crawl with custom config overrides."""
# Note: Django test client can't easily POST the KeyValueWidget format,
# so this test would need to use the form directly or mock the cleaned_data
# For now, we'll skip this test or mark it as TODO
pass
def test_add_empty_urls_fails(self):
"""Test that submitting without URLs fails validation."""
response = self.client.post(self.add_url, {
'url': '',
'depth': '0',
})
# Should show form again with errors, not redirect
self.assertEqual(response.status_code, 200)
self.assertFormError(response, 'form', 'url', 'This field is required.')
def test_add_invalid_urls_fails(self):
"""Test that invalid URLs fail validation."""
response = self.client.post(self.add_url, {
'url': 'not-a-url',
'depth': '0',
})
# Should show form again with errors
self.assertEqual(response.status_code, 200)
# Check for validation error (URL regex should fail)
self.assertContains(response, 'error')
def test_add_success_message_without_schedule(self):
"""Test that success message is shown without schedule link."""
response = self.client.post(self.add_url, {
'url': 'https://example.com\nhttps://example.org',
'depth': '0',
}, follow=True)
# Check success message mentions crawl creation
messages = list(response.context['messages'])
self.assertEqual(len(messages), 1)
message_text = str(messages[0])
self.assertIn('Created crawl with 2 starting URL', message_text)
self.assertIn('View Crawl', message_text)
self.assertNotIn('scheduled to repeat', message_text)
def test_add_success_message_with_schedule(self):
"""Test that success message includes schedule link."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
'schedule': 'weekly',
}, follow=True)
# Check success message mentions schedule
messages = list(response.context['messages'])
self.assertEqual(len(messages), 1)
message_text = str(messages[0])
self.assertIn('Created crawl', message_text)
self.assertIn('scheduled to repeat weekly', message_text)
self.assertIn('View Crawl', message_text)
def test_add_crawl_creates_source_file(self):
"""Test that crawl creation saves URLs to sources file."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
})
self.assertEqual(response.status_code, 302)
# Check that source file was created in sources/ directory
from archivebox.config import CONSTANTS
sources_dir = CONSTANTS.SOURCES_DIR
# Should have created a source file
source_files = list(sources_dir.glob('*__web_ui_add_by_user_*.txt'))
self.assertGreater(len(source_files), 0)
def test_multiple_tags_are_saved(self):
"""Test that multiple comma-separated tags are saved."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
'tag': 'tag1,tag2,tag3',
})
self.assertEqual(response.status_code, 302)
crawl = Crawl.objects.first()
self.assertEqual(crawl.tags_str, 'tag1,tag2,tag3')
def test_crawl_redirects_to_admin_change_page(self):
"""Test that successful submission redirects to crawl admin page."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
})
crawl = Crawl.objects.first()
expected_redirect = f'/admin/crawls/crawl/{crawl.id}/change/'
self.assertRedirects(response, expected_redirect, fetch_redirect_response=False)