mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-03 01:15:57 +10:00
320 lines
12 KiB
Python
320 lines
12 KiB
Python
"""Tests for the core views, especially AddView."""
|
|
|
|
import os
|
|
import django
|
|
|
|
# Set up Django before importing any Django-dependent modules
|
|
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
|
|
django.setup()
|
|
|
|
from django.test import TestCase, Client
|
|
from django.contrib.auth.models import User
|
|
from django.urls import reverse
|
|
|
|
from archivebox.crawls.models import Crawl, CrawlSchedule
|
|
from archivebox.core.models import Tag
|
|
|
|
|
|
class AddViewTests(TestCase):
|
|
"""Tests for the AddView (crawl creation form)."""
|
|
|
|
def setUp(self):
|
|
"""Set up test user and client."""
|
|
self.client = Client()
|
|
self.user = User.objects.create_user(
|
|
username='testuser',
|
|
password='testpass123',
|
|
email='test@example.com'
|
|
)
|
|
self.client.login(username='testuser', password='testpass123')
|
|
self.add_url = reverse('add')
|
|
|
|
def test_add_view_get_requires_auth(self):
|
|
"""Test that GET /add requires authentication."""
|
|
self.client.logout()
|
|
response = self.client.get(self.add_url)
|
|
# Should redirect to login or show 403/404
|
|
self.assertIn(response.status_code, [302, 403, 404])
|
|
|
|
def test_add_view_get_shows_form(self):
|
|
"""Test that GET /add shows the form with all fields."""
|
|
response = self.client.get(self.add_url)
|
|
self.assertEqual(response.status_code, 200)
|
|
|
|
# Check that form fields are present
|
|
self.assertContains(response, 'name="url"')
|
|
self.assertContains(response, 'name="tag"')
|
|
self.assertContains(response, 'name="depth"')
|
|
self.assertContains(response, 'name="notes"')
|
|
self.assertContains(response, 'name="schedule"')
|
|
self.assertContains(response, 'name="persona"')
|
|
self.assertContains(response, 'name="overwrite"')
|
|
self.assertContains(response, 'name="update"')
|
|
self.assertContains(response, 'name="index_only"')
|
|
|
|
# Check for plugin groups
|
|
self.assertContains(response, 'name="chrome_plugins"')
|
|
self.assertContains(response, 'name="archiving_plugins"')
|
|
self.assertContains(response, 'name="parsing_plugins"')
|
|
|
|
def test_add_view_shows_tag_autocomplete(self):
|
|
"""Test that tag autocomplete datalist is rendered."""
|
|
# Create some tags
|
|
Tag.objects.create(name='test-tag-1')
|
|
Tag.objects.create(name='test-tag-2')
|
|
|
|
response = self.client.get(self.add_url)
|
|
self.assertEqual(response.status_code, 200)
|
|
|
|
# Check for datalist with tags
|
|
self.assertContains(response, 'id="tag-datalist"')
|
|
self.assertContains(response, 'test-tag-1')
|
|
self.assertContains(response, 'test-tag-2')
|
|
|
|
def test_add_view_shows_plugin_presets(self):
|
|
"""Test that plugin preset buttons are rendered."""
|
|
response = self.client.get(self.add_url)
|
|
self.assertEqual(response.status_code, 200)
|
|
|
|
self.assertContains(response, 'Quick Archive')
|
|
self.assertContains(response, 'Full Chrome')
|
|
self.assertContains(response, 'Text Only')
|
|
self.assertContains(response, 'Select All')
|
|
self.assertContains(response, 'Clear All')
|
|
|
|
def test_add_view_shows_links_to_resources(self):
|
|
"""Test that helpful links are present."""
|
|
response = self.client.get(self.add_url)
|
|
self.assertEqual(response.status_code, 200)
|
|
|
|
# Link to plugin documentation
|
|
self.assertContains(response, '/admin/environment/plugins/')
|
|
|
|
# Link to create new persona
|
|
self.assertContains(response, '/admin/personas/persona/add/')
|
|
|
|
def test_add_basic_crawl_without_schedule(self):
|
|
"""Test creating a basic crawl without a schedule."""
|
|
response = self.client.post(self.add_url, {
|
|
'url': 'https://example.com\nhttps://example.org',
|
|
'tag': 'test-tag',
|
|
'depth': '0',
|
|
'notes': 'Test crawl notes',
|
|
})
|
|
|
|
# Should redirect to crawl admin page
|
|
self.assertEqual(response.status_code, 302)
|
|
|
|
# Check that crawl was created
|
|
self.assertEqual(Crawl.objects.count(), 1)
|
|
crawl = Crawl.objects.first()
|
|
|
|
self.assertIn('https://example.com', crawl.urls)
|
|
self.assertIn('https://example.org', crawl.urls)
|
|
self.assertEqual(crawl.tags_str, 'test-tag')
|
|
self.assertEqual(crawl.max_depth, 0)
|
|
self.assertEqual(crawl.notes, 'Test crawl notes')
|
|
self.assertEqual(crawl.created_by, self.user)
|
|
|
|
# No schedule should be created
|
|
self.assertIsNone(crawl.schedule)
|
|
self.assertEqual(CrawlSchedule.objects.count(), 0)
|
|
|
|
def test_add_crawl_with_schedule(self):
|
|
"""Test creating a crawl with a repeat schedule."""
|
|
response = self.client.post(self.add_url, {
|
|
'url': 'https://example.com',
|
|
'tag': 'scheduled',
|
|
'depth': '1',
|
|
'notes': 'Daily crawl',
|
|
'schedule': 'daily',
|
|
})
|
|
|
|
self.assertEqual(response.status_code, 302)
|
|
|
|
# Check that crawl and schedule were created
|
|
self.assertEqual(Crawl.objects.count(), 1)
|
|
self.assertEqual(CrawlSchedule.objects.count(), 1)
|
|
|
|
crawl = Crawl.objects.first()
|
|
schedule = CrawlSchedule.objects.first()
|
|
|
|
self.assertEqual(crawl.schedule, schedule)
|
|
self.assertEqual(schedule.template, crawl)
|
|
self.assertEqual(schedule.schedule, 'daily')
|
|
self.assertTrue(schedule.is_enabled)
|
|
self.assertEqual(schedule.created_by, self.user)
|
|
|
|
def test_add_crawl_with_cron_schedule(self):
|
|
"""Test creating a crawl with a cron format schedule."""
|
|
response = self.client.post(self.add_url, {
|
|
'url': 'https://example.com',
|
|
'depth': '0',
|
|
'schedule': '0 */6 * * *', # Every 6 hours
|
|
})
|
|
|
|
self.assertEqual(response.status_code, 302)
|
|
|
|
schedule = CrawlSchedule.objects.first()
|
|
self.assertEqual(schedule.schedule, '0 */6 * * *')
|
|
|
|
def test_add_crawl_with_plugins(self):
|
|
"""Test creating a crawl with specific plugins selected."""
|
|
response = self.client.post(self.add_url, {
|
|
'url': 'https://example.com',
|
|
'depth': '0',
|
|
'chrome_plugins': ['screenshot', 'dom'],
|
|
'archiving_plugins': ['wget'],
|
|
})
|
|
|
|
self.assertEqual(response.status_code, 302)
|
|
|
|
crawl = Crawl.objects.first()
|
|
plugins = crawl.config.get('PLUGINS', '')
|
|
|
|
# Should contain the selected plugins
|
|
self.assertIn('screenshot', plugins)
|
|
self.assertIn('dom', plugins)
|
|
self.assertIn('wget', plugins)
|
|
|
|
def test_add_crawl_with_depth_range(self):
|
|
"""Test creating crawls with different depth values (0-4)."""
|
|
for depth in range(5):
|
|
response = self.client.post(self.add_url, {
|
|
'url': f'https://example{depth}.com',
|
|
'depth': str(depth),
|
|
})
|
|
|
|
self.assertEqual(response.status_code, 302)
|
|
|
|
self.assertEqual(Crawl.objects.count(), 5)
|
|
|
|
for i, crawl in enumerate(Crawl.objects.order_by('created_at')):
|
|
self.assertEqual(crawl.max_depth, i)
|
|
|
|
def test_add_crawl_with_advanced_options(self):
|
|
"""Test creating a crawl with advanced options."""
|
|
response = self.client.post(self.add_url, {
|
|
'url': 'https://example.com',
|
|
'depth': '0',
|
|
'persona': 'CustomPersona',
|
|
'overwrite': True,
|
|
'update': True,
|
|
'index_only': True,
|
|
})
|
|
|
|
self.assertEqual(response.status_code, 302)
|
|
|
|
crawl = Crawl.objects.first()
|
|
config = crawl.config
|
|
|
|
self.assertEqual(config.get('DEFAULT_PERSONA'), 'CustomPersona')
|
|
self.assertEqual(config.get('OVERWRITE'), True)
|
|
self.assertEqual(config.get('ONLY_NEW'), False) # opposite of update
|
|
self.assertEqual(config.get('INDEX_ONLY'), True)
|
|
|
|
def test_add_crawl_with_custom_config(self):
|
|
"""Test creating a crawl with custom config overrides."""
|
|
# Note: Django test client can't easily POST the KeyValueWidget format,
|
|
# so this test would need to use the form directly or mock the cleaned_data
|
|
# For now, we'll skip this test or mark it as TODO
|
|
pass
|
|
|
|
def test_add_empty_urls_fails(self):
|
|
"""Test that submitting without URLs fails validation."""
|
|
response = self.client.post(self.add_url, {
|
|
'url': '',
|
|
'depth': '0',
|
|
})
|
|
|
|
# Should show form again with errors, not redirect
|
|
self.assertEqual(response.status_code, 200)
|
|
self.assertFormError(response, 'form', 'url', 'This field is required.')
|
|
|
|
def test_add_invalid_urls_fails(self):
|
|
"""Test that invalid URLs fail validation."""
|
|
response = self.client.post(self.add_url, {
|
|
'url': 'not-a-url',
|
|
'depth': '0',
|
|
})
|
|
|
|
# Should show form again with errors
|
|
self.assertEqual(response.status_code, 200)
|
|
# Check for validation error (URL regex should fail)
|
|
self.assertContains(response, 'error')
|
|
|
|
def test_add_success_message_without_schedule(self):
|
|
"""Test that success message is shown without schedule link."""
|
|
response = self.client.post(self.add_url, {
|
|
'url': 'https://example.com\nhttps://example.org',
|
|
'depth': '0',
|
|
}, follow=True)
|
|
|
|
# Check success message mentions crawl creation
|
|
messages = list(response.context['messages'])
|
|
self.assertEqual(len(messages), 1)
|
|
message_text = str(messages[0])
|
|
|
|
self.assertIn('Created crawl with 2 starting URL', message_text)
|
|
self.assertIn('View Crawl', message_text)
|
|
self.assertNotIn('scheduled to repeat', message_text)
|
|
|
|
def test_add_success_message_with_schedule(self):
|
|
"""Test that success message includes schedule link."""
|
|
response = self.client.post(self.add_url, {
|
|
'url': 'https://example.com',
|
|
'depth': '0',
|
|
'schedule': 'weekly',
|
|
}, follow=True)
|
|
|
|
# Check success message mentions schedule
|
|
messages = list(response.context['messages'])
|
|
self.assertEqual(len(messages), 1)
|
|
message_text = str(messages[0])
|
|
|
|
self.assertIn('Created crawl', message_text)
|
|
self.assertIn('scheduled to repeat weekly', message_text)
|
|
self.assertIn('View Crawl', message_text)
|
|
|
|
def test_add_crawl_creates_source_file(self):
|
|
"""Test that crawl creation saves URLs to sources file."""
|
|
response = self.client.post(self.add_url, {
|
|
'url': 'https://example.com',
|
|
'depth': '0',
|
|
})
|
|
|
|
self.assertEqual(response.status_code, 302)
|
|
|
|
# Check that source file was created in sources/ directory
|
|
from archivebox.config import CONSTANTS
|
|
sources_dir = CONSTANTS.SOURCES_DIR
|
|
|
|
# Should have created a source file
|
|
source_files = list(sources_dir.glob('*__web_ui_add_by_user_*.txt'))
|
|
self.assertGreater(len(source_files), 0)
|
|
|
|
def test_multiple_tags_are_saved(self):
|
|
"""Test that multiple comma-separated tags are saved."""
|
|
response = self.client.post(self.add_url, {
|
|
'url': 'https://example.com',
|
|
'depth': '0',
|
|
'tag': 'tag1,tag2,tag3',
|
|
})
|
|
|
|
self.assertEqual(response.status_code, 302)
|
|
|
|
crawl = Crawl.objects.first()
|
|
self.assertEqual(crawl.tags_str, 'tag1,tag2,tag3')
|
|
|
|
def test_crawl_redirects_to_admin_change_page(self):
|
|
"""Test that successful submission redirects to crawl admin page."""
|
|
response = self.client.post(self.add_url, {
|
|
'url': 'https://example.com',
|
|
'depth': '0',
|
|
})
|
|
|
|
crawl = Crawl.objects.first()
|
|
expected_redirect = f'/admin/crawls/crawl/{crawl.id}/change/'
|
|
|
|
self.assertRedirects(response, expected_redirect, fetch_redirect_response=False)
|