Files
ArchiveBox/archivebox/tests/test_crawl_admin.py
Nick Sweeting b749b26c5d wip
2026-03-23 03:58:32 -07:00

233 lines
7.4 KiB
Python

from typing import cast
import pytest
from django.contrib.auth import get_user_model
from django.contrib.auth.models import UserManager
from django.urls import reverse
from archivebox.crawls.admin import CrawlAdminForm
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
pytestmark = pytest.mark.django_db
User = get_user_model()
ADMIN_HOST = "admin.archivebox.localhost:8000"
@pytest.fixture
def admin_user(db):
return cast(UserManager, User.objects).create_superuser(
username="crawladmin",
email="crawladmin@test.com",
password="testpassword",
)
@pytest.fixture
def crawl(admin_user):
return Crawl.objects.create(
urls="https://example.com\nhttps://example.org",
tags_str="alpha,beta",
created_by=admin_user,
)
def test_crawl_admin_change_view_renders_tag_editor_widget(client, admin_user, crawl):
client.login(username="crawladmin", password="testpassword")
response = client.get(
reverse("admin:crawls_crawl_change", args=[crawl.pk]),
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
assert b'name="tags_editor"' in response.content
assert b"tag-editor-container" in response.content
assert b"alpha" in response.content
assert b"beta" in response.content
def test_crawl_admin_add_view_renders_url_filter_alias_fields(client, admin_user):
client.login(username="crawladmin", password="testpassword")
response = client.get(
reverse("admin:crawls_crawl_add"),
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
assert b'name="url_filters_allowlist"' in response.content
assert b'name="url_filters_denylist"' in response.content
assert b"Same domain only" in response.content
def test_crawl_admin_form_saves_tags_editor_to_tags_str(crawl, admin_user):
form = CrawlAdminForm(
data={
"created_at": crawl.created_at.strftime("%Y-%m-%d %H:%M:%S"),
"urls": crawl.urls,
"config": "{}",
"max_depth": "0",
"max_urls": "3",
"max_size": str(45 * 1024 * 1024),
"tags_editor": "alpha, beta, Alpha, gamma",
"url_filters_allowlist": "example.com\n*.example.com",
"url_filters_denylist": "static.example.com",
"persona_id": "",
"label": "",
"notes": "",
"schedule": "",
"status": crawl.status,
"retry_at": crawl.retry_at.strftime("%Y-%m-%d %H:%M:%S"),
"created_by": str(admin_user.pk),
"num_uses_failed": "0",
"num_uses_succeeded": "0",
},
instance=crawl,
)
assert form.is_valid(), form.errors
updated = form.save()
updated.refresh_from_db()
assert updated.tags_str == "alpha,beta,gamma"
assert updated.max_urls == 3
assert updated.max_size == 45 * 1024 * 1024
assert updated.config["MAX_URLS"] == 3
assert updated.config["MAX_SIZE"] == 45 * 1024 * 1024
assert updated.config["URL_ALLOWLIST"] == "example.com\n*.example.com"
assert updated.config["URL_DENYLIST"] == "static.example.com"
def test_crawl_admin_delete_snapshot_action_removes_snapshot_and_url(client, admin_user):
crawl = Crawl.objects.create(
urls="https://example.com/remove-me",
created_by=admin_user,
)
snapshot = Snapshot.objects.create(
crawl=crawl,
url="https://example.com/remove-me",
)
client.login(username="crawladmin", password="testpassword")
response = client.post(
reverse("admin:crawls_crawl_snapshot_delete", args=[crawl.pk, snapshot.pk]),
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
assert response.json()["ok"] is True
assert not Snapshot.objects.filter(pk=snapshot.pk).exists()
crawl.refresh_from_db()
assert "https://example.com/remove-me" not in crawl.urls
def test_crawl_admin_exclude_domain_action_prunes_urls_and_pending_snapshots(client, admin_user):
crawl = Crawl.objects.create(
urls="\n".join(
[
"https://cdn.example.com/asset.js",
"https://cdn.example.com/second.js",
"https://example.com/root",
],
),
created_by=admin_user,
)
queued_snapshot = Snapshot.objects.create(
crawl=crawl,
url="https://cdn.example.com/asset.js",
status=Snapshot.StatusChoices.QUEUED,
)
preserved_snapshot = Snapshot.objects.create(
crawl=crawl,
url="https://example.com/root",
status=Snapshot.StatusChoices.SEALED,
)
client.login(username="crawladmin", password="testpassword")
response = client.post(
reverse("admin:crawls_crawl_snapshot_exclude_domain", args=[crawl.pk, queued_snapshot.pk]),
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
payload = response.json()
assert payload["ok"] is True
assert payload["domain"] == "cdn.example.com"
crawl.refresh_from_db()
assert crawl.get_url_denylist(use_effective_config=False) == ["cdn.example.com"]
assert "https://cdn.example.com/asset.js" not in crawl.urls
assert "https://cdn.example.com/second.js" not in crawl.urls
assert "https://example.com/root" in crawl.urls
assert not Snapshot.objects.filter(pk=queued_snapshot.pk).exists()
assert Snapshot.objects.filter(pk=preserved_snapshot.pk).exists()
def test_snapshot_from_json_trims_markdown_suffixes_on_discovered_urls(crawl):
snapshot = Snapshot.from_json(
{"url": "https://docs.sweeting.me/s/youtube-favorites)**"},
overrides={"crawl": crawl},
queue_for_extraction=False,
)
assert snapshot is not None
assert snapshot.url == "https://docs.sweeting.me/s/youtube-favorites"
def test_create_snapshots_from_urls_respects_url_allowlist_and_denylist(admin_user):
crawl = Crawl.objects.create(
urls="\n".join(
[
"https://example.com/root",
"https://static.example.com/app.js",
"https://other.test/page",
],
),
created_by=admin_user,
config={
"URL_ALLOWLIST": "example.com",
"URL_DENYLIST": "static.example.com",
},
)
created = crawl.create_snapshots_from_urls()
assert [snapshot.url for snapshot in created] == ["https://example.com/root"]
def test_url_filter_regex_lists_preserve_commas_and_split_on_newlines_only(admin_user):
crawl = Crawl.objects.create(
urls="\n".join(
[
"https://example.com/root",
"https://example.com/path,with,commas",
"https://other.test/page",
],
),
created_by=admin_user,
config={
"URL_ALLOWLIST": r"^https://example\.com/(root|path,with,commas)$" + "\n" + r"^https://other\.test/page$",
"URL_DENYLIST": r"^https://example\.com/path,with,commas$",
},
)
assert crawl.get_url_allowlist(use_effective_config=False) == [
r"^https://example\.com/(root|path,with,commas)$",
r"^https://other\.test/page$",
]
assert crawl.get_url_denylist(use_effective_config=False) == [
r"^https://example\.com/path,with,commas$",
]
created = crawl.create_snapshots_from_urls()
assert [snapshot.url for snapshot in created] == [
"https://example.com/root",
"https://other.test/page",
]