import re
import pytest
from django.contrib.auth import get_user_model
from django.urls import reverse
from archivebox.config.common import SERVER_CONFIG, SEARCH_BACKEND_CONFIG
from archivebox.core.models import Tag
from archivebox.crawls.models import Crawl
pytestmark = pytest.mark.django_db
User = get_user_model()
WEB_HOST = "web.archivebox.localhost:8000"
ADMIN_HOST = "admin.archivebox.localhost:8000"
@pytest.fixture
def admin_user(db):
return User.objects.create_superuser(
username="addviewadmin",
email="addviewadmin@test.com",
password="testpassword",
)
def test_add_view_renders_tag_editor_and_url_filter_fields(client, admin_user, monkeypatch):
monkeypatch.setattr(SERVER_CONFIG, "PUBLIC_ADD_VIEW", True)
response = client.get(reverse("add"), HTTP_HOST=WEB_HOST)
body = response.content.decode()
assert response.status_code == 200
assert "tag-editor-container" in body
assert 'name="url_filters_allowlist"' in body
assert 'name="url_filters_denylist"' in body
assert "Same domain only" in body
assert 'name="persona"' in body
assert "Overwrite existing snapshots" not in body
assert "Update/retry previously failed URLs" not in body
assert "Index only dry run (add crawl but don't archive yet)" in body
assert 'name="notes"' in body
assert 'name="max_urls"' in body
assert 'name="max_size"' in body
assert 'Crawl Plugins")
assert "data-url-regex=" in body
assert 'id="url-highlight-layer"' in body
assert 'id="detected-urls-list"' in body
assert "detected-url-toggle-btn" in body
def test_add_view_checks_configured_search_backend_by_default(client, monkeypatch):
monkeypatch.setattr(SERVER_CONFIG, "PUBLIC_ADD_VIEW", True)
monkeypatch.setattr(SEARCH_BACKEND_CONFIG, "SEARCH_BACKEND_ENGINE", "sqlite")
response = client.get(reverse("add"), HTTP_HOST=WEB_HOST)
body = response.content.decode()
assert response.status_code == 200
assert re.search(
r']* checked\b',
body,
)
assert "const requiredSearchPlugin = 'search_backend_sqlite';" in body
def test_add_view_creates_crawl_with_tag_and_url_filter_overrides(client, admin_user, monkeypatch):
monkeypatch.setattr(SERVER_CONFIG, "PUBLIC_ADD_VIEW", True)
client.force_login(admin_user)
response = client.post(
reverse("add"),
data={
"url": "https://example.com\nhttps://cdn.example.com/asset.js",
"tag": "alpha,beta",
"depth": "1",
"max_urls": "3",
"max_size": "45mb",
"url_filters_allowlist": "example.com\n*.example.com",
"url_filters_denylist": "cdn.example.com",
"notes": "Created from /add/",
"schedule": "",
"persona": "Default",
"index_only": "",
"config": "{}",
},
HTTP_HOST=WEB_HOST,
)
assert response.status_code == 302
crawl = Crawl.objects.order_by("-created_at").first()
assert crawl is not None
assert crawl.tags_str == "alpha,beta"
assert crawl.notes == "Created from /add/"
assert crawl.max_urls == 3
assert crawl.max_size == 45 * 1024 * 1024
assert crawl.config.get("DEFAULT_PERSONA") == "Default"
assert crawl.config["MAX_URLS"] == 3
assert crawl.config["MAX_SIZE"] == 45 * 1024 * 1024
assert crawl.config["URL_ALLOWLIST"] == "example.com\n*.example.com"
assert crawl.config["URL_DENYLIST"] == "cdn.example.com"
assert "OVERWRITE" not in crawl.config
assert "ONLY_NEW" not in crawl.config
def test_add_view_starts_background_runner_after_creating_crawl(client, admin_user, monkeypatch):
monkeypatch.setattr(SERVER_CONFIG, "PUBLIC_ADD_VIEW", True)
client.force_login(admin_user)
runner_calls = []
monkeypatch.setattr("archivebox.services.runner.ensure_background_runner", lambda: runner_calls.append(True) or True)
response = client.post(
reverse("add"),
data={
"url": "https://example.com",
"tag": "",
"depth": "0",
"max_urls": "0",
"max_size": "0",
"url_filters_allowlist": "",
"url_filters_denylist": "",
"notes": "",
"schedule": "",
"persona": "Default",
"index_only": "",
"config": "{}",
},
HTTP_HOST=WEB_HOST,
)
assert response.status_code == 302
assert runner_calls == [True]
def test_add_view_extracts_urls_from_mixed_text_input(client, admin_user, monkeypatch):
monkeypatch.setattr(SERVER_CONFIG, "PUBLIC_ADD_VIEW", True)
client.force_login(admin_user)
response = client.post(
reverse("add"),
data={
"url": "\n".join(
[
"https://sweeting.me,https://google.com",
"Notes: [ArchiveBox](https://github.com/ArchiveBox/ArchiveBox), https://news.ycombinator.com",
"[Wiki](https://en.wikipedia.org/wiki/Classification_(machine_learning))",
'{"items":["https://example.com/three"]}',
"csv,https://example.com/four",
],
),
"tag": "",
"depth": "0",
"max_urls": "0",
"max_size": "0",
"url_filters_allowlist": "",
"url_filters_denylist": "",
"notes": "",
"schedule": "",
"persona": "Default",
"index_only": "",
"config": "{}",
},
HTTP_HOST=WEB_HOST,
)
assert response.status_code == 302
crawl = Crawl.objects.order_by("-created_at").first()
assert crawl is not None
assert crawl.urls == "\n".join(
[
"https://sweeting.me",
"https://google.com",
"https://github.com/ArchiveBox/ArchiveBox",
"https://news.ycombinator.com",
"https://en.wikipedia.org/wiki/Classification_(machine_learning)",
"https://example.com/three",
"https://example.com/four",
],
)
def test_add_view_trims_trailing_punctuation_from_markdown_urls(client, admin_user, monkeypatch):
monkeypatch.setattr(SERVER_CONFIG, "PUBLIC_ADD_VIEW", True)
client.force_login(admin_user)
response = client.post(
reverse("add"),
data={
"url": "\n".join(
[
"Docs: https://github.com/ArchiveBox/ArchiveBox.",
"Issue: https://github.com/abc?abc#234234?.",
],
),
"tag": "",
"depth": "0",
"max_urls": "0",
"max_size": "0",
"url_filters_allowlist": "",
"url_filters_denylist": "",
"notes": "",
"schedule": "",
"persona": "Default",
"index_only": "",
"config": "{}",
},
HTTP_HOST=WEB_HOST,
)
assert response.status_code == 302
crawl = Crawl.objects.order_by("-created_at").first()
assert crawl is not None
assert crawl.urls == "\n".join(
[
"https://github.com/ArchiveBox/ArchiveBox",
"https://github.com/abc?abc#234234",
],
)
def test_add_view_exposes_api_token_for_tag_widget_autocomplete(client, admin_user, monkeypatch):
monkeypatch.setattr(SERVER_CONFIG, "PUBLIC_ADD_VIEW", True)
client.force_login(admin_user)
response = client.get(reverse("add"), HTTP_HOST=WEB_HOST)
assert response.status_code == 200
assert b"window.ARCHIVEBOX_API_KEY" in response.content
def test_tags_autocomplete_requires_auth_when_public_snapshots_list_disabled(client, settings):
settings.PUBLIC_SNAPSHOTS_LIST = False
settings.PUBLIC_INDEX = False
Tag.objects.create(name="archive")
response = client.get(
reverse("api-1:tags_autocomplete"),
{"q": "a"},
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 401
def test_tags_autocomplete_allows_public_access_when_public_snapshots_list_enabled(client, settings):
settings.PUBLIC_SNAPSHOTS_LIST = True
settings.PUBLIC_INDEX = False
Tag.objects.create(name="archive")
response = client.get(
reverse("api-1:tags_autocomplete"),
{"q": "a"},
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
assert response.json()["tags"][0]["name"] == "archive"
def test_tags_autocomplete_allows_authenticated_user_when_public_snapshots_list_disabled(client, admin_user, settings):
settings.PUBLIC_SNAPSHOTS_LIST = False
settings.PUBLIC_INDEX = False
Tag.objects.create(name="archive")
client.force_login(admin_user)
response = client.get(
reverse("api-1:tags_autocomplete"),
{"q": "a"},
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
assert response.json()["tags"][0]["name"] == "archive"