WIP: checkpoint working tree before rebasing onto dev

This commit is contained in:
Nick Sweeting
2026-03-22 20:23:45 -07:00
parent a6548df8d0
commit f400a2cd67
87 changed files with 12607 additions and 1808 deletions

View File

@@ -0,0 +1,195 @@
import re
import pytest
from django.contrib.auth import get_user_model
from django.urls import reverse
from archivebox.config.common import SERVER_CONFIG, SEARCH_BACKEND_CONFIG
from archivebox.core.models import Tag
from archivebox.crawls.models import Crawl
pytestmark = pytest.mark.django_db
User = get_user_model()
WEB_HOST = 'web.archivebox.localhost:8000'
ADMIN_HOST = 'admin.archivebox.localhost:8000'
@pytest.fixture
def admin_user(db):
return User.objects.create_superuser(
username='addviewadmin',
email='addviewadmin@test.com',
password='testpassword',
)
def test_add_view_renders_tag_editor_and_url_filter_fields(client, admin_user, monkeypatch):
monkeypatch.setattr(SERVER_CONFIG, 'PUBLIC_ADD_VIEW', True)
response = client.get(reverse('add'), HTTP_HOST=WEB_HOST)
body = response.content.decode()
assert response.status_code == 200
assert 'tag-editor-container' in body
assert 'name="url_filters_allowlist"' in body
assert 'name="url_filters_denylist"' in body
assert 'Same domain only' in body
assert 'name="persona"' in body
assert 'Overwrite existing snapshots' not in body
assert 'Update/retry previously failed URLs' not in body
assert 'Index only dry run (add crawl but don't archive yet)' in body
assert 'name="notes"' in body
assert '<input type="text" name="notes"' in body
assert body.index('name="persona"') < body.index('<h3>Crawl Plugins</h3>')
assert 'data-url-regex=' in body
assert 'id="url-highlight-layer"' in body
assert 'id="detected-urls-list"' in body
assert 'detected-url-toggle-btn' in body
def test_add_view_checks_configured_search_backend_by_default(client, monkeypatch):
monkeypatch.setattr(SERVER_CONFIG, 'PUBLIC_ADD_VIEW', True)
monkeypatch.setattr(SEARCH_BACKEND_CONFIG, 'SEARCH_BACKEND_ENGINE', 'sqlite')
response = client.get(reverse('add'), HTTP_HOST=WEB_HOST)
body = response.content.decode()
assert response.status_code == 200
assert re.search(
r'<input type="checkbox" name="search_plugins" value="search_backend_sqlite"[^>]* checked\b',
body,
)
assert "const requiredSearchPlugin = 'search_backend_sqlite';" in body
def test_add_view_creates_crawl_with_tag_and_url_filter_overrides(client, admin_user, monkeypatch):
monkeypatch.setattr(SERVER_CONFIG, 'PUBLIC_ADD_VIEW', True)
client.force_login(admin_user)
response = client.post(
reverse('add'),
data={
'url': 'https://example.com\nhttps://cdn.example.com/asset.js',
'tag': 'alpha,beta',
'depth': '1',
'url_filters_allowlist': 'example.com\n*.example.com',
'url_filters_denylist': 'cdn.example.com',
'notes': 'Created from /add/',
'schedule': '',
'persona': 'Default',
'index_only': '',
'config': '{}',
},
HTTP_HOST=WEB_HOST,
)
assert response.status_code == 302
crawl = Crawl.objects.order_by('-created_at').first()
assert crawl is not None
assert crawl.tags_str == 'alpha,beta'
assert crawl.notes == 'Created from /add/'
assert crawl.config.get('DEFAULT_PERSONA') == 'Default'
assert crawl.config['URL_ALLOWLIST'] == 'example.com\n*.example.com'
assert crawl.config['URL_DENYLIST'] == 'cdn.example.com'
assert 'OVERWRITE' not in crawl.config
assert 'ONLY_NEW' not in crawl.config
def test_add_view_extracts_urls_from_mixed_text_input(client, admin_user, monkeypatch):
monkeypatch.setattr(SERVER_CONFIG, 'PUBLIC_ADD_VIEW', True)
client.force_login(admin_user)
response = client.post(
reverse('add'),
data={
'url': '\n'.join([
'https://sweeting.me,https://google.com',
'Notes: [ArchiveBox](https://github.com/ArchiveBox/ArchiveBox), https://news.ycombinator.com',
'[Wiki](https://en.wikipedia.org/wiki/Classification_(machine_learning))',
'{"items":["https://example.com/three"]}',
'csv,https://example.com/four',
]),
'tag': '',
'depth': '0',
'url_filters_allowlist': '',
'url_filters_denylist': '',
'notes': '',
'schedule': '',
'persona': 'Default',
'index_only': '',
'config': '{}',
},
HTTP_HOST=WEB_HOST,
)
assert response.status_code == 302
crawl = Crawl.objects.order_by('-created_at').first()
assert crawl is not None
assert crawl.urls == '\n'.join([
'https://sweeting.me',
'https://google.com',
'https://github.com/ArchiveBox/ArchiveBox',
'https://news.ycombinator.com',
'https://en.wikipedia.org/wiki/Classification_(machine_learning)',
'https://example.com/three',
'https://example.com/four',
])
def test_add_view_exposes_api_token_for_tag_widget_autocomplete(client, admin_user, monkeypatch):
monkeypatch.setattr(SERVER_CONFIG, 'PUBLIC_ADD_VIEW', True)
client.force_login(admin_user)
response = client.get(reverse('add'), HTTP_HOST=WEB_HOST)
assert response.status_code == 200
assert b'window.ARCHIVEBOX_API_KEY' in response.content
def test_tags_autocomplete_requires_auth_when_public_snapshots_list_disabled(client, settings):
settings.PUBLIC_SNAPSHOTS_LIST = False
settings.PUBLIC_INDEX = False
Tag.objects.create(name='archive')
response = client.get(
reverse('api-1:tags_autocomplete'),
{'q': 'a'},
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 401
def test_tags_autocomplete_allows_public_access_when_public_snapshots_list_enabled(client, settings):
settings.PUBLIC_SNAPSHOTS_LIST = True
settings.PUBLIC_INDEX = False
Tag.objects.create(name='archive')
response = client.get(
reverse('api-1:tags_autocomplete'),
{'q': 'a'},
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
assert response.json()['tags'][0]['name'] == 'archive'
def test_tags_autocomplete_allows_authenticated_user_when_public_snapshots_list_disabled(client, admin_user, settings):
settings.PUBLIC_SNAPSHOTS_LIST = False
settings.PUBLIC_INDEX = False
Tag.objects.create(name='archive')
client.force_login(admin_user)
response = client.get(
reverse('api-1:tags_autocomplete'),
{'q': 'a'},
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
assert response.json()['tags'][0]['name'] == 'archive'

View File

@@ -0,0 +1,151 @@
from archivebox.base_models.admin import KeyValueWidget
def test_key_value_widget_renders_enum_autocomplete_metadata(monkeypatch):
monkeypatch.setattr(
KeyValueWidget,
'_get_config_options',
lambda self: {
'CHROME_WAIT_FOR': {
'plugin': 'chrome',
'type': 'string',
'default': 'networkidle2',
'description': 'Page load completion condition',
'enum': ['domcontentloaded', 'load', 'networkidle0', 'networkidle2'],
},
},
)
html = str(
KeyValueWidget().render(
'config',
{'CHROME_WAIT_FOR': 'load'},
attrs={'id': 'id_config'},
)
)
assert '"enum": ["domcontentloaded", "load", "networkidle0", "networkidle2"]' in html
assert 'class="kv-value-options"' in html
assert 'class="kv-help"' in html
assert 'configureValueInput_id_config' in html
assert 'describeMeta_id_config' in html
assert 'validateValueAgainstMeta_id_config' in html
def test_key_value_widget_renders_numeric_and_pattern_constraints(monkeypatch):
monkeypatch.setattr(
KeyValueWidget,
'_get_config_options',
lambda self: {
'TIMEOUT': {
'plugin': 'base',
'type': 'integer',
'default': 60,
'description': 'Timeout in seconds',
'minimum': 5,
'maximum': 120,
},
'CHROME_RESOLUTION': {
'plugin': 'chrome',
'type': 'string',
'default': '1440,2000',
'description': 'Viewport resolution',
'pattern': '^\\d+,\\d+$',
},
},
)
html = str(KeyValueWidget().render('config', {}, attrs={'id': 'id_config'}))
assert '"minimum": 5' in html
assert '"maximum": 120' in html
assert '"pattern": "^\\\\d+,\\\\d+$"' in html
assert 'Expected: ' in html
assert 'Example: ' in html
assert 'setValueValidationState_id_config' in html
assert 'coerceValueForStorage_id_config' in html
def test_key_value_widget_accepts_common_boolean_spellings(monkeypatch):
monkeypatch.setattr(
KeyValueWidget,
'_get_config_options',
lambda self: {
'DEBUG': {
'plugin': 'base',
'type': 'boolean',
'default': False,
'description': 'Enable debug mode',
},
},
)
html = str(KeyValueWidget().render('config', {'DEBUG': 'True'}, attrs={'id': 'id_config'}))
assert "enumValues = ['True', 'False']" in html
assert "raw.toLowerCase()" in html
assert "lowered === 'true' || raw === '1'" in html
assert "lowered === 'false' || raw === '0'" in html
def test_key_value_widget_shows_array_and_object_examples_and_binary_rules(monkeypatch):
monkeypatch.setattr(
KeyValueWidget,
'_get_config_options',
lambda self: {
'WGET_ARGS_EXTRA': {
'plugin': 'wget',
'type': 'array',
'default': [],
'description': 'Extra arguments to append to wget command',
},
'SAVE_ALLOWLIST': {
'plugin': 'base',
'type': 'object',
'default': {},
'description': 'Regex allowlist mapped to enabled methods',
},
'WGET_BINARY': {
'plugin': 'wget',
'type': 'string',
'default': 'wget',
'description': 'Path to wget binary',
},
},
)
html = str(KeyValueWidget().render('config', {}, attrs={'id': 'id_config'}))
assert 'Example: ["--extra-arg"]' in html
assert 'Example: {"^https://example\\\\.com": ["wget"]}' in html
assert 'Example: wget or /usr/bin/wget' in html
assert 'validateBinaryValue_id_config' in html
assert "meta.key.endsWith('_BINARY')" in html
assert "Binary paths cannot contain quotes" in html
def test_key_value_widget_falls_back_to_binary_validation_for_unknown_binary_keys(monkeypatch):
monkeypatch.setattr(
KeyValueWidget,
'_get_config_options',
lambda self: {
'CHROME_BINARY': {
'plugin': 'base',
'type': 'string',
'default': '',
'description': 'Resolved Chromium/Chrome binary path shared across plugins',
},
},
)
html = str(
KeyValueWidget().render(
'config',
{'NODE_BINARY': '/opt/homebrew/bin/node'},
attrs={'id': 'id_config'},
)
)
assert 'function getMetaForKey_id_config' in html
assert "if (key.endsWith('_BINARY'))" in html
assert 'Path to binary executable' in html

View File

@@ -0,0 +1,127 @@
import pytest
from django.contrib.admin.sites import AdminSite
from uuid import uuid4
pytestmark = pytest.mark.django_db
def _create_snapshot():
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
crawl = Crawl.objects.create(
urls="https://example.com",
created_by_id=get_or_create_system_user_pk(),
)
return Snapshot.objects.create(
url="https://example.com",
crawl=crawl,
status=Snapshot.StatusChoices.STARTED,
)
def _create_machine():
from archivebox.machine.models import Machine
return Machine.objects.create(
guid=f'test-guid-{uuid4()}',
hostname='test-host',
hw_in_docker=False,
hw_in_vm=False,
hw_manufacturer='Test',
hw_product='Test Product',
hw_uuid=f'test-hw-{uuid4()}',
os_arch='arm64',
os_family='darwin',
os_platform='macOS',
os_release='14.0',
os_kernel='Darwin',
stats={},
config={},
)
def _create_iface(machine):
from archivebox.machine.models import NetworkInterface
return NetworkInterface.objects.create(
machine=machine,
mac_address='00:11:22:33:44:66',
ip_public='203.0.113.11',
ip_local='10.0.0.11',
dns_server='1.1.1.1',
hostname='test-host',
iface='en0',
isp='Test ISP',
city='Test City',
region='Test Region',
country='Test Country',
)
def test_archiveresult_admin_links_plugin_and_process():
from archivebox.core.admin_archiveresults import ArchiveResultAdmin
from archivebox.core.models import ArchiveResult
from archivebox.machine.models import Process
snapshot = _create_snapshot()
iface = _create_iface(_create_machine())
process = Process.objects.create(
machine=iface.machine,
iface=iface,
process_type=Process.TypeChoices.HOOK,
pwd=str(snapshot.output_dir / 'wget'),
cmd=['/tmp/on_Snapshot__06_wget.finite.bg.py', '--url=https://example.com'],
status=Process.StatusChoices.EXITED,
)
result = ArchiveResult.objects.create(
snapshot=snapshot,
plugin='wget',
hook_name='on_Snapshot__06_wget.finite.bg.py',
process=process,
status=ArchiveResult.StatusChoices.SUCCEEDED,
)
admin = ArchiveResultAdmin(ArchiveResult, AdminSite())
plugin_html = str(admin.plugin_with_icon(result))
process_html = str(admin.process_link(result))
assert '/admin/environment/plugins/builtin.wget/' in plugin_html
assert f'/admin/machine/process/{process.id}/change' in process_html
def test_process_admin_links_binary_and_iface():
from archivebox.machine.admin import ProcessAdmin
from archivebox.machine.models import Binary, Process
machine = _create_machine()
iface = _create_iface(machine)
binary = Binary.objects.create(
machine=machine,
name='wget',
abspath='/usr/local/bin/wget',
version='1.21.2',
binprovider='env',
binproviders='env',
status=Binary.StatusChoices.INSTALLED,
)
process = Process.objects.create(
machine=machine,
iface=iface,
binary=binary,
process_type=Process.TypeChoices.HOOK,
pwd='/tmp/wget',
cmd=['/tmp/on_Snapshot__06_wget.finite.bg.py', '--url=https://example.com'],
status=Process.StatusChoices.EXITED,
)
admin = ProcessAdmin(Process, AdminSite())
binary_html = str(admin.binary_link(process))
iface_html = str(admin.iface_link(process))
assert f'/admin/machine/binary/{binary.id}/change' in binary_html
assert f'/admin/machine/networkinterface/{iface.id}/change' in iface_html

View File

@@ -9,11 +9,13 @@ Tests cover:
"""
import pytest
import uuid
from typing import cast
from django.test import override_settings
from django.urls import reverse
from django.contrib.auth import get_user_model
from django.contrib.auth.models import UserManager
from django.utils import timezone
pytestmark = pytest.mark.django_db
@@ -195,6 +197,232 @@ class TestAdminSnapshotListView:
assert b'snapshot-view-list' in response.content
assert b'snapshot-view-grid' in response.content
def test_binary_change_view_renders(self, client, admin_user, db):
"""Binary admin change form should load without FieldError."""
from archivebox.machine.models import Machine, Binary
machine = Machine.objects.create(
guid=f'test-guid-{uuid.uuid4()}',
hostname='test-host',
hw_in_docker=False,
hw_in_vm=False,
hw_manufacturer='Test',
hw_product='Test Product',
hw_uuid=f'test-hw-{uuid.uuid4()}',
os_arch='x86_64',
os_family='darwin',
os_platform='darwin',
os_release='test',
os_kernel='test-kernel',
stats={},
)
binary = Binary.objects.create(
machine=machine,
name='gallery-dl',
binproviders='env',
binprovider='env',
abspath='/opt/homebrew/bin/gallery-dl',
version='1.26.9',
sha256='abc123',
status=Binary.StatusChoices.INSTALLED,
)
client.login(username='testadmin', password='testpassword')
url = f'/admin/machine/binary/{binary.pk}/change/'
response = client.get(url, HTTP_HOST=ADMIN_HOST)
assert response.status_code == 200
assert b'gallery-dl' in response.content
def test_change_view_renders_real_redo_failed_action(self, client, admin_user, snapshot):
client.login(username='testadmin', password='testpassword')
url = reverse('admin:core_snapshot_change', args=[snapshot.pk])
response = client.get(url, HTTP_HOST=ADMIN_HOST)
assert response.status_code == 200
assert f'/admin/core/snapshot/{snapshot.pk}/redo-failed/'.encode() in response.content
def test_redo_failed_action_requeues_snapshot(self, client, admin_user, snapshot, monkeypatch):
import archivebox.core.admin_snapshots as admin_snapshots
queued = []
def fake_bg_archive_snapshot(obj, overwrite=False, methods=None):
queued.append((str(obj.pk), overwrite, methods))
return 1
monkeypatch.setattr(admin_snapshots, 'bg_archive_snapshot', fake_bg_archive_snapshot)
client.login(username='testadmin', password='testpassword')
url = reverse('admin:core_snapshot_redo_failed', args=[snapshot.pk])
response = client.post(url, HTTP_HOST=ADMIN_HOST)
assert response.status_code == 302
assert queued == [(str(snapshot.pk), False, None)]
assert response['Location'].endswith(f'/admin/core/snapshot/{snapshot.pk}/change/')
class TestArchiveResultAdminListView:
def test_list_view_renders_readonly_tags_and_noresults_status(self, client, admin_user, snapshot):
from archivebox.core.models import ArchiveResult, Tag
tag = Tag.objects.create(name='Alpha Research')
snapshot.tags.add(tag)
ArchiveResult.objects.create(
snapshot=snapshot,
plugin='title',
status=ArchiveResult.StatusChoices.NORESULTS,
output_str='No title found',
)
client.login(username='testadmin', password='testpassword')
response = client.get(reverse('admin:core_archiveresult_changelist'), HTTP_HOST=ADMIN_HOST)
assert response.status_code == 200
assert b'Alpha Research' in response.content
assert b'tag-editor-inline readonly' in response.content
assert b'No Results' in response.content
def test_archiveresult_model_has_no_retry_at_field(self):
from archivebox.core.models import ArchiveResult
assert 'retry_at' not in {field.name for field in ArchiveResult._meta.fields}
class TestLiveProgressView:
def test_live_progress_routes_crawl_process_rows_to_crawl_setup(self, client, admin_user, snapshot, db):
import archivebox.machine.models as machine_models
from archivebox.machine.models import Machine, Process
machine_models._CURRENT_MACHINE = None
machine = Machine.current()
Process.objects.create(
machine=machine,
process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.RUNNING,
pid=43210,
cmd=['/plugins/chrome/on_Crawl__91_chrome_wait.js', '--url=https://example.com'],
env={
'CRAWL_ID': str(snapshot.crawl_id),
'SNAPSHOT_ID': str(snapshot.id),
},
started_at=timezone.now(),
)
client.login(username='testadmin', password='testpassword')
response = client.get(reverse('live_progress'), HTTP_HOST=ADMIN_HOST)
assert response.status_code == 200
payload = response.json()
active_crawl = next(crawl for crawl in payload['active_crawls'] if crawl['id'] == str(snapshot.crawl_id))
setup_entry = next(item for item in active_crawl['setup_plugins'] if item['source'] == 'process')
active_snapshot = next(item for item in active_crawl['active_snapshots'] if item['id'] == str(snapshot.id))
assert setup_entry['label'] == 'chrome wait'
assert setup_entry['status'] == 'started'
assert active_crawl['worker_pid'] == 43210
assert active_snapshot['all_plugins'] == []
def test_live_progress_uses_snapshot_process_rows_before_archiveresults(self, client, admin_user, snapshot, db):
import archivebox.machine.models as machine_models
from archivebox.machine.models import Machine, Process
machine_models._CURRENT_MACHINE = None
machine = Machine.current()
Process.objects.create(
machine=machine,
process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.RUNNING,
pid=43211,
cmd=['/plugins/title/on_Snapshot__10_title.py', '--url=https://example.com'],
env={
'CRAWL_ID': str(snapshot.crawl_id),
'SNAPSHOT_ID': str(snapshot.id),
},
started_at=timezone.now(),
)
client.login(username='testadmin', password='testpassword')
response = client.get(reverse('live_progress'), HTTP_HOST=ADMIN_HOST)
assert response.status_code == 200
payload = response.json()
active_crawl = next(crawl for crawl in payload['active_crawls'] if crawl['id'] == str(snapshot.crawl_id))
active_snapshot = next(item for item in active_crawl['active_snapshots'] if item['id'] == str(snapshot.id))
assert active_snapshot['all_plugins'][0]['source'] == 'process'
assert active_snapshot['all_plugins'][0]['label'] == 'title'
assert active_snapshot['all_plugins'][0]['status'] == 'started'
assert active_snapshot['worker_pid'] == 43211
def test_live_progress_merges_process_rows_with_archiveresults_when_present(self, client, admin_user, snapshot, db):
import archivebox.machine.models as machine_models
from archivebox.core.models import ArchiveResult
from archivebox.machine.models import Machine, Process
machine_models._CURRENT_MACHINE = None
machine = Machine.current()
Process.objects.create(
machine=machine,
process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.RUNNING,
pid=54321,
cmd=['/plugins/chrome/on_Snapshot__11_chrome_wait.js', '--url=https://example.com'],
env={
'CRAWL_ID': str(snapshot.crawl_id),
'SNAPSHOT_ID': str(snapshot.id),
},
started_at=timezone.now(),
)
ArchiveResult.objects.create(
snapshot=snapshot,
plugin='title',
status=ArchiveResult.StatusChoices.STARTED,
)
client.login(username='testadmin', password='testpassword')
response = client.get(reverse('live_progress'), HTTP_HOST=ADMIN_HOST)
assert response.status_code == 200
payload = response.json()
active_crawl = next(crawl for crawl in payload['active_crawls'] if crawl['id'] == str(snapshot.crawl_id))
active_snapshot = next(item for item in active_crawl['active_snapshots'] if item['id'] == str(snapshot.id))
sources = {item['source'] for item in active_snapshot['all_plugins']}
plugins = {item['plugin'] for item in active_snapshot['all_plugins']}
assert sources == {'archiveresult', 'process'}
assert 'title' in plugins
assert 'chrome' in plugins
def test_live_progress_omits_pid_for_exited_process_rows(self, client, admin_user, snapshot, db):
import archivebox.machine.models as machine_models
from archivebox.machine.models import Machine, Process
machine_models._CURRENT_MACHINE = None
machine = Machine.current()
Process.objects.create(
machine=machine,
process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.EXITED,
exit_code=0,
pid=99999,
cmd=['/plugins/title/on_Snapshot__10_title.py', '--url=https://example.com'],
env={
'CRAWL_ID': str(snapshot.crawl_id),
'SNAPSHOT_ID': str(snapshot.id),
},
started_at=timezone.now(),
ended_at=timezone.now(),
)
client.login(username='testadmin', password='testpassword')
response = client.get(reverse('live_progress'), HTTP_HOST=ADMIN_HOST)
assert response.status_code == 200
payload = response.json()
active_crawl = next(crawl for crawl in payload['active_crawls'] if crawl['id'] == str(snapshot.crawl_id))
active_snapshot = next(item for item in active_crawl['active_snapshots'] if item['id'] == str(snapshot.id))
process_entry = next(item for item in active_snapshot['all_plugins'] if item['source'] == 'process')
assert process_entry['status'] == 'succeeded'
assert 'pid' not in process_entry
class TestAdminSnapshotSearch:
"""Tests for admin snapshot search functionality."""

View File

@@ -0,0 +1,305 @@
from pathlib import Path
from uuid import uuid4
import pytest
from django.db import connection
from abx_dl.events import ProcessCompletedEvent, ProcessStartedEvent
from abx_dl.orchestrator import create_bus
pytestmark = pytest.mark.django_db
def _cleanup_machine_process_rows() -> None:
with connection.cursor() as cursor:
cursor.execute("DELETE FROM machine_process")
def _create_snapshot():
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
crawl = Crawl.objects.create(
urls="https://example.com",
created_by_id=get_or_create_system_user_pk(),
)
return Snapshot.objects.create(
url="https://example.com",
crawl=crawl,
status=Snapshot.StatusChoices.STARTED,
)
def _create_machine():
from archivebox.machine.models import Machine
return Machine.objects.create(
guid=f'test-guid-{uuid4()}',
hostname='test-host',
hw_in_docker=False,
hw_in_vm=False,
hw_manufacturer='Test',
hw_product='Test Product',
hw_uuid=f'test-hw-{uuid4()}',
os_arch='arm64',
os_family='darwin',
os_platform='macOS',
os_release='14.0',
os_kernel='Darwin',
stats={},
config={},
)
def _create_iface(machine):
from archivebox.machine.models import NetworkInterface
return NetworkInterface.objects.create(
machine=machine,
mac_address='00:11:22:33:44:55',
ip_public='203.0.113.10',
ip_local='10.0.0.10',
dns_server='1.1.1.1',
hostname='test-host',
iface='en0',
isp='Test ISP',
city='Test City',
region='Test Region',
country='Test Country',
)
def test_process_completed_projects_inline_archiveresult():
from archivebox.core.models import ArchiveResult
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata
from archivebox.services.process_service import ProcessService
snapshot = _create_snapshot()
plugin_dir = Path(snapshot.output_dir) / "wget"
plugin_dir.mkdir(parents=True, exist_ok=True)
(plugin_dir / "index.html").write_text("<html>ok</html>")
bus = create_bus(name="test_inline_archiveresult")
process_service = ProcessService(bus)
service = ArchiveResultService(bus, process_service=process_service)
event = ProcessCompletedEvent(
plugin_name="wget",
hook_name="on_Snapshot__06_wget.finite.bg",
stdout='{"snapshot_id":"%s","type":"ArchiveResult","status":"succeeded","output_str":"wget/index.html"}\n' % snapshot.id,
stderr="",
exit_code=0,
output_dir=str(plugin_dir),
output_files=["index.html"],
process_id="proc-inline",
snapshot_id=str(snapshot.id),
start_ts="2026-03-22T12:00:00+00:00",
end_ts="2026-03-22T12:00:01+00:00",
)
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
service._project_from_process_completed(
event,
{
"snapshot_id": str(snapshot.id),
"plugin": "wget",
"hook_name": "on_Snapshot__06_wget.finite.bg",
"status": "succeeded",
"output_str": "wget/index.html",
},
output_files,
output_size,
output_mimetypes,
)
result = ArchiveResult.objects.get(snapshot=snapshot, plugin="wget", hook_name="on_Snapshot__06_wget.finite.bg")
assert result.status == ArchiveResult.StatusChoices.SUCCEEDED
assert result.output_str == "wget/index.html"
assert "index.html" in result.output_files
_cleanup_machine_process_rows()
def test_process_completed_projects_synthetic_failed_archiveresult():
from archivebox.core.models import ArchiveResult
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata
from archivebox.services.process_service import ProcessService
snapshot = _create_snapshot()
plugin_dir = Path(snapshot.output_dir) / "chrome"
plugin_dir.mkdir(parents=True, exist_ok=True)
bus = create_bus(name="test_synthetic_archiveresult")
process_service = ProcessService(bus)
service = ArchiveResultService(bus, process_service=process_service)
event = ProcessCompletedEvent(
plugin_name="chrome",
hook_name="on_Snapshot__11_chrome_wait",
stdout="",
stderr="Hook timed out after 60 seconds",
exit_code=-1,
output_dir=str(plugin_dir),
output_files=[],
process_id="proc-failed",
snapshot_id=str(snapshot.id),
start_ts="2026-03-22T12:00:00+00:00",
end_ts="2026-03-22T12:01:00+00:00",
)
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
service._project_from_process_completed(
event,
{
"plugin": "chrome",
"hook_name": "on_Snapshot__11_chrome_wait",
"status": "failed",
"output_str": "Hook timed out after 60 seconds",
"error": "Hook timed out after 60 seconds",
},
output_files,
output_size,
output_mimetypes,
)
result = ArchiveResult.objects.get(snapshot=snapshot, plugin="chrome", hook_name="on_Snapshot__11_chrome_wait")
assert result.status == ArchiveResult.StatusChoices.FAILED
assert result.output_str == "Hook timed out after 60 seconds"
assert "Hook timed out" in result.notes
_cleanup_machine_process_rows()
def test_process_completed_projects_noresults_archiveresult():
from archivebox.core.models import ArchiveResult
from archivebox.services.archive_result_service import ArchiveResultService, _collect_output_metadata
from archivebox.services.process_service import ProcessService
snapshot = _create_snapshot()
plugin_dir = Path(snapshot.output_dir) / "title"
plugin_dir.mkdir(parents=True, exist_ok=True)
bus = create_bus(name="test_noresults_archiveresult")
process_service = ProcessService(bus)
service = ArchiveResultService(bus, process_service=process_service)
event = ProcessCompletedEvent(
plugin_name="title",
hook_name="on_Snapshot__54_title.js",
stdout='{"snapshot_id":"%s","type":"ArchiveResult","status":"noresults","output_str":"No title found"}\n' % snapshot.id,
stderr="",
exit_code=0,
output_dir=str(plugin_dir),
output_files=[],
process_id="proc-noresults",
snapshot_id=str(snapshot.id),
start_ts="2026-03-22T12:00:00+00:00",
end_ts="2026-03-22T12:00:01+00:00",
)
output_files, output_size, output_mimetypes = _collect_output_metadata(plugin_dir)
service._project_from_process_completed(
event,
{
"snapshot_id": str(snapshot.id),
"plugin": "title",
"hook_name": "on_Snapshot__54_title.js",
"status": "noresults",
"output_str": "No title found",
},
output_files,
output_size,
output_mimetypes,
)
result = ArchiveResult.objects.get(snapshot=snapshot, plugin="title", hook_name="on_Snapshot__54_title.js")
assert result.status == ArchiveResult.StatusChoices.NORESULTS
assert result.output_str == "No title found"
_cleanup_machine_process_rows()
def test_process_started_hydrates_binary_and_iface_from_existing_binary_records(monkeypatch):
from archivebox.machine.models import Binary, NetworkInterface
from archivebox.services.process_service import ProcessService
machine = _create_machine()
iface = _create_iface(machine)
monkeypatch.setattr(NetworkInterface, 'current', classmethod(lambda cls, refresh=False: iface))
binary = Binary.objects.create(
machine=machine,
name='postlight-parser',
abspath='/tmp/postlight-parser',
version='2.2.3',
binprovider='npm',
binproviders='npm',
status=Binary.StatusChoices.INSTALLED,
)
bus = create_bus(name="test_process_started_binary_hydration")
service = ProcessService(bus)
event = ProcessStartedEvent(
plugin_name="mercury",
hook_name="on_Snapshot__57_mercury.py",
hook_path="/plugins/mercury/on_Snapshot__57_mercury.py",
hook_args=["--url=https://example.com"],
output_dir="/tmp/mercury",
env={
"MERCURY_BINARY": binary.abspath,
"NODE_BINARY": "/tmp/node",
},
timeout=60,
pid=4321,
process_id="proc-mercury",
snapshot_id="",
start_ts="2026-03-22T12:00:00+00:00",
)
service._project_started(event)
process = service._get_or_create_process(event)
assert process.binary_id == binary.id
assert process.iface_id == iface.id
def test_process_started_uses_node_binary_for_js_hooks_without_plugin_binary(monkeypatch):
from archivebox.machine.models import Binary, NetworkInterface
from archivebox.services.process_service import ProcessService
machine = _create_machine()
iface = _create_iface(machine)
monkeypatch.setattr(NetworkInterface, 'current', classmethod(lambda cls, refresh=False: iface))
node = Binary.objects.create(
machine=machine,
name='node',
abspath='/tmp/node',
version='22.0.0',
binprovider='env',
binproviders='env',
status=Binary.StatusChoices.INSTALLED,
)
bus = create_bus(name="test_process_started_node_fallback")
service = ProcessService(bus)
event = ProcessStartedEvent(
plugin_name="parse_dom_outlinks",
hook_name="on_Snapshot__75_parse_dom_outlinks.js",
hook_path="/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js",
hook_args=["--url=https://example.com"],
output_dir="/tmp/parse-dom-outlinks",
env={
"NODE_BINARY": node.abspath,
},
timeout=60,
pid=9876,
process_id="proc-parse-dom-outlinks",
snapshot_id="",
start_ts="2026-03-22T12:00:00+00:00",
)
service._project_started(event)
process = service._get_or_create_process(event)
assert process.binary_id == node.id
assert process.iface_id == iface.id

View File

@@ -44,6 +44,27 @@ def test_add_single_url_creates_snapshot_in_db(tmp_path, process, disable_extrac
assert snapshots[0][0] == 'https://example.com'
def test_add_bg_creates_root_snapshot_rows_immediately(tmp_path, process, disable_extractors_dict):
"""Background add should create root snapshots immediately so the queue is visible in the DB."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'add', '--bg', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
assert result.returncode == 0
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
snapshots = c.execute("SELECT url, status FROM core_snapshot").fetchall()
conn.close()
assert len(snapshots) == 1
assert snapshots[0][0] == 'https://example.com'
assert snapshots[0][1] == 'queued'
def test_add_creates_crawl_record(tmp_path, process, disable_extractors_dict):
"""Test that add command creates a Crawl record in the database."""
os.chdir(tmp_path)
@@ -217,6 +238,32 @@ def test_add_records_selected_persona_on_crawl(tmp_path, process, disable_extrac
assert persona_id
assert default_persona == 'Default'
assert (tmp_path / "personas" / "Default" / "chrome_user_data").is_dir()
def test_add_records_url_filter_overrides_on_crawl(tmp_path, process, disable_extractors_dict):
os.chdir(tmp_path)
result = subprocess.run(
[
'archivebox', 'add', '--index-only', '--depth=0',
'--domain-allowlist=example.com,*.example.com',
'--domain-denylist=static.example.com',
'https://example.com',
],
capture_output=True,
env=disable_extractors_dict,
)
assert result.returncode == 0
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
allowlist, denylist = c.execute(
"SELECT json_extract(config, '$.URL_ALLOWLIST'), json_extract(config, '$.URL_DENYLIST') FROM crawls_crawl LIMIT 1"
).fetchone()
conn.close()
assert allowlist == 'example.com,*.example.com'
assert denylist == 'static.example.com'
assert (tmp_path / "personas" / "Default" / "chrome_extensions").is_dir()

View File

@@ -16,6 +16,13 @@ from archivebox.tests.conftest import (
create_test_url,
)
PROJECTOR_TEST_ENV = {
'PLUGINS': 'favicon',
'SAVE_FAVICON': 'True',
'USE_COLOR': 'False',
'SHOW_PROGRESS': 'False',
}
class TestArchiveResultCreate:
"""Tests for `archivebox archiveresult create`."""
@@ -38,13 +45,14 @@ class TestArchiveResultCreate:
assert code == 0, f"Command failed: {stderr}"
records = parse_jsonl_output(stdout2)
# Should have the Snapshot passed through and ArchiveResult created
# Should have the Snapshot passed through and an ArchiveResult request emitted
types = [r.get('type') for r in records]
assert 'Snapshot' in types
assert 'ArchiveResult' in types
ar = next(r for r in records if r['type'] == 'ArchiveResult')
assert ar['plugin'] == 'title'
assert 'id' not in ar
def test_create_with_specific_plugin(self, initialized_archive):
"""Create archive result for specific plugin."""
@@ -122,15 +130,33 @@ class TestArchiveResultList:
def test_list_filter_by_status(self, initialized_archive):
"""Filter archive results by status."""
# Create snapshot and archive result
# Create snapshot and materialize an archive result via the runner
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
snapshot = parse_jsonl_output(stdout1)[0]
run_archivebox_cmd(
['archiveresult', 'create', '--plugin=title'],
stdout2, _, _ = run_archivebox_cmd(
['archiveresult', 'create', '--plugin=favicon'],
stdin=json.dumps(snapshot),
data_dir=initialized_archive,
)
run_archivebox_cmd(
['run'],
stdin=stdout2,
data_dir=initialized_archive,
timeout=120,
env=PROJECTOR_TEST_ENV,
)
created = parse_jsonl_output(
run_archivebox_cmd(
['archiveresult', 'list', '--plugin=favicon'],
data_dir=initialized_archive,
)[0]
)[0]
run_archivebox_cmd(
['archiveresult', 'update', '--status=queued'],
stdin=json.dumps(created),
data_dir=initialized_archive,
)
stdout, stderr, code = run_archivebox_cmd(
['archiveresult', 'list', '--status=queued'],
@@ -147,21 +173,28 @@ class TestArchiveResultList:
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
snapshot = parse_jsonl_output(stdout1)[0]
run_archivebox_cmd(
['archiveresult', 'create', '--plugin=title'],
stdout2, _, _ = run_archivebox_cmd(
['archiveresult', 'create', '--plugin=favicon'],
stdin=json.dumps(snapshot),
data_dir=initialized_archive,
)
run_archivebox_cmd(
['run'],
stdin=stdout2,
data_dir=initialized_archive,
timeout=120,
env=PROJECTOR_TEST_ENV,
)
stdout, stderr, code = run_archivebox_cmd(
['archiveresult', 'list', '--plugin=title'],
['archiveresult', 'list', '--plugin=favicon'],
data_dir=initialized_archive,
)
assert code == 0
records = parse_jsonl_output(stdout)
for r in records:
assert r['plugin'] == 'title'
assert r['plugin'] == 'favicon'
def test_list_with_limit(self, initialized_archive):
"""Limit number of results."""
@@ -170,11 +203,18 @@ class TestArchiveResultList:
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
snapshot = parse_jsonl_output(stdout1)[0]
run_archivebox_cmd(
['archiveresult', 'create', '--plugin=title'],
stdout2, _, _ = run_archivebox_cmd(
['archiveresult', 'create', '--plugin=favicon'],
stdin=json.dumps(snapshot),
data_dir=initialized_archive,
)
run_archivebox_cmd(
['run'],
stdin=stdout2,
data_dir=initialized_archive,
timeout=120,
env=PROJECTOR_TEST_ENV,
)
stdout, stderr, code = run_archivebox_cmd(
['archiveresult', 'list', '--limit=2'],
@@ -196,11 +236,22 @@ class TestArchiveResultUpdate:
snapshot = parse_jsonl_output(stdout1)[0]
stdout2, _, _ = run_archivebox_cmd(
['archiveresult', 'create', '--plugin=title'],
['archiveresult', 'create', '--plugin=favicon'],
stdin=json.dumps(snapshot),
data_dir=initialized_archive,
)
ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult')
stdout_run, _, _ = run_archivebox_cmd(
['run'],
stdin=stdout2,
data_dir=initialized_archive,
timeout=120,
env=PROJECTOR_TEST_ENV,
)
stdout_list, _, _ = run_archivebox_cmd(
['archiveresult', 'list', '--plugin=favicon'],
data_dir=initialized_archive,
)
ar = parse_jsonl_output(stdout_list)[0]
stdout3, stderr, code = run_archivebox_cmd(
['archiveresult', 'update', '--status=failed'],
@@ -225,11 +276,22 @@ class TestArchiveResultDelete:
snapshot = parse_jsonl_output(stdout1)[0]
stdout2, _, _ = run_archivebox_cmd(
['archiveresult', 'create', '--plugin=title'],
['archiveresult', 'create', '--plugin=favicon'],
stdin=json.dumps(snapshot),
data_dir=initialized_archive,
)
ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult')
stdout_run, _, _ = run_archivebox_cmd(
['run'],
stdin=stdout2,
data_dir=initialized_archive,
timeout=120,
env=PROJECTOR_TEST_ENV,
)
stdout_list, _, _ = run_archivebox_cmd(
['archiveresult', 'list', '--plugin=favicon'],
data_dir=initialized_archive,
)
ar = parse_jsonl_output(stdout_list)[0]
stdout, stderr, code = run_archivebox_cmd(
['archiveresult', 'delete'],
@@ -247,11 +309,22 @@ class TestArchiveResultDelete:
snapshot = parse_jsonl_output(stdout1)[0]
stdout2, _, _ = run_archivebox_cmd(
['archiveresult', 'create', '--plugin=title'],
['archiveresult', 'create', '--plugin=favicon'],
stdin=json.dumps(snapshot),
data_dir=initialized_archive,
)
ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult')
stdout_run, _, _ = run_archivebox_cmd(
['run'],
stdin=stdout2,
data_dir=initialized_archive,
timeout=120,
env=PROJECTOR_TEST_ENV,
)
stdout_list, _, _ = run_archivebox_cmd(
['archiveresult', 'list', '--plugin=favicon'],
data_dir=initialized_archive,
)
ar = parse_jsonl_output(stdout_list)[0]
stdout, stderr, code = run_archivebox_cmd(
['archiveresult', 'delete', '--yes'],

View File

@@ -83,7 +83,7 @@ class TestCrawlCreate:
assert code == 0
records = parse_jsonl_output(stdout)
assert 'test-tag' in records[0].get('tags_str', '')
assert 'test-tag' in records[0].get('tags', '')
def test_create_pass_through_other_types(self, initialized_archive):
"""Pass-through records of other types unchanged."""

View File

@@ -173,6 +173,20 @@ def test_collect_urls_from_plugins_reads_only_parser_outputs(tmp_path):
assert collect_urls_from_plugins(tmp_path / "nonexistent") == []
def test_collect_urls_from_plugins_trims_markdown_suffixes(tmp_path):
from archivebox.hooks import collect_urls_from_plugins
(tmp_path / "parse_html_urls").mkdir()
(tmp_path / "parse_html_urls" / "urls.jsonl").write_text(
'{"url":"https://docs.sweeting.me/s/youtube-favorites)**"}\n',
encoding="utf-8",
)
urls = collect_urls_from_plugins(tmp_path)
assert len(urls) == 1
assert urls[0]["url"] == "https://docs.sweeting.me/s/youtube-favorites"
def test_crawl_create_stdout_pipes_into_run(initialized_archive):
"""`archivebox crawl create | archivebox run` should queue and materialize snapshots."""
url = create_test_url()
@@ -269,8 +283,13 @@ def test_archiveresult_list_stdout_pipes_into_run(initialized_archive):
)
assert ar_create_code == 0, ar_create_stderr
created_records = parse_jsonl_output(ar_create_stdout)
archiveresult = next(record for record in created_records if record.get("type") == "ArchiveResult")
run_archivebox_cmd(
["run"],
stdin=ar_create_stdout,
data_dir=initialized_archive,
timeout=120,
env=PIPE_TEST_ENV,
)
list_stdout, list_stderr, list_code = run_archivebox_cmd(
["archiveresult", "list", "--plugin=favicon"],
@@ -278,6 +297,8 @@ def test_archiveresult_list_stdout_pipes_into_run(initialized_archive):
)
assert list_code == 0, list_stderr
_assert_stdout_is_jsonl_only(list_stdout)
listed_records = parse_jsonl_output(list_stdout)
archiveresult = next(record for record in listed_records if record.get("type") == "ArchiveResult")
run_stdout, run_stderr, run_code = run_archivebox_cmd(
["run"],

View File

@@ -8,6 +8,9 @@ Tests cover:
"""
import json
import sys
import pytest
from archivebox.tests.conftest import (
run_archivebox_cmd,
@@ -266,3 +269,182 @@ class TestRunEmpty:
assert code == 0
assert 'No records to process' in stderr
class TestRunDaemonMode:
def test_run_daemon_processes_stdin_before_runner(self, monkeypatch):
from archivebox.cli import archivebox_run
class FakeStdin:
def isatty(self):
return False
monkeypatch.setattr(sys, "stdin", FakeStdin())
calls = []
monkeypatch.setattr(
archivebox_run,
"process_stdin_records",
lambda: calls.append("stdin") or 0,
)
monkeypatch.setattr(
archivebox_run,
"run_runner",
lambda daemon=False: calls.append(f"runner:{daemon}") or 0,
)
with pytest.raises(SystemExit) as exit_info:
archivebox_run.main.callback(daemon=True, crawl_id=None, snapshot_id=None, binary_id=None)
assert exit_info.value.code == 0
assert calls == ["stdin", "runner:True"]
def test_run_daemon_skips_runner_if_stdin_processing_fails(self, monkeypatch):
from archivebox.cli import archivebox_run
class FakeStdin:
def isatty(self):
return False
monkeypatch.setattr(sys, "stdin", FakeStdin())
monkeypatch.setattr(archivebox_run, "process_stdin_records", lambda: 1)
monkeypatch.setattr(
archivebox_run,
"run_runner",
lambda daemon=False: (_ for _ in ()).throw(AssertionError("runner should not start after stdin failure")),
)
with pytest.raises(SystemExit) as exit_info:
archivebox_run.main.callback(daemon=True, crawl_id=None, snapshot_id=None, binary_id=None)
assert exit_info.value.code == 1
@pytest.mark.django_db
class TestRecoverOrphanedCrawls:
def test_recover_orphaned_crawl_requeues_started_crawl_without_active_processes(self):
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.services.runner import recover_orphaned_crawls
crawl = Crawl.objects.create(
urls='https://example.com',
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.STARTED,
retry_at=None,
)
Snapshot.objects.create(
url='https://example.com',
crawl=crawl,
status=Snapshot.StatusChoices.QUEUED,
retry_at=None,
)
recovered = recover_orphaned_crawls()
crawl.refresh_from_db()
assert recovered == 1
assert crawl.status == Crawl.StatusChoices.STARTED
assert crawl.retry_at is not None
def test_recover_orphaned_crawl_skips_active_child_processes(self):
import archivebox.machine.models as machine_models
from django.utils import timezone
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.machine.models import Machine, Process
from archivebox.services.runner import recover_orphaned_crawls
crawl = Crawl.objects.create(
urls='https://example.com',
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.STARTED,
retry_at=None,
)
snapshot = Snapshot.objects.create(
url='https://example.com',
crawl=crawl,
status=Snapshot.StatusChoices.QUEUED,
retry_at=None,
)
machine_models._CURRENT_MACHINE = None
machine = Machine.current()
Process.objects.create(
machine=machine,
process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.RUNNING,
cmd=['/plugins/chrome/on_Crawl__91_chrome_wait.js'],
env={
'CRAWL_ID': str(crawl.id),
'SNAPSHOT_ID': str(snapshot.id),
},
started_at=timezone.now(),
)
recovered = recover_orphaned_crawls()
crawl.refresh_from_db()
assert recovered == 0
assert crawl.retry_at is None
def test_recover_orphaned_crawl_seals_when_all_snapshots_are_already_sealed(self):
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.services.runner import recover_orphaned_crawls
crawl = Crawl.objects.create(
urls='https://example.com',
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.STARTED,
retry_at=None,
)
Snapshot.objects.create(
url='https://example.com',
crawl=crawl,
status=Snapshot.StatusChoices.SEALED,
retry_at=None,
)
recovered = recover_orphaned_crawls()
crawl.refresh_from_db()
assert recovered == 1
assert crawl.status == Crawl.StatusChoices.SEALED
assert crawl.retry_at is None
@pytest.mark.django_db
class TestRecoverOrphanedSnapshots:
def test_recover_orphaned_snapshot_requeues_started_snapshot_without_active_processes(self):
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.services.runner import recover_orphaned_snapshots
crawl = Crawl.objects.create(
urls='https://example.com',
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.SEALED,
retry_at=None,
)
snapshot = Snapshot.objects.create(
url='https://example.com',
crawl=crawl,
status=Snapshot.StatusChoices.STARTED,
retry_at=None,
)
recovered = recover_orphaned_snapshots()
snapshot.refresh_from_db()
crawl.refresh_from_db()
assert recovered == 1
assert snapshot.status == Snapshot.StatusChoices.QUEUED
assert snapshot.retry_at is not None
assert crawl.status == Crawl.StatusChoices.QUEUED
assert crawl.retry_at is not None

View File

@@ -6,6 +6,15 @@ Verify server can start (basic smoke tests only, no full server testing).
import os
import subprocess
import sys
from unittest.mock import Mock
def test_sqlite_connections_use_explicit_30_second_busy_timeout():
from archivebox.core.settings import SQLITE_CONNECTION_OPTIONS
assert SQLITE_CONNECTION_OPTIONS["OPTIONS"]["timeout"] == 30
assert "PRAGMA busy_timeout = 30000;" in SQLITE_CONNECTION_OPTIONS["OPTIONS"]["init_command"]
def test_server_shows_usage_info(tmp_path, process):
@@ -39,3 +48,64 @@ def test_server_init_flag(tmp_path, process):
assert result.returncode == 0
assert '--init' in result.stdout or 'init' in result.stdout.lower()
def test_runner_worker_uses_current_interpreter():
"""The supervised runner should use the active Python environment, not PATH."""
from archivebox.workers.supervisord_util import RUNNER_WORKER
assert RUNNER_WORKER["command"] == f"{sys.executable} -m archivebox run --daemon"
def test_reload_workers_use_current_interpreter_and_supervisord_managed_runner():
from archivebox.workers.supervisord_util import RUNNER_WATCH_WORKER, RUNSERVER_WORKER
runserver = RUNSERVER_WORKER("127.0.0.1", "8000", reload=True, pidfile="/tmp/runserver.pid")
watcher = RUNNER_WATCH_WORKER("/tmp/runserver.pid")
assert runserver["name"] == "worker_runserver"
assert runserver["command"] == f"{sys.executable} -m archivebox manage runserver 127.0.0.1:8000"
assert 'ARCHIVEBOX_RUNSERVER="1"' in runserver["environment"]
assert 'ARCHIVEBOX_AUTORELOAD="1"' in runserver["environment"]
assert 'ARCHIVEBOX_RUNSERVER_PIDFILE="/tmp/runserver.pid"' in runserver["environment"]
assert watcher["name"] == "worker_runner_watch"
assert watcher["command"] == f"{sys.executable} -m archivebox manage runner_watch --pidfile=/tmp/runserver.pid"
def test_stop_existing_background_runner_cleans_up_and_stops_orchestrators():
from archivebox.cli.archivebox_server import stop_existing_background_runner
runner_a = Mock()
runner_a.kill_tree = Mock()
runner_a.terminate = Mock()
runner_b = Mock()
runner_b.kill_tree = Mock(side_effect=RuntimeError("boom"))
runner_b.terminate = Mock()
process_model = Mock()
process_model.StatusChoices.RUNNING = "running"
process_model.TypeChoices.ORCHESTRATOR = "orchestrator"
queryset = Mock()
queryset.order_by.return_value = [runner_a, runner_b]
process_model.objects.filter.return_value = queryset
supervisor = Mock()
stop_worker = Mock()
log = Mock()
stopped = stop_existing_background_runner(
machine=Mock(),
process_model=process_model,
supervisor=supervisor,
stop_worker_fn=stop_worker,
log=log,
)
assert stopped == 2
assert process_model.cleanup_stale_running.call_count == 2
stop_worker.assert_any_call(supervisor, "worker_runner")
stop_worker.assert_any_call(supervisor, "worker_runner_watch")
runner_a.kill_tree.assert_called_once_with(graceful_timeout=2.0)
runner_b.terminate.assert_called_once_with(graceful_timeout=2.0)
log.assert_called_once()

View File

@@ -74,7 +74,7 @@ class TestSnapshotCreate:
assert code == 0
records = parse_jsonl_output(stdout)
assert 'test-tag' in records[0].get('tags_str', '')
assert 'test-tag' in records[0].get('tags', '')
def test_create_pass_through_other_types(self, initialized_archive):
"""Pass-through records of other types unchanged."""

View File

@@ -0,0 +1,326 @@
from datetime import timedelta
from types import SimpleNamespace
import pytest
from django.test import RequestFactory
from django.utils import timezone
from archivebox.config import views as config_views
from archivebox.core import views as core_views
from archivebox.machine.models import Binary
pytestmark = pytest.mark.django_db
def test_get_db_binaries_by_name_collapses_youtube_dl_aliases(monkeypatch):
now = timezone.now()
records = [
SimpleNamespace(
name='youtube-dl',
version='',
binprovider='',
abspath='/usr/bin/youtube-dl',
status=Binary.StatusChoices.INSTALLED,
modified_at=now,
),
SimpleNamespace(
name='yt-dlp',
version='2026.03.01',
binprovider='pip',
abspath='/usr/bin/yt-dlp',
status=Binary.StatusChoices.INSTALLED,
modified_at=now + timedelta(seconds=1),
),
]
monkeypatch.setattr(config_views.Binary, 'objects', SimpleNamespace(all=lambda: records))
binaries = config_views.get_db_binaries_by_name()
assert 'yt-dlp' in binaries
assert 'youtube-dl' not in binaries
assert binaries['yt-dlp'].version == '2026.03.01'
def test_binaries_list_view_uses_db_version_and_hides_youtube_dl_alias(monkeypatch):
request = RequestFactory().get('/admin/environment/binaries/')
request.user = SimpleNamespace(is_superuser=True)
db_binary = SimpleNamespace(
name='youtube-dl',
version='2026.03.01',
binprovider='pip',
abspath='/usr/bin/yt-dlp',
status=Binary.StatusChoices.INSTALLED,
sha256='',
modified_at=timezone.now(),
)
monkeypatch.setattr(config_views, 'get_db_binaries_by_name', lambda: {'yt-dlp': db_binary})
context = config_views.binaries_list_view.__wrapped__(request)
assert len(context['table']['Binary Name']) == 1
assert str(context['table']['Binary Name'][0].link_item) == 'yt-dlp'
assert context['table']['Found Version'][0] == '✅ 2026.03.01'
assert context['table']['Provided By'][0] == 'pip'
assert context['table']['Found Abspath'][0] == '/usr/bin/yt-dlp'
def test_binaries_list_view_only_shows_persisted_records(monkeypatch):
request = RequestFactory().get('/admin/environment/binaries/')
request.user = SimpleNamespace(is_superuser=True)
monkeypatch.setattr(config_views, 'get_db_binaries_by_name', lambda: {})
context = config_views.binaries_list_view.__wrapped__(request)
assert context['table']['Binary Name'] == []
assert context['table']['Found Version'] == []
assert context['table']['Provided By'] == []
assert context['table']['Found Abspath'] == []
def test_binary_detail_view_uses_canonical_db_record(monkeypatch):
request = RequestFactory().get('/admin/environment/binaries/youtube-dl/')
request.user = SimpleNamespace(is_superuser=True)
db_binary = SimpleNamespace(
id='019d14cc-6c40-7793-8ff1-0f8bb050e8a3',
name='yt-dlp',
version='2026.03.01',
binprovider='pip',
abspath='/usr/bin/yt-dlp',
sha256='abc123',
status=Binary.StatusChoices.INSTALLED,
modified_at=timezone.now(),
)
monkeypatch.setattr(config_views, 'get_db_binaries_by_name', lambda: {'yt-dlp': db_binary})
context = config_views.binary_detail_view.__wrapped__(request, key='youtube-dl')
section = context['data'][0]
assert context['title'] == 'yt-dlp'
assert section['fields']['name'] == 'yt-dlp'
assert section['fields']['version'] == '2026.03.01'
assert section['fields']['binprovider'] == 'pip'
assert section['fields']['abspath'] == '/usr/bin/yt-dlp'
assert '/admin/machine/binary/019d14cc-6c40-7793-8ff1-0f8bb050e8a3/change/?_changelist_filters=q%3Dyt-dlp' in section['description']
def test_binary_detail_view_marks_unrecorded_binary(monkeypatch):
request = RequestFactory().get('/admin/environment/binaries/wget/')
request.user = SimpleNamespace(is_superuser=True)
monkeypatch.setattr(config_views, 'get_db_binaries_by_name', lambda: {})
context = config_views.binary_detail_view.__wrapped__(request, key='wget')
section = context['data'][0]
assert section['description'] == 'No persisted Binary record found'
assert section['fields']['status'] == 'unrecorded'
assert section['fields']['binprovider'] == 'not recorded'
def test_plugin_detail_view_renders_config_in_dedicated_sections(monkeypatch):
request = RequestFactory().get('/admin/environment/plugins/builtin.example/')
request.user = SimpleNamespace(is_superuser=True)
plugin_config = {
'title': 'Example Plugin',
'description': 'Example config used to verify plugin metadata rendering.',
'type': 'object',
'required_plugins': ['chrome'],
'required_binaries': ['example-cli'],
'output_mimetypes': ['text/plain', 'application/json'],
'properties': {
'EXAMPLE_ENABLED': {
'type': 'boolean',
'description': 'Enable the example plugin.',
'x-fallback': 'CHECK_SSL_VALIDITY',
},
'EXAMPLE_BINARY': {
'type': 'string',
'default': 'gallery-dl',
'description': 'Filesystem path for example output.',
'x-aliases': ['USE_EXAMPLE_BINARY'],
},
},
}
monkeypatch.setattr(config_views, 'get_filesystem_plugins', lambda: {
'builtin.example': {
'id': 'builtin.example',
'name': 'example',
'source': 'builtin',
'path': '/plugins/example',
'hooks': ['on_Snapshot__01_example.py'],
'config': plugin_config,
}
})
monkeypatch.setattr(config_views, 'get_machine_admin_url', lambda: '/admin/machine/machine/test-machine/change/')
context = config_views.plugin_detail_view.__wrapped__(request, key='builtin.example')
assert context['title'] == 'example'
assert len(context['data']) == 5
summary_section, hooks_section, metadata_section, config_section, properties_section = context['data']
assert summary_section['fields'] == {
'id': 'builtin.example',
'name': 'example',
'source': 'builtin',
}
assert '/plugins/example' in summary_section['description']
assert 'https://archivebox.github.io/abx-plugins/#example' in summary_section['description']
assert hooks_section['name'] == 'Hooks'
assert hooks_section['fields'] == {}
assert 'https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/example/on_Snapshot__01_example.py' in hooks_section['description']
assert 'on_Snapshot__01_example.py' in hooks_section['description']
assert metadata_section['name'] == 'Plugin Metadata'
assert metadata_section['fields'] == {}
assert 'Example Plugin' in metadata_section['description']
assert 'Example config used to verify plugin metadata rendering.' in metadata_section['description']
assert 'https://archivebox.github.io/abx-plugins/#chrome' in metadata_section['description']
assert '/admin/environment/binaries/example-cli/' in metadata_section['description']
assert 'text/plain' in metadata_section['description']
assert 'application/json' in metadata_section['description']
assert config_section['name'] == 'config.json'
assert config_section['fields'] == {}
assert '<pre style=' in config_section['description']
assert 'EXAMPLE_ENABLED' in config_section['description']
assert '<span style="color: #0550ae;">"properties"</span>' in config_section['description']
assert properties_section['name'] == 'Config Properties'
assert properties_section['fields'] == {}
assert '/admin/machine/machine/test-machine/change/' in properties_section['description']
assert '/admin/machine/binary/' in properties_section['description']
assert '/admin/environment/binaries/' in properties_section['description']
assert 'EXAMPLE_ENABLED' in properties_section['description']
assert 'boolean' in properties_section['description']
assert 'Enable the example plugin.' in properties_section['description']
assert '/admin/environment/config/EXAMPLE_ENABLED/' in properties_section['description']
assert '/admin/environment/config/CHECK_SSL_VALIDITY/' in properties_section['description']
assert '/admin/environment/config/USE_EXAMPLE_BINARY/' in properties_section['description']
assert '/admin/environment/binaries/gallery-dl/' in properties_section['description']
assert 'EXAMPLE_BINARY' in properties_section['description']
def test_get_config_definition_link_keeps_core_config_search_link(monkeypatch):
monkeypatch.setattr(core_views, 'find_plugin_for_config_key', lambda key: None)
url, label = core_views.get_config_definition_link('CHECK_SSL_VALIDITY')
assert 'github.com/search' in url
assert 'CHECK_SSL_VALIDITY' in url
assert label == 'archivebox/config'
def test_get_config_definition_link_uses_plugin_config_json_for_plugin_options(monkeypatch):
plugin_dir = core_views.BUILTIN_PLUGINS_DIR / 'parse_dom_outlinks'
monkeypatch.setattr(core_views, 'find_plugin_for_config_key', lambda key: 'parse_dom_outlinks')
monkeypatch.setattr(core_views, 'iter_plugin_dirs', lambda: [plugin_dir])
url, label = core_views.get_config_definition_link('PARSE_DOM_OUTLINKS_ENABLED')
assert url == 'https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/parse_dom_outlinks/config.json'
assert label == 'abx_plugins/plugins/parse_dom_outlinks/config.json'
def test_live_config_value_view_renames_source_field_and_uses_plugin_definition_link(monkeypatch):
request = RequestFactory().get('/admin/environment/config/PARSE_DOM_OUTLINKS_ENABLED/')
request.user = SimpleNamespace(is_superuser=True)
monkeypatch.setattr(core_views, 'get_all_configs', lambda: {})
monkeypatch.setattr(core_views, 'get_flat_config', lambda: {})
monkeypatch.setattr(core_views, 'get_config', lambda: {'PARSE_DOM_OUTLINKS_ENABLED': True})
monkeypatch.setattr(core_views, 'find_config_default', lambda key: 'True')
monkeypatch.setattr(core_views, 'find_config_type', lambda key: 'bool')
monkeypatch.setattr(core_views, 'find_config_source', lambda key, merged: 'Default')
monkeypatch.setattr(core_views, 'key_is_safe', lambda key: True)
monkeypatch.setattr(core_views.CONSTANTS, 'CONFIG_FILE', SimpleNamespace(exists=lambda: False))
from archivebox.machine.models import Machine
from archivebox.config.configset import BaseConfigSet
monkeypatch.setattr(Machine, 'current', classmethod(lambda cls: SimpleNamespace(id='machine-id', config={})))
monkeypatch.setattr(BaseConfigSet, 'load_from_file', classmethod(lambda cls, path: {}))
monkeypatch.setattr(
core_views,
'get_config_definition_link',
lambda key: (
'https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/parse_dom_outlinks/config.json',
'abx_plugins/plugins/parse_dom_outlinks/config.json',
),
)
context = core_views.live_config_value_view.__wrapped__(request, key='PARSE_DOM_OUTLINKS_ENABLED')
section = context['data'][0]
assert 'Currently read from' in section['fields']
assert 'Source' not in section['fields']
assert section['fields']['Currently read from'] == 'Default'
assert 'abx_plugins/plugins/parse_dom_outlinks/config.json' in section['help_texts']['Type']
def test_find_config_source_prefers_environment_over_machine_and_file(monkeypatch):
monkeypatch.setenv('CHECK_SSL_VALIDITY', 'false')
from archivebox.machine.models import Machine
from archivebox.config.configset import BaseConfigSet
monkeypatch.setattr(
Machine,
'current',
classmethod(lambda cls: SimpleNamespace(id='machine-id', config={'CHECK_SSL_VALIDITY': 'true'})),
)
monkeypatch.setattr(
BaseConfigSet,
'load_from_file',
classmethod(lambda cls, path: {'CHECK_SSL_VALIDITY': 'true'}),
)
assert core_views.find_config_source('CHECK_SSL_VALIDITY', {'CHECK_SSL_VALIDITY': False}) == 'Environment'
def test_live_config_value_view_priority_text_matches_runtime_precedence(monkeypatch):
request = RequestFactory().get('/admin/environment/config/CHECK_SSL_VALIDITY/')
request.user = SimpleNamespace(is_superuser=True)
monkeypatch.setattr(core_views, 'get_all_configs', lambda: {})
monkeypatch.setattr(core_views, 'get_flat_config', lambda: {'CHECK_SSL_VALIDITY': True})
monkeypatch.setattr(core_views, 'get_config', lambda: {'CHECK_SSL_VALIDITY': False})
monkeypatch.setattr(core_views, 'find_config_default', lambda key: 'True')
monkeypatch.setattr(core_views, 'find_config_type', lambda key: 'bool')
monkeypatch.setattr(core_views, 'key_is_safe', lambda key: True)
from archivebox.machine.models import Machine
from archivebox.config.configset import BaseConfigSet
monkeypatch.setattr(
Machine,
'current',
classmethod(lambda cls: SimpleNamespace(id='machine-id', config={'CHECK_SSL_VALIDITY': 'true'})),
)
monkeypatch.setattr(
BaseConfigSet,
'load_from_file',
classmethod(lambda cls, path: {'CHECK_SSL_VALIDITY': 'true'}),
)
monkeypatch.setattr(core_views.CONSTANTS, 'CONFIG_FILE', SimpleNamespace(exists=lambda: True))
monkeypatch.setenv('CHECK_SSL_VALIDITY', 'false')
context = core_views.live_config_value_view.__wrapped__(request, key='CHECK_SSL_VALIDITY')
section = context['data'][0]
assert section['fields']['Currently read from'] == 'Environment'
help_text = section['help_texts']['Currently read from']
assert help_text.index('Environment') < help_text.index('Machine') < help_text.index('Config File') < help_text.index('Default')
assert 'Configuration Sources (highest priority first):' in section['help_texts']['Value']

View File

@@ -0,0 +1,220 @@
from typing import cast
import pytest
from django.contrib.auth import get_user_model
from django.contrib.auth.models import UserManager
from django.urls import reverse
from archivebox.crawls.admin import CrawlAdminForm
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
pytestmark = pytest.mark.django_db
User = get_user_model()
ADMIN_HOST = 'admin.archivebox.localhost:8000'
@pytest.fixture
def admin_user(db):
return cast(UserManager, User.objects).create_superuser(
username='crawladmin',
email='crawladmin@test.com',
password='testpassword',
)
@pytest.fixture
def crawl(admin_user):
return Crawl.objects.create(
urls='https://example.com\nhttps://example.org',
tags_str='alpha,beta',
created_by=admin_user,
)
def test_crawl_admin_change_view_renders_tag_editor_widget(client, admin_user, crawl):
client.login(username='crawladmin', password='testpassword')
response = client.get(
reverse('admin:crawls_crawl_change', args=[crawl.pk]),
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
assert b'name="tags_editor"' in response.content
assert b'tag-editor-container' in response.content
assert b'alpha' in response.content
assert b'beta' in response.content
def test_crawl_admin_add_view_renders_url_filter_alias_fields(client, admin_user):
client.login(username='crawladmin', password='testpassword')
response = client.get(
reverse('admin:crawls_crawl_add'),
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
assert b'name="url_filters_allowlist"' in response.content
assert b'name="url_filters_denylist"' in response.content
assert b'Same domain only' in response.content
def test_crawl_admin_form_saves_tags_editor_to_tags_str(crawl, admin_user):
form = CrawlAdminForm(
data={
'created_at': crawl.created_at.strftime('%Y-%m-%d %H:%M:%S'),
'urls': crawl.urls,
'config': '{}',
'max_depth': '0',
'tags_editor': 'alpha, beta, Alpha, gamma',
'url_filters_allowlist': 'example.com\n*.example.com',
'url_filters_denylist': 'static.example.com',
'persona_id': '',
'label': '',
'notes': '',
'schedule': '',
'status': crawl.status,
'retry_at': crawl.retry_at.strftime('%Y-%m-%d %H:%M:%S'),
'created_by': str(admin_user.pk),
'num_uses_failed': '0',
'num_uses_succeeded': '0',
},
instance=crawl,
)
assert form.is_valid(), form.errors
updated = form.save()
updated.refresh_from_db()
assert updated.tags_str == 'alpha,beta,gamma'
assert updated.config['URL_ALLOWLIST'] == 'example.com\n*.example.com'
assert updated.config['URL_DENYLIST'] == 'static.example.com'
def test_crawl_admin_delete_snapshot_action_removes_snapshot_and_url(client, admin_user):
crawl = Crawl.objects.create(
urls='https://example.com/remove-me',
created_by=admin_user,
)
snapshot = Snapshot.objects.create(
crawl=crawl,
url='https://example.com/remove-me',
)
client.login(username='crawladmin', password='testpassword')
response = client.post(
reverse('admin:crawls_crawl_snapshot_delete', args=[crawl.pk, snapshot.pk]),
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
assert response.json()['ok'] is True
assert not Snapshot.objects.filter(pk=snapshot.pk).exists()
crawl.refresh_from_db()
assert 'https://example.com/remove-me' not in crawl.urls
def test_crawl_admin_exclude_domain_action_prunes_urls_and_pending_snapshots(client, admin_user):
crawl = Crawl.objects.create(
urls='\n'.join([
'https://cdn.example.com/asset.js',
'https://cdn.example.com/second.js',
'https://example.com/root',
]),
created_by=admin_user,
)
queued_snapshot = Snapshot.objects.create(
crawl=crawl,
url='https://cdn.example.com/asset.js',
status=Snapshot.StatusChoices.QUEUED,
)
preserved_snapshot = Snapshot.objects.create(
crawl=crawl,
url='https://example.com/root',
status=Snapshot.StatusChoices.SEALED,
)
client.login(username='crawladmin', password='testpassword')
response = client.post(
reverse('admin:crawls_crawl_snapshot_exclude_domain', args=[crawl.pk, queued_snapshot.pk]),
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
payload = response.json()
assert payload['ok'] is True
assert payload['domain'] == 'cdn.example.com'
crawl.refresh_from_db()
assert crawl.get_url_denylist(use_effective_config=False) == ['cdn.example.com']
assert 'https://cdn.example.com/asset.js' not in crawl.urls
assert 'https://cdn.example.com/second.js' not in crawl.urls
assert 'https://example.com/root' in crawl.urls
assert not Snapshot.objects.filter(pk=queued_snapshot.pk).exists()
assert Snapshot.objects.filter(pk=preserved_snapshot.pk).exists()
def test_snapshot_from_json_trims_markdown_suffixes_on_discovered_urls(crawl):
snapshot = Snapshot.from_json(
{'url': 'https://docs.sweeting.me/s/youtube-favorites)**'},
overrides={'crawl': crawl},
queue_for_extraction=False,
)
assert snapshot is not None
assert snapshot.url == 'https://docs.sweeting.me/s/youtube-favorites'
def test_create_snapshots_from_urls_respects_url_allowlist_and_denylist(admin_user):
crawl = Crawl.objects.create(
urls='\n'.join([
'https://example.com/root',
'https://static.example.com/app.js',
'https://other.test/page',
]),
created_by=admin_user,
config={
'URL_ALLOWLIST': 'example.com',
'URL_DENYLIST': 'static.example.com',
},
)
created = crawl.create_snapshots_from_urls()
assert [snapshot.url for snapshot in created] == ['https://example.com/root']
def test_url_filter_regex_lists_preserve_commas_and_split_on_newlines_only(admin_user):
crawl = Crawl.objects.create(
urls='\n'.join([
'https://example.com/root',
'https://example.com/path,with,commas',
'https://other.test/page',
]),
created_by=admin_user,
config={
'URL_ALLOWLIST': r'^https://example\.com/(root|path,with,commas)$' + '\n' + r'^https://other\.test/page$',
'URL_DENYLIST': r'^https://example\.com/path,with,commas$',
},
)
assert crawl.get_url_allowlist(use_effective_config=False) == [
r'^https://example\.com/(root|path,with,commas)$',
r'^https://other\.test/page$',
]
assert crawl.get_url_denylist(use_effective_config=False) == [
r'^https://example\.com/path,with,commas$',
]
created = crawl.create_snapshots_from_urls()
assert [snapshot.url for snapshot in created] == [
'https://example.com/root',
'https://other.test/page',
]

View File

@@ -14,7 +14,7 @@ Tests cover:
import os
from datetime import timedelta
from typing import cast
from unittest.mock import patch
from unittest.mock import Mock, patch
import pytest
from django.test import TestCase
@@ -89,11 +89,45 @@ class TestMachineModel(TestCase):
assert result is not None
self.assertEqual(result.config.get('WGET_BINARY'), '/usr/bin/wget')
def test_machine_from_jsonl_strips_legacy_chromium_version(self):
"""Machine.from_json() should ignore legacy browser version keys."""
Machine.current() # Ensure machine exists
record = {
'config': {
'WGET_BINARY': '/usr/bin/wget',
'CHROMIUM_VERSION': '123.4.5',
},
}
result = Machine.from_json(record)
self.assertIsNotNone(result)
assert result is not None
self.assertEqual(result.config.get('WGET_BINARY'), '/usr/bin/wget')
self.assertNotIn('CHROMIUM_VERSION', result.config)
def test_machine_from_jsonl_invalid(self):
"""Machine.from_json() should return None for invalid records."""
result = Machine.from_json({'invalid': 'record'})
self.assertIsNone(result)
def test_machine_current_strips_legacy_chromium_version(self):
"""Machine.current() should clean legacy browser version keys from persisted config."""
import archivebox.machine.models as models
machine = Machine.current()
machine.config = {
'CHROME_BINARY': '/tmp/chromium',
'CHROMIUM_VERSION': '123.4.5',
}
machine.save(update_fields=['config'])
models._CURRENT_MACHINE = machine
refreshed = Machine.current()
self.assertEqual(refreshed.config.get('CHROME_BINARY'), '/tmp/chromium')
self.assertNotIn('CHROMIUM_VERSION', refreshed.config)
def test_machine_manager_current(self):
"""Machine.objects.current() should return current machine."""
machine = Machine.current()
@@ -131,6 +165,36 @@ class TestNetworkInterfaceModel(TestCase):
interface = NetworkInterface.current()
self.assertIsNotNone(interface)
def test_networkinterface_current_refresh_creates_new_interface_when_properties_change(self):
"""Refreshing should persist a new NetworkInterface row when the host network fingerprint changes."""
import archivebox.machine.models as models
first = {
'mac_address': 'aa:bb:cc:dd:ee:01',
'ip_public': '1.1.1.1',
'ip_local': '192.168.1.10',
'dns_server': '8.8.8.8',
'hostname': 'host-a',
'iface': 'en0',
'isp': 'ISP A',
'city': 'City',
'region': 'Region',
'country': 'Country',
}
second = {
**first,
'ip_public': '2.2.2.2',
'ip_local': '10.0.0.5',
}
with patch.object(models, 'get_host_network', side_effect=[first, second]):
interface1 = NetworkInterface.current(refresh=True)
interface2 = NetworkInterface.current(refresh=True)
self.assertNotEqual(interface1.id, interface2.id)
self.assertEqual(interface1.machine_id, interface2.machine_id)
self.assertEqual(NetworkInterface.objects.filter(machine=interface1.machine).count(), 2)
class TestBinaryModel(TestCase):
"""Test the Binary model."""
@@ -360,6 +424,8 @@ class TestProcessCurrent(TestCase):
self.assertEqual(proc.pid, os.getpid())
self.assertEqual(proc.status, Process.StatusChoices.RUNNING)
self.assertIsNotNone(proc.machine)
self.assertIsNotNone(proc.iface)
self.assertEqual(proc.iface.machine_id, proc.machine_id)
self.assertIsNotNone(proc.started_at)
def test_process_current_caches(self):
@@ -375,6 +441,12 @@ class TestProcessCurrent(TestCase):
result = Process._detect_process_type()
self.assertEqual(result, Process.TypeChoices.ORCHESTRATOR)
def test_process_detect_type_runner_watch(self):
"""runner_watch should be classified as a worker, not the orchestrator itself."""
with patch('sys.argv', ['archivebox', 'manage', 'runner_watch', '--pidfile=/tmp/runserver.pid']):
result = Process._detect_process_type()
self.assertEqual(result, Process.TypeChoices.WORKER)
def test_process_detect_type_cli(self):
"""_detect_process_type should detect CLI commands."""
with patch('sys.argv', ['archivebox', 'add', 'http://example.com']):
@@ -387,6 +459,27 @@ class TestProcessCurrent(TestCase):
result = Process._detect_process_type()
self.assertEqual(result, Process.TypeChoices.BINARY)
def test_process_proc_allows_interpreter_wrapped_script(self):
"""Process.proc should accept a script recorded in DB when wrapped by an interpreter in psutil."""
proc = Process.objects.create(
machine=Machine.current(),
cmd=['/tmp/on_Crawl__90_chrome_launch.daemon.bg.js', '--url=https://example.com/'],
pid=12345,
status=Process.StatusChoices.RUNNING,
started_at=timezone.now(),
)
os_proc = Mock()
os_proc.create_time.return_value = proc.started_at.timestamp()
os_proc.cmdline.return_value = [
'node',
'/tmp/on_Crawl__90_chrome_launch.daemon.bg.js',
'--url=https://example.com/',
]
with patch('archivebox.machine.models.psutil.Process', return_value=os_proc):
self.assertIs(proc.proc, os_proc)
class TestProcessHierarchy(TestCase):
"""Test Process parent/child relationships."""

View File

@@ -0,0 +1,191 @@
import pytest
from typing import cast
from django.contrib.auth import get_user_model
from django.contrib.auth.models import UserManager
from django.urls import reverse
from archivebox.personas.importers import (
PersonaImportResult,
discover_persona_template_profiles,
import_persona_from_source,
resolve_browser_profile_source,
resolve_custom_import_source,
)
pytestmark = pytest.mark.django_db
User = get_user_model()
ADMIN_HOST = "admin.archivebox.localhost:8000"
@pytest.fixture
def admin_user(db):
return cast(UserManager, User.objects).create_superuser(
username="personaadmin",
email="personaadmin@test.com",
password="testpassword",
)
def _make_profile_source(tmp_path):
user_data_dir = tmp_path / "Chrome User Data"
profile_dir = user_data_dir / "Default"
profile_dir.mkdir(parents=True)
(profile_dir / "Preferences").write_text("{}")
return resolve_browser_profile_source(
browser="chrome",
user_data_dir=user_data_dir,
profile_dir="Default",
browser_binary="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
)
def test_resolve_custom_import_source_accepts_exact_profile_dir(tmp_path):
user_data_dir = tmp_path / "Brave User Data"
profile_dir = user_data_dir / "Profile 2"
profile_dir.mkdir(parents=True)
(profile_dir / "Preferences").write_text("{}")
source = resolve_custom_import_source(str(profile_dir))
assert source.kind == "browser-profile"
assert source.user_data_dir == user_data_dir.resolve()
assert source.profile_dir == "Profile 2"
def test_resolve_custom_import_source_accepts_cdp_url():
source = resolve_custom_import_source("ws://127.0.0.1:9222/devtools/browser/test-session")
assert source.kind == "cdp"
assert source.cdp_url == "ws://127.0.0.1:9222/devtools/browser/test-session"
def test_discover_persona_template_profiles_finds_chrome_profile_dirs(tmp_path):
personas_dir = tmp_path / "personas"
chrome_profile = personas_dir / "ExistingPersona" / "chrome_profile"
default_profile = chrome_profile / "Default"
default_profile.mkdir(parents=True)
(default_profile / "Preferences").write_text("{}")
discovered = discover_persona_template_profiles(personas_dir=personas_dir)
assert len(discovered) == 1
assert discovered[0].browser == "persona"
assert discovered[0].source_name == "ExistingPersona"
assert discovered[0].profile_dir == "Default"
assert discovered[0].user_data_dir == chrome_profile.resolve()
def test_discover_persona_template_profiles_finds_home_abx_personas(monkeypatch, tmp_path):
from archivebox.config.constants import CONSTANTS
monkeypatch.setattr(CONSTANTS, "PERSONAS_DIR", tmp_path / "missing-data-personas")
monkeypatch.setattr("archivebox.personas.importers.Path.home", lambda: tmp_path)
chrome_profile = tmp_path / ".config" / "abx" / "personas" / "HomePersona" / "chrome_profile"
default_profile = chrome_profile / "Default"
default_profile.mkdir(parents=True)
(default_profile / "Preferences").write_text("{}")
discovered = discover_persona_template_profiles()
assert len(discovered) == 1
assert discovered[0].browser == "persona"
assert discovered[0].source_name == "HomePersona"
assert discovered[0].profile_dir == "Default"
assert discovered[0].user_data_dir == chrome_profile.resolve()
def test_persona_admin_add_view_renders_import_ui(client, admin_user, monkeypatch, tmp_path):
source = _make_profile_source(tmp_path)
monkeypatch.setattr("archivebox.personas.forms.discover_local_browser_profiles", lambda: [source])
monkeypatch.setattr("archivebox.personas.admin.discover_local_browser_profiles", lambda: [source])
client.login(username="personaadmin", password="testpassword")
response = client.get(reverse("admin:personas_persona_add"), HTTP_HOST=ADMIN_HOST)
assert response.status_code == 200
assert b"Bootstrap a persona from a real browser session" in response.content
assert b"Google Chrome / Default" in response.content
assert b"auth.json" in response.content
def test_import_persona_from_source_copies_user_agent_to_persona_config(admin_user, monkeypatch, tmp_path):
from archivebox.personas.models import Persona
source = _make_profile_source(tmp_path)
persona = Persona.objects.create(name="AgentPersona", created_by=admin_user)
def fake_export_browser_state(**kwargs):
return True, {"user_agent": "Mozilla/5.0 Test Imported UA"}, "ok"
monkeypatch.setattr("archivebox.personas.importers.export_browser_state", fake_export_browser_state)
result = import_persona_from_source(
persona,
source,
copy_profile=False,
import_cookies=False,
capture_storage=False,
)
persona.refresh_from_db()
assert result.user_agent_imported is True
assert persona.config["USER_AGENT"] == "Mozilla/5.0 Test Imported UA"
def test_persona_admin_add_post_runs_shared_importer(client, admin_user, monkeypatch, tmp_path):
from archivebox.personas.models import Persona
source = _make_profile_source(tmp_path)
monkeypatch.setattr("archivebox.personas.forms.discover_local_browser_profiles", lambda: [source])
monkeypatch.setattr("archivebox.personas.admin.discover_local_browser_profiles", lambda: [source])
calls = {}
def fake_import(persona, selected_source, **kwargs):
calls["persona_name"] = persona.name
calls["source"] = selected_source
calls["kwargs"] = kwargs
(persona.path / "cookies.txt").parent.mkdir(parents=True, exist_ok=True)
(persona.path / "cookies.txt").write_text("# Netscape HTTP Cookie File\n")
(persona.path / "auth.json").write_text('{"TYPE":"auth","cookies":[],"localStorage":{},"sessionStorage":{}}\n')
return PersonaImportResult(
source=selected_source,
profile_copied=True,
cookies_imported=True,
storage_captured=True,
)
monkeypatch.setattr("archivebox.personas.forms.import_persona_from_source", fake_import)
client.login(username="personaadmin", password="testpassword")
response = client.post(
reverse("admin:personas_persona_add"),
{
"name": "ImportedPersona",
"created_by": str(admin_user.pk),
"config": "{}",
"import_mode": "discovered",
"import_discovered_profile": source.choice_value,
"import_copy_profile": "on",
"import_extract_cookies": "on",
"import_capture_storage": "on",
"_save": "Save",
},
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 302
persona = Persona.objects.get(name="ImportedPersona")
assert calls["persona_name"] == "ImportedPersona"
assert calls["source"].profile_dir == "Default"
assert calls["kwargs"] == {
"copy_profile": True,
"import_cookies": True,
"capture_storage": True,
}
assert persona.COOKIES_FILE.endswith("cookies.txt")
assert persona.AUTH_STORAGE_FILE.endswith("auth.json")

View File

@@ -0,0 +1,640 @@
import asyncio
import subprocess
from types import SimpleNamespace
import pytest
from django.test import RequestFactory
pytestmark = pytest.mark.django_db
class _DummyBus:
def __init__(self, name: str):
self.name = name
async def stop(self):
return None
class _DummyService:
def __init__(self, *args, **kwargs):
pass
class _DummyAbxServices:
def __init__(self):
self.process = SimpleNamespace(wait_for_background_monitors=self._wait)
async def _wait(self):
return None
async def _call_sync(func, *args, **kwargs):
return func(*args, **kwargs)
def test_run_snapshot_uses_isolated_bus_per_snapshot(monkeypatch):
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls='https://blog.sweeting.me\nhttps://sweeting.me',
created_by_id=get_or_create_system_user_pk(),
)
snapshot_a = Snapshot.objects.create(
url='https://blog.sweeting.me',
crawl=crawl,
status=Snapshot.StatusChoices.QUEUED,
)
snapshot_b = Snapshot.objects.create(
url='https://sweeting.me',
crawl=crawl,
status=Snapshot.StatusChoices.QUEUED,
)
created_buses: list[_DummyBus] = []
def fake_create_bus(*, name, total_timeout=3600.0, **kwargs):
bus = _DummyBus(name)
created_buses.append(bus)
return bus
monkeypatch.setattr(runner_module, 'create_bus', fake_create_bus)
monkeypatch.setattr(runner_module, 'discover_plugins', lambda: {})
monkeypatch.setattr(runner_module, 'ProcessService', _DummyService)
monkeypatch.setattr(runner_module, 'MachineService', _DummyService)
monkeypatch.setattr(runner_module, 'BinaryService', _DummyService)
monkeypatch.setattr(runner_module, 'TagService', _DummyService)
monkeypatch.setattr(runner_module, 'CrawlService', _DummyService)
monkeypatch.setattr(runner_module, 'SnapshotService', _DummyService)
monkeypatch.setattr(runner_module, 'ArchiveResultService', _DummyService)
monkeypatch.setattr(runner_module, 'setup_abx_services', lambda *args, **kwargs: _DummyAbxServices())
download_calls = []
async def fake_download(*, url, bus, config_overrides, snapshot, **kwargs):
download_calls.append(
{
'url': url,
'bus': bus,
'snapshot_id': config_overrides['SNAPSHOT_ID'],
'source_url': config_overrides['SOURCE_URL'],
'abx_snapshot_id': snapshot.id,
}
)
await asyncio.sleep(0)
return []
monkeypatch.setattr(runner_module, 'download', fake_download)
crawl_runner = runner_module.CrawlRunner(crawl)
snapshot_data = {
str(snapshot_a.id): {
'id': str(snapshot_a.id),
'url': snapshot_a.url,
'title': snapshot_a.title,
'timestamp': snapshot_a.timestamp,
'bookmarked_at': snapshot_a.bookmarked_at.isoformat() if snapshot_a.bookmarked_at else "",
'created_at': snapshot_a.created_at.isoformat() if snapshot_a.created_at else "",
'tags': snapshot_a.tags_str(),
'depth': snapshot_a.depth,
'parent_snapshot_id': str(snapshot_a.parent_snapshot_id) if snapshot_a.parent_snapshot_id else None,
'output_dir': str(snapshot_a.output_dir),
'config': crawl_runner._snapshot_config(snapshot_a),
},
str(snapshot_b.id): {
'id': str(snapshot_b.id),
'url': snapshot_b.url,
'title': snapshot_b.title,
'timestamp': snapshot_b.timestamp,
'bookmarked_at': snapshot_b.bookmarked_at.isoformat() if snapshot_b.bookmarked_at else "",
'created_at': snapshot_b.created_at.isoformat() if snapshot_b.created_at else "",
'tags': snapshot_b.tags_str(),
'depth': snapshot_b.depth,
'parent_snapshot_id': str(snapshot_b.parent_snapshot_id) if snapshot_b.parent_snapshot_id else None,
'output_dir': str(snapshot_b.output_dir),
'config': crawl_runner._snapshot_config(snapshot_b),
},
}
monkeypatch.setattr(crawl_runner, '_load_snapshot_run_data', lambda snapshot_id: snapshot_data[snapshot_id])
async def run_both():
await asyncio.gather(
crawl_runner._run_snapshot(str(snapshot_a.id)),
crawl_runner._run_snapshot(str(snapshot_b.id)),
)
asyncio.run(run_both())
assert len(download_calls) == 2
assert {call['snapshot_id'] for call in download_calls} == {str(snapshot_a.id), str(snapshot_b.id)}
assert {call['source_url'] for call in download_calls} == {snapshot_a.url, snapshot_b.url}
assert len({id(call['bus']) for call in download_calls}) == 2
assert len(created_buses) == 3 # 1 crawl bus + 2 isolated snapshot buses
def test_ensure_background_runner_starts_when_none_running(monkeypatch):
import archivebox.machine.models as machine_models
from archivebox.services import runner as runner_module
popen_calls = []
class DummyPopen:
def __init__(self, args, **kwargs):
popen_calls.append((args, kwargs))
monkeypatch.setattr(machine_models.Process, 'cleanup_stale_running', classmethod(lambda cls, machine=None: 0))
monkeypatch.setattr(machine_models.Machine, 'current', classmethod(lambda cls: SimpleNamespace(id='machine-1')))
monkeypatch.setattr(
machine_models.Process.objects,
'filter',
lambda **kwargs: SimpleNamespace(exists=lambda: False),
)
monkeypatch.setattr(runner_module.subprocess, 'Popen', DummyPopen)
started = runner_module.ensure_background_runner(allow_under_pytest=True)
assert started is True
assert len(popen_calls) == 1
assert popen_calls[0][0] == [runner_module.sys.executable, '-m', 'archivebox', 'run', '--daemon']
assert popen_calls[0][1]['stdin'] is subprocess.DEVNULL
def test_ensure_background_runner_skips_when_orchestrator_running(monkeypatch):
import archivebox.machine.models as machine_models
from archivebox.services import runner as runner_module
monkeypatch.setattr(machine_models.Process, 'cleanup_stale_running', classmethod(lambda cls, machine=None: 0))
monkeypatch.setattr(machine_models.Machine, 'current', classmethod(lambda cls: SimpleNamespace(id='machine-1')))
monkeypatch.setattr(
machine_models.Process.objects,
'filter',
lambda **kwargs: SimpleNamespace(exists=lambda: True),
)
monkeypatch.setattr(
runner_module.subprocess,
'Popen',
lambda *args, **kwargs: (_ for _ in ()).throw(AssertionError('runner should not be spawned')),
)
started = runner_module.ensure_background_runner(allow_under_pytest=True)
assert started is False
def test_runner_prepare_refreshes_network_interface_and_attaches_current_process(monkeypatch):
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls='https://example.com',
created_by_id=get_or_create_system_user_pk(),
)
class _Iface:
id = 'iface-1'
machine = SimpleNamespace(id='machine-1')
machine_id = 'machine-1'
saved_updates = []
class _Proc:
iface_id = None
machine_id = 'machine-1'
iface = None
machine = None
def save(self, *, update_fields):
saved_updates.append(tuple(update_fields))
proc = _Proc()
monkeypatch.setattr(runner_module, 'discover_plugins', lambda: {})
monkeypatch.setattr(runner_module, 'create_bus', lambda **kwargs: _DummyBus(kwargs['name']))
monkeypatch.setattr(runner_module, 'ProcessService', _DummyService)
monkeypatch.setattr(runner_module, 'MachineService', _DummyService)
monkeypatch.setattr(runner_module, 'BinaryService', _DummyService)
monkeypatch.setattr(runner_module, 'TagService', _DummyService)
monkeypatch.setattr(runner_module, 'CrawlService', _DummyService)
monkeypatch.setattr(runner_module, 'SnapshotService', _DummyService)
monkeypatch.setattr(runner_module, 'ArchiveResultService', _DummyService)
from archivebox.machine.models import NetworkInterface, Process
from archivebox.config import configset as configset_module
refresh_calls = []
monkeypatch.setattr(NetworkInterface, 'current', classmethod(lambda cls, refresh=False: refresh_calls.append(refresh) or _Iface()))
monkeypatch.setattr(Process, 'current', classmethod(lambda cls: proc))
monkeypatch.setattr(configset_module, 'get_config', lambda **kwargs: {})
crawl_runner = runner_module.CrawlRunner(crawl)
crawl_runner._prepare()
assert refresh_calls == [True]
assert proc.iface is not None
assert proc.machine == proc.iface.machine
assert saved_updates == [('iface', 'machine', 'modified_at')]
def test_create_crawl_api_queues_crawl_without_spawning_runner(monkeypatch):
from django.contrib.auth import get_user_model
from archivebox.api.v1_crawls import CrawlCreateSchema, create_crawl
user = get_user_model().objects.create_superuser(
username='runner-api-admin',
email='runner-api-admin@example.com',
password='testpassword',
)
request = RequestFactory().post('/api/v1/crawls')
request.user = user
crawl = create_crawl(
request,
CrawlCreateSchema(
urls=['https://example.com'],
max_depth=0,
tags=[],
tags_str='',
label='',
notes='',
config={},
),
)
assert str(crawl.id)
assert crawl.status == 'queued'
assert crawl.retry_at is not None
def test_crawl_runner_does_not_seal_unfinished_crawl(monkeypatch):
import asgiref.sync
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls='https://example.com',
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.STARTED,
)
snapshot = Snapshot.objects.create(
url='https://example.com',
crawl=crawl,
status=Snapshot.StatusChoices.STARTED,
)
monkeypatch.setattr(runner_module, '_attach_bus_trace', lambda bus: None)
monkeypatch.setattr(runner_module, '_stop_bus_trace', lambda bus: asyncio.sleep(0))
monkeypatch.setattr(runner_module, 'setup_abx_services', lambda *args, **kwargs: _DummyAbxServices())
monkeypatch.setenv("DJANGO_ALLOW_ASYNC_UNSAFE", "true")
monkeypatch.setattr(
asgiref.sync,
'sync_to_async',
lambda func, thread_sensitive=True: (lambda *args, **kwargs: _call_sync(func, *args, **kwargs)),
)
monkeypatch.setattr(Crawl.objects, 'get', lambda id: crawl)
monkeypatch.setattr(crawl, 'is_finished', lambda: False)
monkeypatch.setattr(crawl, 'save', lambda *args, **kwargs: None)
monkeypatch.setattr(runner_module.CrawlRunner, '_prepare', lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, '_create_live_ui', lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, '_initial_snapshot_ids', lambda self: [str(snapshot.id)])
monkeypatch.setattr(runner_module.CrawlRunner, '_run_crawl_setup', lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, 'enqueue_snapshot', lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, '_wait_for_snapshot_tasks', lambda self: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, '_run_crawl_cleanup', lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, '_cleanup_persona', lambda self: None)
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
assert crawl.status != Crawl.StatusChoices.SEALED
assert crawl.retry_at is not None
def test_crawl_runner_finalizes_with_sync_to_async_for_is_finished(monkeypatch):
import asgiref.sync
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls='https://example.com',
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.STARTED,
)
snapshot = Snapshot.objects.create(
url='https://example.com',
crawl=crawl,
status=Snapshot.StatusChoices.STARTED,
)
monkeypatch.setattr(runner_module, 'create_bus', lambda *args, **kwargs: _DummyBus('runner'))
monkeypatch.setattr(runner_module, 'discover_plugins', lambda: {})
monkeypatch.setattr(runner_module, 'ProcessService', _DummyService)
monkeypatch.setattr(runner_module, 'MachineService', _DummyService)
monkeypatch.setattr(runner_module, 'BinaryService', _DummyService)
monkeypatch.setattr(runner_module, 'TagService', _DummyService)
monkeypatch.setattr(runner_module, 'CrawlService', _DummyService)
monkeypatch.setattr(runner_module, 'SnapshotService', _DummyService)
monkeypatch.setattr(runner_module, 'ArchiveResultService', _DummyService)
monkeypatch.setattr(runner_module, '_attach_bus_trace', lambda bus: None)
monkeypatch.setattr(runner_module, '_stop_bus_trace', lambda bus: asyncio.sleep(0))
monkeypatch.setattr(runner_module, 'setup_abx_services', lambda *args, **kwargs: _DummyAbxServices())
monkeypatch.setattr(Crawl.objects, 'get', lambda id: crawl)
monkeypatch.setattr(crawl, 'save', lambda *args, **kwargs: None)
monkeypatch.setattr(crawl, 'cleanup', lambda: None)
monkeypatch.setattr(runner_module.CrawlRunner, '_prepare', lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, '_create_live_ui', lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, '_initial_snapshot_ids', lambda self: [str(snapshot.id)])
monkeypatch.setattr(runner_module.CrawlRunner, '_run_crawl_setup', lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, 'enqueue_snapshot', lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, '_wait_for_snapshot_tasks', lambda self: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, '_run_crawl_cleanup', lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, '_cleanup_persona', lambda self: None)
sync_to_async_wrapped: list[str] = []
sync_to_async_active = False
def fake_sync_to_async(func, thread_sensitive=True):
async def wrapper(*args, **kwargs):
nonlocal sync_to_async_active
sync_to_async_wrapped.append(getattr(func, '__name__', repr(func)))
previous = sync_to_async_active
sync_to_async_active = True
try:
return func(*args, **kwargs)
finally:
sync_to_async_active = previous
return wrapper
def guarded_is_finished():
assert sync_to_async_active is True
return False
monkeypatch.setattr(asgiref.sync, 'sync_to_async', fake_sync_to_async)
monkeypatch.setattr(crawl, 'is_finished', guarded_is_finished)
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
crawl.refresh_from_db()
assert crawl.status == Crawl.StatusChoices.STARTED
assert crawl.retry_at is not None
assert 'guarded_is_finished' in sync_to_async_wrapped
def test_wait_for_snapshot_tasks_surfaces_already_failed_task():
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls='https://example.com',
created_by_id=get_or_create_system_user_pk(),
)
crawl_runner = runner_module.CrawlRunner(crawl)
async def run_test():
task = asyncio.get_running_loop().create_future()
task.set_exception(RuntimeError('snapshot failed'))
crawl_runner.snapshot_tasks['snap-1'] = task
with pytest.raises(RuntimeError, match='snapshot failed'):
await crawl_runner._wait_for_snapshot_tasks()
asyncio.run(run_test())
def test_wait_for_snapshot_tasks_returns_after_completed_tasks_are_pruned():
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls='https://example.com',
created_by_id=get_or_create_system_user_pk(),
)
crawl_runner = runner_module.CrawlRunner(crawl)
async def finish_snapshot() -> None:
await asyncio.sleep(0)
async def run_test():
task = asyncio.create_task(finish_snapshot())
crawl_runner.snapshot_tasks['snap-1'] = task
await asyncio.wait_for(crawl_runner._wait_for_snapshot_tasks(), timeout=0.5)
assert crawl_runner.snapshot_tasks == {}
asyncio.run(run_test())
def test_crawl_runner_calls_crawl_cleanup_after_snapshot_phase(monkeypatch):
import asgiref.sync
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls='https://example.com',
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.STARTED,
)
snapshot = Snapshot.objects.create(
url='https://example.com',
crawl=crawl,
status=Snapshot.StatusChoices.STARTED,
)
monkeypatch.setattr(runner_module, '_attach_bus_trace', lambda bus: None)
monkeypatch.setattr(runner_module, '_stop_bus_trace', lambda bus: asyncio.sleep(0))
monkeypatch.setattr(runner_module, 'setup_abx_services', lambda *args, **kwargs: _DummyAbxServices())
monkeypatch.setenv("DJANGO_ALLOW_ASYNC_UNSAFE", "true")
monkeypatch.setattr(
asgiref.sync,
'sync_to_async',
lambda func, thread_sensitive=True: (lambda *args, **kwargs: _call_sync(func, *args, **kwargs)),
)
monkeypatch.setattr(Crawl.objects, 'get', lambda id: crawl)
monkeypatch.setattr(crawl, 'is_finished', lambda: False)
monkeypatch.setattr(crawl, 'save', lambda *args, **kwargs: None)
monkeypatch.setattr(runner_module.CrawlRunner, '_prepare', lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, '_create_live_ui', lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, '_initial_snapshot_ids', lambda self: [str(snapshot.id)])
monkeypatch.setattr(runner_module.CrawlRunner, '_run_crawl_setup', lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, 'enqueue_snapshot', lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, '_wait_for_snapshot_tasks', lambda self: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, '_cleanup_persona', lambda self: None)
cleanup_calls = []
monkeypatch.setattr(runner_module.CrawlRunner, '_run_crawl_cleanup', lambda self, snapshot_id: cleanup_calls.append('abx_cleanup') or asyncio.sleep(0))
monkeypatch.setattr(crawl, 'cleanup', lambda: cleanup_calls.append('crawl_cleanup'))
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
assert cleanup_calls == ['crawl_cleanup', 'abx_cleanup']
def test_abx_process_service_background_monitor_finishes_after_process_exit(monkeypatch, tmp_path):
from abx_dl.models import Process as AbxProcess, now_iso
from abx_dl.services.process_service import ProcessService
from abx_dl.events import ProcessCompletedEvent
service = object.__new__(ProcessService)
service.emit_jsonl = False
emitted_events = []
async def fake_emit_event(event, *, detach_from_parent):
emitted_events.append((event, detach_from_parent))
async def fake_stream_stdout(**kwargs):
try:
await asyncio.Event().wait()
except asyncio.CancelledError:
return ["daemon output\n"]
service._emit_event = fake_emit_event
monkeypatch.setattr(service, '_stream_stdout', fake_stream_stdout)
class FakeAsyncProcess:
def __init__(self):
self.pid = 42424
self.returncode = None
async def wait(self):
await asyncio.sleep(0)
self.returncode = 0
return 0
plugin_output_dir = tmp_path / 'chrome'
plugin_output_dir.mkdir()
stdout_file = plugin_output_dir / 'on_Crawl__90_chrome_launch.daemon.bg.stdout.log'
stderr_file = plugin_output_dir / 'on_Crawl__90_chrome_launch.daemon.bg.stderr.log'
stderr_file.write_text('')
pid_file = plugin_output_dir / 'on_Crawl__90_chrome_launch.daemon.bg.pid'
pid_file.write_text('12345')
proc = AbxProcess(
cmd=['hook'],
pwd=str(plugin_output_dir),
timeout=60,
started_at=now_iso(),
plugin='chrome',
hook_name='on_Crawl__90_chrome_launch.daemon.bg',
)
process = FakeAsyncProcess()
event = SimpleNamespace(
plugin_name='chrome',
hook_name='on_Crawl__90_chrome_launch.daemon.bg',
hook_path='hook',
hook_args=['--url=https://example.org/'],
env={},
output_dir=str(plugin_output_dir),
timeout=60,
snapshot_id='snap-1',
is_background=True,
)
async def run_test():
await asyncio.wait_for(
service._monitor_background_process(
event=event,
proc=proc,
process=process,
plugin_output_dir=plugin_output_dir,
stdout_file=stdout_file,
stderr_file=stderr_file,
pid_file=pid_file,
files_before=set(),
),
timeout=0.5,
)
asyncio.run(run_test())
assert pid_file.exists() is False
assert any(isinstance(event, ProcessCompletedEvent) for event, _ in emitted_events)
def test_run_pending_crawls_runs_due_snapshot_in_place(monkeypatch):
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls='https://example.com',
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.SEALED,
)
snapshot = Snapshot.objects.create(
url='https://example.com',
crawl=crawl,
status=Snapshot.StatusChoices.QUEUED,
retry_at=runner_module.timezone.now(),
)
monkeypatch.setattr(type(snapshot), 'claim_processing_lock', lambda self, lock_seconds=60: True)
monkeypatch.setattr(type(crawl), 'claim_processing_lock', lambda self, lock_seconds=60: True)
run_calls: list[tuple[str, list[str] | None, bool]] = []
def fake_run_crawl(crawl_id, snapshot_ids=None, selected_plugins=None, process_discovered_snapshots_inline=True):
run_calls.append((crawl_id, snapshot_ids, process_discovered_snapshots_inline))
snapshot.status = Snapshot.StatusChoices.SEALED
snapshot.retry_at = None
snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
monkeypatch.setattr(runner_module, 'run_crawl', fake_run_crawl)
result = runner_module.run_pending_crawls(daemon=False)
assert result == 0
assert run_calls == [(str(crawl.id), [str(snapshot.id)], False)]
def test_run_pending_crawls_prioritizes_new_queued_crawl_before_snapshot_backlog(monkeypatch):
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.services import runner as runner_module
older_crawl = Crawl.objects.create(
urls='https://older.example.com',
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.STARTED,
)
older_snapshot = Snapshot.objects.create(
url='https://older.example.com',
crawl=older_crawl,
status=Snapshot.StatusChoices.QUEUED,
retry_at=runner_module.timezone.now(),
)
newer_crawl = Crawl.objects.create(
urls='https://newer.example.com',
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.QUEUED,
retry_at=runner_module.timezone.now(),
)
monkeypatch.setattr(type(older_snapshot), 'claim_processing_lock', lambda self, lock_seconds=60: True)
monkeypatch.setattr(type(older_crawl), 'claim_processing_lock', lambda self, lock_seconds=60: True)
monkeypatch.setattr(type(newer_crawl), 'claim_processing_lock', lambda self, lock_seconds=60: True)
run_calls: list[tuple[str, list[str] | None, bool]] = []
class _StopScheduling(Exception):
pass
def fake_run_crawl(crawl_id, snapshot_ids=None, selected_plugins=None, process_discovered_snapshots_inline=True):
run_calls.append((crawl_id, snapshot_ids, process_discovered_snapshots_inline))
raise _StopScheduling
monkeypatch.setattr(runner_module, 'run_crawl', fake_run_crawl)
with pytest.raises(_StopScheduling):
runner_module.run_pending_crawls(daemon=False)
assert run_calls == [(str(newer_crawl.id), None, False)]

View File

@@ -0,0 +1,205 @@
import json
from datetime import datetime
from typing import cast
import pytest
from django.contrib.auth import get_user_model
from django.contrib.auth.models import UserManager
from django.urls import reverse
from django.utils import timezone
pytestmark = pytest.mark.django_db
User = get_user_model()
ADMIN_HOST = 'admin.archivebox.localhost:8000'
@pytest.fixture
def admin_user(db):
return cast(UserManager, User.objects).create_superuser(
username='tagadmin',
email='tagadmin@test.com',
password='testpassword',
)
@pytest.fixture
def api_token(admin_user):
from archivebox.api.auth import get_or_create_api_token
token = get_or_create_api_token(admin_user)
assert token is not None
return token.token
@pytest.fixture
def crawl(admin_user):
from archivebox.crawls.models import Crawl
return Crawl.objects.create(
urls='https://example.com',
created_by=admin_user,
)
@pytest.fixture
def tagged_data(crawl, admin_user):
from archivebox.core.models import Snapshot, Tag
tag = Tag.objects.create(name='Alpha Research', created_by=admin_user)
first = Snapshot.objects.create(
url='https://example.com/one',
title='Example One',
crawl=crawl,
)
second = Snapshot.objects.create(
url='https://example.com/two',
title='Example Two',
crawl=crawl,
)
first.tags.add(tag)
second.tags.add(tag)
return tag, [first, second]
def test_tag_admin_changelist_renders_custom_ui(client, admin_user, tagged_data):
client.login(username='tagadmin', password='testpassword')
response = client.get(reverse('admin:core_tag_changelist'), HTTP_HOST=ADMIN_HOST)
assert response.status_code == 200
assert b'id="tag-live-search"' in response.content
assert b'id="tag-sort-select"' in response.content
assert b'id="tag-created-by-select"' in response.content
assert b'id="tag-year-select"' in response.content
assert b'id="tag-has-snapshots-select"' in response.content
assert b'Alpha Research' in response.content
assert b'class="tag-card"' in response.content
def test_tag_admin_add_view_renders_similar_tag_reference(client, admin_user):
client.login(username='tagadmin', password='testpassword')
response = client.get(reverse('admin:core_tag_add'), HTTP_HOST=ADMIN_HOST)
assert response.status_code == 200
assert b'Similar Tags' in response.content
assert b'data-tag-name-input="1"' in response.content
def test_tag_search_api_returns_card_payload(client, api_token, tagged_data):
tag, snapshots = tagged_data
response = client.get(
reverse('api-1:search_tags'),
{'q': 'Alpha', 'api_key': api_token},
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
payload = response.json()
assert payload['sort'] == 'created_desc'
assert payload['created_by'] == ''
assert payload['year'] == ''
assert payload['has_snapshots'] == 'all'
assert payload['tags'][0]['id'] == tag.id
assert payload['tags'][0]['name'] == 'Alpha Research'
assert payload['tags'][0]['num_snapshots'] == 2
assert payload['tags'][0]['snapshots'][0]['title'] in {'Example One', 'Example Two'}
assert payload['tags'][0]['export_jsonl_url'].endswith(f'/api/v1/core/tag/{tag.id}/snapshots.jsonl')
assert payload['tags'][0]['filter_url'].endswith(f'/admin/core/snapshot/?tags__id__exact={tag.id}')
assert {snapshot['url'] for snapshot in payload['tags'][0]['snapshots']} == {snap.url for snap in snapshots}
def test_tag_search_api_respects_sort_and_filters(client, api_token, admin_user, crawl, tagged_data):
from archivebox.core.models import Snapshot, Tag
other_user = cast(UserManager, User.objects).create_user(
username='tagother',
email='tagother@test.com',
password='unused',
)
tag_with_snapshots = tagged_data[0]
empty_tag = Tag.objects.create(name='Zulu Empty', created_by=other_user)
alpha_tag = Tag.objects.create(name='Alpha Empty', created_by=other_user)
Snapshot.objects.create(
url='https://example.com/three',
title='Example Three',
crawl=crawl,
).tags.add(alpha_tag)
Tag.objects.filter(pk=empty_tag.pk).update(created_at=timezone.make_aware(datetime(2024, 1, 1, 12, 0, 0)))
Tag.objects.filter(pk=alpha_tag.pk).update(created_at=timezone.make_aware(datetime(2025, 1, 1, 12, 0, 0)))
Tag.objects.filter(pk=tag_with_snapshots.pk).update(created_at=timezone.make_aware(datetime(2026, 1, 1, 12, 0, 0)))
response = client.get(
reverse('api-1:search_tags'),
{
'sort': 'name_desc',
'created_by': str(other_user.pk),
'year': '2024',
'has_snapshots': 'no',
'api_key': api_token,
},
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
payload = response.json()
assert payload['sort'] == 'name_desc'
assert payload['created_by'] == str(other_user.pk)
assert payload['year'] == '2024'
assert payload['has_snapshots'] == 'no'
assert [tag['name'] for tag in payload['tags']] == ['Zulu Empty']
def test_tag_rename_api_updates_slug(client, api_token, tagged_data):
tag, _ = tagged_data
response = client.post(
f"{reverse('api-1:rename_tag', args=[tag.id])}?api_key={api_token}",
data=json.dumps({'name': 'Alpha Archive'}),
content_type='application/json',
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
tag.refresh_from_db()
assert tag.name == 'Alpha Archive'
assert tag.slug == 'alpha-archive'
def test_tag_snapshots_export_returns_jsonl(client, api_token, tagged_data):
tag, _ = tagged_data
response = client.get(
reverse('api-1:tag_snapshots_export', args=[tag.id]),
{'api_key': api_token},
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
assert response['Content-Type'].startswith('application/x-ndjson')
assert f'tag-{tag.slug}-snapshots.jsonl' in response['Content-Disposition']
body = response.content.decode()
assert '"type": "Snapshot"' in body
assert '"tags": "Alpha Research"' in body
def test_tag_urls_export_returns_plain_text_urls(client, api_token, tagged_data):
tag, snapshots = tagged_data
response = client.get(
reverse('api-1:tag_urls_export', args=[tag.id]),
{'api_key': api_token},
HTTP_HOST=ADMIN_HOST,
)
assert response.status_code == 200
assert response['Content-Type'].startswith('text/plain')
assert f'tag-{tag.slug}-urls.txt' in response['Content-Disposition']
exported_urls = set(filter(None, response.content.decode().splitlines()))
assert exported_urls == {snapshot.url for snapshot in snapshots}

View File

@@ -55,6 +55,7 @@ def _build_script(body: str) -> str:
get_admin_host,
get_api_host,
get_web_host,
get_public_host,
get_snapshot_host,
get_original_host,
get_listen_subdomain,
@@ -198,6 +199,7 @@ class TestUrlRouting:
web_host = get_web_host()
admin_host = get_admin_host()
api_host = get_api_host()
public_host = get_public_host()
snapshot_host = get_snapshot_host(snapshot_id)
original_host = get_original_host(domain)
base_host = SERVER_CONFIG.LISTEN_HOST
@@ -208,6 +210,7 @@ class TestUrlRouting:
assert web_host == "web.archivebox.localhost:8000"
assert admin_host == "admin.archivebox.localhost:8000"
assert api_host == "api.archivebox.localhost:8000"
assert public_host == "public.archivebox.localhost:8000"
assert snapshot_host == f"{snapshot_id}.archivebox.localhost:8000"
assert original_host == f"{domain}.archivebox.localhost:8000"
assert get_listen_subdomain(web_host) == "web"
@@ -302,6 +305,20 @@ class TestUrlRouting:
assert resp.status_code == 200
assert response_body(resp) == response_file.read_bytes()
resp = client.get("/index.html", HTTP_HOST=snapshot_host)
assert resp.status_code == 200
snapshot_html = response_body(resp).decode("utf-8", "ignore")
assert f"http://{snapshot_host}/" in snapshot_html
assert "See all files..." in snapshot_html
assert ">WARC<" not in snapshot_html
assert ">Media<" not in snapshot_html
assert ">Git<" not in snapshot_html
resp = client.get("/?files=1", HTTP_HOST=snapshot_host)
assert resp.status_code == 200
files_html = response_body(resp).decode("utf-8", "ignore")
assert output_rel.split("/", 1)[0] in files_html
print("OK")
"""
)
@@ -479,6 +496,7 @@ class TestUrlRouting:
snapshot_host = get_snapshot_host(snapshot_id)
admin_host = get_admin_host()
web_host = get_web_host()
public_host = get_public_host()
client = Client()
@@ -491,10 +509,17 @@ class TestUrlRouting:
assert resp.status_code == 200
live_html = response_body(resp).decode("utf-8", "ignore")
assert f"http://{snapshot_host}/" in live_html
assert "http://web.archivebox.localhost:8000" in live_html
assert f"http://{public_host}/static/archive.png" in live_html
assert ">WARC<" not in live_html
assert ">Media<" not in live_html
assert ">Git<" not in live_html
static_html = Path(snapshot.output_dir, "index.html").read_text(encoding="utf-8", errors="ignore")
assert f"http://{snapshot_host}/" in static_html
assert f"http://{public_host}/static/archive.png" in static_html
assert ">WARC<" not in static_html
assert ">Media<" not in static_html
assert ">Git<" not in static_html
client.login(username="testadmin", password="testpassword")
resp = client.get(f"/admin/core/snapshot/{snapshot_id}/change/", HTTP_HOST=admin_host)