bump package versions

This commit is contained in:
Nick Sweeting
2026-03-15 20:47:28 -07:00
parent bc21d4bfdb
commit 9de084da65
32 changed files with 469 additions and 711 deletions

View File

@@ -1,7 +1,20 @@
__package__ = 'archivebox.core'
from typing import TYPE_CHECKING, Any
from django.contrib import admin
from admin_data_views.admin import get_app_list, admin_data_index_view, get_admin_data_urls, get_urls
from admin_data_views.admin import (
admin_data_index_view as adv_admin_data_index_view,
get_admin_data_urls as adv_get_admin_data_urls,
get_app_list as adv_get_app_list,
)
if TYPE_CHECKING:
from django.http import HttpRequest
from django.template.response import TemplateResponse
from django.urls import URLPattern, URLResolver
from admin_data_views.typing import AppDict
class ArchiveBoxAdmin(admin.AdminSite):
@@ -10,6 +23,20 @@ class ArchiveBoxAdmin(admin.AdminSite):
site_title = 'Admin'
namespace = 'admin'
def get_app_list(self, request: 'HttpRequest', app_label: str | None = None) -> list['AppDict']:
if app_label is None:
return adv_get_app_list(self, request)
return adv_get_app_list(self, request, app_label)
def admin_data_index_view(self, request: 'HttpRequest', **kwargs: Any) -> 'TemplateResponse':
return adv_admin_data_index_view(self, request, **kwargs)
def get_admin_data_urls(self) -> list['URLResolver | URLPattern']:
return adv_get_admin_data_urls(self)
def get_urls(self) -> list['URLResolver | URLPattern']:
return self.get_admin_data_urls() + super().get_urls()
archivebox_admin = ArchiveBoxAdmin()
# Note: delete_selected is enabled per-model via actions = ['delete_selected'] in each ModelAdmin
@@ -17,13 +44,6 @@ archivebox_admin = ArchiveBoxAdmin()
# patch admin with methods to add data views (implemented by admin_data_views package)
# https://github.com/MrThearMan/django-admin-data-views
# https://mrthearman.github.io/django-admin-data-views/setup/
archivebox_admin.get_app_list = get_app_list.__get__(archivebox_admin, ArchiveBoxAdmin)
archivebox_admin.admin_data_index_view = admin_data_index_view.__get__(archivebox_admin, ArchiveBoxAdmin) # type: ignore
archivebox_admin.get_admin_data_urls = get_admin_data_urls.__get__(archivebox_admin, ArchiveBoxAdmin) # type: ignore
archivebox_admin.get_urls = get_urls(archivebox_admin.get_urls).__get__(archivebox_admin, ArchiveBoxAdmin)
############### Admin Data View sections are defined in settings.ADMIN_DATA_VIEWS #########

View File

@@ -1,9 +1,9 @@
__package__ = 'archivebox.core'
from typing import Optional, Dict, Iterable, Any, List
from typing import Optional, Dict, Iterable, Any, List, Sequence, cast
import uuid
from archivebox.uuid_compat import uuid7
from datetime import datetime, timedelta
from django_stubs_ext.db.models import TypedModelMeta
import os
import json
@@ -20,6 +20,7 @@ from django.core.cache import cache
from django.urls import reverse_lazy
from django.contrib import admin
from django.conf import settings
from django.utils.safestring import mark_safe
from archivebox.config import CONSTANTS
from archivebox.misc.system import get_dir_size, atomic_write
@@ -51,7 +52,7 @@ class Tag(ModelWithUUID):
snapshot_set: models.Manager['Snapshot']
class Meta(TypedModelMeta):
class Meta(ModelWithUUID.Meta):
app_label = 'core'
verbose_name = "Tag"
verbose_name_plural = "Tags"
@@ -88,7 +89,7 @@ class Tag(ModelWithUUID):
@property
def api_url(self) -> str:
return reverse_lazy('api-1:get_tag', args=[self.id])
return str(reverse_lazy('api-1:get_tag', args=[self.id]))
def to_json(self) -> dict:
"""
@@ -104,7 +105,7 @@ class Tag(ModelWithUUID):
}
@staticmethod
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None):
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] | None = None):
"""
Create/update Tag from JSON dict.
@@ -259,7 +260,7 @@ class SnapshotQuerySet(models.QuerySet):
})
class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)): # ty: ignore[unsupported-base]
"""Manager for Snapshot model - uses SnapshotQuerySet for chainable methods"""
def filter(self, *args, **kwargs):
@@ -283,8 +284,8 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
from django.db import transaction
if atomic:
with transaction.atomic():
return self.delete()
return self.delete()
return self.get_queryset().delete()
return self.get_queryset().delete()
class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
@@ -318,10 +319,20 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
StatusChoices = ModelWithStateMachine.StatusChoices
active_state = StatusChoices.STARTED
crawl_id: uuid.UUID
parent_snapshot_id: uuid.UUID | None
_prefetched_objects_cache: dict[str, Any]
objects = SnapshotManager()
archiveresult_set: models.Manager['ArchiveResult']
class Meta(TypedModelMeta):
class Meta(
ModelWithOutputDir.Meta,
ModelWithConfig.Meta,
ModelWithNotes.Meta,
ModelWithHealthStats.Meta,
ModelWithStateMachine.Meta,
):
app_label = 'core'
verbose_name = "Snapshot"
verbose_name_plural = "Snapshots"
@@ -663,6 +674,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
candidates = cls.objects.filter(url=url, timestamp__startswith=timestamp)
if candidates.count() == 1:
snapshot = candidates.first()
if snapshot is None:
return None
print(f"[DEBUG load_from_directory] Found via fuzzy match: {snapshot.timestamp}")
return snapshot
elif candidates.count() > 1:
@@ -751,14 +764,16 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
)
@staticmethod
def _select_best_timestamp(index_timestamp: str, folder_name: str) -> Optional[str]:
def _select_best_timestamp(index_timestamp: object | None, folder_name: str) -> Optional[str]:
"""
Select best timestamp from index.json vs folder name.
Validates range (1995-2035).
Prefers index.json if valid.
"""
def is_valid_timestamp(ts):
def is_valid_timestamp(ts: object | None) -> bool:
if not isinstance(ts, (str, int, float)):
return False
try:
ts_int = int(float(ts))
# 1995-01-01 to 2035-12-31
@@ -769,12 +784,11 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
index_valid = is_valid_timestamp(index_timestamp) if index_timestamp else False
folder_valid = is_valid_timestamp(folder_name)
if index_valid:
return str(int(float(index_timestamp)))
elif folder_valid:
return str(int(float(folder_name)))
else:
return None
if index_valid and index_timestamp is not None:
return str(int(float(str(index_timestamp))))
if folder_valid:
return str(int(float(str(folder_name))))
return None
@classmethod
def _ensure_unique_timestamp(cls, url: str, timestamp: str) -> str:
@@ -1039,7 +1053,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
)
index_path = Path(self.output_dir) / CONSTANTS.JSONL_INDEX_FILENAME
result = {
result: dict[str, Any] = {
'snapshot': None,
'archive_results': [],
'binaries': [],
@@ -1210,7 +1224,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
return merged
@classmethod
def _merge_snapshots(cls, snapshots: list['Snapshot']):
def _merge_snapshots(cls, snapshots: Sequence['Snapshot']):
"""
Merge exact duplicates.
Keep oldest, union files + ArchiveResults.
@@ -1271,19 +1285,21 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
@admin.display(description='Tags')
def tags_str(self, nocache=True) -> str | None:
calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all()))
if hasattr(self, '_prefetched_objects_cache') and 'tags' in self._prefetched_objects_cache:
prefetched_cache = getattr(self, '_prefetched_objects_cache', {})
if 'tags' in prefetched_cache:
return calc_tags_str()
cache_key = f'{self.pk}-tags'
return cache.get_or_set(cache_key, calc_tags_str) if not nocache else calc_tags_str()
def icons(self, path: Optional[str] = None) -> str:
"""Generate HTML icons showing which extractor plugins have succeeded for this snapshot"""
from django.utils.html import format_html, mark_safe
from django.utils.html import format_html
cache_key = f'result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}'
def calc_icons():
if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
prefetched_cache = getattr(self, '_prefetched_objects_cache', {})
if 'archiveresult_set' in prefetched_cache:
archive_results = {r.plugin: r for r in self.archiveresult_set.all() if r.status == "succeeded" and (r.output_files or r.output_str)}
else:
# Filter for results that have either output_files or output_str
@@ -1331,7 +1347,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
@property
def api_url(self) -> str:
return reverse_lazy('api-1:get_snapshot', args=[self.id])
return str(reverse_lazy('api-1:get_snapshot', args=[self.id]))
def get_absolute_url(self):
return f'/{self.archive_path}'
@@ -1341,23 +1357,28 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
return url_domain(self.url)
@property
def output_dir(self):
def title_stripped(self) -> str:
return (self.title or '').strip()
@property
def output_dir(self) -> Path:
"""The filesystem path to the snapshot's output directory."""
import os
current_path = self.get_storage_path_for_version(self.fs_version)
if current_path.exists():
return str(current_path)
return current_path
# Check for backwards-compat symlink
old_path = CONSTANTS.ARCHIVE_DIR / self.timestamp
if old_path.is_symlink():
return str(Path(os.readlink(old_path)).resolve())
link_target = Path(os.readlink(old_path))
return (old_path.parent / link_target).resolve() if not link_target.is_absolute() else link_target.resolve()
elif old_path.exists():
return str(old_path)
return old_path
return str(current_path)
return current_path
def ensure_legacy_archive_symlink(self) -> None:
"""Ensure the legacy archive/<timestamp> path resolves to this snapshot."""
@@ -1405,7 +1426,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
date_base = crawl.created_at or self.created_at or timezone.now()
date_str = date_base.strftime('%Y%m%d')
domain = self.extract_domain_from_url(self.url)
username = crawl.created_by.username if crawl.created_by_id else 'system'
username = crawl.created_by.username if getattr(crawl, 'created_by_id', None) else 'system'
crawl_dir = DATA_DIR / 'users' / username / 'crawls' / date_str / domain / str(crawl.id)
link_path = crawl_dir / 'snapshots' / domain / str(self.id)
@@ -1591,7 +1612,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
}
@staticmethod
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True):
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] | None = None, queue_for_extraction: bool = True):
"""
Create/update Snapshot from JSON dict.
@@ -1859,7 +1880,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
'is_sealed': is_sealed,
}
def retry_failed_archiveresults(self, retry_at: Optional['timezone.datetime'] = None) -> int:
def retry_failed_archiveresults(self, retry_at: Optional[datetime] = None) -> int:
"""
Reset failed/skipped ArchiveResults to queued for retry.
@@ -2163,20 +2184,20 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
cols = cols or ['timestamp', 'is_archived', 'url']
return separator.join(to_json(data.get(col, ''), indent=None).ljust(ljust) for col in cols)
def write_json_details(self, out_dir: Optional[str] = None) -> None:
def write_json_details(self, out_dir: Path | str | None = None) -> None:
"""Write JSON index file for this snapshot to its output directory"""
out_dir = out_dir or self.output_dir
path = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
output_dir = Path(out_dir) if out_dir is not None else self.output_dir
path = output_dir / CONSTANTS.JSON_INDEX_FILENAME
atomic_write(str(path), self.to_dict(extended=True))
def write_html_details(self, out_dir: Optional[str] = None) -> None:
def write_html_details(self, out_dir: Path | str | None = None) -> None:
"""Write HTML detail page for this snapshot to its output directory"""
from django.template.loader import render_to_string
from archivebox.config.common import SERVER_CONFIG
from archivebox.config.configset import get_config
from archivebox.misc.logging_util import printable_filesize
out_dir = out_dir or self.output_dir
output_dir = Path(out_dir) if out_dir is not None else self.output_dir
config = get_config()
SAVE_ARCHIVE_DOT_ORG = config.get('SAVE_ARCHIVE_DOT_ORG', True)
TITLE_LOADING_MSG = 'Not yet archived...'
@@ -2198,12 +2219,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
for plugin in preview_priority:
out = outputs_by_plugin.get(plugin)
if out and out.get('path'):
best_preview_path = out['path']
best_preview_path = str(out['path'])
best_result = out
break
if best_preview_path == 'about:blank' and outputs:
best_preview_path = outputs[0].get('path') or 'about:blank'
best_preview_path = str(outputs[0].get('path') or 'about:blank')
best_result = outputs[0]
context = {
**self.to_dict(extended=True),
@@ -2223,7 +2244,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
'archiveresults': outputs,
}
rendered_html = render_to_string('snapshot.html', context)
atomic_write(str(Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME), rendered_html)
atomic_write(str(output_dir / CONSTANTS.HTML_INDEX_FILENAME), rendered_html)
# =========================================================================
# Helper Methods
@@ -2285,6 +2306,8 @@ class SnapshotMachine(BaseStateMachine):
# Manual event (can also be triggered by last ArchiveResult finishing)
seal = started.to(sealed)
snapshot: Snapshot
def can_start(self) -> bool:
can_start = bool(self.snapshot.url)
return can_start
@@ -2332,7 +2355,7 @@ class SnapshotMachine(BaseStateMachine):
if remaining_active == 0 and crawl.status == crawl.StatusChoices.STARTED:
print(f'[cyan]🔒 All snapshots sealed for crawl {crawl.id}, sealing crawl[/cyan]', file=sys.stderr)
# Seal the parent crawl
crawl.sm.seal()
cast(Any, crawl).sm.seal()
class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithStateMachine):
@@ -2391,7 +2414,15 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
state_field_name = 'status'
active_state = StatusChoices.STARTED
class Meta(TypedModelMeta):
snapshot_id: uuid.UUID
process_id: uuid.UUID | None
class Meta(
ModelWithOutputDir.Meta,
ModelWithConfig.Meta,
ModelWithNotes.Meta,
ModelWithStateMachine.Meta,
):
app_label = 'core'
verbose_name = 'Archive Result'
verbose_name_plural = 'Archive Results Log'
@@ -2442,7 +2473,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
return record
@staticmethod
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None):
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] | None = None):
"""
Create/update ArchiveResult from JSON dict.
@@ -2469,7 +2500,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
# Get or create by snapshot_id + plugin
try:
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.get(id=snapshot_id)
result, _ = ArchiveResult.objects.get_or_create(
@@ -2531,7 +2561,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
@property
def api_url(self) -> str:
return reverse_lazy('api-1:get_archiveresult', args=[self.id])
return str(reverse_lazy('api-1:get_archiveresult', args=[self.id]))
def get_absolute_url(self):
return f'/{self.snapshot.archive_path}/{self.plugin}'
@@ -3198,6 +3228,8 @@ class ArchiveResultMachine(BaseStateMachine):
# Reason: backoff should always retry→started, then started→final states
)
archiveresult: ArchiveResult
def can_start(self) -> bool:
"""Pure function - check if AR can start (has valid URL)."""
return bool(self.archiveresult.snapshot.url)
@@ -3259,7 +3291,7 @@ class ArchiveResultMachine(BaseStateMachine):
process = self.archiveresult.process
# If process is NOT running anymore, reap the background hook
if not process.is_running():
if not process.is_running:
self.archiveresult.update_from_output()
# Check if now in final state after reaping
return self.archiveresult.status in (
@@ -3331,7 +3363,7 @@ class ArchiveResultMachine(BaseStateMachine):
if remaining_active == 0:
print(f'[cyan] 🔒 All archiveresults finished for snapshot {snapshot.url}, sealing snapshot[/cyan]', file=sys.stderr)
# Seal the parent snapshot
snapshot.sm.seal()
cast(Any, snapshot).sm.seal()
@succeeded.enter
def enter_succeeded(self):

View File

@@ -3,6 +3,8 @@ __package__ = "archivebox.core"
import os
import sys
import inspect
import importlib
from typing import Any, cast
from pathlib import Path
@@ -119,8 +121,8 @@ try:
try:
# Try to import django-auth-ldap (will fail if not installed)
from django_auth_ldap.config import LDAPSearch
import ldap
LDAPSearch = importlib.import_module("django_auth_ldap.config").LDAPSearch
ldap = importlib.import_module("ldap")
# Configure LDAP authentication
AUTH_LDAP_SERVER_URI = LDAP_CONFIG.LDAP_SERVER_URI
@@ -130,7 +132,7 @@ try:
# Configure user search
AUTH_LDAP_USER_SEARCH = LDAPSearch(
LDAP_CONFIG.LDAP_USER_BASE,
ldap.SCOPE_SUBTREE,
getattr(ldap, "SCOPE_SUBTREE", 2),
LDAP_CONFIG.LDAP_USER_FILTER,
)
@@ -432,7 +434,7 @@ LOGGING = SETTINGS_LOGGING
# Add default webhook configuration to the User model
SIGNAL_WEBHOOKS_CUSTOM_MODEL = "archivebox.api.models.OutboundWebhook"
SIGNAL_WEBHOOKS = {
SIGNAL_WEBHOOKS: dict[str, object] = {
"HOOKS": {
# ... is a special sigil value that means "use the default autogenerated hooks"
"django.contrib.auth.models.User": ...,
@@ -444,7 +446,8 @@ SIGNAL_WEBHOOKS = {
}
# Avoid background threads touching sqlite connections (especially during tests/migrations).
if DATABASES["default"]["ENGINE"].endswith("sqlite3"):
default_database = cast(dict[str, Any], DATABASES["default"])
if str(default_database["ENGINE"]).endswith("sqlite3"):
SIGNAL_WEBHOOKS["TASK_HANDLER"] = "signal_webhooks.handlers.sync_task_handler"
################################################################################
@@ -551,10 +554,8 @@ if DEBUG_TOOLBAR:
MIDDLEWARE = [*MIDDLEWARE, "debug_toolbar.middleware.DebugToolbarMiddleware"]
if DEBUG:
from django_autotyping.typing import AutotypingSettingsDict
INSTALLED_APPS += ["django_autotyping"]
AUTOTYPING: AutotypingSettingsDict = {
AUTOTYPING = {
"STUBS_GENERATION": {
"LOCAL_STUBS_DIR": PACKAGE_DIR / "typings",
}

View File

@@ -1,5 +1,7 @@
"""Template tags for accessing config values in templates."""
from typing import Any
from django import template
from archivebox.config.configset import get_config as _get_config
@@ -8,7 +10,7 @@ register = template.Library()
@register.simple_tag
def get_config(key: str) -> any:
def get_config(key: str) -> Any:
"""
Get a config value by key.

View File

@@ -4,6 +4,9 @@ import importlib
import os
import django
from unittest.mock import patch
from typing import TypeVar, cast
from django.forms import BaseForm
# Set up Django before importing any Django-dependent modules
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
@@ -18,6 +21,14 @@ CrawlSchedule = importlib.import_module('archivebox.crawls.models').CrawlSchedul
Tag = importlib.import_module('archivebox.core.models').Tag
SERVER_CONFIG = importlib.import_module('archivebox.config.common').SERVER_CONFIG
T = TypeVar('T')
def require(value: T | None) -> T:
if value is None:
raise AssertionError('Expected value to be present')
return value
class AddViewTests(TestCase):
"""Tests for the AddView (crawl creation form)."""
@@ -111,7 +122,7 @@ class AddViewTests(TestCase):
# Check that crawl was created
self.assertEqual(Crawl.objects.count(), 1)
crawl = Crawl.objects.first()
crawl = require(Crawl.objects.first())
self.assertIn('https://example.com', crawl.urls)
self.assertIn('https://example.org', crawl.urls)
@@ -140,8 +151,8 @@ class AddViewTests(TestCase):
self.assertEqual(Crawl.objects.count(), 1)
self.assertEqual(CrawlSchedule.objects.count(), 1)
crawl = Crawl.objects.first()
schedule = CrawlSchedule.objects.first()
crawl = require(Crawl.objects.first())
schedule = require(CrawlSchedule.objects.first())
self.assertEqual(crawl.schedule, schedule)
self.assertEqual(schedule.template, crawl)
@@ -159,7 +170,7 @@ class AddViewTests(TestCase):
self.assertEqual(response.status_code, 302)
schedule = CrawlSchedule.objects.first()
schedule = require(CrawlSchedule.objects.first())
self.assertEqual(schedule.schedule, '0 */6 * * *')
def test_add_crawl_with_plugins(self):
@@ -173,7 +184,7 @@ class AddViewTests(TestCase):
self.assertEqual(response.status_code, 302)
crawl = Crawl.objects.first()
crawl = require(Crawl.objects.first())
plugins = crawl.config.get('PLUGINS', '')
# Should contain the selected plugins
@@ -209,7 +220,7 @@ class AddViewTests(TestCase):
self.assertEqual(response.status_code, 302)
crawl = Crawl.objects.first()
crawl = require(Crawl.objects.first())
config = crawl.config
self.assertEqual(config.get('DEFAULT_PERSONA'), 'CustomPersona')
@@ -236,7 +247,7 @@ class AddViewTests(TestCase):
})
self.assertEqual(response.status_code, 302)
crawl = Crawl.objects.order_by('-created_at').first()
crawl = require(Crawl.objects.order_by('-created_at').first())
self.assertNotIn('YTDLP_ARGS_EXTRA', crawl.config)
def test_add_authenticated_non_admin_custom_config_is_silently_stripped(self):
@@ -248,7 +259,7 @@ class AddViewTests(TestCase):
})
self.assertEqual(response.status_code, 302)
crawl = Crawl.objects.order_by('-created_at').first()
crawl = require(Crawl.objects.order_by('-created_at').first())
self.assertNotIn('YTDLP_ARGS_EXTRA', crawl.config)
def test_add_staff_admin_custom_config_is_allowed(self):
@@ -269,7 +280,7 @@ class AddViewTests(TestCase):
})
self.assertEqual(response.status_code, 302)
crawl = Crawl.objects.order_by('-created_at').first()
crawl = require(Crawl.objects.order_by('-created_at').first())
self.assertEqual(crawl.config.get('YTDLP_ARGS_EXTRA'), ['--exec', 'echo hello'])
def test_add_empty_urls_fails(self):
@@ -281,7 +292,7 @@ class AddViewTests(TestCase):
# Should show form again with errors, not redirect
self.assertEqual(response.status_code, 200)
self.assertFormError(response, 'form', 'url', 'This field is required.')
self.assertFormError(cast(BaseForm, response.context['form']), 'url', 'This field is required.')
def test_add_invalid_urls_fails(self):
"""Test that invalid URLs fail validation."""
@@ -355,7 +366,7 @@ class AddViewTests(TestCase):
self.assertEqual(response.status_code, 302)
crawl = Crawl.objects.first()
crawl = require(Crawl.objects.first())
self.assertEqual(crawl.tags_str, 'tag1,tag2,tag3')
def test_crawl_redirects_to_admin_change_page(self):
@@ -365,7 +376,7 @@ class AddViewTests(TestCase):
'depth': '0',
})
crawl = Crawl.objects.first()
crawl = require(Crawl.objects.first())
expected_redirect = f'/admin/crawls/crawl/{crawl.id}/change/'
self.assertRedirects(response, expected_redirect, fetch_redirect_response=False)

View File

@@ -4,6 +4,7 @@ from django.urls import path, re_path, include
from django.views import static
from django.conf import settings
from django.views.generic.base import RedirectView
from django.http import HttpRequest
from archivebox.misc.serve_static import serve_static
@@ -53,7 +54,7 @@ urlpatterns = [
path("api/", include('archivebox.api.urls'), name='api'),
path('health/', HealthCheckView.as_view(), name='healthcheck'),
path('error/', lambda *_: 1/0), # type: ignore
path('error/', lambda request: _raise_test_error(request)),
# path('jet_api/', include('jet_django.urls')), Enable to use https://www.jetadmin.io/integrations/django
@@ -61,6 +62,10 @@ urlpatterns = [
path('', HomepageView.as_view(), name='Home'),
]
def _raise_test_error(_request: HttpRequest):
raise ZeroDivisionError('Intentional test error route')
if settings.DEBUG_TOOLBAR:
urlpatterns += [path('__debug__/', include("debug_toolbar.urls"))]

View File

@@ -5,13 +5,14 @@ import posixpath
from glob import glob, escape
from django.utils import timezone
import inspect
from typing import Callable, get_type_hints
from typing import Callable, cast, get_type_hints
from pathlib import Path
from urllib.parse import urlparse
from django.shortcuts import render, redirect
from django.http import JsonResponse, HttpRequest, HttpResponse, Http404, HttpResponseForbidden
from django.utils.html import format_html, mark_safe
from django.utils.html import format_html
from django.utils.safestring import mark_safe
from django.views import View
from django.views.generic.list import ListView
from django.views.generic import FormView
@@ -21,7 +22,7 @@ from django.contrib.auth.mixins import UserPassesTestMixin
from django.views.decorators.csrf import csrf_exempt
from django.utils.decorators import method_decorator
from admin_data_views.typing import TableContext, ItemContext
from admin_data_views.typing import TableContext, ItemContext, SectionData
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION
@@ -854,7 +855,7 @@ class AddView(UserPassesTestMixin, FormView):
def _can_override_crawl_config(self) -> bool:
user = self.request.user
return bool(user.is_authenticated and (user.is_superuser or user.is_staff))
return bool(user.is_authenticated and (getattr(user, 'is_superuser', False) or getattr(user, 'is_staff', False)))
def _get_custom_config_overrides(self, form: AddLinkForm) -> dict:
custom_config = form.cleaned_data.get("config") or {}
@@ -906,7 +907,7 @@ class AddView(UserPassesTestMixin, FormView):
from archivebox.base_models.models import get_or_create_system_user_pk
created_by_id = get_or_create_system_user_pk()
created_by_name = self.request.user.username if self.request.user.is_authenticated else 'web'
created_by_name = getattr(self.request.user, 'username', 'web') if self.request.user.is_authenticated else 'web'
# 1. save the provided urls to sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{created_by_id}.txt'
@@ -1015,8 +1016,8 @@ class WebAddView(AddView):
return super().dispatch(request, *args, **kwargs)
def get(self, request, url: str):
requested_url = urldecode(url)
def get(self, request: HttpRequest, *args: object, **kwargs: object):
requested_url = urldecode(str(kwargs.get('url') or (args[0] if args else '')))
if not requested_url:
raise Http404
@@ -1025,6 +1026,7 @@ class WebAddView(AddView):
return redirect(f'/{snapshot.url_path}')
add_url = self._normalize_add_url(requested_url)
assert self.form_class is not None
defaults_form = self.form_class()
form_data = {
'url': add_url,
@@ -1045,6 +1047,7 @@ class WebAddView(AddView):
crawl = self._create_crawl_from_form(form)
snapshot = Snapshot.from_json({'url': add_url, 'tags': form.cleaned_data.get('tag', '')}, overrides={'crawl': crawl})
assert snapshot is not None
return redirect(f'/{snapshot.url_path}')
@@ -1385,7 +1388,7 @@ def find_config_type(key: str) -> str:
# Try to get from pydantic model_fields first (more reliable)
if hasattr(config, 'model_fields') and key in config.model_fields:
field = config.model_fields[key]
if hasattr(field, 'annotation'):
if hasattr(field, 'annotation') and field.annotation is not None:
try:
return str(field.annotation.__name__)
except AttributeError:
@@ -1448,7 +1451,7 @@ def find_config_source(key: str, merged_config: dict) -> str:
def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
CONFIGS = get_all_configs()
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
assert getattr(request.user, 'is_superuser', False), 'Must be a superuser to view configuration settings.'
# Get merged config that includes Machine.config overrides
try:
@@ -1519,7 +1522,7 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
CONFIGS = get_all_configs()
FLAT_CONFIG = get_flat_config()
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
assert getattr(request.user, 'is_superuser', False), 'Must be a superuser to view configuration settings.'
# Get merged config
merged_config = get_config()
@@ -1575,62 +1578,62 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
section_header = mark_safe(f'[DYNAMIC CONFIG] &nbsp; <b><code style="color: lightgray">{key}</code></b> &nbsp; <small>(read-only, calculated at runtime)</small>')
section_data = cast(SectionData, {
"name": section_header,
"description": None,
"fields": {
'Key': key,
'Type': find_config_type(key),
'Value': final_value,
'Source': find_config_source(key, merged_config),
},
"help_texts": {
'Key': mark_safe(f'''
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">Documentation</a> &nbsp;
<span style="display: {"inline" if aliases else "none"}">
Aliases: {", ".join(aliases)}
</span>
'''),
'Type': mark_safe(f'''
<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code">
See full definition in <code>archivebox/config</code>...
</a>
'''),
'Value': mark_safe(f'''
{'<b style="color: red">Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)</b><br/><br/>' if not key_is_safe(key) else ''}
<br/><hr/><br/>
<b>Configuration Sources (in priority order):</b><br/><br/>
{sources_html}
<br/><br/>
<p style="display: {"block" if key in FLAT_CONFIG and key not in CONSTANTS_CONFIG else "none"}">
<i>To change this value, edit <code>data/ArchiveBox.conf</code> or run:</i>
<br/><br/>
<code>archivebox config --set {key}="{
val.strip("'")
if (val := find_config_default(key)) else
(str(FLAT_CONFIG[key] if key_is_safe(key) else '********')).strip("'")
}"</code>
</p>
'''),
'Source': mark_safe(f'''
The value shown in the "Value" field comes from the <b>{find_config_source(key, merged_config)}</b> source.
<br/><br/>
Priority order (highest to lowest):
<ol>
<li><b style="color: purple">Machine</b> - Machine-specific overrides (e.g., resolved binary paths)
{f'<br/><a href="{machine_admin_url}">→ Edit <code>{key}</code> in Machine.config for this server</a>' if machine_admin_url else ''}
</li>
<li><b style="color: blue">Environment</b> - Environment variables</li>
<li><b style="color: green">Config File</b> - data/ArchiveBox.conf</li>
<li><b style="color: gray">Default</b> - Default value from code</li>
</ol>
{f'<br/><b>Tip:</b> To override <code>{key}</code> on this machine, <a href="{machine_admin_url}">edit the Machine.config field</a> and add:<br/><code>{{"\\"{key}\\": "your_value_here"}}</code>' if machine_admin_url and key not in CONSTANTS_CONFIG else ''}
'''),
},
})
return ItemContext(
slug=key,
title=key,
data=[
{
"name": section_header,
"description": None,
"fields": {
'Key': key,
'Type': find_config_type(key),
'Value': final_value,
'Source': find_config_source(key, merged_config),
},
"help_texts": {
'Key': mark_safe(f'''
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">Documentation</a> &nbsp;
<span style="display: {"inline" if aliases else "none"}">
Aliases: {", ".join(aliases)}
</span>
'''),
'Type': mark_safe(f'''
<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code">
See full definition in <code>archivebox/config</code>...
</a>
'''),
'Value': mark_safe(f'''
{'<b style="color: red">Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)</b><br/><br/>' if not key_is_safe(key) else ''}
<br/><hr/><br/>
<b>Configuration Sources (in priority order):</b><br/><br/>
{sources_html}
<br/><br/>
<p style="display: {"block" if key in FLAT_CONFIG and key not in CONSTANTS_CONFIG else "none"}">
<i>To change this value, edit <code>data/ArchiveBox.conf</code> or run:</i>
<br/><br/>
<code>archivebox config --set {key}="{
val.strip("'")
if (val := find_config_default(key)) else
(str(FLAT_CONFIG[key] if key_is_safe(key) else '********')).strip("'")
}"</code>
</p>
'''),
'Source': mark_safe(f'''
The value shown in the "Value" field comes from the <b>{find_config_source(key, merged_config)}</b> source.
<br/><br/>
Priority order (highest to lowest):
<ol>
<li><b style="color: purple">Machine</b> - Machine-specific overrides (e.g., resolved binary paths)
{f'<br/><a href="{machine_admin_url}">→ Edit <code>{key}</code> in Machine.config for this server</a>' if machine_admin_url else ''}
</li>
<li><b style="color: blue">Environment</b> - Environment variables</li>
<li><b style="color: green">Config File</b> - data/ArchiveBox.conf</li>
<li><b style="color: gray">Default</b> - Default value from code</li>
</ol>
{f'<br/><b>💡 Tip:</b> To override <code>{key}</code> on this machine, <a href="{machine_admin_url}">edit the Machine.config field</a> and add:<br/><code>{{"\\"{key}\\": "your_value_here"}}</code>' if machine_admin_url and key not in CONSTANTS_CONFIG else ''}
'''),
},
},
],
data=[section_data],
)

View File

@@ -16,7 +16,7 @@ class TagEditorWidget(forms.Widget):
- Press Enter or Space to create new tags (auto-creates if doesn't exist)
- Uses AJAX for autocomplete and tag creation
"""
template_name = None # We render manually
template_name = "" # We render manually
class Media:
css = {'all': []}