WIP: checkpoint working tree before rebasing onto dev

This commit is contained in:
Nick Sweeting
2026-03-22 20:23:45 -07:00
parent a6548df8d0
commit f400a2cd67
87 changed files with 12607 additions and 1808 deletions

View File

@@ -1,8 +1,11 @@
__package__ = 'archivebox.crawls'
from django import forms
from django.utils.html import format_html, format_html_join
from django.http import JsonResponse, HttpRequest, HttpResponseNotAllowed
from django.shortcuts import get_object_or_404, redirect
from django.urls import path, reverse
from django.utils.html import escape, format_html, format_html_join
from django.utils import timezone
from django.utils.safestring import mark_safe
from django.contrib import admin, messages
from django.db.models import Count, Q
@@ -13,16 +16,19 @@ from django_object_actions import action
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
from archivebox.core.models import Snapshot
from archivebox.core.widgets import TagEditorWidget
from archivebox.crawls.models import Crawl, CrawlSchedule
def render_snapshots_list(snapshots_qs, limit=20):
def render_snapshots_list(snapshots_qs, limit=20, crawl=None):
"""Render a nice inline list view of snapshots with status, title, URL, and progress."""
snapshots = snapshots_qs.order_by('-created_at')[:limit].annotate(
total_results=Count('archiveresult'),
succeeded_results=Count('archiveresult', filter=Q(archiveresult__status='succeeded')),
failed_results=Count('archiveresult', filter=Q(archiveresult__status='failed')),
started_results=Count('archiveresult', filter=Q(archiveresult__status='started')),
skipped_results=Count('archiveresult', filter=Q(archiveresult__status='skipped')),
)
if not snapshots:
@@ -43,17 +49,57 @@ def render_snapshots_list(snapshots_qs, limit=20):
# Calculate progress
total = snapshot.total_results
done = snapshot.succeeded_results + snapshot.failed_results
succeeded = snapshot.succeeded_results
failed = snapshot.failed_results
running = snapshot.started_results
skipped = snapshot.skipped_results
done = succeeded + failed + skipped
pending = max(total - done - running, 0)
progress_pct = int((done / total) * 100) if total > 0 else 0
progress_text = f'{done}/{total}' if total > 0 else '-'
progress_title = (
f'{succeeded} succeeded, {failed} failed, {running} running, '
f'{pending} pending, {skipped} skipped'
)
progress_color = '#28a745'
if failed:
progress_color = '#dc3545'
elif running:
progress_color = '#17a2b8'
elif pending:
progress_color = '#ffc107'
# Truncate title and URL
title = (snapshot.title or 'Untitled')[:60]
if len(snapshot.title or '') > 60:
snapshot_title = snapshot.title or 'Untitled'
title = snapshot_title[:60]
if len(snapshot_title) > 60:
title += '...'
url_display = snapshot.url[:50]
if len(snapshot.url) > 50:
url_display += '...'
delete_button = ''
exclude_button = ''
if crawl is not None:
delete_url = reverse('admin:crawls_crawl_snapshot_delete', args=[crawl.pk, snapshot.pk])
exclude_url = reverse('admin:crawls_crawl_snapshot_exclude_domain', args=[crawl.pk, snapshot.pk])
delete_button = f'''
<button type="button"
class="crawl-snapshots-action"
data-post-url="{escape(delete_url)}"
data-confirm="Delete this snapshot from the crawl?"
title="Delete this snapshot from the crawl and remove its URL from the crawl queue."
aria-label="Delete snapshot"
style="border: 1px solid #ddd; background: #fff; color: #666; border-radius: 4px; width: 28px; height: 28px; cursor: pointer;">🗑</button>
'''
exclude_button = f'''
<button type="button"
class="crawl-snapshots-action"
data-post-url="{escape(exclude_url)}"
data-confirm="Exclude this domain from the crawl? This removes matching queued URLs, deletes pending matching snapshots, and blocks future matches."
title="Exclude this domain from this crawl. This removes matching URLs from the crawl queue, deletes pending matching snapshots, and blocks future matches."
aria-label="Exclude domain from crawl"
style="border: 1px solid #ddd; background: #fff; color: #666; border-radius: 4px; width: 28px; height: 28px; cursor: pointer;">⊘</button>
'''
# Format date
date_str = snapshot.created_at.strftime('%Y-%m-%d %H:%M') if snapshot.created_at else '-'
@@ -74,18 +120,18 @@ def render_snapshots_list(snapshots_qs, limit=20):
</td>
<td style="padding: 6px 8px; max-width: 300px;">
<a href="{snapshot.admin_change_url}" style="color: #417690; text-decoration: none; font-weight: 500;"
title="{snapshot.title or 'Untitled'}">{title}</a>
title="{escape(snapshot_title)}">{escape(title)}</a>
</td>
<td style="padding: 6px 8px; max-width: 250px;">
<a href="{snapshot.url}" target="_blank"
<a href="{escape(snapshot.url)}" target="_blank"
style="color: #666; text-decoration: none; font-family: monospace; font-size: 11px;"
title="{snapshot.url}">{url_display}</a>
title="{escape(snapshot.url)}">{escape(url_display)}</a>
</td>
<td style="padding: 6px 8px; white-space: nowrap; text-align: center;">
<div style="display: inline-flex; align-items: center; gap: 6px;">
<div style="display: inline-flex; align-items: center; gap: 6px;" title="{escape(progress_title)}">
<div style="width: 60px; height: 6px; background: #eee; border-radius: 3px; overflow: hidden;">
<div style="width: {progress_pct}%; height: 100%;
background: {'#28a745' if snapshot.failed_results == 0 else '#ffc107' if snapshot.succeeded_results > 0 else '#dc3545'};
background: {progress_color};
transition: width 0.3s;"></div>
</div>
<a href="/admin/core/archiveresult/?snapshot__id__exact={snapshot.id}"
@@ -96,6 +142,7 @@ def render_snapshots_list(snapshots_qs, limit=20):
<td style="padding: 6px 8px; white-space: nowrap; color: #888; font-size: 11px;">
{date_str}
</td>
{"<td style=\"padding: 6px 8px; white-space: nowrap; text-align: right;\"><div style=\"display: inline-flex; gap: 6px;\">%s%s</div></td>" % (exclude_button, delete_button) if crawl is not None else ""}
</tr>
''')
@@ -111,7 +158,7 @@ def render_snapshots_list(snapshots_qs, limit=20):
'''
return mark_safe(f'''
<div style="border: 1px solid #ddd; border-radius: 6px; overflow: hidden; max-width: 100%;">
<div data-crawl-snapshots-list style="border: 1px solid #ddd; border-radius: 6px; overflow: hidden; max-width: 100%;">
<table style="width: 100%; border-collapse: collapse; font-size: 13px;">
<thead>
<tr style="background: #f5f5f5; border-bottom: 2px solid #ddd;">
@@ -121,6 +168,7 @@ def render_snapshots_list(snapshots_qs, limit=20):
<th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">URL</th>
<th style="padding: 8px; text-align: center; font-weight: 600; color: #333;">Progress</th>
<th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">Created</th>
{'<th style="padding: 8px; text-align: right; font-weight: 600; color: #333;">Actions</th>' if crawl is not None else ''}
</tr>
</thead>
<tbody>
@@ -129,11 +177,197 @@ def render_snapshots_list(snapshots_qs, limit=20):
</tbody>
</table>
</div>
{'''
<script>
(function() {
if (window.__archiveboxCrawlSnapshotActionsBound) {
return;
}
window.__archiveboxCrawlSnapshotActionsBound = true;
function getCookie(name) {
var cookieValue = null;
if (!document.cookie) {
return cookieValue;
}
var cookies = document.cookie.split(';');
for (var i = 0; i < cookies.length; i++) {
var cookie = cookies[i].trim();
if (cookie.substring(0, name.length + 1) === (name + '=')) {
cookieValue = decodeURIComponent(cookie.substring(name.length + 1));
break;
}
}
return cookieValue;
}
document.addEventListener('click', function(event) {
var button = event.target.closest('.crawl-snapshots-action');
if (!button) {
return;
}
event.preventDefault();
var confirmMessage = button.getAttribute('data-confirm');
if (confirmMessage && !window.confirm(confirmMessage)) {
return;
}
button.disabled = true;
fetch(button.getAttribute('data-post-url'), {
method: 'POST',
credentials: 'same-origin',
headers: {
'X-CSRFToken': getCookie('csrftoken') || '',
'X-Requested-With': 'XMLHttpRequest'
}
}).then(function(response) {
return response.json().then(function(data) {
if (!response.ok) {
throw new Error(data.error || 'Request failed');
}
return data;
});
}).then(function() {
window.location.reload();
}).catch(function(error) {
button.disabled = false;
window.alert(error.message || 'Request failed');
});
});
})();
</script>
''' if crawl is not None else ''}
''')
class URLFiltersWidget(forms.Widget):
def render(self, name, value, attrs=None, renderer=None):
value = value if isinstance(value, dict) else {}
widget_id = (attrs or {}).get('id', name)
allowlist = escape(value.get('allowlist', '') or '')
denylist = escape(value.get('denylist', '') or '')
return mark_safe(f'''
<div id="{widget_id}_container" style="min-width: 420px;">
<input type="hidden" name="{name}" value="">
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 10px;">
<div>
<label for="{widget_id}_allowlist" style="display: block; font-weight: 600; margin-bottom: 4px;">Allowlist</label>
<textarea id="{widget_id}_allowlist" name="{name}_allowlist" rows="3"
style="width: 100%; font-family: monospace; font-size: 12px;"
placeholder="example.com&#10;*.example.com">{allowlist}</textarea>
</div>
<div>
<label for="{widget_id}_denylist" style="display: block; font-weight: 600; margin-bottom: 4px;">Denylist</label>
<textarea id="{widget_id}_denylist" name="{name}_denylist" rows="3"
style="width: 100%; font-family: monospace; font-size: 12px;"
placeholder="static.example.com">{denylist}</textarea>
</div>
</div>
<label style="display: inline-flex; align-items: center; gap: 6px; margin-top: 8px; font-weight: 500;">
<input type="checkbox" id="{widget_id}_same_domain_only" name="{name}_same_domain_only" value="1">
Same domain only
</label>
<p style="color: #666; font-size: 11px; margin: 6px 0 0 0;">
Enter domains, wildcards, or regex patterns. Denylist takes precedence over allowlist.
</p>
<script>
(function() {{
if (window.__archiveboxUrlFilterEditors && window.__archiveboxUrlFilterEditors['{widget_id}']) {{
return;
}}
window.__archiveboxUrlFilterEditors = window.__archiveboxUrlFilterEditors || {{}};
window.__archiveboxUrlFilterEditors['{widget_id}'] = true;
var urlsField = document.getElementById('id_urls');
var allowlistField = document.getElementById('{widget_id}_allowlist');
var sameDomainOnly = document.getElementById('{widget_id}_same_domain_only');
function extractUrl(line) {{
var trimmed = (line || '').trim();
if (!trimmed || trimmed.charAt(0) === '#') {{
return '';
}}
if (trimmed.charAt(0) === '{{') {{
try {{
var record = JSON.parse(trimmed);
return String(record.url || '').trim();
}} catch (error) {{
return '';
}}
}}
return trimmed;
}}
function syncAllowlistFromUrls() {{
if (!urlsField || !allowlistField || !sameDomainOnly || !sameDomainOnly.checked) {{
return;
}}
var domains = [];
var seen = Object.create(null);
urlsField.value.split(/\\n+/).forEach(function(line) {{
var url = extractUrl(line);
if (!url) {{
return;
}}
try {{
var parsed = new URL(url);
var domain = (parsed.hostname || '').toLowerCase();
if (domain && !seen[domain]) {{
seen[domain] = true;
domains.push(domain);
}}
}} catch (error) {{
return;
}}
}});
allowlistField.value = domains.join('\\n');
}}
if (sameDomainOnly) {{
sameDomainOnly.addEventListener('change', syncAllowlistFromUrls);
}}
if (urlsField) {{
urlsField.addEventListener('input', syncAllowlistFromUrls);
urlsField.addEventListener('change', syncAllowlistFromUrls);
}}
}})();
</script>
</div>
''')
def value_from_datadict(self, data, files, name):
return {
'allowlist': data.get(f'{name}_allowlist', ''),
'denylist': data.get(f'{name}_denylist', ''),
'same_domain_only': data.get(f'{name}_same_domain_only') in ('1', 'on', 'true'),
}
class URLFiltersField(forms.Field):
widget = URLFiltersWidget
def to_python(self, value):
if isinstance(value, dict):
return value
return {'allowlist': '', 'denylist': '', 'same_domain_only': False}
class CrawlAdminForm(forms.ModelForm):
"""Custom form for Crawl admin to render urls field as textarea."""
tags_editor = forms.CharField(
label='Tags',
required=False,
widget=TagEditorWidget(),
help_text='Type tag names and press Enter or Space to add. Click × to remove.',
)
url_filters = URLFiltersField(
label='URL Filters',
required=False,
help_text='Set URL_ALLOWLIST / URL_DENYLIST for this crawl.',
)
class Meta:
model = Crawl
@@ -144,8 +378,62 @@ class CrawlAdminForm(forms.ModelForm):
'style': 'width: 100%; font-family: monospace; font-size: 13px;',
'placeholder': 'https://example.com\nhttps://example2.com\n# Comments start with #',
}),
'notes': forms.Textarea(attrs={
'rows': 1,
'style': 'width: 100%; min-height: 0; resize: vertical;',
}),
}
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
config = dict(self.instance.config or {}) if self.instance and self.instance.pk else {}
if self.instance and self.instance.pk:
self.initial['tags_editor'] = self.instance.tags_str
self.initial['url_filters'] = {
'allowlist': config.get('URL_ALLOWLIST', ''),
'denylist': config.get('URL_DENYLIST', ''),
'same_domain_only': False,
}
def clean_tags_editor(self):
tags_str = self.cleaned_data.get('tags_editor', '')
tag_names = []
seen = set()
for raw_name in tags_str.split(','):
name = raw_name.strip()
if not name:
continue
lowered = name.lower()
if lowered in seen:
continue
seen.add(lowered)
tag_names.append(name)
return ','.join(tag_names)
def clean_url_filters(self):
value = self.cleaned_data.get('url_filters') or {}
return {
'allowlist': '\n'.join(Crawl.split_filter_patterns(value.get('allowlist', ''))),
'denylist': '\n'.join(Crawl.split_filter_patterns(value.get('denylist', ''))),
'same_domain_only': bool(value.get('same_domain_only')),
}
def save(self, commit=True):
instance = super().save(commit=False)
instance.tags_str = self.cleaned_data.get('tags_editor', '')
url_filters = self.cleaned_data.get('url_filters') or {}
instance.set_url_filters(
url_filters.get('allowlist', ''),
url_filters.get('denylist', ''),
)
if commit:
instance.save()
instance.apply_crawl_config_filters()
save_m2m = getattr(self, '_save_m2m', None)
if callable(save_m2m):
save_m2m()
return instance
class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
form = CrawlAdminForm
@@ -161,11 +449,11 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
'classes': ('card', 'wide'),
}),
('Info', {
'fields': ('label', 'notes', 'tags_str'),
'fields': ('label', 'notes', 'tags_editor'),
'classes': ('card',),
}),
('Settings', {
'fields': ('max_depth', 'config'),
'fields': (('max_depth', 'url_filters'), 'config'),
'classes': ('card',),
}),
('Status', {
@@ -185,6 +473,28 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
'classes': ('card', 'wide'),
}),
)
add_fieldsets = (
('URLs', {
'fields': ('urls',),
'classes': ('card', 'wide'),
}),
('Info', {
'fields': ('label', 'notes', 'tags_editor'),
'classes': ('card',),
}),
('Settings', {
'fields': (('max_depth', 'url_filters'), 'config'),
'classes': ('card',),
}),
('Status', {
'fields': ('status', 'retry_at'),
'classes': ('card',),
}),
('Relations', {
'fields': ('schedule', 'created_by'),
'classes': ('card',),
}),
)
list_filter = ('max_depth', 'schedule', 'created_by', 'status', 'retry_at')
ordering = ['-created_at', '-retry_at']
@@ -199,6 +509,25 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
num_snapshots_cached=Count('snapshot_set')
)
def get_fieldsets(self, request, obj=None):
return self.fieldsets if obj else self.add_fieldsets
def get_urls(self):
urls = super().get_urls()
custom_urls = [
path(
'<path:object_id>/snapshot/<path:snapshot_id>/delete/',
self.admin_site.admin_view(self.delete_snapshot_view),
name='crawls_crawl_snapshot_delete',
),
path(
'<path:object_id>/snapshot/<path:snapshot_id>/exclude-domain/',
self.admin_site.admin_view(self.exclude_domain_view),
name='crawls_crawl_snapshot_exclude_domain',
),
]
return custom_urls + urls
@admin.action(description='Delete selected crawls')
def delete_selected_batched(self, request, queryset):
"""Delete crawls in a single transaction to avoid SQLite concurrency issues."""
@@ -218,8 +547,6 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
@action(label='Recrawl', description='Create a new crawl with the same settings')
def recrawl(self, request, obj):
"""Duplicate this crawl as a new crawl with the same URLs and settings."""
from django.utils import timezone
from django.shortcuts import redirect
# Validate URLs (required for crawl to start)
if not obj.urls:
@@ -252,7 +579,37 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
return getattr(obj, 'num_snapshots_cached', obj.snapshot_set.count())
def snapshots(self, obj):
return render_snapshots_list(obj.snapshot_set.all())
return render_snapshots_list(obj.snapshot_set.all(), crawl=obj)
def delete_snapshot_view(self, request: HttpRequest, object_id: str, snapshot_id: str):
if request.method != 'POST':
return HttpResponseNotAllowed(['POST'])
crawl = get_object_or_404(Crawl, pk=object_id)
snapshot = get_object_or_404(Snapshot, pk=snapshot_id, crawl=crawl)
if snapshot.status == Snapshot.StatusChoices.STARTED:
snapshot.cancel_running_hooks()
removed_urls = crawl.prune_url(snapshot.url)
snapshot.delete()
return JsonResponse({
'ok': True,
'snapshot_id': str(snapshot.id),
'removed_urls': removed_urls,
})
def exclude_domain_view(self, request: HttpRequest, object_id: str, snapshot_id: str):
if request.method != 'POST':
return HttpResponseNotAllowed(['POST'])
crawl = get_object_or_404(Crawl, pk=object_id)
snapshot = get_object_or_404(Snapshot, pk=snapshot_id, crawl=crawl)
result = crawl.exclude_domain(snapshot.url)
return JsonResponse({
'ok': True,
**result,
})
@admin.display(description='Schedule', ordering='schedule')
def schedule_str(self, obj):

View File

@@ -2,9 +2,12 @@ __package__ = 'archivebox.crawls'
from typing import TYPE_CHECKING
import uuid
import json
import re
from datetime import timedelta
from archivebox.uuid_compat import uuid7
from pathlib import Path
from urllib.parse import urlparse
from django.db import models
from django.core.validators import MaxValueValidator, MinValueValidator
@@ -141,22 +144,21 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
return f'[...{short_id}] {first_url[:120]}'
def save(self, *args, **kwargs):
is_new = self._state.adding
super().save(*args, **kwargs)
if is_new:
from archivebox.misc.logging_util import log_worker_event
first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
log_worker_event(
worker_type='DB',
event='Created Crawl',
indent_level=1,
metadata={
'id': str(self.id),
'first_url': first_url[:64],
'max_depth': self.max_depth,
'status': self.status,
},
)
# if is_new:
# from archivebox.misc.logging_util import log_worker_event
# first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
# log_worker_event(
# worker_type='DB',
# event='Created Crawl',
# indent_level=1,
# metadata={
# 'id': str(self.id),
# 'first_url': first_url[:64],
# 'max_depth': self.max_depth,
# 'status': self.status,
# },
# )
@property
def api_url(self) -> str:
@@ -248,6 +250,222 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
if url.strip() and not url.strip().startswith('#')
]
@staticmethod
def normalize_domain(value: str) -> str:
candidate = (value or '').strip().lower()
if not candidate:
return ''
if '://' not in candidate and '/' not in candidate:
candidate = f'https://{candidate.lstrip(".")}'
try:
parsed = urlparse(candidate)
hostname = parsed.hostname or ''
if not hostname:
return ''
if parsed.port:
return f'{hostname}_{parsed.port}'
return hostname
except Exception:
return ''
@staticmethod
def split_filter_patterns(value) -> list[str]:
patterns = []
seen = set()
if isinstance(value, list):
raw_values = value
elif isinstance(value, str):
raw_values = value.splitlines()
else:
raw_values = []
for raw_value in raw_values:
pattern = str(raw_value or '').strip()
if not pattern or pattern in seen:
continue
seen.add(pattern)
patterns.append(pattern)
return patterns
@classmethod
def _pattern_matches_url(cls, url: str, pattern: str) -> bool:
normalized_pattern = str(pattern or '').strip()
if not normalized_pattern:
return False
if re.fullmatch(r'[\w.*:-]+', normalized_pattern):
wildcard_only_subdomains = normalized_pattern.startswith('*.')
normalized_domain = cls.normalize_domain(
normalized_pattern[2:] if wildcard_only_subdomains else normalized_pattern
)
normalized_url_domain = cls.normalize_domain(url)
if not normalized_domain or not normalized_url_domain:
return False
pattern_host = normalized_domain.split('_', 1)[0]
url_host = normalized_url_domain.split('_', 1)[0]
if wildcard_only_subdomains:
return url_host.endswith(f'.{pattern_host}')
if normalized_url_domain == normalized_domain:
return True
return url_host == pattern_host or url_host.endswith(f'.{pattern_host}')
try:
return bool(re.search(normalized_pattern, url))
except re.error:
return False
def get_url_allowlist(self, *, use_effective_config: bool = False, snapshot=None) -> list[str]:
if use_effective_config:
from archivebox.config.configset import get_config
config = get_config(crawl=self, snapshot=snapshot)
else:
config = self.config or {}
return self.split_filter_patterns(config.get('URL_ALLOWLIST', ''))
def get_url_denylist(self, *, use_effective_config: bool = False, snapshot=None) -> list[str]:
if use_effective_config:
from archivebox.config.configset import get_config
config = get_config(crawl=self, snapshot=snapshot)
else:
config = self.config or {}
return self.split_filter_patterns(config.get('URL_DENYLIST', ''))
def url_passes_filters(self, url: str, *, snapshot=None, use_effective_config: bool = True) -> bool:
denylist = self.get_url_denylist(use_effective_config=use_effective_config, snapshot=snapshot)
allowlist = self.get_url_allowlist(use_effective_config=use_effective_config, snapshot=snapshot)
for pattern in denylist:
if self._pattern_matches_url(url, pattern):
return False
if allowlist:
return any(self._pattern_matches_url(url, pattern) for pattern in allowlist)
return True
def set_url_filters(self, allowlist, denylist) -> None:
config = dict(self.config or {})
allow_patterns = self.split_filter_patterns(allowlist)
deny_patterns = self.split_filter_patterns(denylist)
if allow_patterns:
config['URL_ALLOWLIST'] = '\n'.join(allow_patterns)
else:
config.pop('URL_ALLOWLIST', None)
if deny_patterns:
config['URL_DENYLIST'] = '\n'.join(deny_patterns)
else:
config.pop('URL_DENYLIST', None)
self.config = config
def apply_crawl_config_filters(self) -> dict[str, int]:
from archivebox.core.models import Snapshot
removed_urls = self.prune_urls(
lambda url: not self.url_passes_filters(url, use_effective_config=False)
)
filtered_snapshots = [
snapshot
for snapshot in self.snapshot_set.filter(
status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED],
).only('pk', 'url', 'status')
if not self.url_passes_filters(snapshot.url, snapshot=snapshot, use_effective_config=False)
]
deleted_snapshots = 0
if filtered_snapshots:
started_snapshots = [
snapshot for snapshot in filtered_snapshots
if snapshot.status == Snapshot.StatusChoices.STARTED
]
for snapshot in started_snapshots:
snapshot.cancel_running_hooks()
filtered_snapshot_ids = [snapshot.pk for snapshot in filtered_snapshots]
deleted_snapshots, _ = self.snapshot_set.filter(pk__in=filtered_snapshot_ids).delete()
return {
'removed_urls': len(removed_urls),
'deleted_snapshots': deleted_snapshots,
}
def _iter_url_lines(self) -> list[tuple[str, str]]:
entries: list[tuple[str, str]] = []
for raw_line in (self.urls or '').splitlines():
stripped = raw_line.strip()
if not stripped:
continue
if stripped.startswith('#'):
entries.append((raw_line.rstrip(), ''))
continue
try:
entry = json.loads(stripped)
entries.append((raw_line.rstrip(), str(entry.get('url', '') or '').strip()))
except json.JSONDecodeError:
entries.append((raw_line.rstrip(), stripped))
return entries
def prune_urls(self, predicate) -> list[str]:
kept_lines: list[str] = []
removed_urls: list[str] = []
for raw_line, url in self._iter_url_lines():
if not url:
kept_lines.append(raw_line)
continue
if predicate(url):
removed_urls.append(url)
continue
kept_lines.append(raw_line)
next_urls = '\n'.join(kept_lines)
if next_urls != (self.urls or ''):
self.urls = next_urls
self.save(update_fields=['urls', 'modified_at'])
return removed_urls
def prune_url(self, url: str) -> int:
target = (url or '').strip()
removed = self.prune_urls(lambda candidate: candidate == target)
return len(removed)
def exclude_domain(self, domain: str) -> dict[str, int | str | bool]:
normalized_domain = self.normalize_domain(domain)
if not normalized_domain:
return {
'domain': '',
'created': False,
'removed_urls': 0,
'deleted_snapshots': 0,
}
domains = self.get_url_denylist(use_effective_config=False)
created = normalized_domain not in domains
if created:
domains.append(normalized_domain)
self.set_url_filters(
self.get_url_allowlist(use_effective_config=False),
domains,
)
self.save(update_fields=['config', 'modified_at'])
filter_result = self.apply_crawl_config_filters()
return {
'domain': normalized_domain,
'created': created,
'removed_urls': filter_result['removed_urls'],
'deleted_snapshots': filter_result['deleted_snapshots'],
}
def get_system_task(self) -> str | None:
urls = self.get_urls_list()
if len(urls) != 1:
@@ -284,11 +502,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
Returns:
True if URL was added, False if skipped (duplicate or depth exceeded)
"""
import json
from archivebox.misc.util import fix_url_from_markdown
url = entry.get('url', '')
url = fix_url_from_markdown(str(entry.get('url', '') or '').strip())
if not url:
return False
if not self.url_passes_filters(url):
return False
depth = entry.get('depth', 1)
@@ -301,20 +521,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
return False
# Check if already in urls (parse existing JSONL entries)
existing_urls = set()
for line in self.urls.splitlines():
if not line.strip():
continue
try:
existing_entry = json.loads(line)
existing_urls.add(existing_entry.get('url', ''))
except json.JSONDecodeError:
existing_urls.add(line.strip())
existing_urls = {url for _raw_line, url in self._iter_url_lines() if url}
if url in existing_urls:
return False
# Append as JSONL
entry = {**entry, 'url': url}
jsonl_entry = json.dumps(entry)
self.urls = (self.urls.rstrip() + '\n' + jsonl_entry).lstrip('\n')
self.save(update_fields=['urls', 'modified_at'])
@@ -327,15 +540,11 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
Returns:
List of newly created Snapshot objects
"""
import sys
import json
from archivebox.core.models import Snapshot
from archivebox.misc.util import fix_url_from_markdown
created_snapshots = []
print(f'[cyan]DEBUG create_snapshots_from_urls: self.urls={repr(self.urls)}[/cyan]', file=sys.stderr)
print(f'[cyan]DEBUG create_snapshots_from_urls: lines={self.urls.splitlines()}[/cyan]', file=sys.stderr)
for line in self.urls.splitlines():
if not line.strip():
continue
@@ -343,13 +552,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
# Parse JSONL or plain URL
try:
entry = json.loads(line)
url = entry.get('url', '')
url = fix_url_from_markdown(str(entry.get('url', '') or '').strip())
depth = entry.get('depth', 0)
title = entry.get('title')
timestamp = entry.get('timestamp')
tags = entry.get('tags', '')
except json.JSONDecodeError:
url = line.strip()
url = fix_url_from_markdown(line.strip())
depth = 0
title = None
timestamp = None
@@ -357,6 +566,8 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
if not url:
continue
if not self.url_passes_filters(url):
continue
# Skip if depth exceeds max_depth
if depth > self.max_depth: