remove Seed model in favor of Crawl as template

This commit is contained in:
Nick Sweeting
2025-12-25 01:52:38 -08:00
parent 28e6c5bb65
commit bb53228ebf
30 changed files with 785 additions and 690 deletions

View File

@@ -33,15 +33,18 @@ GLOBAL_CONTEXT = {}
class SnapshotActionForm(ActionForm):
tags = forms.ModelMultipleChoiceField(
label='Edit tags',
queryset=Tag.objects.all(),
required=False,
widget=FilteredSelectMultiple(
'core_tag__name',
False,
),
)
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Define tags field in __init__ to avoid database access during app initialization
self.fields['tags'] = forms.ModelMultipleChoiceField(
label='Edit tags',
queryset=Tag.objects.all(),
required=False,
widget=FilteredSelectMultiple(
'core_tag__name',
False,
),
)
# TODO: allow selecting actions for specific extractors? is this useful?
# extractor = forms.ChoiceField(
@@ -165,14 +168,69 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
def admin_actions(self, obj):
return format_html(
# URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
'''
<a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/archive/{}">Summary page ➡️</a> &nbsp; &nbsp;
<a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/archive/{}/index.html#all">Result files 📑</a> &nbsp; &nbsp;
<a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/admin/core/snapshot/?id__exact={}">Admin actions ⚙️</a>
<div style="display: flex; flex-wrap: wrap; gap: 12px; align-items: center;">
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/archive/{}"
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
📄 Summary Page
</a>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/archive/{}/index.html#all"
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
📁 Result Files
</a>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="{}"
target="_blank"
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
🔗 Original URL
</a>
<span style="border-left: 1px solid #e2e8f0; height: 24px; margin: 0 4px;"></span>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #ecfdf5; border: 1px solid #a7f3d0; border-radius: 8px; color: #065f46; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/admin/core/snapshot/?id__exact={}"
title="Get missing extractors"
onmouseover="this.style.background='#d1fae5';"
onmouseout="this.style.background='#ecfdf5';">
⬇️ Get Missing
</a>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #eff6ff; border: 1px solid #bfdbfe; border-radius: 8px; color: #1e40af; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/admin/core/snapshot/?id__exact={}"
title="Create a fresh new snapshot of this URL"
onmouseover="this.style.background='#dbeafe';"
onmouseout="this.style.background='#eff6ff';">
🆕 Archive Again
</a>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #fffbeb; border: 1px solid #fde68a; border-radius: 8px; color: #92400e; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/admin/core/snapshot/?id__exact={}"
title="Re-run all extractors (overwrite existing)"
onmouseover="this.style.background='#fef3c7';"
onmouseout="this.style.background='#fffbeb';">
🔄 Redo All
</a>
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #fef2f2; border: 1px solid #fecaca; border-radius: 8px; color: #991b1b; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
href="/admin/core/snapshot/?id__exact={}"
title="Permanently delete this snapshot"
onmouseover="this.style.background='#fee2e2';"
onmouseout="this.style.background='#fef2f2';">
☠️ Delete
</a>
</div>
<p style="margin-top: 12px; font-size: 12px; color: #64748b;">
<b>Tip:</b> Action buttons link to the list view with this snapshot pre-selected. Select it and use the action dropdown to execute.
</p>
''',
obj.timestamp,
obj.timestamp,
obj.url,
obj.pk,
obj.pk,
obj.pk,
obj.pk,
)

View File

@@ -0,0 +1,101 @@
# Generated by Django 6.0 on 2025-12-25 09:34
import archivebox.base_models.models
import django.db.models.deletion
import django.utils.timezone
import uuid
from django.conf import settings
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0025_allow_duplicate_urls_per_crawl'),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.RemoveField(
model_name='archiveresult',
name='output_dir',
),
migrations.RemoveField(
model_name='snapshot',
name='output_dir',
),
migrations.AlterField(
model_name='archiveresult',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AlterField(
model_name='archiveresult',
name='created_by',
field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresult_set', to=settings.AUTH_USER_MODEL),
),
migrations.AlterField(
model_name='archiveresult',
name='extractor',
field=models.CharField(db_index=True, max_length=32),
),
migrations.AlterField(
model_name='archiveresult',
name='id',
field=models.AutoField(editable=False, primary_key=True, serialize=False),
),
migrations.AlterField(
model_name='archiveresult',
name='status',
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
),
migrations.AlterField(
model_name='archiveresult',
name='uuid',
field=models.UUIDField(blank=True, db_index=True, default=uuid.uuid7, null=True, unique=True),
),
migrations.AlterField(
model_name='snapshot',
name='bookmarked_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AlterField(
model_name='snapshot',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AlterField(
model_name='snapshot',
name='created_by',
field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to=settings.AUTH_USER_MODEL),
),
migrations.AlterField(
model_name='snapshot',
name='downloaded_at',
field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='id',
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
# migrations.AlterField(
# model_name='snapshot',
# name='tags',
# field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'),
# ),
migrations.AlterField(
model_name='snapshottag',
name='id',
field=models.AutoField(primary_key=True, serialize=False),
),
migrations.AlterField(
model_name='tag',
name='created_by',
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
),
migrations.AlterUniqueTogether(
name='snapshottag',
unique_together={('snapshot', 'tag')},
),
]

View File

@@ -59,7 +59,7 @@ INSTALLED_APPS = [
"config", # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
"machine", # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
"workers", # handles starting and managing background workers and processes (orchestrators and actors)
"crawls", # handles Seed, Crawl, and CrawlSchedule models and management
"crawls", # handles Crawl and CrawlSchedule models and management
"personas", # handles Persona and session management
"core", # core django model with Snapshot, ArchiveResult, etc.
"api", # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
@@ -194,10 +194,6 @@ DATABASES = {
"NAME": DATABASE_NAME,
**SQLITE_CONNECTION_OPTIONS,
},
"queue": {
"NAME": CONSTANTS.QUEUE_DATABASE_FILE,
**SQLITE_CONNECTION_OPTIONS,
},
# "filestore": {
# "NAME": CONSTANTS.FILESTORE_DATABASE_FILE,
# **SQLITE_CONNECTION_OPTIONS,

View File

@@ -2,8 +2,6 @@ __package__ = 'archivebox.core'
import re
import os
import shutil
import tempfile
import logging
@@ -11,7 +9,6 @@ import pydantic
import django.template
from archivebox.config import CONSTANTS
from archivebox.misc.logging import IS_TTY
IGNORABLE_URL_PATTERNS = [
@@ -79,7 +76,6 @@ SETTINGS_LOGGING = {
"formatters": {
"rich": {
"datefmt": "[%Y-%m-%d %H:%M:%S]",
# "format": "{asctime} {levelname} {module} {name} {message} {username}",
"format": "%(name)s %(message)s",
},
"outbound_webhooks": {
@@ -99,26 +95,13 @@ SETTINGS_LOGGING = {
},
},
"handlers": {
# "console": {
# "level": "DEBUG",
# 'formatter': 'simple',
# "class": "logging.StreamHandler",
# 'filters': ['noisyrequestsfilter', 'add_extra_logging_attrs'],
# },
"default": {
"class": "rich.logging.RichHandler",
"formatter": "rich",
"level": "DEBUG",
"markup": False,
"rich_tracebacks": IS_TTY,
"rich_tracebacks": False, # Use standard Python tracebacks (no frame/box)
"filters": ["noisyrequestsfilter"],
"tracebacks_suppress": [
django,
pydantic,
],
"tracebacks_width": shutil.get_terminal_size((100, 10)).columns - 1,
"tracebacks_word_wrap": False,
"tracebacks_show_locals": False,
},
"logfile": {
"level": "INFO",
@@ -132,7 +115,7 @@ SETTINGS_LOGGING = {
"outbound_webhooks": {
"class": "rich.logging.RichHandler",
"markup": False,
"rich_tracebacks": True,
"rich_tracebacks": False, # Use standard Python tracebacks (no frame/box)
"formatter": "outbound_webhooks",
},
# "mail_admins": {

View File

@@ -15,7 +15,7 @@ from statemachine import State, StateMachine
# from workers.actor import ActorType
from core.models import Snapshot, ArchiveResult
from crawls.models import Crawl, Seed
from crawls.models import Crawl
class SnapshotMachine(StateMachine, strict_states=True):
@@ -247,17 +247,14 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
)
self.archiveresult.save(write_indexes=True)
# Increment health stats on ArchiveResult, Snapshot, and optionally Crawl/Seed
# Increment health stats on ArchiveResult, Snapshot, and optionally Crawl
ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
# Also update Crawl and Seed health stats if snapshot has a crawl
# Also update Crawl health stats if snapshot has a crawl
snapshot = self.archiveresult.snapshot
if snapshot.crawl_id:
Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
crawl = Crawl.objects.filter(pk=snapshot.crawl_id).values_list('seed_id', flat=True).first()
if crawl:
Seed.objects.filter(pk=crawl).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
@failed.enter
def enter_failed(self):
@@ -268,17 +265,14 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
end_ts=timezone.now(),
)
# Increment health stats on ArchiveResult, Snapshot, and optionally Crawl/Seed
# Increment health stats on ArchiveResult, Snapshot, and optionally Crawl
ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_failed=F('num_uses_failed') + 1)
Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_failed=F('num_uses_failed') + 1)
# Also update Crawl and Seed health stats if snapshot has a crawl
# Also update Crawl health stats if snapshot has a crawl
snapshot = self.archiveresult.snapshot
if snapshot.crawl_id:
Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_failed=F('num_uses_failed') + 1)
crawl = Crawl.objects.filter(pk=snapshot.crawl_id).values_list('seed_id', flat=True).first()
if crawl:
Seed.objects.filter(pk=crawl).update(num_uses_failed=F('num_uses_failed') + 1)
@skipped.enter
def enter_skipped(self):

View File

@@ -33,7 +33,7 @@ from archivebox.search import query_search_index
from core.models import Snapshot
from core.forms import AddLinkForm
from crawls.models import Seed, Crawl
from crawls.models import Crawl
from archivebox.hooks import get_extractors, get_extractor_name
@@ -119,7 +119,11 @@ class SnapshotView(View):
if result_file.name in existing_files or result_file.name == 'index.html':
continue
file_size = result_file.stat().st_size or 0
# Skip circular symlinks and other stat() failures
try:
file_size = result_file.stat().st_size or 0
except OSError:
continue
if file_size > min_size_threshold:
archiveresults[result_file.name] = {
@@ -471,14 +475,16 @@ class AddView(UserPassesTestMixin, FormView):
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{self.request.user.pk}.txt'
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
# 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
# 2. create a new Crawl with the URLs from the file
timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
seed = Seed.from_file(
sources_file,
urls_content = sources_file.read_text()
crawl = Crawl.objects.create(
urls=urls_content,
extractor=parser,
max_depth=depth,
tags_str=tag,
label=f'{self.request.user.username}@{HOSTNAME}{self.request.path} {timestamp}',
parser=parser,
tag=tag,
created_by=self.request.user.pk,
created_by_id=self.request.user.pk,
config={
# 'ONLY_NEW': not update,
# 'INDEX_ONLY': index_only,
@@ -486,9 +492,8 @@ class AddView(UserPassesTestMixin, FormView):
'DEPTH': depth,
'EXTRACTORS': extractors or '',
# 'DEFAULT_PERSONA': persona or 'Default',
})
# 3. create a new Crawl pointing to the Seed
crawl = Crawl.from_seed(seed, max_depth=depth)
}
)
# 4. start the Orchestrator & wait until it completes
# ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
@@ -569,19 +574,7 @@ def live_progress_view(request):
# Count URLs in the crawl (for when snapshots haven't been created yet)
urls_count = 0
if crawl.urls:
urls_count = len([u for u in crawl.urls.split('\n') if u.strip()])
elif crawl.seed and crawl.seed.uri:
# Try to get URL count from seed
if crawl.seed.uri.startswith('file:///'):
try:
from pathlib import Path
seed_file = Path(crawl.seed.uri.replace('file://', ''))
if seed_file.exists():
urls_count = len([l for l in seed_file.read_text().split('\n') if l.strip() and not l.startswith('#')])
except:
pass
else:
urls_count = 1 # Single URL seed
urls_count = len([u for u in crawl.urls.split('\n') if u.strip() and not u.startswith('#')])
# Calculate crawl progress
crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0
@@ -635,8 +628,8 @@ def live_progress_view(request):
})
# Check if crawl can start (for debugging stuck crawls)
can_start = bool(crawl.seed and crawl.seed.uri)
seed_uri = crawl.seed.uri[:60] if crawl.seed and crawl.seed.uri else None
can_start = bool(crawl.urls)
urls_preview = crawl.urls[:60] if crawl.urls else None
# Check if retry_at is in the future (would prevent worker from claiming)
retry_at_future = crawl.retry_at > timezone.now() if crawl.retry_at else False
@@ -657,7 +650,7 @@ def live_progress_view(request):
'pending_snapshots': pending_snapshots,
'active_snapshots': active_snapshots_for_crawl,
'can_start': can_start,
'seed_uri': seed_uri,
'urls_preview': urls_preview,
'retry_at_future': retry_at_future,
'seconds_until_retry': seconds_until_retry,
})