mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 23:37:58 +10:00
remove Seed model in favor of Crawl as template
This commit is contained in:
@@ -33,15 +33,18 @@ GLOBAL_CONTEXT = {}
|
||||
|
||||
|
||||
class SnapshotActionForm(ActionForm):
|
||||
tags = forms.ModelMultipleChoiceField(
|
||||
label='Edit tags',
|
||||
queryset=Tag.objects.all(),
|
||||
required=False,
|
||||
widget=FilteredSelectMultiple(
|
||||
'core_tag__name',
|
||||
False,
|
||||
),
|
||||
)
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
# Define tags field in __init__ to avoid database access during app initialization
|
||||
self.fields['tags'] = forms.ModelMultipleChoiceField(
|
||||
label='Edit tags',
|
||||
queryset=Tag.objects.all(),
|
||||
required=False,
|
||||
widget=FilteredSelectMultiple(
|
||||
'core_tag__name',
|
||||
False,
|
||||
),
|
||||
)
|
||||
|
||||
# TODO: allow selecting actions for specific extractors? is this useful?
|
||||
# extractor = forms.ChoiceField(
|
||||
@@ -165,14 +168,69 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
|
||||
def admin_actions(self, obj):
|
||||
return format_html(
|
||||
# URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
|
||||
'''
|
||||
<a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/archive/{}">Summary page ➡️</a>
|
||||
<a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/archive/{}/index.html#all">Result files 📑</a>
|
||||
<a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/admin/core/snapshot/?id__exact={}">Admin actions ⚙️</a>
|
||||
<div style="display: flex; flex-wrap: wrap; gap: 12px; align-items: center;">
|
||||
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
|
||||
href="/archive/{}"
|
||||
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
|
||||
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
|
||||
📄 Summary Page
|
||||
</a>
|
||||
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
|
||||
href="/archive/{}/index.html#all"
|
||||
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
|
||||
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
|
||||
📁 Result Files
|
||||
</a>
|
||||
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; color: #334155; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
|
||||
href="{}"
|
||||
target="_blank"
|
||||
onmouseover="this.style.background='#f1f5f9'; this.style.borderColor='#cbd5e1';"
|
||||
onmouseout="this.style.background='#f8fafc'; this.style.borderColor='#e2e8f0';">
|
||||
🔗 Original URL
|
||||
</a>
|
||||
|
||||
<span style="border-left: 1px solid #e2e8f0; height: 24px; margin: 0 4px;"></span>
|
||||
|
||||
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #ecfdf5; border: 1px solid #a7f3d0; border-radius: 8px; color: #065f46; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
|
||||
href="/admin/core/snapshot/?id__exact={}"
|
||||
title="Get missing extractors"
|
||||
onmouseover="this.style.background='#d1fae5';"
|
||||
onmouseout="this.style.background='#ecfdf5';">
|
||||
⬇️ Get Missing
|
||||
</a>
|
||||
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #eff6ff; border: 1px solid #bfdbfe; border-radius: 8px; color: #1e40af; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
|
||||
href="/admin/core/snapshot/?id__exact={}"
|
||||
title="Create a fresh new snapshot of this URL"
|
||||
onmouseover="this.style.background='#dbeafe';"
|
||||
onmouseout="this.style.background='#eff6ff';">
|
||||
🆕 Archive Again
|
||||
</a>
|
||||
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #fffbeb; border: 1px solid #fde68a; border-radius: 8px; color: #92400e; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
|
||||
href="/admin/core/snapshot/?id__exact={}"
|
||||
title="Re-run all extractors (overwrite existing)"
|
||||
onmouseover="this.style.background='#fef3c7';"
|
||||
onmouseout="this.style.background='#fffbeb';">
|
||||
🔄 Redo All
|
||||
</a>
|
||||
<a class="btn" style="display: inline-flex; align-items: center; gap: 6px; padding: 10px 16px; background: #fef2f2; border: 1px solid #fecaca; border-radius: 8px; color: #991b1b; text-decoration: none; font-size: 14px; font-weight: 500; transition: all 0.15s;"
|
||||
href="/admin/core/snapshot/?id__exact={}"
|
||||
title="Permanently delete this snapshot"
|
||||
onmouseover="this.style.background='#fee2e2';"
|
||||
onmouseout="this.style.background='#fef2f2';">
|
||||
☠️ Delete
|
||||
</a>
|
||||
</div>
|
||||
<p style="margin-top: 12px; font-size: 12px; color: #64748b;">
|
||||
<b>Tip:</b> Action buttons link to the list view with this snapshot pre-selected. Select it and use the action dropdown to execute.
|
||||
</p>
|
||||
''',
|
||||
obj.timestamp,
|
||||
obj.timestamp,
|
||||
obj.url,
|
||||
obj.pk,
|
||||
obj.pk,
|
||||
obj.pk,
|
||||
obj.pk,
|
||||
)
|
||||
|
||||
|
||||
@@ -0,0 +1,101 @@
|
||||
# Generated by Django 6.0 on 2025-12-25 09:34
|
||||
|
||||
import archivebox.base_models.models
|
||||
import django.db.models.deletion
|
||||
import django.utils.timezone
|
||||
import uuid
|
||||
from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0025_allow_duplicate_urls_per_crawl'),
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='output_dir',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='snapshot',
|
||||
name='output_dir',
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='created_at',
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresult_set', to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='extractor',
|
||||
field=models.CharField(db_index=True, max_length=32),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='id',
|
||||
field=models.AutoField(editable=False, primary_key=True, serialize=False),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='status',
|
||||
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='uuid',
|
||||
field=models.UUIDField(blank=True, db_index=True, default=uuid.uuid7, null=True, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='bookmarked_at',
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='created_at',
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='downloaded_at',
|
||||
field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
# migrations.AlterField(
|
||||
# model_name='snapshot',
|
||||
# name='tags',
|
||||
# field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'),
|
||||
# ),
|
||||
migrations.AlterField(
|
||||
model_name='snapshottag',
|
||||
name='id',
|
||||
field=models.AutoField(primary_key=True, serialize=False),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='created_by',
|
||||
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
migrations.AlterUniqueTogether(
|
||||
name='snapshottag',
|
||||
unique_together={('snapshot', 'tag')},
|
||||
),
|
||||
]
|
||||
@@ -59,7 +59,7 @@ INSTALLED_APPS = [
|
||||
"config", # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
|
||||
"machine", # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
|
||||
"workers", # handles starting and managing background workers and processes (orchestrators and actors)
|
||||
"crawls", # handles Seed, Crawl, and CrawlSchedule models and management
|
||||
"crawls", # handles Crawl and CrawlSchedule models and management
|
||||
"personas", # handles Persona and session management
|
||||
"core", # core django model with Snapshot, ArchiveResult, etc.
|
||||
"api", # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
|
||||
@@ -194,10 +194,6 @@ DATABASES = {
|
||||
"NAME": DATABASE_NAME,
|
||||
**SQLITE_CONNECTION_OPTIONS,
|
||||
},
|
||||
"queue": {
|
||||
"NAME": CONSTANTS.QUEUE_DATABASE_FILE,
|
||||
**SQLITE_CONNECTION_OPTIONS,
|
||||
},
|
||||
# "filestore": {
|
||||
# "NAME": CONSTANTS.FILESTORE_DATABASE_FILE,
|
||||
# **SQLITE_CONNECTION_OPTIONS,
|
||||
|
||||
@@ -2,8 +2,6 @@ __package__ = 'archivebox.core'
|
||||
|
||||
import re
|
||||
import os
|
||||
|
||||
import shutil
|
||||
import tempfile
|
||||
import logging
|
||||
|
||||
@@ -11,7 +9,6 @@ import pydantic
|
||||
import django.template
|
||||
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.misc.logging import IS_TTY
|
||||
|
||||
|
||||
IGNORABLE_URL_PATTERNS = [
|
||||
@@ -79,7 +76,6 @@ SETTINGS_LOGGING = {
|
||||
"formatters": {
|
||||
"rich": {
|
||||
"datefmt": "[%Y-%m-%d %H:%M:%S]",
|
||||
# "format": "{asctime} {levelname} {module} {name} {message} {username}",
|
||||
"format": "%(name)s %(message)s",
|
||||
},
|
||||
"outbound_webhooks": {
|
||||
@@ -99,26 +95,13 @@ SETTINGS_LOGGING = {
|
||||
},
|
||||
},
|
||||
"handlers": {
|
||||
# "console": {
|
||||
# "level": "DEBUG",
|
||||
# 'formatter': 'simple',
|
||||
# "class": "logging.StreamHandler",
|
||||
# 'filters': ['noisyrequestsfilter', 'add_extra_logging_attrs'],
|
||||
# },
|
||||
"default": {
|
||||
"class": "rich.logging.RichHandler",
|
||||
"formatter": "rich",
|
||||
"level": "DEBUG",
|
||||
"markup": False,
|
||||
"rich_tracebacks": IS_TTY,
|
||||
"rich_tracebacks": False, # Use standard Python tracebacks (no frame/box)
|
||||
"filters": ["noisyrequestsfilter"],
|
||||
"tracebacks_suppress": [
|
||||
django,
|
||||
pydantic,
|
||||
],
|
||||
"tracebacks_width": shutil.get_terminal_size((100, 10)).columns - 1,
|
||||
"tracebacks_word_wrap": False,
|
||||
"tracebacks_show_locals": False,
|
||||
},
|
||||
"logfile": {
|
||||
"level": "INFO",
|
||||
@@ -132,7 +115,7 @@ SETTINGS_LOGGING = {
|
||||
"outbound_webhooks": {
|
||||
"class": "rich.logging.RichHandler",
|
||||
"markup": False,
|
||||
"rich_tracebacks": True,
|
||||
"rich_tracebacks": False, # Use standard Python tracebacks (no frame/box)
|
||||
"formatter": "outbound_webhooks",
|
||||
},
|
||||
# "mail_admins": {
|
||||
|
||||
@@ -15,7 +15,7 @@ from statemachine import State, StateMachine
|
||||
# from workers.actor import ActorType
|
||||
|
||||
from core.models import Snapshot, ArchiveResult
|
||||
from crawls.models import Crawl, Seed
|
||||
from crawls.models import Crawl
|
||||
|
||||
|
||||
class SnapshotMachine(StateMachine, strict_states=True):
|
||||
@@ -247,17 +247,14 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||
)
|
||||
self.archiveresult.save(write_indexes=True)
|
||||
|
||||
# Increment health stats on ArchiveResult, Snapshot, and optionally Crawl/Seed
|
||||
# Increment health stats on ArchiveResult, Snapshot, and optionally Crawl
|
||||
ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
|
||||
Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
|
||||
|
||||
# Also update Crawl and Seed health stats if snapshot has a crawl
|
||||
# Also update Crawl health stats if snapshot has a crawl
|
||||
snapshot = self.archiveresult.snapshot
|
||||
if snapshot.crawl_id:
|
||||
Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
|
||||
crawl = Crawl.objects.filter(pk=snapshot.crawl_id).values_list('seed_id', flat=True).first()
|
||||
if crawl:
|
||||
Seed.objects.filter(pk=crawl).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
|
||||
|
||||
@failed.enter
|
||||
def enter_failed(self):
|
||||
@@ -268,17 +265,14 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||
end_ts=timezone.now(),
|
||||
)
|
||||
|
||||
# Increment health stats on ArchiveResult, Snapshot, and optionally Crawl/Seed
|
||||
# Increment health stats on ArchiveResult, Snapshot, and optionally Crawl
|
||||
ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_failed=F('num_uses_failed') + 1)
|
||||
Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_failed=F('num_uses_failed') + 1)
|
||||
|
||||
# Also update Crawl and Seed health stats if snapshot has a crawl
|
||||
# Also update Crawl health stats if snapshot has a crawl
|
||||
snapshot = self.archiveresult.snapshot
|
||||
if snapshot.crawl_id:
|
||||
Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_failed=F('num_uses_failed') + 1)
|
||||
crawl = Crawl.objects.filter(pk=snapshot.crawl_id).values_list('seed_id', flat=True).first()
|
||||
if crawl:
|
||||
Seed.objects.filter(pk=crawl).update(num_uses_failed=F('num_uses_failed') + 1)
|
||||
|
||||
@skipped.enter
|
||||
def enter_skipped(self):
|
||||
|
||||
@@ -33,7 +33,7 @@ from archivebox.search import query_search_index
|
||||
|
||||
from core.models import Snapshot
|
||||
from core.forms import AddLinkForm
|
||||
from crawls.models import Seed, Crawl
|
||||
from crawls.models import Crawl
|
||||
from archivebox.hooks import get_extractors, get_extractor_name
|
||||
|
||||
|
||||
@@ -119,7 +119,11 @@ class SnapshotView(View):
|
||||
if result_file.name in existing_files or result_file.name == 'index.html':
|
||||
continue
|
||||
|
||||
file_size = result_file.stat().st_size or 0
|
||||
# Skip circular symlinks and other stat() failures
|
||||
try:
|
||||
file_size = result_file.stat().st_size or 0
|
||||
except OSError:
|
||||
continue
|
||||
|
||||
if file_size > min_size_threshold:
|
||||
archiveresults[result_file.name] = {
|
||||
@@ -471,14 +475,16 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{self.request.user.pk}.txt'
|
||||
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
|
||||
|
||||
# 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
|
||||
# 2. create a new Crawl with the URLs from the file
|
||||
timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
|
||||
seed = Seed.from_file(
|
||||
sources_file,
|
||||
urls_content = sources_file.read_text()
|
||||
crawl = Crawl.objects.create(
|
||||
urls=urls_content,
|
||||
extractor=parser,
|
||||
max_depth=depth,
|
||||
tags_str=tag,
|
||||
label=f'{self.request.user.username}@{HOSTNAME}{self.request.path} {timestamp}',
|
||||
parser=parser,
|
||||
tag=tag,
|
||||
created_by=self.request.user.pk,
|
||||
created_by_id=self.request.user.pk,
|
||||
config={
|
||||
# 'ONLY_NEW': not update,
|
||||
# 'INDEX_ONLY': index_only,
|
||||
@@ -486,9 +492,8 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
'DEPTH': depth,
|
||||
'EXTRACTORS': extractors or '',
|
||||
# 'DEFAULT_PERSONA': persona or 'Default',
|
||||
})
|
||||
# 3. create a new Crawl pointing to the Seed
|
||||
crawl = Crawl.from_seed(seed, max_depth=depth)
|
||||
}
|
||||
)
|
||||
|
||||
# 4. start the Orchestrator & wait until it completes
|
||||
# ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
|
||||
@@ -569,19 +574,7 @@ def live_progress_view(request):
|
||||
# Count URLs in the crawl (for when snapshots haven't been created yet)
|
||||
urls_count = 0
|
||||
if crawl.urls:
|
||||
urls_count = len([u for u in crawl.urls.split('\n') if u.strip()])
|
||||
elif crawl.seed and crawl.seed.uri:
|
||||
# Try to get URL count from seed
|
||||
if crawl.seed.uri.startswith('file:///'):
|
||||
try:
|
||||
from pathlib import Path
|
||||
seed_file = Path(crawl.seed.uri.replace('file://', ''))
|
||||
if seed_file.exists():
|
||||
urls_count = len([l for l in seed_file.read_text().split('\n') if l.strip() and not l.startswith('#')])
|
||||
except:
|
||||
pass
|
||||
else:
|
||||
urls_count = 1 # Single URL seed
|
||||
urls_count = len([u for u in crawl.urls.split('\n') if u.strip() and not u.startswith('#')])
|
||||
|
||||
# Calculate crawl progress
|
||||
crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0
|
||||
@@ -635,8 +628,8 @@ def live_progress_view(request):
|
||||
})
|
||||
|
||||
# Check if crawl can start (for debugging stuck crawls)
|
||||
can_start = bool(crawl.seed and crawl.seed.uri)
|
||||
seed_uri = crawl.seed.uri[:60] if crawl.seed and crawl.seed.uri else None
|
||||
can_start = bool(crawl.urls)
|
||||
urls_preview = crawl.urls[:60] if crawl.urls else None
|
||||
|
||||
# Check if retry_at is in the future (would prevent worker from claiming)
|
||||
retry_at_future = crawl.retry_at > timezone.now() if crawl.retry_at else False
|
||||
@@ -657,7 +650,7 @@ def live_progress_view(request):
|
||||
'pending_snapshots': pending_snapshots,
|
||||
'active_snapshots': active_snapshots_for_crawl,
|
||||
'can_start': can_start,
|
||||
'seed_uri': seed_uri,
|
||||
'urls_preview': urls_preview,
|
||||
'retry_at_future': retry_at_future,
|
||||
'seconds_until_retry': seconds_until_retry,
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user