mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-04 23:07:56 +10:00
wip
This commit is contained in:
@@ -4,7 +4,7 @@ __order__ = 100
|
||||
|
||||
def register_admin(admin_site):
|
||||
"""Register the core.models views (Snapshot, ArchiveResult, Tag, etc.) with the admin site"""
|
||||
from core.admin import register_admin as do_register
|
||||
from archivebox.core.admin import register_admin as do_register
|
||||
do_register(admin_site)
|
||||
|
||||
|
||||
|
||||
@@ -3,11 +3,11 @@ __package__ = 'archivebox.core'
|
||||
from django.contrib.auth import get_user_model
|
||||
|
||||
|
||||
from core.models import Snapshot, ArchiveResult, Tag
|
||||
from core.admin_tags import TagAdmin
|
||||
from core.admin_snapshots import SnapshotAdmin
|
||||
from core.admin_archiveresults import ArchiveResultAdmin
|
||||
from core.admin_users import UserAdmin
|
||||
from archivebox.core.models import Snapshot, ArchiveResult, Tag
|
||||
from archivebox.core.admin_tags import TagAdmin
|
||||
from archivebox.core.admin_snapshots import SnapshotAdmin
|
||||
from archivebox.core.admin_archiveresults import ArchiveResultAdmin
|
||||
from archivebox.core.admin_users import UserAdmin
|
||||
|
||||
|
||||
def register_admin(admin_site):
|
||||
|
||||
@@ -16,7 +16,7 @@ from archivebox.base_models.admin import BaseModelAdmin
|
||||
from archivebox.hooks import get_plugin_icon
|
||||
|
||||
|
||||
from core.models import ArchiveResult, Snapshot
|
||||
from archivebox.core.models import ArchiveResult, Snapshot
|
||||
|
||||
|
||||
def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
@@ -187,7 +187,7 @@ class ArchiveResultInline(admin.TabularInline):
|
||||
extra = 0
|
||||
sort_fields = ('end_ts', 'plugin', 'output_str', 'status', 'cmd_version')
|
||||
readonly_fields = ('id', 'result_id', 'completed', 'command', 'version')
|
||||
fields = ('start_ts', 'end_ts', *readonly_fields, 'plugin', 'cmd', 'cmd_version', 'pwd', 'created_by', 'status', 'retry_at', 'output_str')
|
||||
fields = ('start_ts', 'end_ts', *readonly_fields, 'plugin', 'cmd', 'cmd_version', 'pwd', 'status', 'retry_at', 'output_str')
|
||||
# exclude = ('id',)
|
||||
ordering = ('end_ts',)
|
||||
show_change_link = True
|
||||
@@ -229,17 +229,15 @@ class ArchiveResultInline(admin.TabularInline):
|
||||
formset.form.base_fields['end_ts'].initial = timezone.now()
|
||||
formset.form.base_fields['cmd_version'].initial = '-'
|
||||
formset.form.base_fields['pwd'].initial = str(snapshot.output_dir)
|
||||
formset.form.base_fields['created_by'].initial = request.user
|
||||
formset.form.base_fields['cmd'].initial = '["-"]'
|
||||
formset.form.base_fields['output_str'].initial = 'Manually recorded cmd output...'
|
||||
|
||||
|
||||
if obj is not None:
|
||||
# hidden values for existing entries and new entries
|
||||
formset.form.base_fields['start_ts'].widget = formset.form.base_fields['start_ts'].hidden_widget()
|
||||
formset.form.base_fields['end_ts'].widget = formset.form.base_fields['end_ts'].hidden_widget()
|
||||
formset.form.base_fields['cmd'].widget = formset.form.base_fields['cmd'].hidden_widget()
|
||||
formset.form.base_fields['pwd'].widget = formset.form.base_fields['pwd'].hidden_widget()
|
||||
formset.form.base_fields['created_by'].widget = formset.form.base_fields['created_by'].hidden_widget()
|
||||
formset.form.base_fields['cmd_version'].widget = formset.form.base_fields['cmd_version'].hidden_widget()
|
||||
return formset
|
||||
|
||||
@@ -252,8 +250,8 @@ class ArchiveResultInline(admin.TabularInline):
|
||||
|
||||
|
||||
class ArchiveResultAdmin(BaseModelAdmin):
|
||||
list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str')
|
||||
sort_fields = ('id', 'created_by', 'created_at', 'plugin', 'status')
|
||||
list_display = ('id', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str')
|
||||
sort_fields = ('id', 'created_at', 'plugin', 'status')
|
||||
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon', 'iface')
|
||||
search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp')
|
||||
autocomplete_fields = ['snapshot']
|
||||
@@ -279,10 +277,6 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
'fields': ('output_str', 'output_json', 'output_files', 'output_size', 'output_mimetypes', 'output_summary'),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Metadata', {
|
||||
'fields': ('created_by',),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
)
|
||||
|
||||
list_filter = ('status', 'plugin', 'start_ts', 'cmd_version')
|
||||
|
||||
@@ -38,11 +38,11 @@ def register_admin_site():
|
||||
|
||||
# Register admin views for each app
|
||||
# (Previously handled by ABX plugin system, now called directly)
|
||||
from core.admin import register_admin as register_core_admin
|
||||
from crawls.admin import register_admin as register_crawls_admin
|
||||
from api.admin import register_admin as register_api_admin
|
||||
from machine.admin import register_admin as register_machine_admin
|
||||
from workers.admin import register_admin as register_workers_admin
|
||||
from archivebox.core.admin import register_admin as register_core_admin
|
||||
from archivebox.crawls.admin import register_admin as register_crawls_admin
|
||||
from archivebox.api.admin import register_admin as register_api_admin
|
||||
from archivebox.machine.admin import register_admin as register_machine_admin
|
||||
from archivebox.workers.admin import register_admin as register_workers_admin
|
||||
|
||||
register_core_admin(archivebox_admin)
|
||||
register_crawls_admin(archivebox_admin)
|
||||
|
||||
@@ -23,9 +23,9 @@ from archivebox.search.admin import SearchResultsAdminMixin
|
||||
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
|
||||
from archivebox.workers.tasks import bg_archive_snapshots, bg_add
|
||||
|
||||
from core.models import Tag, Snapshot
|
||||
from core.admin_tags import TagInline
|
||||
from core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list
|
||||
from archivebox.core.models import Tag, Snapshot
|
||||
from archivebox.core.admin_tags import TagInline
|
||||
from archivebox.core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list
|
||||
|
||||
|
||||
# GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
|
||||
@@ -59,7 +59,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
|
||||
readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir', 'archiveresults_list')
|
||||
search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
|
||||
list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
|
||||
list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'crawl__created_by', 'tags__name')
|
||||
|
||||
fieldsets = (
|
||||
('URL', {
|
||||
@@ -75,7 +75,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Relations', {
|
||||
'fields': ('crawl', 'created_by', 'tags_str'),
|
||||
'fields': ('crawl', 'tags_str'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Config', {
|
||||
|
||||
@@ -6,7 +6,7 @@ from django.utils.html import format_html, mark_safe
|
||||
from archivebox.misc.paginators import AccelleratedPaginator
|
||||
from archivebox.base_models.admin import BaseModelAdmin
|
||||
|
||||
from core.models import Tag
|
||||
from archivebox.core.models import Tag
|
||||
|
||||
|
||||
class TagInline(admin.TabularInline):
|
||||
|
||||
@@ -4,9 +4,9 @@ from django.apps import AppConfig
|
||||
|
||||
|
||||
class CoreConfig(AppConfig):
|
||||
name = 'core'
|
||||
name = 'archivebox.core'
|
||||
|
||||
def ready(self):
|
||||
"""Register the archivebox.core.admin_site as the main django admin site"""
|
||||
from core.admin_site import register_admin_site
|
||||
from archivebox.core.admin_site import register_admin_site
|
||||
register_admin_site()
|
||||
|
||||
@@ -20,7 +20,7 @@ application = get_asgi_application()
|
||||
# from channels.routing import ProtocolTypeRouter, URLRouter
|
||||
# from channels.auth import AuthMiddlewareStack
|
||||
# from channels.security.websocket import AllowedHostsOriginValidator
|
||||
# from core.routing import websocket_urlpatterns
|
||||
# from archivebox.core.routing import websocket_urlpatterns
|
||||
#
|
||||
# application = ProtocolTypeRouter({
|
||||
# "http": get_asgi_application(),
|
||||
|
||||
@@ -4,10 +4,14 @@ from django import forms
|
||||
|
||||
from archivebox.misc.util import URL_REGEX
|
||||
from taggit.utils import edit_string_for_tags, parse_tags
|
||||
from archivebox.base_models.admin import KeyValueWidget
|
||||
|
||||
DEPTH_CHOICES = (
|
||||
('0', 'depth = 0 (archive just these URLs)'),
|
||||
('1', 'depth = 1 (archive these URLs and all URLs one hop away)'),
|
||||
('1', 'depth = 1 (+ URLs one hop away)'),
|
||||
('2', 'depth = 2 (+ URLs two hops away)'),
|
||||
('3', 'depth = 3 (+ URLs three hops away)'),
|
||||
('4', 'depth = 4 (+ URLs four hops away)'),
|
||||
)
|
||||
|
||||
from archivebox.hooks import get_plugins
|
||||
@@ -18,39 +22,180 @@ def get_plugin_choices():
|
||||
|
||||
|
||||
class AddLinkForm(forms.Form):
|
||||
url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True)
|
||||
tag = forms.CharField(label="Tags (comma separated tag1,tag2,tag3)", strip=True, required=False)
|
||||
depth = forms.ChoiceField(label="Archive depth", choices=DEPTH_CHOICES, initial='0', widget=forms.RadioSelect(attrs={"class": "depth-selection"}))
|
||||
plugins = forms.MultipleChoiceField(
|
||||
label="Plugins (select at least 1, otherwise all will be used by default)",
|
||||
# Basic fields
|
||||
url = forms.RegexField(
|
||||
label="URLs (one per line)",
|
||||
regex=URL_REGEX,
|
||||
min_length='6',
|
||||
strip=True,
|
||||
widget=forms.Textarea,
|
||||
required=True
|
||||
)
|
||||
tag = forms.CharField(
|
||||
label="Tags (comma separated tag1,tag2,tag3)",
|
||||
strip=True,
|
||||
required=False,
|
||||
widget=forms.TextInput(attrs={
|
||||
'list': 'tag-datalist',
|
||||
'autocomplete': 'off',
|
||||
})
|
||||
)
|
||||
depth = forms.ChoiceField(
|
||||
label="Archive depth",
|
||||
choices=DEPTH_CHOICES,
|
||||
initial='0',
|
||||
widget=forms.RadioSelect(attrs={"class": "depth-selection"})
|
||||
)
|
||||
notes = forms.CharField(
|
||||
label="Notes",
|
||||
strip=True,
|
||||
required=False,
|
||||
widget=forms.Textarea(attrs={
|
||||
'rows': 3,
|
||||
'placeholder': 'Optional notes about this crawl (e.g., purpose, project name, context...)',
|
||||
})
|
||||
)
|
||||
|
||||
# Plugin groups
|
||||
chrome_plugins = forms.MultipleChoiceField(
|
||||
label="Chrome-dependent plugins",
|
||||
required=False,
|
||||
widget=forms.CheckboxSelectMultiple,
|
||||
choices=[], # populated in __init__
|
||||
)
|
||||
archiving_plugins = forms.MultipleChoiceField(
|
||||
label="Archiving",
|
||||
required=False,
|
||||
widget=forms.CheckboxSelectMultiple,
|
||||
choices=[],
|
||||
)
|
||||
parsing_plugins = forms.MultipleChoiceField(
|
||||
label="Parsing",
|
||||
required=False,
|
||||
widget=forms.CheckboxSelectMultiple,
|
||||
choices=[],
|
||||
)
|
||||
search_plugins = forms.MultipleChoiceField(
|
||||
label="Search",
|
||||
required=False,
|
||||
widget=forms.CheckboxSelectMultiple,
|
||||
choices=[],
|
||||
)
|
||||
binary_plugins = forms.MultipleChoiceField(
|
||||
label="Binary providers",
|
||||
required=False,
|
||||
widget=forms.CheckboxSelectMultiple,
|
||||
choices=[],
|
||||
)
|
||||
extension_plugins = forms.MultipleChoiceField(
|
||||
label="Browser extensions",
|
||||
required=False,
|
||||
widget=forms.CheckboxSelectMultiple,
|
||||
choices=[],
|
||||
)
|
||||
|
||||
# Advanced options
|
||||
schedule = forms.CharField(
|
||||
label="Repeat schedule",
|
||||
max_length=64,
|
||||
required=False,
|
||||
widget=forms.TextInput(attrs={
|
||||
'placeholder': 'e.g., daily, weekly, 0 */6 * * * (every 6 hours)',
|
||||
})
|
||||
)
|
||||
persona = forms.CharField(
|
||||
label="Persona (authentication profile)",
|
||||
max_length=100,
|
||||
initial='Default',
|
||||
required=False,
|
||||
)
|
||||
overwrite = forms.BooleanField(
|
||||
label="Overwrite existing snapshots",
|
||||
initial=False,
|
||||
required=False,
|
||||
)
|
||||
update = forms.BooleanField(
|
||||
label="Update/retry previously failed URLs",
|
||||
initial=False,
|
||||
required=False,
|
||||
)
|
||||
index_only = forms.BooleanField(
|
||||
label="Index only (don't archive yet)",
|
||||
initial=False,
|
||||
required=False,
|
||||
)
|
||||
config = forms.JSONField(
|
||||
label="Custom config overrides",
|
||||
widget=KeyValueWidget(),
|
||||
initial=dict,
|
||||
required=False,
|
||||
widget=forms.SelectMultiple,
|
||||
choices=[], # populated dynamically in __init__
|
||||
)
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.fields['plugins'].choices = get_plugin_choices()
|
||||
# TODO: hook these up to the view and put them
|
||||
# in a collapsible UI section labeled "Advanced"
|
||||
#
|
||||
# exclude_patterns = forms.CharField(
|
||||
# label="Exclude patterns",
|
||||
# min_length='1',
|
||||
# required=False,
|
||||
# initial=URL_DENYLIST,
|
||||
# )
|
||||
# timeout = forms.IntegerField(
|
||||
# initial=TIMEOUT,
|
||||
# )
|
||||
# overwrite = forms.BooleanField(
|
||||
# label="Overwrite any existing Snapshots",
|
||||
# initial=False,
|
||||
# )
|
||||
# index_only = forms.BooleanField(
|
||||
# label="Add URLs to index without Snapshotting",
|
||||
# initial=False,
|
||||
# )
|
||||
|
||||
# Import at runtime to avoid circular imports
|
||||
from archivebox.config.common import ARCHIVING_CONFIG
|
||||
|
||||
# Get all plugins
|
||||
all_plugins = get_plugins()
|
||||
|
||||
# Define plugin groups
|
||||
chrome_dependent = {
|
||||
'accessibility', 'chrome', 'consolelog', 'dom', 'headers',
|
||||
'parse_dom_outlinks', 'pdf', 'redirects', 'responses',
|
||||
'screenshot', 'seo', 'singlefile', 'ssl', 'staticfile', 'title'
|
||||
}
|
||||
archiving = {
|
||||
'archive_org', 'favicon', 'forumdl', 'gallerydl', 'git',
|
||||
'htmltotext', 'media', 'mercury', 'papersdl', 'readability', 'wget'
|
||||
}
|
||||
parsing = {
|
||||
'parse_html_urls', 'parse_jsonl_urls',
|
||||
'parse_netscape_urls', 'parse_rss_urls', 'parse_txt_urls'
|
||||
}
|
||||
search = {
|
||||
'search_backend_ripgrep', 'search_backend_sonic', 'search_backend_sqlite'
|
||||
}
|
||||
binary = {'apt', 'brew', 'custom', 'env', 'npm', 'pip'}
|
||||
extensions = {'captcha2', 'istilldontcareaboutcookies', 'ublock'}
|
||||
|
||||
# Populate plugin field choices
|
||||
self.fields['chrome_plugins'].choices = [
|
||||
(p, p) for p in sorted(all_plugins) if p in chrome_dependent
|
||||
]
|
||||
self.fields['archiving_plugins'].choices = [
|
||||
(p, p) for p in sorted(all_plugins) if p in archiving
|
||||
]
|
||||
self.fields['parsing_plugins'].choices = [
|
||||
(p, p) for p in sorted(all_plugins) if p in parsing
|
||||
]
|
||||
self.fields['search_plugins'].choices = [
|
||||
(p, p) for p in sorted(all_plugins) if p in search
|
||||
]
|
||||
self.fields['binary_plugins'].choices = [
|
||||
(p, p) for p in sorted(all_plugins) if p in binary
|
||||
]
|
||||
self.fields['extension_plugins'].choices = [
|
||||
(p, p) for p in sorted(all_plugins) if p in extensions
|
||||
]
|
||||
|
||||
# Set update default from config
|
||||
self.fields['update'].initial = not ARCHIVING_CONFIG.ONLY_NEW
|
||||
|
||||
def clean(self):
|
||||
cleaned_data = super().clean()
|
||||
|
||||
# Combine all plugin groups into single list
|
||||
all_selected_plugins = []
|
||||
for field in ['chrome_plugins', 'archiving_plugins', 'parsing_plugins',
|
||||
'search_plugins', 'binary_plugins', 'extension_plugins']:
|
||||
all_selected_plugins.extend(cleaned_data.get(field, []))
|
||||
|
||||
# Store combined list for easy access
|
||||
cleaned_data['plugins'] = all_selected_plugins
|
||||
|
||||
return cleaned_data
|
||||
|
||||
class TagWidgetMixin:
|
||||
def format_value(self, value):
|
||||
|
||||
@@ -12,7 +12,7 @@ try:
|
||||
ARCHIVE_DIR = CONSTANTS.ARCHIVE_DIR
|
||||
except ImportError:
|
||||
try:
|
||||
from config import CONFIG
|
||||
from archivebox.config import CONFIG
|
||||
ARCHIVE_DIR = Path(CONFIG.get('ARCHIVE_DIR', './archive'))
|
||||
except ImportError:
|
||||
ARCHIVE_DIR = Path('./archive')
|
||||
|
||||
@@ -11,7 +11,7 @@ class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
('core', '0031_snapshot_parent_snapshot'),
|
||||
('crawls', '0004_alter_crawl_output_dir'),
|
||||
('machine', '0003_alter_dependency_id_alter_installedbinary_dependency_and_more'),
|
||||
('machine', '0004_drop_dependency_table'), # Changed from 0003 - wait until Dependency is dropped
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
]
|
||||
|
||||
|
||||
@@ -0,0 +1,79 @@
|
||||
# Generated migration
|
||||
|
||||
from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
|
||||
|
||||
def create_catchall_crawls_and_assign_snapshots(apps, schema_editor):
|
||||
"""
|
||||
Create one catchall Crawl per user for all snapshots without a crawl.
|
||||
Assign those snapshots to their user's catchall crawl.
|
||||
"""
|
||||
Snapshot = apps.get_model('core', 'Snapshot')
|
||||
Crawl = apps.get_model('crawls', 'Crawl')
|
||||
User = apps.get_model(settings.AUTH_USER_MODEL)
|
||||
|
||||
# Get all snapshots without a crawl
|
||||
snapshots_without_crawl = Snapshot.objects.filter(crawl__isnull=True)
|
||||
|
||||
if not snapshots_without_crawl.exists():
|
||||
return
|
||||
|
||||
# Group by created_by_id
|
||||
snapshots_by_user = {}
|
||||
for snapshot in snapshots_without_crawl:
|
||||
user_id = snapshot.created_by_id
|
||||
if user_id not in snapshots_by_user:
|
||||
snapshots_by_user[user_id] = []
|
||||
snapshots_by_user[user_id].append(snapshot)
|
||||
|
||||
# Create one catchall crawl per user and assign snapshots
|
||||
for user_id, snapshots in snapshots_by_user.items():
|
||||
try:
|
||||
user = User.objects.get(pk=user_id)
|
||||
username = user.username
|
||||
except User.DoesNotExist:
|
||||
username = 'unknown'
|
||||
|
||||
# Create catchall crawl for this user
|
||||
crawl = Crawl.objects.create(
|
||||
urls=f'# Catchall crawl for {len(snapshots)} snapshots without a crawl',
|
||||
max_depth=0,
|
||||
label=f'[migration] catchall for user {username}',
|
||||
created_by_id=user_id,
|
||||
)
|
||||
|
||||
# Assign all snapshots to this crawl
|
||||
for snapshot in snapshots:
|
||||
snapshot.crawl = crawl
|
||||
snapshot.save(update_fields=['crawl'])
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0034_snapshot_current_step'),
|
||||
('crawls', '0004_alter_crawl_output_dir'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
# Step 1: Assign all snapshots without a crawl to catchall crawls
|
||||
migrations.RunPython(
|
||||
create_catchall_crawls_and_assign_snapshots,
|
||||
reverse_code=migrations.RunPython.noop,
|
||||
),
|
||||
|
||||
# Step 2: Make crawl non-nullable
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='crawl',
|
||||
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
|
||||
),
|
||||
|
||||
# Step 3: Remove created_by field
|
||||
migrations.RemoveField(
|
||||
model_name='snapshot',
|
||||
name='created_by',
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,19 @@
|
||||
# Generated migration
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0035_snapshot_crawl_non_nullable_remove_created_by'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
# Remove created_by field from ArchiveResult
|
||||
# No data migration needed - created_by can be accessed via snapshot.crawl.created_by
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='created_by',
|
||||
),
|
||||
]
|
||||
@@ -9,6 +9,8 @@ import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from statemachine import State, registry
|
||||
|
||||
from django.db import models
|
||||
from django.db.models import QuerySet, Value, Case, When, IntegerField
|
||||
from django.utils.functional import cached_property
|
||||
@@ -33,10 +35,10 @@ from archivebox.base_models.models import (
|
||||
ModelWithConfig, ModelWithNotes, ModelWithHealthStats,
|
||||
get_or_create_system_user_pk,
|
||||
)
|
||||
from workers.models import ModelWithStateMachine
|
||||
from workers.tasks import bg_archive_snapshot
|
||||
from crawls.models import Crawl
|
||||
from machine.models import NetworkInterface, Binary
|
||||
from archivebox.workers.models import ModelWithStateMachine, BaseStateMachine
|
||||
from archivebox.workers.tasks import bg_archive_snapshot
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.machine.models import NetworkInterface, Binary
|
||||
|
||||
|
||||
|
||||
@@ -53,6 +55,7 @@ class Tag(ModelWithSerializers):
|
||||
snapshot_set: models.Manager['Snapshot']
|
||||
|
||||
class Meta(TypedModelMeta):
|
||||
app_label = 'core'
|
||||
verbose_name = "Tag"
|
||||
verbose_name_plural = "Tags"
|
||||
|
||||
@@ -122,6 +125,7 @@ class SnapshotTag(models.Model):
|
||||
tag = models.ForeignKey(Tag, db_column='tag_id', on_delete=models.CASCADE, to_field='id')
|
||||
|
||||
class Meta:
|
||||
app_label = 'core'
|
||||
db_table = 'core_snapshot_tags'
|
||||
unique_together = [('snapshot', 'tag')]
|
||||
|
||||
@@ -263,52 +267,6 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
|
||||
# Import Methods
|
||||
# =========================================================================
|
||||
|
||||
def create_or_update_from_dict(self, link_dict: Dict[str, Any], created_by_id: Optional[int] = None) -> 'Snapshot':
|
||||
"""Create or update a Snapshot from a SnapshotDict (parser output)"""
|
||||
import re
|
||||
from archivebox.config.common import GENERAL_CONFIG
|
||||
|
||||
url = link_dict['url']
|
||||
timestamp = link_dict.get('timestamp')
|
||||
title = link_dict.get('title')
|
||||
tags_str = link_dict.get('tags')
|
||||
|
||||
tag_list = []
|
||||
if tags_str:
|
||||
tag_list = list(dict.fromkeys(
|
||||
tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_str)
|
||||
if tag.strip()
|
||||
))
|
||||
|
||||
# Get most recent snapshot with this URL (URLs can exist in multiple crawls)
|
||||
snapshot = self.filter(url=url).order_by('-created_at').first()
|
||||
if snapshot:
|
||||
if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
|
||||
snapshot.title = title
|
||||
snapshot.save(update_fields=['title', 'modified_at'])
|
||||
else:
|
||||
if timestamp:
|
||||
while self.filter(timestamp=timestamp).exists():
|
||||
timestamp = str(float(timestamp) + 1.0)
|
||||
|
||||
snapshot = self.create(
|
||||
url=url,
|
||||
timestamp=timestamp,
|
||||
title=title,
|
||||
created_by_id=created_by_id or get_or_create_system_user_pk(),
|
||||
)
|
||||
|
||||
if tag_list:
|
||||
existing_tags = set(snapshot.tags.values_list('name', flat=True))
|
||||
new_tags = set(tag_list) | existing_tags
|
||||
snapshot.save_tags(new_tags)
|
||||
|
||||
return snapshot
|
||||
|
||||
def create_from_dicts(self, link_dicts: List[Dict[str, Any]], created_by_id: Optional[int] = None) -> List['Snapshot']:
|
||||
"""Create or update multiple Snapshots from a list of SnapshotDicts"""
|
||||
return [self.create_or_update_from_dict(d, created_by_id=created_by_id) for d in link_dicts]
|
||||
|
||||
def remove(self, atomic: bool = False) -> tuple:
|
||||
"""Remove snapshots from the database"""
|
||||
from django.db import transaction
|
||||
@@ -320,14 +278,13 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
|
||||
|
||||
class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
|
||||
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
||||
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='snapshot_set', db_index=True)
|
||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
modified_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
url = models.URLField(unique=False, db_index=True) # URLs can appear in multiple crawls
|
||||
timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
|
||||
bookmarked_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set', db_index=True) # type: ignore
|
||||
crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, null=False, related_name='snapshot_set', db_index=True) # type: ignore[assignment]
|
||||
parent_snapshot = models.ForeignKey('self', on_delete=models.SET_NULL, null=True, blank=True, related_name='child_snapshots', db_index=True, help_text='Parent snapshot that discovered this URL (for recursive crawling)')
|
||||
|
||||
title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
|
||||
@@ -344,7 +301,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
|
||||
tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
|
||||
|
||||
state_machine_name = 'core.statemachines.SnapshotMachine'
|
||||
state_machine_name = 'core.models.SnapshotMachine'
|
||||
state_field_name = 'status'
|
||||
retry_at_field_name = 'retry_at'
|
||||
StatusChoices = ModelWithStateMachine.StatusChoices
|
||||
@@ -354,6 +311,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
archiveresult_set: models.Manager['ArchiveResult']
|
||||
|
||||
class Meta(TypedModelMeta):
|
||||
app_label = 'core'
|
||||
verbose_name = "Snapshot"
|
||||
verbose_name_plural = "Snapshots"
|
||||
constraints = [
|
||||
@@ -366,6 +324,11 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
def __str__(self):
|
||||
return f'[{self.id}] {self.url[:64]}'
|
||||
|
||||
@property
|
||||
def created_by(self):
|
||||
"""Convenience property to access the user who created this snapshot via its crawl."""
|
||||
return self.crawl.created_by
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
is_new = self._state.adding
|
||||
if not self.bookmarked_at:
|
||||
@@ -395,7 +358,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
self.fs_version = target
|
||||
|
||||
super().save(*args, **kwargs)
|
||||
if self.crawl and self.url not in self.crawl.urls:
|
||||
if self.url not in self.crawl.urls:
|
||||
self.crawl.urls += f'\n{self.url}'
|
||||
self.crawl.save()
|
||||
|
||||
@@ -408,7 +371,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
url=self.url,
|
||||
metadata={
|
||||
'id': str(self.id),
|
||||
'crawl_id': str(self.crawl_id) if self.crawl_id else None,
|
||||
'crawl_id': str(self.crawl_id),
|
||||
'depth': self.depth,
|
||||
'status': self.status,
|
||||
},
|
||||
@@ -437,20 +400,11 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
return self.fs_version != self._fs_current_version()
|
||||
|
||||
def _fs_next_version(self, version: str) -> str:
|
||||
"""Get next version in migration chain"""
|
||||
chain = ['0.7.0', '0.8.0', '0.9.0']
|
||||
try:
|
||||
idx = chain.index(version)
|
||||
return chain[idx + 1] if idx + 1 < len(chain) else self._fs_current_version()
|
||||
except ValueError:
|
||||
# Unknown version - skip to current
|
||||
return self._fs_current_version()
|
||||
|
||||
def _fs_migrate_from_0_7_0_to_0_8_0(self):
|
||||
"""Migration from 0.7.0 to 0.8.0 layout (no-op)"""
|
||||
# 0.7 and 0.8 both used archive/<timestamp>
|
||||
# Nothing to do!
|
||||
pass
|
||||
"""Get next version in migration chain (0.7/0.8 had same layout, only 0.8→0.9 migration needed)"""
|
||||
# Treat 0.7.0 and 0.8.0 as equivalent (both used archive/{timestamp})
|
||||
if version in ('0.7.0', '0.8.0'):
|
||||
return '0.9.0'
|
||||
return self._fs_current_version()
|
||||
|
||||
def _fs_migrate_from_0_8_0_to_0_9_0(self):
|
||||
"""
|
||||
@@ -578,7 +532,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
return CONSTANTS.ARCHIVE_DIR / self.timestamp
|
||||
|
||||
elif version in ('0.9.0', '1.0.0'):
|
||||
username = self.created_by.username if self.created_by else 'unknown'
|
||||
username = self.created_by.username
|
||||
|
||||
# Use created_at for date grouping (fallback to timestamp)
|
||||
if self.created_at:
|
||||
@@ -875,7 +829,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
pwd=result_data.get('pwd', str(self.output_dir)),
|
||||
start_ts=start_ts,
|
||||
end_ts=end_ts,
|
||||
created_by=self.created_by,
|
||||
)
|
||||
except:
|
||||
pass
|
||||
@@ -1069,6 +1022,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
result = archive_results.get(plugin)
|
||||
existing = result and result.status == 'succeeded' and (result.output_files or result.output_str)
|
||||
icon = get_plugin_icon(plugin)
|
||||
|
||||
# Skip plugins with empty icons that have no output
|
||||
# (e.g., staticfile only shows when there's actual output)
|
||||
if not icon.strip() and not existing:
|
||||
continue
|
||||
|
||||
output += format_html(
|
||||
output_template,
|
||||
path,
|
||||
@@ -1139,9 +1098,20 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
|
||||
def run(self) -> list['ArchiveResult']:
|
||||
"""
|
||||
Execute this Snapshot by creating ArchiveResults for all enabled extractors.
|
||||
Execute snapshot by creating pending ArchiveResults for all enabled hooks.
|
||||
|
||||
Called by the state machine when entering the 'started' state.
|
||||
Called by: SnapshotMachine.enter_started()
|
||||
|
||||
Hook Lifecycle:
|
||||
1. discover_hooks('Snapshot') → finds all plugin hooks
|
||||
2. For each hook:
|
||||
- Create ArchiveResult with status=QUEUED
|
||||
- Store hook_name (e.g., 'on_Snapshot__50_wget.py')
|
||||
3. ArchiveResults execute independently via ArchiveResultMachine
|
||||
4. Hook execution happens in ArchiveResult.run(), NOT here
|
||||
|
||||
Returns:
|
||||
list[ArchiveResult]: Newly created pending results
|
||||
"""
|
||||
return self.create_pending_archiveresults()
|
||||
|
||||
@@ -1152,28 +1122,20 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
Called by the state machine when entering the 'sealed' state.
|
||||
Kills any background hooks and finalizes their ArchiveResults.
|
||||
"""
|
||||
from pathlib import Path
|
||||
from archivebox.hooks import kill_process
|
||||
|
||||
# Kill any background ArchiveResult hooks
|
||||
if not self.OUTPUT_DIR.exists():
|
||||
return
|
||||
|
||||
for plugin_dir in self.OUTPUT_DIR.iterdir():
|
||||
if not plugin_dir.is_dir():
|
||||
continue
|
||||
pid_file = plugin_dir / 'hook.pid'
|
||||
if pid_file.exists():
|
||||
kill_process(pid_file, validate=True) # Use validation
|
||||
# Find all .pid files in this snapshot's output directory
|
||||
for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
|
||||
kill_process(pid_file, validate=True)
|
||||
|
||||
# Update the ArchiveResult from filesystem
|
||||
plugin_name = plugin_dir.name
|
||||
results = self.archiveresult_set.filter(
|
||||
status=ArchiveResult.StatusChoices.STARTED,
|
||||
pwd__contains=plugin_name
|
||||
)
|
||||
for ar in results:
|
||||
ar.update_from_output()
|
||||
# Update all STARTED ArchiveResults from filesystem
|
||||
results = self.archiveresult_set.filter(status=ArchiveResult.StatusChoices.STARTED)
|
||||
for ar in results:
|
||||
ar.update_from_output()
|
||||
|
||||
def has_running_background_hooks(self) -> bool:
|
||||
"""
|
||||
@@ -1196,51 +1158,156 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None):
|
||||
def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True):
|
||||
"""
|
||||
Create/update Snapshot from JSONL record.
|
||||
Create/update Snapshot from JSONL record or dict.
|
||||
|
||||
Unified method that handles:
|
||||
- ID-based patching: {"id": "...", "title": "new title"}
|
||||
- URL-based create/update: {"url": "...", "title": "...", "tags": "..."}
|
||||
- Auto-creates Crawl if not provided
|
||||
- Optionally queues for extraction
|
||||
|
||||
Args:
|
||||
record: JSONL record with 'url' field and optional metadata
|
||||
record: Dict with 'url' (for create) or 'id' (for patch), plus other fields
|
||||
overrides: Dict with 'crawl', 'snapshot' (parent), 'created_by_id'
|
||||
queue_for_extraction: If True, sets status=QUEUED and retry_at (default: True)
|
||||
|
||||
Returns:
|
||||
Snapshot instance or None
|
||||
|
||||
Note:
|
||||
Filtering (depth, URL allowlist/denylist) should be done by caller
|
||||
BEFORE calling this method. This method just creates the snapshot.
|
||||
"""
|
||||
from archivebox.misc.jsonl import get_or_create_snapshot
|
||||
import re
|
||||
from django.utils import timezone
|
||||
from archivebox.misc.util import parse_date
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.config.common import GENERAL_CONFIG
|
||||
|
||||
overrides = overrides or {}
|
||||
|
||||
# If 'id' is provided, lookup and patch that specific snapshot
|
||||
snapshot_id = record.get('id')
|
||||
if snapshot_id:
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
|
||||
# Generically update all fields present in record
|
||||
update_fields = []
|
||||
for field_name, value in record.items():
|
||||
# Skip internal fields
|
||||
if field_name in ('id', 'type'):
|
||||
continue
|
||||
|
||||
# Skip if field doesn't exist on model
|
||||
if not hasattr(snapshot, field_name):
|
||||
continue
|
||||
|
||||
# Special parsing for date fields
|
||||
if field_name in ('bookmarked_at', 'retry_at', 'created_at', 'modified_at'):
|
||||
if value and isinstance(value, str):
|
||||
value = parse_date(value)
|
||||
|
||||
# Update field if value is provided and different
|
||||
if value is not None and getattr(snapshot, field_name) != value:
|
||||
setattr(snapshot, field_name, value)
|
||||
update_fields.append(field_name)
|
||||
|
||||
if update_fields:
|
||||
snapshot.save(update_fields=update_fields + ['modified_at'])
|
||||
|
||||
return snapshot
|
||||
except Snapshot.DoesNotExist:
|
||||
# ID not found, fall through to create-by-URL logic
|
||||
pass
|
||||
|
||||
url = record.get('url')
|
||||
if not url:
|
||||
return None
|
||||
|
||||
# Apply crawl context metadata
|
||||
# Determine or create crawl (every snapshot must have a crawl)
|
||||
crawl = overrides.get('crawl')
|
||||
snapshot = overrides.get('snapshot') # Parent snapshot
|
||||
parent_snapshot = overrides.get('snapshot') # Parent snapshot
|
||||
created_by_id = overrides.get('created_by_id') or (parent_snapshot.created_by.pk if parent_snapshot else get_or_create_system_user_pk())
|
||||
|
||||
if crawl:
|
||||
record.setdefault('crawl_id', str(crawl.id))
|
||||
record.setdefault('depth', (snapshot.depth + 1 if snapshot else 1))
|
||||
if snapshot:
|
||||
record.setdefault('parent_snapshot_id', str(snapshot.id))
|
||||
# If no crawl provided, inherit from parent or auto-create one
|
||||
if not crawl:
|
||||
if parent_snapshot:
|
||||
# Inherit crawl from parent snapshot
|
||||
crawl = parent_snapshot.crawl
|
||||
else:
|
||||
# Auto-create a single-URL crawl
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.config import CONSTANTS
|
||||
|
||||
try:
|
||||
created_by_id = overrides.get('created_by_id') or (snapshot.created_by_id if snapshot else None)
|
||||
new_snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
|
||||
timestamp_str = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
|
||||
sources_file = CONSTANTS.SOURCES_DIR / f'{timestamp_str}__auto_crawl.txt'
|
||||
sources_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
sources_file.write_text(url)
|
||||
|
||||
# Queue for extraction
|
||||
new_snapshot.status = Snapshot.StatusChoices.QUEUED
|
||||
new_snapshot.retry_at = timezone.now()
|
||||
new_snapshot.save()
|
||||
crawl = Crawl.objects.create(
|
||||
urls=url,
|
||||
max_depth=0,
|
||||
label=f'auto-created for {url[:50]}',
|
||||
created_by_id=created_by_id,
|
||||
)
|
||||
|
||||
return new_snapshot
|
||||
except ValueError:
|
||||
return None
|
||||
# Parse tags
|
||||
tags_str = record.get('tags', '')
|
||||
tag_list = []
|
||||
if tags_str:
|
||||
tag_list = list(dict.fromkeys(
|
||||
tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_str)
|
||||
if tag.strip()
|
||||
))
|
||||
|
||||
# Get most recent snapshot with this URL (URLs can exist in multiple crawls)
|
||||
snapshot = Snapshot.objects.filter(url=url).order_by('-created_at').first()
|
||||
|
||||
title = record.get('title')
|
||||
timestamp = record.get('timestamp')
|
||||
|
||||
if snapshot:
|
||||
# Update existing snapshot
|
||||
if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
|
||||
snapshot.title = title
|
||||
snapshot.save(update_fields=['title', 'modified_at'])
|
||||
else:
|
||||
# Create new snapshot
|
||||
if timestamp:
|
||||
while Snapshot.objects.filter(timestamp=timestamp).exists():
|
||||
timestamp = str(float(timestamp) + 1.0)
|
||||
|
||||
snapshot = Snapshot.objects.create(
|
||||
url=url,
|
||||
timestamp=timestamp,
|
||||
title=title,
|
||||
crawl=crawl,
|
||||
)
|
||||
|
||||
# Update tags
|
||||
if tag_list:
|
||||
existing_tags = set(snapshot.tags.values_list('name', flat=True))
|
||||
new_tags = set(tag_list) | existing_tags
|
||||
snapshot.save_tags(new_tags)
|
||||
|
||||
# Queue for extraction and update additional fields
|
||||
update_fields = []
|
||||
|
||||
if queue_for_extraction:
|
||||
snapshot.status = Snapshot.StatusChoices.QUEUED
|
||||
snapshot.retry_at = timezone.now()
|
||||
update_fields.extend(['status', 'retry_at'])
|
||||
|
||||
# Update additional fields if provided
|
||||
for field_name in ('depth', 'parent_snapshot_id', 'crawl_id', 'bookmarked_at'):
|
||||
value = record.get(field_name)
|
||||
if value is not None and getattr(snapshot, field_name) != value:
|
||||
setattr(snapshot, field_name, value)
|
||||
update_fields.append(field_name)
|
||||
|
||||
if update_fields:
|
||||
snapshot.save(update_fields=update_fields + ['modified_at'])
|
||||
|
||||
return snapshot
|
||||
|
||||
def create_pending_archiveresults(self) -> list['ArchiveResult']:
|
||||
"""
|
||||
@@ -1273,7 +1340,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
'plugin': plugin,
|
||||
'status': ArchiveResult.INITIAL_STATE,
|
||||
'retry_at': timezone.now(),
|
||||
'created_by_id': self.created_by_id,
|
||||
},
|
||||
)
|
||||
if archiveresult.status == ArchiveResult.INITIAL_STATE:
|
||||
@@ -1329,6 +1395,36 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
self.save(update_fields=['current_step', 'modified_at'])
|
||||
return True
|
||||
|
||||
def is_finished_processing(self) -> bool:
|
||||
"""
|
||||
Check if this snapshot has finished processing.
|
||||
|
||||
Used by SnapshotMachine.is_finished() to determine if snapshot is complete.
|
||||
|
||||
Returns:
|
||||
True if all archiveresults are finished (or no work to do), False otherwise.
|
||||
"""
|
||||
# if no archiveresults exist yet, it's not finished
|
||||
if not self.archiveresult_set.exists():
|
||||
return False
|
||||
|
||||
# Try to advance step if ready (handles step-based hook execution)
|
||||
# This will increment current_step when all foreground hooks in current step are done
|
||||
while self.advance_step_if_ready():
|
||||
pass # Keep advancing until we can't anymore
|
||||
|
||||
# if archiveresults exist but are still pending, it's not finished
|
||||
if self.pending_archiveresults().exists():
|
||||
return False
|
||||
|
||||
# Don't wait for background hooks - they'll be cleaned up on entering sealed state
|
||||
# Background hooks in STARTED state are excluded by pending_archiveresults()
|
||||
# (STARTED is in FINAL_OR_ACTIVE_STATES) so once all results are FINAL or ACTIVE,
|
||||
# we can transition to sealed and cleanup() will kill the background hooks
|
||||
|
||||
# otherwise archiveresults exist and are all finished, so it's finished
|
||||
return True
|
||||
|
||||
def retry_failed_archiveresults(self, retry_at: Optional['timezone.datetime'] = None) -> int:
|
||||
"""
|
||||
Reset failed/skipped ArchiveResults to queued for retry.
|
||||
@@ -1730,6 +1826,97 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
return dt.strftime('%Y-%m-%d %H:%M:%S') if dt else None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Snapshot State Machine
|
||||
# =============================================================================
|
||||
|
||||
class SnapshotMachine(BaseStateMachine, strict_states=True):
|
||||
"""
|
||||
State machine for managing Snapshot lifecycle.
|
||||
|
||||
Hook Lifecycle:
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ QUEUED State │
|
||||
│ • Waiting for snapshot to be ready │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
↓ tick() when can_start()
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ STARTED State → enter_started() │
|
||||
│ 1. snapshot.run() │
|
||||
│ • discover_hooks('Snapshot') → finds all plugin hooks │
|
||||
│ • create_pending_archiveresults() → creates ONE │
|
||||
│ ArchiveResult per hook (NO execution yet) │
|
||||
│ 2. ArchiveResults process independently with their own │
|
||||
│ state machines (see ArchiveResultMachine) │
|
||||
│ 3. Advance through steps 0-9 as foreground hooks complete │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
↓ tick() when is_finished()
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ SEALED State → enter_sealed() │
|
||||
│ • cleanup() → kills any background hooks still running │
|
||||
│ • Set retry_at=None (no more processing) │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
|
||||
https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
|
||||
"""
|
||||
|
||||
model_attr_name = 'snapshot'
|
||||
|
||||
# States
|
||||
queued = State(value=Snapshot.StatusChoices.QUEUED, initial=True)
|
||||
started = State(value=Snapshot.StatusChoices.STARTED)
|
||||
sealed = State(value=Snapshot.StatusChoices.SEALED, final=True)
|
||||
|
||||
# Tick Event
|
||||
tick = (
|
||||
queued.to.itself(unless='can_start') |
|
||||
queued.to(started, cond='can_start') |
|
||||
started.to.itself(unless='is_finished') |
|
||||
started.to(sealed, cond='is_finished')
|
||||
)
|
||||
|
||||
def can_start(self) -> bool:
|
||||
can_start = bool(self.snapshot.url)
|
||||
return can_start
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
"""Check if snapshot processing is complete - delegates to model method."""
|
||||
return self.snapshot.is_finished_processing()
|
||||
|
||||
@queued.enter
|
||||
def enter_queued(self):
|
||||
self.snapshot.update_and_requeue(
|
||||
retry_at=timezone.now(),
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
)
|
||||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
# lock the snapshot while we create the pending archiveresults
|
||||
self.snapshot.update_and_requeue(
|
||||
retry_at=timezone.now() + timedelta(seconds=30), # if failed, wait 30s before retrying
|
||||
)
|
||||
|
||||
# Run the snapshot - creates pending archiveresults for all enabled plugins
|
||||
self.snapshot.run()
|
||||
|
||||
# unlock the snapshot after we're done + set status = started
|
||||
self.snapshot.update_and_requeue(
|
||||
retry_at=timezone.now() + timedelta(seconds=5), # check again in 5s
|
||||
status=Snapshot.StatusChoices.STARTED,
|
||||
)
|
||||
|
||||
@sealed.enter
|
||||
def enter_sealed(self):
|
||||
# Clean up background hooks
|
||||
self.snapshot.cleanup()
|
||||
|
||||
self.snapshot.update_and_requeue(
|
||||
retry_at=None,
|
||||
status=Snapshot.StatusChoices.SEALED,
|
||||
)
|
||||
|
||||
|
||||
class ArchiveResultManager(models.Manager):
|
||||
def indexable(self, sorted: bool = True):
|
||||
INDEXABLE_METHODS = [r[0] for r in EXTRACTOR_INDEXING_PRECEDENCE]
|
||||
@@ -1761,7 +1948,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
# Note: unique constraint is added by migration 0027 - don't set unique=True here
|
||||
# or SQLite table recreation in earlier migrations will fail
|
||||
uuid = models.UUIDField(default=uuid7, null=True, blank=True, db_index=True)
|
||||
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='archiveresult_set', db_index=True)
|
||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
modified_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
@@ -1782,7 +1968,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
|
||||
# Binary FK (optional - set when hook reports cmd)
|
||||
binary = models.ForeignKey(
|
||||
'machine.Binary',
|
||||
Binary,
|
||||
on_delete=models.SET_NULL,
|
||||
null=True, blank=True,
|
||||
related_name='archiveresults',
|
||||
@@ -1798,7 +1984,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
output_dir = models.CharField(max_length=256, default=None, null=True, blank=True)
|
||||
iface = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True)
|
||||
|
||||
state_machine_name = 'core.statemachines.ArchiveResultMachine'
|
||||
state_machine_name = 'core.models.ArchiveResultMachine'
|
||||
retry_at_field_name = 'retry_at'
|
||||
state_field_name = 'status'
|
||||
active_state = StatusChoices.STARTED
|
||||
@@ -1806,12 +1992,18 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
objects = ArchiveResultManager()
|
||||
|
||||
class Meta(TypedModelMeta):
|
||||
app_label = 'core'
|
||||
verbose_name = 'Archive Result'
|
||||
verbose_name_plural = 'Archive Results Log'
|
||||
|
||||
def __str__(self):
|
||||
return f'[{self.id}] {self.snapshot.url[:64]} -> {self.plugin}'
|
||||
|
||||
@property
|
||||
def created_by(self):
|
||||
"""Convenience property to access the user who created this archive result via its snapshot's crawl."""
|
||||
return self.snapshot.crawl.created_by
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
is_new = self._state.adding
|
||||
# Skip ModelWithOutputDir.save() to avoid creating index.json in plugin directories
|
||||
@@ -1900,6 +2092,12 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
def save_search_index(self):
|
||||
pass
|
||||
|
||||
def cascade_health_update(self, success: bool):
|
||||
"""Update health stats for self, parent Snapshot, and grandparent Crawl."""
|
||||
self.increment_health_stats(success)
|
||||
self.snapshot.increment_health_stats(success)
|
||||
self.snapshot.crawl.increment_health_stats(success)
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
Execute this ArchiveResult's hook and update status.
|
||||
@@ -1911,8 +2109,13 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
"""
|
||||
from django.utils import timezone
|
||||
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook, is_background_hook
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
|
||||
# Get merged config with proper context
|
||||
config = get_config(
|
||||
crawl=self.snapshot.crawl,
|
||||
snapshot=self.snapshot,
|
||||
)
|
||||
|
||||
# Determine which hook(s) to run
|
||||
hooks = []
|
||||
@@ -1962,10 +2165,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
result = run_hook(
|
||||
hook,
|
||||
output_dir=plugin_dir,
|
||||
config_objects=config_objects,
|
||||
config=config,
|
||||
url=self.snapshot.url,
|
||||
snapshot_id=str(self.snapshot.id),
|
||||
crawl_id=str(self.snapshot.crawl.id) if self.snapshot.crawl else None,
|
||||
crawl_id=str(self.snapshot.crawl.id),
|
||||
depth=self.snapshot.depth,
|
||||
)
|
||||
|
||||
@@ -2112,9 +2315,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
|
||||
# Filter Snapshot records for depth/URL constraints
|
||||
if record_type == 'Snapshot':
|
||||
if not self.snapshot.crawl:
|
||||
continue
|
||||
|
||||
url = record.get('url')
|
||||
if not url:
|
||||
continue
|
||||
@@ -2132,19 +2332,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
overrides = {
|
||||
'snapshot': self.snapshot,
|
||||
'crawl': self.snapshot.crawl,
|
||||
'created_by_id': self.snapshot.created_by_id,
|
||||
'created_by_id': self.created_by.pk,
|
||||
}
|
||||
process_hook_records(filtered_records, overrides=overrides)
|
||||
|
||||
# Update snapshot title if this is the title plugin
|
||||
plugin_name = get_plugin_name(self.plugin)
|
||||
if self.status == self.StatusChoices.SUCCEEDED and plugin_name == 'title':
|
||||
self._update_snapshot_title(plugin_dir)
|
||||
|
||||
# Trigger search indexing if succeeded
|
||||
if self.status == self.StatusChoices.SUCCEEDED:
|
||||
self.trigger_search_indexing()
|
||||
|
||||
# Cleanup PID files and empty logs
|
||||
pid_file = plugin_dir / 'hook.pid'
|
||||
pid_file.unlink(missing_ok=True)
|
||||
@@ -2164,7 +2355,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
if not cmd:
|
||||
return
|
||||
|
||||
from machine.models import Machine
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
|
||||
machine = Machine.current()
|
||||
@@ -2189,23 +2380,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
if binary:
|
||||
self.binary = binary
|
||||
|
||||
def _update_snapshot_title(self, plugin_dir: Path):
|
||||
"""
|
||||
Update snapshot title from title plugin output.
|
||||
|
||||
The title plugin writes title.txt with the extracted page title.
|
||||
This updates the Snapshot.title field if the file exists and has content.
|
||||
"""
|
||||
title_file = plugin_dir / 'title.txt'
|
||||
if title_file.exists():
|
||||
try:
|
||||
title = title_file.read_text(encoding='utf-8').strip()
|
||||
if title and (not self.snapshot.title or len(title) > len(self.snapshot.title)):
|
||||
self.snapshot.title = title[:512] # Max length from model
|
||||
self.snapshot.save(update_fields=['title', 'modified_at'])
|
||||
except Exception:
|
||||
pass # Failed to read title, that's okay
|
||||
|
||||
def _url_passes_filters(self, url: str) -> bool:
|
||||
"""Check if URL passes URL_ALLOWLIST and URL_DENYLIST config filters.
|
||||
|
||||
@@ -2216,8 +2390,8 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
|
||||
# Get merged config with proper hierarchy
|
||||
config = get_config(
|
||||
user=self.snapshot.created_by if self.snapshot else None,
|
||||
crawl=self.snapshot.crawl if self.snapshot else None,
|
||||
user=self.created_by,
|
||||
crawl=self.snapshot.crawl,
|
||||
snapshot=self.snapshot,
|
||||
)
|
||||
|
||||
@@ -2256,23 +2430,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
return False # No allowlist patterns matched
|
||||
|
||||
return True # No filters or passed filters
|
||||
|
||||
def trigger_search_indexing(self):
|
||||
"""Run any ArchiveResult__index hooks to update search indexes."""
|
||||
from archivebox.hooks import discover_hooks, run_hook
|
||||
|
||||
# Pass config objects in priority order (later overrides earlier)
|
||||
config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
|
||||
|
||||
for hook in discover_hooks('ArchiveResult__index'):
|
||||
run_hook(
|
||||
hook,
|
||||
output_dir=self.output_dir,
|
||||
config_objects=config_objects,
|
||||
url=self.snapshot.url,
|
||||
snapshot_id=str(self.snapshot.id),
|
||||
plugin=self.plugin,
|
||||
)
|
||||
|
||||
@property
|
||||
def output_dir(self) -> Path:
|
||||
@@ -2285,4 +2442,185 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
if not plugin_dir:
|
||||
return False
|
||||
pid_file = plugin_dir / 'hook.pid'
|
||||
return pid_file.exists()
|
||||
return pid_file.exists()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# ArchiveResult State Machine
|
||||
# =============================================================================
|
||||
|
||||
class ArchiveResultMachine(BaseStateMachine, strict_states=True):
|
||||
"""
|
||||
State machine for managing ArchiveResult (single plugin execution) lifecycle.
|
||||
|
||||
Hook Lifecycle:
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ QUEUED State │
|
||||
│ • Waiting for its turn to run │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
↓ tick() when can_start()
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ STARTED State → enter_started() │
|
||||
│ 1. archiveresult.run() │
|
||||
│ • Find specific hook by hook_name │
|
||||
│ • run_hook(script, output_dir, ...) → subprocess │
|
||||
│ │
|
||||
│ 2a. FOREGROUND hook (returns HookResult): │
|
||||
│ • update_from_output() immediately │
|
||||
│ - Read stdout.log │
|
||||
│ - Parse JSONL records │
|
||||
│ - Extract 'ArchiveResult' record → update status │
|
||||
│ - Walk output_dir → populate output_files │
|
||||
│ - Call process_hook_records() for side effects │
|
||||
│ │
|
||||
│ 2b. BACKGROUND hook (returns None): │
|
||||
│ • Status stays STARTED │
|
||||
│ • Continues running in background │
|
||||
│ • Killed by Snapshot.cleanup() when sealed │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
↓ tick() checks status
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ SUCCEEDED / FAILED / SKIPPED / BACKOFF │
|
||||
│ • Set by hook's JSONL output during update_from_output() │
|
||||
│ • Health stats incremented (num_uses_succeeded/failed) │
|
||||
│ • Parent Snapshot health stats also updated │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
|
||||
https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
|
||||
"""
|
||||
|
||||
model_attr_name = 'archiveresult'
|
||||
|
||||
# States
|
||||
queued = State(value=ArchiveResult.StatusChoices.QUEUED, initial=True)
|
||||
started = State(value=ArchiveResult.StatusChoices.STARTED)
|
||||
backoff = State(value=ArchiveResult.StatusChoices.BACKOFF)
|
||||
succeeded = State(value=ArchiveResult.StatusChoices.SUCCEEDED, final=True)
|
||||
failed = State(value=ArchiveResult.StatusChoices.FAILED, final=True)
|
||||
skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True)
|
||||
|
||||
# Tick Event - transitions based on conditions
|
||||
tick = (
|
||||
queued.to.itself(unless='can_start') |
|
||||
queued.to(started, cond='can_start') |
|
||||
started.to.itself(unless='is_finished') |
|
||||
started.to(succeeded, cond='is_succeeded') |
|
||||
started.to(failed, cond='is_failed') |
|
||||
started.to(skipped, cond='is_skipped') |
|
||||
started.to(backoff, cond='is_backoff') |
|
||||
backoff.to.itself(unless='can_start') |
|
||||
backoff.to(started, cond='can_start') |
|
||||
backoff.to(succeeded, cond='is_succeeded') |
|
||||
backoff.to(failed, cond='is_failed') |
|
||||
backoff.to(skipped, cond='is_skipped')
|
||||
)
|
||||
|
||||
def can_start(self) -> bool:
|
||||
can_start = bool(self.archiveresult.snapshot.url)
|
||||
return can_start
|
||||
|
||||
def is_succeeded(self) -> bool:
|
||||
"""Check if extractor plugin succeeded (status was set by run())."""
|
||||
return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED
|
||||
|
||||
def is_failed(self) -> bool:
|
||||
"""Check if extractor plugin failed (status was set by run())."""
|
||||
return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED
|
||||
|
||||
def is_skipped(self) -> bool:
|
||||
"""Check if extractor plugin was skipped (status was set by run())."""
|
||||
return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED
|
||||
|
||||
def is_backoff(self) -> bool:
|
||||
"""Check if we should backoff and retry later."""
|
||||
# Backoff if status is still started (plugin didn't complete) and output_str is empty
|
||||
return (
|
||||
self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
|
||||
not self.archiveresult.output_str
|
||||
)
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
"""Check if extraction has completed (success, failure, or skipped)."""
|
||||
return self.archiveresult.status in (
|
||||
ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
ArchiveResult.StatusChoices.FAILED,
|
||||
ArchiveResult.StatusChoices.SKIPPED,
|
||||
)
|
||||
|
||||
@queued.enter
|
||||
def enter_queued(self):
|
||||
self.archiveresult.update_and_requeue(
|
||||
retry_at=timezone.now(),
|
||||
status=ArchiveResult.StatusChoices.QUEUED,
|
||||
start_ts=None,
|
||||
) # bump the snapshot's retry_at so they pickup any new changes
|
||||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
from archivebox.machine.models import NetworkInterface
|
||||
|
||||
# Lock the object and mark start time
|
||||
self.archiveresult.update_and_requeue(
|
||||
retry_at=timezone.now() + timedelta(seconds=120), # 2 min timeout for plugin
|
||||
status=ArchiveResult.StatusChoices.STARTED,
|
||||
start_ts=timezone.now(),
|
||||
iface=NetworkInterface.current(),
|
||||
)
|
||||
|
||||
# Run the plugin - this updates status, output, timestamps, etc.
|
||||
self.archiveresult.run()
|
||||
|
||||
# Save the updated result
|
||||
self.archiveresult.save()
|
||||
|
||||
|
||||
@backoff.enter
|
||||
def enter_backoff(self):
|
||||
self.archiveresult.update_and_requeue(
|
||||
retry_at=timezone.now() + timedelta(seconds=60),
|
||||
status=ArchiveResult.StatusChoices.BACKOFF,
|
||||
end_ts=None,
|
||||
)
|
||||
|
||||
@succeeded.enter
|
||||
def enter_succeeded(self):
|
||||
self.archiveresult.update_and_requeue(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
end_ts=timezone.now(),
|
||||
)
|
||||
|
||||
# Update health stats for ArchiveResult, Snapshot, and Crawl cascade
|
||||
self.archiveresult.cascade_health_update(success=True)
|
||||
|
||||
@failed.enter
|
||||
def enter_failed(self):
|
||||
self.archiveresult.update_and_requeue(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.FAILED,
|
||||
end_ts=timezone.now(),
|
||||
)
|
||||
|
||||
# Update health stats for ArchiveResult, Snapshot, and Crawl cascade
|
||||
self.archiveresult.cascade_health_update(success=False)
|
||||
|
||||
@skipped.enter
|
||||
def enter_skipped(self):
|
||||
self.archiveresult.update_and_requeue(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.SKIPPED,
|
||||
end_ts=timezone.now(),
|
||||
)
|
||||
|
||||
def after_transition(self, event: str, source: State, target: State):
|
||||
self.archiveresult.snapshot.update_and_requeue() # bump snapshot retry time so it picks up all the new changes
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# State Machine Registration
|
||||
# =============================================================================
|
||||
|
||||
# Manually register state machines with python-statemachine registry
|
||||
# (normally auto-discovered from statemachines.py, but we define them here for clarity)
|
||||
registry.register(SnapshotMachine)
|
||||
registry.register(ArchiveResultMachine)
|
||||
2638
archivebox/core/models.py.bak
Executable file
2638
archivebox/core/models.py.bak
Executable file
File diff suppressed because it is too large
Load Diff
@@ -30,9 +30,9 @@ LOADED_PLUGINS = archivebox.LOADED_PLUGINS
|
||||
### Django Core Settings
|
||||
################################################################################
|
||||
|
||||
WSGI_APPLICATION = "core.wsgi.application"
|
||||
ASGI_APPLICATION = "core.asgi.application"
|
||||
ROOT_URLCONF = "core.urls"
|
||||
WSGI_APPLICATION = "archivebox.core.wsgi.application"
|
||||
ASGI_APPLICATION = "archivebox.core.asgi.application"
|
||||
ROOT_URLCONF = "archivebox.core.urls"
|
||||
|
||||
LOGIN_URL = "/accounts/login/"
|
||||
LOGOUT_REDIRECT_URL = os.environ.get("LOGOUT_REDIRECT_URL", "/")
|
||||
@@ -55,14 +55,15 @@ INSTALLED_APPS = [
|
||||
# 3rd-party apps from PyPI
|
||||
"signal_webhooks", # handles REST API outbound webhooks https://github.com/MrThearMan/django-signal-webhooks
|
||||
"django_object_actions", # provides easy Django Admin action buttons on change views https://github.com/crccheck/django-object-actions
|
||||
# Our ArchiveBox-provided apps
|
||||
"config", # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
|
||||
"machine", # handles collecting and storing information about the host machine, network interfaces, binaries, etc.
|
||||
"workers", # handles starting and managing background workers and processes (orchestrators and actors)
|
||||
"crawls", # handles Crawl and CrawlSchedule models and management
|
||||
"personas", # handles Persona and session management
|
||||
"core", # core django model with Snapshot, ArchiveResult, etc.
|
||||
"api", # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
|
||||
# Our ArchiveBox-provided apps (use fully qualified names)
|
||||
# NOTE: Order matters! Apps with migrations that depend on other apps must come AFTER their dependencies
|
||||
# "archivebox.config", # ArchiveBox config settings (no models, not a real Django app)
|
||||
"archivebox.machine", # handles collecting and storing information about the host machine, network interfaces, binaries, etc.
|
||||
"archivebox.workers", # handles starting and managing background workers and processes (orchestrators and actors)
|
||||
"archivebox.personas", # handles Persona and session management
|
||||
"archivebox.core", # core django model with Snapshot, ArchiveResult, etc. (crawls depends on this)
|
||||
"archivebox.crawls", # handles Crawl and CrawlSchedule models and management (depends on core)
|
||||
"archivebox.api", # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
|
||||
# ArchiveBox plugins (hook-based plugins no longer add Django apps)
|
||||
# Use hooks.py discover_hooks() for plugin functionality
|
||||
# 3rd-party apps from PyPI that need to be loaded last
|
||||
@@ -72,15 +73,15 @@ INSTALLED_APPS = [
|
||||
|
||||
|
||||
MIDDLEWARE = [
|
||||
"core.middleware.TimezoneMiddleware",
|
||||
"archivebox.core.middleware.TimezoneMiddleware",
|
||||
"django.middleware.security.SecurityMiddleware",
|
||||
"django.contrib.sessions.middleware.SessionMiddleware",
|
||||
"django.middleware.common.CommonMiddleware",
|
||||
"django.middleware.csrf.CsrfViewMiddleware",
|
||||
"django.contrib.auth.middleware.AuthenticationMiddleware",
|
||||
"core.middleware.ReverseProxyAuthMiddleware",
|
||||
"archivebox.core.middleware.ReverseProxyAuthMiddleware",
|
||||
"django.contrib.messages.middleware.MessageMiddleware",
|
||||
"core.middleware.CacheControlMiddleware",
|
||||
"archivebox.core.middleware.CacheControlMiddleware",
|
||||
# Additional middlewares from plugins (if any)
|
||||
]
|
||||
|
||||
@@ -370,15 +371,15 @@ LOGGING = SETTINGS_LOGGING
|
||||
################################################################################
|
||||
|
||||
# Add default webhook configuration to the User model
|
||||
SIGNAL_WEBHOOKS_CUSTOM_MODEL = "api.models.OutboundWebhook"
|
||||
SIGNAL_WEBHOOKS_CUSTOM_MODEL = "archivebox.api.models.OutboundWebhook"
|
||||
SIGNAL_WEBHOOKS = {
|
||||
"HOOKS": {
|
||||
# ... is a special sigil value that means "use the default autogenerated hooks"
|
||||
"django.contrib.auth.models.User": ...,
|
||||
"core.models.Snapshot": ...,
|
||||
"core.models.ArchiveResult": ...,
|
||||
"core.models.Tag": ...,
|
||||
"api.models.APIToken": ...,
|
||||
"archivebox.core.models.Snapshot": ...,
|
||||
"archivebox.core.models.ArchiveResult": ...,
|
||||
"archivebox.core.models.Tag": ...,
|
||||
"archivebox.api.models.APIToken": ...,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -391,11 +392,11 @@ ADMIN_DATA_VIEWS = {
|
||||
"URLS": [
|
||||
{
|
||||
"route": "config/",
|
||||
"view": "core.views.live_config_list_view",
|
||||
"view": "archivebox.core.views.live_config_list_view",
|
||||
"name": "Configuration",
|
||||
"items": {
|
||||
"route": "<str:key>/",
|
||||
"view": "core.views.live_config_value_view",
|
||||
"view": "archivebox.core.views.live_config_value_view",
|
||||
"name": "config_val",
|
||||
},
|
||||
},
|
||||
|
||||
@@ -1,319 +0,0 @@
|
||||
__package__ = 'archivebox.core'
|
||||
|
||||
import time
|
||||
import os
|
||||
from datetime import timedelta
|
||||
from typing import ClassVar
|
||||
|
||||
from django.db.models import F
|
||||
from django.utils import timezone
|
||||
|
||||
from rich import print
|
||||
|
||||
from statemachine import State, StateMachine
|
||||
|
||||
# from workers.actor import ActorType
|
||||
|
||||
from core.models import Snapshot, ArchiveResult
|
||||
from crawls.models import Crawl
|
||||
|
||||
|
||||
class SnapshotMachine(StateMachine, strict_states=True):
|
||||
"""
|
||||
State machine for managing Snapshot lifecycle.
|
||||
|
||||
https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
|
||||
"""
|
||||
|
||||
model: Snapshot
|
||||
|
||||
# States
|
||||
queued = State(value=Snapshot.StatusChoices.QUEUED, initial=True)
|
||||
started = State(value=Snapshot.StatusChoices.STARTED)
|
||||
sealed = State(value=Snapshot.StatusChoices.SEALED, final=True)
|
||||
|
||||
# Tick Event
|
||||
tick = (
|
||||
queued.to.itself(unless='can_start') |
|
||||
queued.to(started, cond='can_start') |
|
||||
started.to.itself(unless='is_finished') |
|
||||
started.to(sealed, cond='is_finished')
|
||||
)
|
||||
|
||||
def __init__(self, snapshot, *args, **kwargs):
|
||||
self.snapshot = snapshot
|
||||
super().__init__(snapshot, *args, **kwargs)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'Snapshot[{self.snapshot.id}]'
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.__repr__()
|
||||
|
||||
def can_start(self) -> bool:
|
||||
can_start = bool(self.snapshot.url)
|
||||
# Suppressed: queue waiting logs
|
||||
return can_start
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
# if no archiveresults exist yet, it's not finished
|
||||
if not self.snapshot.archiveresult_set.exists():
|
||||
return False
|
||||
|
||||
# Try to advance step if ready (handles step-based hook execution)
|
||||
# This will increment current_step when all foreground hooks in current step are done
|
||||
while self.snapshot.advance_step_if_ready():
|
||||
pass # Keep advancing until we can't anymore
|
||||
|
||||
# if archiveresults exist but are still pending, it's not finished
|
||||
if self.snapshot.pending_archiveresults().exists():
|
||||
return False
|
||||
|
||||
# Don't wait for background hooks - they'll be cleaned up on entering sealed state
|
||||
# Background hooks in STARTED state are excluded by pending_archiveresults()
|
||||
# (STARTED is in FINAL_OR_ACTIVE_STATES) so once all results are FINAL or ACTIVE,
|
||||
# we can transition to sealed and cleanup() will kill the background hooks
|
||||
|
||||
# otherwise archiveresults exist and are all finished, so it's finished
|
||||
return True
|
||||
|
||||
# def on_transition(self, event, state):
|
||||
# print(f'{self}.on_transition() [blue]{str(state).upper()}[/blue] ➡️ ...')
|
||||
|
||||
@queued.enter
|
||||
def enter_queued(self):
|
||||
# Suppressed: state transition logs
|
||||
self.snapshot.update_for_workers(
|
||||
retry_at=timezone.now(),
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
)
|
||||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
# Suppressed: state transition logs
|
||||
# lock the snapshot while we create the pending archiveresults
|
||||
self.snapshot.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=30), # if failed, wait 30s before retrying
|
||||
)
|
||||
|
||||
# Run the snapshot - creates pending archiveresults for all enabled plugins
|
||||
self.snapshot.run()
|
||||
|
||||
# unlock the snapshot after we're done + set status = started
|
||||
self.snapshot.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=5), # check again in 5s
|
||||
status=Snapshot.StatusChoices.STARTED,
|
||||
)
|
||||
|
||||
@sealed.enter
|
||||
def enter_sealed(self):
|
||||
# Clean up background hooks
|
||||
self.snapshot.cleanup()
|
||||
|
||||
# Suppressed: state transition logs
|
||||
self.snapshot.update_for_workers(
|
||||
retry_at=None,
|
||||
status=Snapshot.StatusChoices.SEALED,
|
||||
)
|
||||
|
||||
|
||||
# class SnapshotWorker(ActorType[Snapshot]):
|
||||
# """
|
||||
# The primary actor for progressing Snapshot objects
|
||||
# through their lifecycle using the SnapshotMachine.
|
||||
# """
|
||||
# Model = Snapshot
|
||||
# StateMachineClass = SnapshotMachine
|
||||
|
||||
# ACTIVE_STATE: ClassVar[State] = SnapshotMachine.started # 'started'
|
||||
|
||||
# MAX_CONCURRENT_ACTORS: ClassVar[int] = 3
|
||||
# MAX_TICK_TIME: ClassVar[int] = 10
|
||||
# CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||
"""
|
||||
State machine for managing ArchiveResult lifecycle.
|
||||
|
||||
https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
|
||||
"""
|
||||
|
||||
model: ArchiveResult
|
||||
|
||||
# States
|
||||
queued = State(value=ArchiveResult.StatusChoices.QUEUED, initial=True)
|
||||
started = State(value=ArchiveResult.StatusChoices.STARTED)
|
||||
backoff = State(value=ArchiveResult.StatusChoices.BACKOFF)
|
||||
succeeded = State(value=ArchiveResult.StatusChoices.SUCCEEDED, final=True)
|
||||
failed = State(value=ArchiveResult.StatusChoices.FAILED, final=True)
|
||||
skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True)
|
||||
|
||||
# Tick Event - transitions based on conditions
|
||||
tick = (
|
||||
queued.to.itself(unless='can_start') |
|
||||
queued.to(started, cond='can_start') |
|
||||
started.to.itself(unless='is_finished') |
|
||||
started.to(succeeded, cond='is_succeeded') |
|
||||
started.to(failed, cond='is_failed') |
|
||||
started.to(skipped, cond='is_skipped') |
|
||||
started.to(backoff, cond='is_backoff') |
|
||||
backoff.to.itself(unless='can_start') |
|
||||
backoff.to(started, cond='can_start') |
|
||||
backoff.to(succeeded, cond='is_succeeded') |
|
||||
backoff.to(failed, cond='is_failed') |
|
||||
backoff.to(skipped, cond='is_skipped')
|
||||
)
|
||||
|
||||
def __init__(self, archiveresult, *args, **kwargs):
|
||||
self.archiveresult = archiveresult
|
||||
super().__init__(archiveresult, *args, **kwargs)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'ArchiveResult[{self.archiveresult.id}]'
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.__repr__()
|
||||
|
||||
def can_start(self) -> bool:
|
||||
can_start = bool(self.archiveresult.snapshot.url)
|
||||
# Suppressed: queue waiting logs
|
||||
return can_start
|
||||
|
||||
def is_succeeded(self) -> bool:
|
||||
"""Check if extractor plugin succeeded (status was set by run())."""
|
||||
return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED
|
||||
|
||||
def is_failed(self) -> bool:
|
||||
"""Check if extractor plugin failed (status was set by run())."""
|
||||
return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED
|
||||
|
||||
def is_skipped(self) -> bool:
|
||||
"""Check if extractor plugin was skipped (status was set by run())."""
|
||||
return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED
|
||||
|
||||
def is_backoff(self) -> bool:
|
||||
"""Check if we should backoff and retry later."""
|
||||
# Backoff if status is still started (plugin didn't complete) and output_str is empty
|
||||
return (
|
||||
self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
|
||||
not self.archiveresult.output_str
|
||||
)
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
"""Check if extraction has completed (success, failure, or skipped)."""
|
||||
return self.archiveresult.status in (
|
||||
ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
ArchiveResult.StatusChoices.FAILED,
|
||||
ArchiveResult.StatusChoices.SKIPPED,
|
||||
)
|
||||
|
||||
@queued.enter
|
||||
def enter_queued(self):
|
||||
# Suppressed: state transition logs
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=timezone.now(),
|
||||
status=ArchiveResult.StatusChoices.QUEUED,
|
||||
start_ts=None,
|
||||
) # bump the snapshot's retry_at so they pickup any new changes
|
||||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
from machine.models import NetworkInterface
|
||||
|
||||
# Suppressed: state transition logs
|
||||
# Lock the object and mark start time
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=120), # 2 min timeout for plugin
|
||||
status=ArchiveResult.StatusChoices.STARTED,
|
||||
start_ts=timezone.now(),
|
||||
iface=NetworkInterface.current(),
|
||||
)
|
||||
|
||||
# Run the plugin - this updates status, output, timestamps, etc.
|
||||
self.archiveresult.run()
|
||||
|
||||
# Save the updated result
|
||||
self.archiveresult.save()
|
||||
|
||||
# Suppressed: plugin result logs (already logged by worker)
|
||||
|
||||
@backoff.enter
|
||||
def enter_backoff(self):
|
||||
# Suppressed: state transition logs
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=60),
|
||||
status=ArchiveResult.StatusChoices.BACKOFF,
|
||||
end_ts=None,
|
||||
# retries=F('retries') + 1, # F() equivalent to getattr(self.archiveresult, 'retries', 0) + 1,
|
||||
)
|
||||
self.archiveresult.save()
|
||||
|
||||
@succeeded.enter
|
||||
def enter_succeeded(self):
|
||||
# Suppressed: state transition logs
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
end_ts=timezone.now(),
|
||||
# **self.archiveresult.get_output_dict(), # {output, output_json, stderr, stdout, returncode, errors, cmd_version, pwd, cmd, machine}
|
||||
)
|
||||
self.archiveresult.save()
|
||||
|
||||
# Increment health stats on ArchiveResult, Snapshot, and optionally Crawl
|
||||
ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
|
||||
Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
|
||||
|
||||
# Also update Crawl health stats if snapshot has a crawl
|
||||
snapshot = self.archiveresult.snapshot
|
||||
if snapshot.crawl_id:
|
||||
Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
|
||||
|
||||
@failed.enter
|
||||
def enter_failed(self):
|
||||
# Suppressed: state transition logs
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.FAILED,
|
||||
end_ts=timezone.now(),
|
||||
)
|
||||
|
||||
# Increment health stats on ArchiveResult, Snapshot, and optionally Crawl
|
||||
ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_failed=F('num_uses_failed') + 1)
|
||||
Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_failed=F('num_uses_failed') + 1)
|
||||
|
||||
# Also update Crawl health stats if snapshot has a crawl
|
||||
snapshot = self.archiveresult.snapshot
|
||||
if snapshot.crawl_id:
|
||||
Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_failed=F('num_uses_failed') + 1)
|
||||
|
||||
@skipped.enter
|
||||
def enter_skipped(self):
|
||||
# Suppressed: state transition logs
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.SKIPPED,
|
||||
end_ts=timezone.now(),
|
||||
)
|
||||
|
||||
def after_transition(self, event: str, source: State, target: State):
|
||||
# print(f"after '{event}' from '{source.id}' to '{target.id}'")
|
||||
self.archiveresult.snapshot.update_for_workers() # bump snapshot retry time so it picks up all the new changes
|
||||
|
||||
|
||||
# class ArchiveResultWorker(ActorType[ArchiveResult]):
|
||||
# """
|
||||
# The primary actor for progressing ArchiveResult objects
|
||||
# through their lifecycle using the ArchiveResultMachine.
|
||||
# """
|
||||
# Model = ArchiveResult
|
||||
# StateMachineClass = ArchiveResultMachine
|
||||
|
||||
# ACTIVE_STATE: ClassVar[State] = ArchiveResultMachine.started # 'started'
|
||||
|
||||
# MAX_CONCURRENT_ACTORS: ClassVar[int] = 6
|
||||
# MAX_TICK_TIME: ClassVar[int] = 60
|
||||
# CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
|
||||
20
archivebox/core/templatetags/config_tags.py
Normal file
20
archivebox/core/templatetags/config_tags.py
Normal file
@@ -0,0 +1,20 @@
|
||||
"""Template tags for accessing config values in templates."""
|
||||
|
||||
from django import template
|
||||
|
||||
from archivebox.config.configset import get_config as _get_config
|
||||
|
||||
register = template.Library()
|
||||
|
||||
|
||||
@register.simple_tag
|
||||
def get_config(key: str) -> any:
|
||||
"""
|
||||
Get a config value by key.
|
||||
|
||||
Usage: {% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
|
||||
"""
|
||||
try:
|
||||
return _get_config(key)
|
||||
except (KeyError, AttributeError):
|
||||
return None
|
||||
@@ -1,3 +1,319 @@
|
||||
#from django.test import TestCase
|
||||
"""Tests for the core views, especially AddView."""
|
||||
|
||||
# Create your tests here.
|
||||
import os
|
||||
import django
|
||||
|
||||
# Set up Django before importing any Django-dependent modules
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
|
||||
django.setup()
|
||||
|
||||
from django.test import TestCase, Client
|
||||
from django.contrib.auth.models import User
|
||||
from django.urls import reverse
|
||||
|
||||
from archivebox.crawls.models import Crawl, CrawlSchedule
|
||||
from archivebox.core.models import Tag
|
||||
|
||||
|
||||
class AddViewTests(TestCase):
|
||||
"""Tests for the AddView (crawl creation form)."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test user and client."""
|
||||
self.client = Client()
|
||||
self.user = User.objects.create_user(
|
||||
username='testuser',
|
||||
password='testpass123',
|
||||
email='test@example.com'
|
||||
)
|
||||
self.client.login(username='testuser', password='testpass123')
|
||||
self.add_url = reverse('add')
|
||||
|
||||
def test_add_view_get_requires_auth(self):
|
||||
"""Test that GET /add requires authentication."""
|
||||
self.client.logout()
|
||||
response = self.client.get(self.add_url)
|
||||
# Should redirect to login or show 403/404
|
||||
self.assertIn(response.status_code, [302, 403, 404])
|
||||
|
||||
def test_add_view_get_shows_form(self):
|
||||
"""Test that GET /add shows the form with all fields."""
|
||||
response = self.client.get(self.add_url)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
# Check that form fields are present
|
||||
self.assertContains(response, 'name="url"')
|
||||
self.assertContains(response, 'name="tag"')
|
||||
self.assertContains(response, 'name="depth"')
|
||||
self.assertContains(response, 'name="notes"')
|
||||
self.assertContains(response, 'name="schedule"')
|
||||
self.assertContains(response, 'name="persona"')
|
||||
self.assertContains(response, 'name="overwrite"')
|
||||
self.assertContains(response, 'name="update"')
|
||||
self.assertContains(response, 'name="index_only"')
|
||||
|
||||
# Check for plugin groups
|
||||
self.assertContains(response, 'name="chrome_plugins"')
|
||||
self.assertContains(response, 'name="archiving_plugins"')
|
||||
self.assertContains(response, 'name="parsing_plugins"')
|
||||
|
||||
def test_add_view_shows_tag_autocomplete(self):
|
||||
"""Test that tag autocomplete datalist is rendered."""
|
||||
# Create some tags
|
||||
Tag.objects.create(name='test-tag-1')
|
||||
Tag.objects.create(name='test-tag-2')
|
||||
|
||||
response = self.client.get(self.add_url)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
# Check for datalist with tags
|
||||
self.assertContains(response, 'id="tag-datalist"')
|
||||
self.assertContains(response, 'test-tag-1')
|
||||
self.assertContains(response, 'test-tag-2')
|
||||
|
||||
def test_add_view_shows_plugin_presets(self):
|
||||
"""Test that plugin preset buttons are rendered."""
|
||||
response = self.client.get(self.add_url)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
self.assertContains(response, 'Quick Archive')
|
||||
self.assertContains(response, 'Full Chrome')
|
||||
self.assertContains(response, 'Text Only')
|
||||
self.assertContains(response, 'Select All')
|
||||
self.assertContains(response, 'Clear All')
|
||||
|
||||
def test_add_view_shows_links_to_resources(self):
|
||||
"""Test that helpful links are present."""
|
||||
response = self.client.get(self.add_url)
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
# Link to plugin documentation
|
||||
self.assertContains(response, '/admin/environment/plugins/')
|
||||
|
||||
# Link to create new persona
|
||||
self.assertContains(response, '/admin/personas/persona/add/')
|
||||
|
||||
def test_add_basic_crawl_without_schedule(self):
|
||||
"""Test creating a basic crawl without a schedule."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com\nhttps://example.org',
|
||||
'tag': 'test-tag',
|
||||
'depth': '0',
|
||||
'notes': 'Test crawl notes',
|
||||
})
|
||||
|
||||
# Should redirect to crawl admin page
|
||||
self.assertEqual(response.status_code, 302)
|
||||
|
||||
# Check that crawl was created
|
||||
self.assertEqual(Crawl.objects.count(), 1)
|
||||
crawl = Crawl.objects.first()
|
||||
|
||||
self.assertIn('https://example.com', crawl.urls)
|
||||
self.assertIn('https://example.org', crawl.urls)
|
||||
self.assertEqual(crawl.tags_str, 'test-tag')
|
||||
self.assertEqual(crawl.max_depth, 0)
|
||||
self.assertEqual(crawl.notes, 'Test crawl notes')
|
||||
self.assertEqual(crawl.created_by, self.user)
|
||||
|
||||
# No schedule should be created
|
||||
self.assertIsNone(crawl.schedule)
|
||||
self.assertEqual(CrawlSchedule.objects.count(), 0)
|
||||
|
||||
def test_add_crawl_with_schedule(self):
|
||||
"""Test creating a crawl with a repeat schedule."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com',
|
||||
'tag': 'scheduled',
|
||||
'depth': '1',
|
||||
'notes': 'Daily crawl',
|
||||
'schedule': 'daily',
|
||||
})
|
||||
|
||||
self.assertEqual(response.status_code, 302)
|
||||
|
||||
# Check that crawl and schedule were created
|
||||
self.assertEqual(Crawl.objects.count(), 1)
|
||||
self.assertEqual(CrawlSchedule.objects.count(), 1)
|
||||
|
||||
crawl = Crawl.objects.first()
|
||||
schedule = CrawlSchedule.objects.first()
|
||||
|
||||
self.assertEqual(crawl.schedule, schedule)
|
||||
self.assertEqual(schedule.template, crawl)
|
||||
self.assertEqual(schedule.schedule, 'daily')
|
||||
self.assertTrue(schedule.is_enabled)
|
||||
self.assertEqual(schedule.created_by, self.user)
|
||||
|
||||
def test_add_crawl_with_cron_schedule(self):
|
||||
"""Test creating a crawl with a cron format schedule."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com',
|
||||
'depth': '0',
|
||||
'schedule': '0 */6 * * *', # Every 6 hours
|
||||
})
|
||||
|
||||
self.assertEqual(response.status_code, 302)
|
||||
|
||||
schedule = CrawlSchedule.objects.first()
|
||||
self.assertEqual(schedule.schedule, '0 */6 * * *')
|
||||
|
||||
def test_add_crawl_with_plugins(self):
|
||||
"""Test creating a crawl with specific plugins selected."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com',
|
||||
'depth': '0',
|
||||
'chrome_plugins': ['screenshot', 'dom'],
|
||||
'archiving_plugins': ['wget'],
|
||||
})
|
||||
|
||||
self.assertEqual(response.status_code, 302)
|
||||
|
||||
crawl = Crawl.objects.first()
|
||||
plugins = crawl.config.get('PLUGINS', '')
|
||||
|
||||
# Should contain the selected plugins
|
||||
self.assertIn('screenshot', plugins)
|
||||
self.assertIn('dom', plugins)
|
||||
self.assertIn('wget', plugins)
|
||||
|
||||
def test_add_crawl_with_depth_range(self):
|
||||
"""Test creating crawls with different depth values (0-4)."""
|
||||
for depth in range(5):
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': f'https://example{depth}.com',
|
||||
'depth': str(depth),
|
||||
})
|
||||
|
||||
self.assertEqual(response.status_code, 302)
|
||||
|
||||
self.assertEqual(Crawl.objects.count(), 5)
|
||||
|
||||
for i, crawl in enumerate(Crawl.objects.order_by('created_at')):
|
||||
self.assertEqual(crawl.max_depth, i)
|
||||
|
||||
def test_add_crawl_with_advanced_options(self):
|
||||
"""Test creating a crawl with advanced options."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com',
|
||||
'depth': '0',
|
||||
'persona': 'CustomPersona',
|
||||
'overwrite': True,
|
||||
'update': True,
|
||||
'index_only': True,
|
||||
})
|
||||
|
||||
self.assertEqual(response.status_code, 302)
|
||||
|
||||
crawl = Crawl.objects.first()
|
||||
config = crawl.config
|
||||
|
||||
self.assertEqual(config.get('DEFAULT_PERSONA'), 'CustomPersona')
|
||||
self.assertEqual(config.get('OVERWRITE'), True)
|
||||
self.assertEqual(config.get('ONLY_NEW'), False) # opposite of update
|
||||
self.assertEqual(config.get('INDEX_ONLY'), True)
|
||||
|
||||
def test_add_crawl_with_custom_config(self):
|
||||
"""Test creating a crawl with custom config overrides."""
|
||||
# Note: Django test client can't easily POST the KeyValueWidget format,
|
||||
# so this test would need to use the form directly or mock the cleaned_data
|
||||
# For now, we'll skip this test or mark it as TODO
|
||||
pass
|
||||
|
||||
def test_add_empty_urls_fails(self):
|
||||
"""Test that submitting without URLs fails validation."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': '',
|
||||
'depth': '0',
|
||||
})
|
||||
|
||||
# Should show form again with errors, not redirect
|
||||
self.assertEqual(response.status_code, 200)
|
||||
self.assertFormError(response, 'form', 'url', 'This field is required.')
|
||||
|
||||
def test_add_invalid_urls_fails(self):
|
||||
"""Test that invalid URLs fail validation."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'not-a-url',
|
||||
'depth': '0',
|
||||
})
|
||||
|
||||
# Should show form again with errors
|
||||
self.assertEqual(response.status_code, 200)
|
||||
# Check for validation error (URL regex should fail)
|
||||
self.assertContains(response, 'error')
|
||||
|
||||
def test_add_success_message_without_schedule(self):
|
||||
"""Test that success message is shown without schedule link."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com\nhttps://example.org',
|
||||
'depth': '0',
|
||||
}, follow=True)
|
||||
|
||||
# Check success message mentions crawl creation
|
||||
messages = list(response.context['messages'])
|
||||
self.assertEqual(len(messages), 1)
|
||||
message_text = str(messages[0])
|
||||
|
||||
self.assertIn('Created crawl with 2 starting URL', message_text)
|
||||
self.assertIn('View Crawl', message_text)
|
||||
self.assertNotIn('scheduled to repeat', message_text)
|
||||
|
||||
def test_add_success_message_with_schedule(self):
|
||||
"""Test that success message includes schedule link."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com',
|
||||
'depth': '0',
|
||||
'schedule': 'weekly',
|
||||
}, follow=True)
|
||||
|
||||
# Check success message mentions schedule
|
||||
messages = list(response.context['messages'])
|
||||
self.assertEqual(len(messages), 1)
|
||||
message_text = str(messages[0])
|
||||
|
||||
self.assertIn('Created crawl', message_text)
|
||||
self.assertIn('scheduled to repeat weekly', message_text)
|
||||
self.assertIn('View Crawl', message_text)
|
||||
|
||||
def test_add_crawl_creates_source_file(self):
|
||||
"""Test that crawl creation saves URLs to sources file."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com',
|
||||
'depth': '0',
|
||||
})
|
||||
|
||||
self.assertEqual(response.status_code, 302)
|
||||
|
||||
# Check that source file was created in sources/ directory
|
||||
from archivebox.config import CONSTANTS
|
||||
sources_dir = CONSTANTS.SOURCES_DIR
|
||||
|
||||
# Should have created a source file
|
||||
source_files = list(sources_dir.glob('*__web_ui_add_by_user_*.txt'))
|
||||
self.assertGreater(len(source_files), 0)
|
||||
|
||||
def test_multiple_tags_are_saved(self):
|
||||
"""Test that multiple comma-separated tags are saved."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com',
|
||||
'depth': '0',
|
||||
'tag': 'tag1,tag2,tag3',
|
||||
})
|
||||
|
||||
self.assertEqual(response.status_code, 302)
|
||||
|
||||
crawl = Crawl.objects.first()
|
||||
self.assertEqual(crawl.tags_str, 'tag1,tag2,tag3')
|
||||
|
||||
def test_crawl_redirects_to_admin_change_page(self):
|
||||
"""Test that successful submission redirects to crawl admin page."""
|
||||
response = self.client.post(self.add_url, {
|
||||
'url': 'https://example.com',
|
||||
'depth': '0',
|
||||
})
|
||||
|
||||
crawl = Crawl.objects.first()
|
||||
expected_redirect = f'/admin/crawls/crawl/{crawl.id}/change/'
|
||||
|
||||
self.assertRedirects(response, expected_redirect, fetch_redirect_response=False)
|
||||
|
||||
@@ -7,10 +7,10 @@ from django.views.generic.base import RedirectView
|
||||
|
||||
from archivebox.misc.serve_static import serve_static
|
||||
|
||||
from core.admin_site import archivebox_admin
|
||||
from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView, live_progress_view
|
||||
from archivebox.core.admin_site import archivebox_admin
|
||||
from archivebox.core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView, live_progress_view
|
||||
|
||||
from workers.views import JobsDashboardView
|
||||
from archivebox.workers.views import JobsDashboardView
|
||||
|
||||
# GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
|
||||
# from archivebox.config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE
|
||||
|
||||
@@ -23,7 +23,7 @@ from admin_data_views.typing import TableContext, ItemContext
|
||||
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
|
||||
|
||||
import archivebox
|
||||
from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION, SAVE_ARCHIVE_DOT_ORG
|
||||
from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION
|
||||
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG
|
||||
from archivebox.config.configset import get_flat_config, get_config, get_all_configs
|
||||
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
|
||||
@@ -31,9 +31,9 @@ from archivebox.misc.serve_static import serve_static_with_byterange_support
|
||||
from archivebox.misc.logging_util import printable_filesize
|
||||
from archivebox.search import query_search_index
|
||||
|
||||
from core.models import Snapshot
|
||||
from core.forms import AddLinkForm
|
||||
from crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.core.forms import AddLinkForm
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.hooks import get_extractors, get_extractor_name
|
||||
|
||||
|
||||
@@ -150,7 +150,6 @@ class SnapshotView(View):
|
||||
'status_color': 'success' if snapshot.is_archived else 'danger',
|
||||
'oldest_archive_date': ts_to_date_str(snapshot.oldest_archive_date),
|
||||
'warc_path': warc_path,
|
||||
'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
|
||||
'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
|
||||
'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']),
|
||||
'best_result': best_result,
|
||||
@@ -421,35 +420,34 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
return SERVER_CONFIG.PUBLIC_ADD_VIEW or self.request.user.is_authenticated
|
||||
|
||||
def get_context_data(self, **kwargs):
|
||||
from archivebox.core.models import Tag
|
||||
|
||||
return {
|
||||
**super().get_context_data(**kwargs),
|
||||
'title': "Add URLs",
|
||||
'title': "Create Crawl",
|
||||
# We can't just call request.build_absolute_uri in the template, because it would include query parameters
|
||||
'absolute_add_path': self.request.build_absolute_uri(self.request.path),
|
||||
'VERSION': VERSION,
|
||||
'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
|
||||
'stdout': '',
|
||||
'available_tags': list(Tag.objects.all().order_by('name').values_list('name', flat=True)),
|
||||
}
|
||||
|
||||
def form_valid(self, form):
|
||||
urls = form.cleaned_data["url"]
|
||||
print(f'[+] Adding URL: {urls}')
|
||||
parser = form.cleaned_data.get("parser", "auto") # default to auto-detect parser
|
||||
tag = form.cleaned_data["tag"]
|
||||
depth = 0 if form.cleaned_data["depth"] == "0" else 1
|
||||
plugins = ','.join(form.cleaned_data["archive_methods"])
|
||||
input_kwargs = {
|
||||
"urls": urls,
|
||||
"tag": tag,
|
||||
"depth": depth,
|
||||
"parser": parser,
|
||||
"update_all": False,
|
||||
"out_dir": DATA_DIR,
|
||||
"created_by_id": self.request.user.pk,
|
||||
}
|
||||
if plugins:
|
||||
input_kwargs.update({"plugins": plugins})
|
||||
|
||||
# Extract all form fields
|
||||
tag = form.cleaned_data["tag"]
|
||||
depth = int(form.cleaned_data["depth"])
|
||||
plugins = ','.join(form.cleaned_data.get("plugins", []))
|
||||
schedule = form.cleaned_data.get("schedule", "").strip()
|
||||
persona = form.cleaned_data.get("persona", "Default")
|
||||
overwrite = form.cleaned_data.get("overwrite", False)
|
||||
update = form.cleaned_data.get("update", False)
|
||||
index_only = form.cleaned_data.get("index_only", False)
|
||||
notes = form.cleaned_data.get("notes", "")
|
||||
custom_config = form.cleaned_data.get("config", {})
|
||||
|
||||
from archivebox.config.permissions import HOSTNAME
|
||||
|
||||
@@ -461,33 +459,59 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
# 2. create a new Crawl with the URLs from the file
|
||||
timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
|
||||
urls_content = sources_file.read_text()
|
||||
# Build complete config
|
||||
config = {
|
||||
'ONLY_NEW': not update,
|
||||
'INDEX_ONLY': index_only,
|
||||
'OVERWRITE': overwrite,
|
||||
'DEPTH': depth,
|
||||
'PLUGINS': plugins or '',
|
||||
'DEFAULT_PERSONA': persona or 'Default',
|
||||
}
|
||||
|
||||
# Merge custom config overrides
|
||||
config.update(custom_config)
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls=urls_content,
|
||||
max_depth=depth,
|
||||
tags_str=tag,
|
||||
notes=notes,
|
||||
label=f'{self.request.user.username}@{HOSTNAME}{self.request.path} {timestamp}',
|
||||
created_by_id=self.request.user.pk,
|
||||
config={
|
||||
# 'ONLY_NEW': not update,
|
||||
# 'INDEX_ONLY': index_only,
|
||||
# 'OVERWRITE': False,
|
||||
'DEPTH': depth,
|
||||
'PLUGINS': plugins or '',
|
||||
# 'DEFAULT_PERSONA': persona or 'Default',
|
||||
}
|
||||
config=config
|
||||
)
|
||||
|
||||
|
||||
# 3. create a CrawlSchedule if schedule is provided
|
||||
if schedule:
|
||||
from crawls.models import CrawlSchedule
|
||||
crawl_schedule = CrawlSchedule.objects.create(
|
||||
template=crawl,
|
||||
schedule=schedule,
|
||||
is_enabled=True,
|
||||
label=crawl.label,
|
||||
notes=f"Auto-created from add page. {notes}".strip(),
|
||||
created_by_id=self.request.user.pk,
|
||||
)
|
||||
crawl.schedule = crawl_schedule
|
||||
crawl.save(update_fields=['schedule'])
|
||||
|
||||
# 4. start the Orchestrator & wait until it completes
|
||||
# ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
|
||||
# from crawls.actors import CrawlActor
|
||||
# from core.actors import SnapshotActor, ArchiveResultActor
|
||||
|
||||
# from archivebox.crawls.actors import CrawlActor
|
||||
# from archivebox.core.actors import SnapshotActor, ArchiveResultActor
|
||||
|
||||
|
||||
rough_url_count = urls.count('://')
|
||||
|
||||
# Build success message with schedule link if created
|
||||
schedule_msg = ""
|
||||
if schedule:
|
||||
schedule_msg = f" and <a href='{crawl.schedule.admin_change_url}'>scheduled to repeat {schedule}</a>"
|
||||
|
||||
messages.success(
|
||||
self.request,
|
||||
mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a minute start seeing results) {crawl.admin_change_url}"),
|
||||
mark_safe(f"Created crawl with {rough_url_count} starting URL(s){schedule_msg}. Snapshots will be created and archived in the background. <a href='{crawl.admin_change_url}'>View Crawl →</a>"),
|
||||
)
|
||||
|
||||
# Orchestrator (managed by supervisord) will pick up the queued crawl
|
||||
@@ -516,8 +540,8 @@ def live_progress_view(request):
|
||||
"""Simple JSON endpoint for live progress status - used by admin progress monitor."""
|
||||
try:
|
||||
from workers.orchestrator import Orchestrator
|
||||
from crawls.models import Crawl
|
||||
from core.models import Snapshot, ArchiveResult
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
from django.db.models import Case, When, Value, IntegerField
|
||||
|
||||
# Get orchestrator status
|
||||
@@ -764,9 +788,9 @@ def key_is_safe(key: str) -> bool:
|
||||
def find_config_source(key: str, merged_config: dict) -> str:
|
||||
"""Determine where a config value comes from."""
|
||||
import os
|
||||
from machine.models import Machine
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
# Check if it's from machine config
|
||||
# Check if it's from archivebox.machine.config
|
||||
try:
|
||||
machine = Machine.current()
|
||||
if machine.config and key in machine.config:
|
||||
@@ -778,7 +802,7 @@ def find_config_source(key: str, merged_config: dict) -> str:
|
||||
if key in os.environ:
|
||||
return 'Environment'
|
||||
|
||||
# Check if it's from config file
|
||||
# Check if it's from archivebox.config.file
|
||||
from archivebox.config.configset import BaseConfigSet
|
||||
file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE)
|
||||
if key in file_config:
|
||||
@@ -796,7 +820,7 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
|
||||
# Get merged config that includes Machine.config overrides
|
||||
try:
|
||||
from machine.models import Machine
|
||||
from archivebox.machine.models import Machine
|
||||
machine = Machine.current()
|
||||
merged_config = get_config()
|
||||
except Exception as e:
|
||||
@@ -859,7 +883,7 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
@render_with_item_view
|
||||
def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
import os
|
||||
from machine.models import Machine
|
||||
from archivebox.machine.models import Machine
|
||||
from archivebox.config.configset import BaseConfigSet
|
||||
|
||||
CONFIGS = get_all_configs()
|
||||
|
||||
Reference in New Issue
Block a user