This commit is contained in:
Nick Sweeting
2025-12-28 17:51:54 -08:00
parent 54f91c1339
commit f0aa19fa7d
157 changed files with 6774 additions and 5061 deletions

View File

@@ -4,7 +4,7 @@ __order__ = 100
def register_admin(admin_site):
"""Register the core.models views (Snapshot, ArchiveResult, Tag, etc.) with the admin site"""
from core.admin import register_admin as do_register
from archivebox.core.admin import register_admin as do_register
do_register(admin_site)

View File

@@ -3,11 +3,11 @@ __package__ = 'archivebox.core'
from django.contrib.auth import get_user_model
from core.models import Snapshot, ArchiveResult, Tag
from core.admin_tags import TagAdmin
from core.admin_snapshots import SnapshotAdmin
from core.admin_archiveresults import ArchiveResultAdmin
from core.admin_users import UserAdmin
from archivebox.core.models import Snapshot, ArchiveResult, Tag
from archivebox.core.admin_tags import TagAdmin
from archivebox.core.admin_snapshots import SnapshotAdmin
from archivebox.core.admin_archiveresults import ArchiveResultAdmin
from archivebox.core.admin_users import UserAdmin
def register_admin(admin_site):

View File

@@ -16,7 +16,7 @@ from archivebox.base_models.admin import BaseModelAdmin
from archivebox.hooks import get_plugin_icon
from core.models import ArchiveResult, Snapshot
from archivebox.core.models import ArchiveResult, Snapshot
def render_archiveresults_list(archiveresults_qs, limit=50):
@@ -187,7 +187,7 @@ class ArchiveResultInline(admin.TabularInline):
extra = 0
sort_fields = ('end_ts', 'plugin', 'output_str', 'status', 'cmd_version')
readonly_fields = ('id', 'result_id', 'completed', 'command', 'version')
fields = ('start_ts', 'end_ts', *readonly_fields, 'plugin', 'cmd', 'cmd_version', 'pwd', 'created_by', 'status', 'retry_at', 'output_str')
fields = ('start_ts', 'end_ts', *readonly_fields, 'plugin', 'cmd', 'cmd_version', 'pwd', 'status', 'retry_at', 'output_str')
# exclude = ('id',)
ordering = ('end_ts',)
show_change_link = True
@@ -229,17 +229,15 @@ class ArchiveResultInline(admin.TabularInline):
formset.form.base_fields['end_ts'].initial = timezone.now()
formset.form.base_fields['cmd_version'].initial = '-'
formset.form.base_fields['pwd'].initial = str(snapshot.output_dir)
formset.form.base_fields['created_by'].initial = request.user
formset.form.base_fields['cmd'].initial = '["-"]'
formset.form.base_fields['output_str'].initial = 'Manually recorded cmd output...'
if obj is not None:
# hidden values for existing entries and new entries
formset.form.base_fields['start_ts'].widget = formset.form.base_fields['start_ts'].hidden_widget()
formset.form.base_fields['end_ts'].widget = formset.form.base_fields['end_ts'].hidden_widget()
formset.form.base_fields['cmd'].widget = formset.form.base_fields['cmd'].hidden_widget()
formset.form.base_fields['pwd'].widget = formset.form.base_fields['pwd'].hidden_widget()
formset.form.base_fields['created_by'].widget = formset.form.base_fields['created_by'].hidden_widget()
formset.form.base_fields['cmd_version'].widget = formset.form.base_fields['cmd_version'].hidden_widget()
return formset
@@ -252,8 +250,8 @@ class ArchiveResultInline(admin.TabularInline):
class ArchiveResultAdmin(BaseModelAdmin):
list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str')
sort_fields = ('id', 'created_by', 'created_at', 'plugin', 'status')
list_display = ('id', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str')
sort_fields = ('id', 'created_at', 'plugin', 'status')
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon', 'iface')
search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp')
autocomplete_fields = ['snapshot']
@@ -279,10 +277,6 @@ class ArchiveResultAdmin(BaseModelAdmin):
'fields': ('output_str', 'output_json', 'output_files', 'output_size', 'output_mimetypes', 'output_summary'),
'classes': ('card', 'wide'),
}),
('Metadata', {
'fields': ('created_by',),
'classes': ('card',),
}),
)
list_filter = ('status', 'plugin', 'start_ts', 'cmd_version')

View File

@@ -38,11 +38,11 @@ def register_admin_site():
# Register admin views for each app
# (Previously handled by ABX plugin system, now called directly)
from core.admin import register_admin as register_core_admin
from crawls.admin import register_admin as register_crawls_admin
from api.admin import register_admin as register_api_admin
from machine.admin import register_admin as register_machine_admin
from workers.admin import register_admin as register_workers_admin
from archivebox.core.admin import register_admin as register_core_admin
from archivebox.crawls.admin import register_admin as register_crawls_admin
from archivebox.api.admin import register_admin as register_api_admin
from archivebox.machine.admin import register_admin as register_machine_admin
from archivebox.workers.admin import register_admin as register_workers_admin
register_core_admin(archivebox_admin)
register_crawls_admin(archivebox_admin)

View File

@@ -23,9 +23,9 @@ from archivebox.search.admin import SearchResultsAdminMixin
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
from archivebox.workers.tasks import bg_archive_snapshots, bg_add
from core.models import Tag, Snapshot
from core.admin_tags import TagInline
from core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list
from archivebox.core.models import Tag, Snapshot
from archivebox.core.admin_tags import TagInline
from archivebox.core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list
# GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
@@ -59,7 +59,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir', 'archiveresults_list')
search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'crawl__created_by', 'tags__name')
fieldsets = (
('URL', {
@@ -75,7 +75,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
'classes': ('card',),
}),
('Relations', {
'fields': ('crawl', 'created_by', 'tags_str'),
'fields': ('crawl', 'tags_str'),
'classes': ('card',),
}),
('Config', {

View File

@@ -6,7 +6,7 @@ from django.utils.html import format_html, mark_safe
from archivebox.misc.paginators import AccelleratedPaginator
from archivebox.base_models.admin import BaseModelAdmin
from core.models import Tag
from archivebox.core.models import Tag
class TagInline(admin.TabularInline):

View File

@@ -4,9 +4,9 @@ from django.apps import AppConfig
class CoreConfig(AppConfig):
name = 'core'
name = 'archivebox.core'
def ready(self):
"""Register the archivebox.core.admin_site as the main django admin site"""
from core.admin_site import register_admin_site
from archivebox.core.admin_site import register_admin_site
register_admin_site()

View File

@@ -20,7 +20,7 @@ application = get_asgi_application()
# from channels.routing import ProtocolTypeRouter, URLRouter
# from channels.auth import AuthMiddlewareStack
# from channels.security.websocket import AllowedHostsOriginValidator
# from core.routing import websocket_urlpatterns
# from archivebox.core.routing import websocket_urlpatterns
#
# application = ProtocolTypeRouter({
# "http": get_asgi_application(),

View File

@@ -4,10 +4,14 @@ from django import forms
from archivebox.misc.util import URL_REGEX
from taggit.utils import edit_string_for_tags, parse_tags
from archivebox.base_models.admin import KeyValueWidget
DEPTH_CHOICES = (
('0', 'depth = 0 (archive just these URLs)'),
('1', 'depth = 1 (archive these URLs and all URLs one hop away)'),
('1', 'depth = 1 (+ URLs one hop away)'),
('2', 'depth = 2 (+ URLs two hops away)'),
('3', 'depth = 3 (+ URLs three hops away)'),
('4', 'depth = 4 (+ URLs four hops away)'),
)
from archivebox.hooks import get_plugins
@@ -18,39 +22,180 @@ def get_plugin_choices():
class AddLinkForm(forms.Form):
url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True)
tag = forms.CharField(label="Tags (comma separated tag1,tag2,tag3)", strip=True, required=False)
depth = forms.ChoiceField(label="Archive depth", choices=DEPTH_CHOICES, initial='0', widget=forms.RadioSelect(attrs={"class": "depth-selection"}))
plugins = forms.MultipleChoiceField(
label="Plugins (select at least 1, otherwise all will be used by default)",
# Basic fields
url = forms.RegexField(
label="URLs (one per line)",
regex=URL_REGEX,
min_length='6',
strip=True,
widget=forms.Textarea,
required=True
)
tag = forms.CharField(
label="Tags (comma separated tag1,tag2,tag3)",
strip=True,
required=False,
widget=forms.TextInput(attrs={
'list': 'tag-datalist',
'autocomplete': 'off',
})
)
depth = forms.ChoiceField(
label="Archive depth",
choices=DEPTH_CHOICES,
initial='0',
widget=forms.RadioSelect(attrs={"class": "depth-selection"})
)
notes = forms.CharField(
label="Notes",
strip=True,
required=False,
widget=forms.Textarea(attrs={
'rows': 3,
'placeholder': 'Optional notes about this crawl (e.g., purpose, project name, context...)',
})
)
# Plugin groups
chrome_plugins = forms.MultipleChoiceField(
label="Chrome-dependent plugins",
required=False,
widget=forms.CheckboxSelectMultiple,
choices=[], # populated in __init__
)
archiving_plugins = forms.MultipleChoiceField(
label="Archiving",
required=False,
widget=forms.CheckboxSelectMultiple,
choices=[],
)
parsing_plugins = forms.MultipleChoiceField(
label="Parsing",
required=False,
widget=forms.CheckboxSelectMultiple,
choices=[],
)
search_plugins = forms.MultipleChoiceField(
label="Search",
required=False,
widget=forms.CheckboxSelectMultiple,
choices=[],
)
binary_plugins = forms.MultipleChoiceField(
label="Binary providers",
required=False,
widget=forms.CheckboxSelectMultiple,
choices=[],
)
extension_plugins = forms.MultipleChoiceField(
label="Browser extensions",
required=False,
widget=forms.CheckboxSelectMultiple,
choices=[],
)
# Advanced options
schedule = forms.CharField(
label="Repeat schedule",
max_length=64,
required=False,
widget=forms.TextInput(attrs={
'placeholder': 'e.g., daily, weekly, 0 */6 * * * (every 6 hours)',
})
)
persona = forms.CharField(
label="Persona (authentication profile)",
max_length=100,
initial='Default',
required=False,
)
overwrite = forms.BooleanField(
label="Overwrite existing snapshots",
initial=False,
required=False,
)
update = forms.BooleanField(
label="Update/retry previously failed URLs",
initial=False,
required=False,
)
index_only = forms.BooleanField(
label="Index only (don't archive yet)",
initial=False,
required=False,
)
config = forms.JSONField(
label="Custom config overrides",
widget=KeyValueWidget(),
initial=dict,
required=False,
widget=forms.SelectMultiple,
choices=[], # populated dynamically in __init__
)
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.fields['plugins'].choices = get_plugin_choices()
# TODO: hook these up to the view and put them
# in a collapsible UI section labeled "Advanced"
#
# exclude_patterns = forms.CharField(
# label="Exclude patterns",
# min_length='1',
# required=False,
# initial=URL_DENYLIST,
# )
# timeout = forms.IntegerField(
# initial=TIMEOUT,
# )
# overwrite = forms.BooleanField(
# label="Overwrite any existing Snapshots",
# initial=False,
# )
# index_only = forms.BooleanField(
# label="Add URLs to index without Snapshotting",
# initial=False,
# )
# Import at runtime to avoid circular imports
from archivebox.config.common import ARCHIVING_CONFIG
# Get all plugins
all_plugins = get_plugins()
# Define plugin groups
chrome_dependent = {
'accessibility', 'chrome', 'consolelog', 'dom', 'headers',
'parse_dom_outlinks', 'pdf', 'redirects', 'responses',
'screenshot', 'seo', 'singlefile', 'ssl', 'staticfile', 'title'
}
archiving = {
'archive_org', 'favicon', 'forumdl', 'gallerydl', 'git',
'htmltotext', 'media', 'mercury', 'papersdl', 'readability', 'wget'
}
parsing = {
'parse_html_urls', 'parse_jsonl_urls',
'parse_netscape_urls', 'parse_rss_urls', 'parse_txt_urls'
}
search = {
'search_backend_ripgrep', 'search_backend_sonic', 'search_backend_sqlite'
}
binary = {'apt', 'brew', 'custom', 'env', 'npm', 'pip'}
extensions = {'captcha2', 'istilldontcareaboutcookies', 'ublock'}
# Populate plugin field choices
self.fields['chrome_plugins'].choices = [
(p, p) for p in sorted(all_plugins) if p in chrome_dependent
]
self.fields['archiving_plugins'].choices = [
(p, p) for p in sorted(all_plugins) if p in archiving
]
self.fields['parsing_plugins'].choices = [
(p, p) for p in sorted(all_plugins) if p in parsing
]
self.fields['search_plugins'].choices = [
(p, p) for p in sorted(all_plugins) if p in search
]
self.fields['binary_plugins'].choices = [
(p, p) for p in sorted(all_plugins) if p in binary
]
self.fields['extension_plugins'].choices = [
(p, p) for p in sorted(all_plugins) if p in extensions
]
# Set update default from config
self.fields['update'].initial = not ARCHIVING_CONFIG.ONLY_NEW
def clean(self):
cleaned_data = super().clean()
# Combine all plugin groups into single list
all_selected_plugins = []
for field in ['chrome_plugins', 'archiving_plugins', 'parsing_plugins',
'search_plugins', 'binary_plugins', 'extension_plugins']:
all_selected_plugins.extend(cleaned_data.get(field, []))
# Store combined list for easy access
cleaned_data['plugins'] = all_selected_plugins
return cleaned_data
class TagWidgetMixin:
def format_value(self, value):

View File

@@ -12,7 +12,7 @@ try:
ARCHIVE_DIR = CONSTANTS.ARCHIVE_DIR
except ImportError:
try:
from config import CONFIG
from archivebox.config import CONFIG
ARCHIVE_DIR = Path(CONFIG.get('ARCHIVE_DIR', './archive'))
except ImportError:
ARCHIVE_DIR = Path('./archive')

View File

@@ -11,7 +11,7 @@ class Migration(migrations.Migration):
dependencies = [
('core', '0031_snapshot_parent_snapshot'),
('crawls', '0004_alter_crawl_output_dir'),
('machine', '0003_alter_dependency_id_alter_installedbinary_dependency_and_more'),
('machine', '0004_drop_dependency_table'), # Changed from 0003 - wait until Dependency is dropped
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]

View File

@@ -0,0 +1,79 @@
# Generated migration
from django.conf import settings
from django.db import migrations, models
import django.db.models.deletion
def create_catchall_crawls_and_assign_snapshots(apps, schema_editor):
"""
Create one catchall Crawl per user for all snapshots without a crawl.
Assign those snapshots to their user's catchall crawl.
"""
Snapshot = apps.get_model('core', 'Snapshot')
Crawl = apps.get_model('crawls', 'Crawl')
User = apps.get_model(settings.AUTH_USER_MODEL)
# Get all snapshots without a crawl
snapshots_without_crawl = Snapshot.objects.filter(crawl__isnull=True)
if not snapshots_without_crawl.exists():
return
# Group by created_by_id
snapshots_by_user = {}
for snapshot in snapshots_without_crawl:
user_id = snapshot.created_by_id
if user_id not in snapshots_by_user:
snapshots_by_user[user_id] = []
snapshots_by_user[user_id].append(snapshot)
# Create one catchall crawl per user and assign snapshots
for user_id, snapshots in snapshots_by_user.items():
try:
user = User.objects.get(pk=user_id)
username = user.username
except User.DoesNotExist:
username = 'unknown'
# Create catchall crawl for this user
crawl = Crawl.objects.create(
urls=f'# Catchall crawl for {len(snapshots)} snapshots without a crawl',
max_depth=0,
label=f'[migration] catchall for user {username}',
created_by_id=user_id,
)
# Assign all snapshots to this crawl
for snapshot in snapshots:
snapshot.crawl = crawl
snapshot.save(update_fields=['crawl'])
class Migration(migrations.Migration):
dependencies = [
('core', '0034_snapshot_current_step'),
('crawls', '0004_alter_crawl_output_dir'),
]
operations = [
# Step 1: Assign all snapshots without a crawl to catchall crawls
migrations.RunPython(
create_catchall_crawls_and_assign_snapshots,
reverse_code=migrations.RunPython.noop,
),
# Step 2: Make crawl non-nullable
migrations.AlterField(
model_name='snapshot',
name='crawl',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
),
# Step 3: Remove created_by field
migrations.RemoveField(
model_name='snapshot',
name='created_by',
),
]

View File

@@ -0,0 +1,19 @@
# Generated migration
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('core', '0035_snapshot_crawl_non_nullable_remove_created_by'),
]
operations = [
# Remove created_by field from ArchiveResult
# No data migration needed - created_by can be accessed via snapshot.crawl.created_by
migrations.RemoveField(
model_name='archiveresult',
name='created_by',
),
]

View File

@@ -9,6 +9,8 @@ import os
import json
from pathlib import Path
from statemachine import State, registry
from django.db import models
from django.db.models import QuerySet, Value, Case, When, IntegerField
from django.utils.functional import cached_property
@@ -33,10 +35,10 @@ from archivebox.base_models.models import (
ModelWithConfig, ModelWithNotes, ModelWithHealthStats,
get_or_create_system_user_pk,
)
from workers.models import ModelWithStateMachine
from workers.tasks import bg_archive_snapshot
from crawls.models import Crawl
from machine.models import NetworkInterface, Binary
from archivebox.workers.models import ModelWithStateMachine, BaseStateMachine
from archivebox.workers.tasks import bg_archive_snapshot
from archivebox.crawls.models import Crawl
from archivebox.machine.models import NetworkInterface, Binary
@@ -53,6 +55,7 @@ class Tag(ModelWithSerializers):
snapshot_set: models.Manager['Snapshot']
class Meta(TypedModelMeta):
app_label = 'core'
verbose_name = "Tag"
verbose_name_plural = "Tags"
@@ -122,6 +125,7 @@ class SnapshotTag(models.Model):
tag = models.ForeignKey(Tag, db_column='tag_id', on_delete=models.CASCADE, to_field='id')
class Meta:
app_label = 'core'
db_table = 'core_snapshot_tags'
unique_together = [('snapshot', 'tag')]
@@ -263,52 +267,6 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
# Import Methods
# =========================================================================
def create_or_update_from_dict(self, link_dict: Dict[str, Any], created_by_id: Optional[int] = None) -> 'Snapshot':
"""Create or update a Snapshot from a SnapshotDict (parser output)"""
import re
from archivebox.config.common import GENERAL_CONFIG
url = link_dict['url']
timestamp = link_dict.get('timestamp')
title = link_dict.get('title')
tags_str = link_dict.get('tags')
tag_list = []
if tags_str:
tag_list = list(dict.fromkeys(
tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_str)
if tag.strip()
))
# Get most recent snapshot with this URL (URLs can exist in multiple crawls)
snapshot = self.filter(url=url).order_by('-created_at').first()
if snapshot:
if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
snapshot.title = title
snapshot.save(update_fields=['title', 'modified_at'])
else:
if timestamp:
while self.filter(timestamp=timestamp).exists():
timestamp = str(float(timestamp) + 1.0)
snapshot = self.create(
url=url,
timestamp=timestamp,
title=title,
created_by_id=created_by_id or get_or_create_system_user_pk(),
)
if tag_list:
existing_tags = set(snapshot.tags.values_list('name', flat=True))
new_tags = set(tag_list) | existing_tags
snapshot.save_tags(new_tags)
return snapshot
def create_from_dicts(self, link_dicts: List[Dict[str, Any]], created_by_id: Optional[int] = None) -> List['Snapshot']:
"""Create or update multiple Snapshots from a list of SnapshotDicts"""
return [self.create_or_update_from_dict(d, created_by_id=created_by_id) for d in link_dicts]
def remove(self, atomic: bool = False) -> tuple:
"""Remove snapshots from the database"""
from django.db import transaction
@@ -320,14 +278,13 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='snapshot_set', db_index=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
url = models.URLField(unique=False, db_index=True) # URLs can appear in multiple crawls
timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
bookmarked_at = models.DateTimeField(default=timezone.now, db_index=True)
crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set', db_index=True) # type: ignore
crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, null=False, related_name='snapshot_set', db_index=True) # type: ignore[assignment]
parent_snapshot = models.ForeignKey('self', on_delete=models.SET_NULL, null=True, blank=True, related_name='child_snapshots', db_index=True, help_text='Parent snapshot that discovered this URL (for recursive crawling)')
title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
@@ -344,7 +301,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
state_machine_name = 'core.statemachines.SnapshotMachine'
state_machine_name = 'core.models.SnapshotMachine'
state_field_name = 'status'
retry_at_field_name = 'retry_at'
StatusChoices = ModelWithStateMachine.StatusChoices
@@ -354,6 +311,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
archiveresult_set: models.Manager['ArchiveResult']
class Meta(TypedModelMeta):
app_label = 'core'
verbose_name = "Snapshot"
verbose_name_plural = "Snapshots"
constraints = [
@@ -366,6 +324,11 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
def __str__(self):
return f'[{self.id}] {self.url[:64]}'
@property
def created_by(self):
"""Convenience property to access the user who created this snapshot via its crawl."""
return self.crawl.created_by
def save(self, *args, **kwargs):
is_new = self._state.adding
if not self.bookmarked_at:
@@ -395,7 +358,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
self.fs_version = target
super().save(*args, **kwargs)
if self.crawl and self.url not in self.crawl.urls:
if self.url not in self.crawl.urls:
self.crawl.urls += f'\n{self.url}'
self.crawl.save()
@@ -408,7 +371,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
url=self.url,
metadata={
'id': str(self.id),
'crawl_id': str(self.crawl_id) if self.crawl_id else None,
'crawl_id': str(self.crawl_id),
'depth': self.depth,
'status': self.status,
},
@@ -437,20 +400,11 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
return self.fs_version != self._fs_current_version()
def _fs_next_version(self, version: str) -> str:
"""Get next version in migration chain"""
chain = ['0.7.0', '0.8.0', '0.9.0']
try:
idx = chain.index(version)
return chain[idx + 1] if idx + 1 < len(chain) else self._fs_current_version()
except ValueError:
# Unknown version - skip to current
return self._fs_current_version()
def _fs_migrate_from_0_7_0_to_0_8_0(self):
"""Migration from 0.7.0 to 0.8.0 layout (no-op)"""
# 0.7 and 0.8 both used archive/<timestamp>
# Nothing to do!
pass
"""Get next version in migration chain (0.7/0.8 had same layout, only 0.8→0.9 migration needed)"""
# Treat 0.7.0 and 0.8.0 as equivalent (both used archive/{timestamp})
if version in ('0.7.0', '0.8.0'):
return '0.9.0'
return self._fs_current_version()
def _fs_migrate_from_0_8_0_to_0_9_0(self):
"""
@@ -578,7 +532,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
return CONSTANTS.ARCHIVE_DIR / self.timestamp
elif version in ('0.9.0', '1.0.0'):
username = self.created_by.username if self.created_by else 'unknown'
username = self.created_by.username
# Use created_at for date grouping (fallback to timestamp)
if self.created_at:
@@ -875,7 +829,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
pwd=result_data.get('pwd', str(self.output_dir)),
start_ts=start_ts,
end_ts=end_ts,
created_by=self.created_by,
)
except:
pass
@@ -1069,6 +1022,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
result = archive_results.get(plugin)
existing = result and result.status == 'succeeded' and (result.output_files or result.output_str)
icon = get_plugin_icon(plugin)
# Skip plugins with empty icons that have no output
# (e.g., staticfile only shows when there's actual output)
if not icon.strip() and not existing:
continue
output += format_html(
output_template,
path,
@@ -1139,9 +1098,20 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
def run(self) -> list['ArchiveResult']:
"""
Execute this Snapshot by creating ArchiveResults for all enabled extractors.
Execute snapshot by creating pending ArchiveResults for all enabled hooks.
Called by the state machine when entering the 'started' state.
Called by: SnapshotMachine.enter_started()
Hook Lifecycle:
1. discover_hooks('Snapshot') → finds all plugin hooks
2. For each hook:
- Create ArchiveResult with status=QUEUED
- Store hook_name (e.g., 'on_Snapshot__50_wget.py')
3. ArchiveResults execute independently via ArchiveResultMachine
4. Hook execution happens in ArchiveResult.run(), NOT here
Returns:
list[ArchiveResult]: Newly created pending results
"""
return self.create_pending_archiveresults()
@@ -1152,28 +1122,20 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
Called by the state machine when entering the 'sealed' state.
Kills any background hooks and finalizes their ArchiveResults.
"""
from pathlib import Path
from archivebox.hooks import kill_process
# Kill any background ArchiveResult hooks
if not self.OUTPUT_DIR.exists():
return
for plugin_dir in self.OUTPUT_DIR.iterdir():
if not plugin_dir.is_dir():
continue
pid_file = plugin_dir / 'hook.pid'
if pid_file.exists():
kill_process(pid_file, validate=True) # Use validation
# Find all .pid files in this snapshot's output directory
for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
kill_process(pid_file, validate=True)
# Update the ArchiveResult from filesystem
plugin_name = plugin_dir.name
results = self.archiveresult_set.filter(
status=ArchiveResult.StatusChoices.STARTED,
pwd__contains=plugin_name
)
for ar in results:
ar.update_from_output()
# Update all STARTED ArchiveResults from filesystem
results = self.archiveresult_set.filter(status=ArchiveResult.StatusChoices.STARTED)
for ar in results:
ar.update_from_output()
def has_running_background_hooks(self) -> bool:
"""
@@ -1196,51 +1158,156 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
return False
@staticmethod
def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None):
def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True):
"""
Create/update Snapshot from JSONL record.
Create/update Snapshot from JSONL record or dict.
Unified method that handles:
- ID-based patching: {"id": "...", "title": "new title"}
- URL-based create/update: {"url": "...", "title": "...", "tags": "..."}
- Auto-creates Crawl if not provided
- Optionally queues for extraction
Args:
record: JSONL record with 'url' field and optional metadata
record: Dict with 'url' (for create) or 'id' (for patch), plus other fields
overrides: Dict with 'crawl', 'snapshot' (parent), 'created_by_id'
queue_for_extraction: If True, sets status=QUEUED and retry_at (default: True)
Returns:
Snapshot instance or None
Note:
Filtering (depth, URL allowlist/denylist) should be done by caller
BEFORE calling this method. This method just creates the snapshot.
"""
from archivebox.misc.jsonl import get_or_create_snapshot
import re
from django.utils import timezone
from archivebox.misc.util import parse_date
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.config.common import GENERAL_CONFIG
overrides = overrides or {}
# If 'id' is provided, lookup and patch that specific snapshot
snapshot_id = record.get('id')
if snapshot_id:
try:
snapshot = Snapshot.objects.get(id=snapshot_id)
# Generically update all fields present in record
update_fields = []
for field_name, value in record.items():
# Skip internal fields
if field_name in ('id', 'type'):
continue
# Skip if field doesn't exist on model
if not hasattr(snapshot, field_name):
continue
# Special parsing for date fields
if field_name in ('bookmarked_at', 'retry_at', 'created_at', 'modified_at'):
if value and isinstance(value, str):
value = parse_date(value)
# Update field if value is provided and different
if value is not None and getattr(snapshot, field_name) != value:
setattr(snapshot, field_name, value)
update_fields.append(field_name)
if update_fields:
snapshot.save(update_fields=update_fields + ['modified_at'])
return snapshot
except Snapshot.DoesNotExist:
# ID not found, fall through to create-by-URL logic
pass
url = record.get('url')
if not url:
return None
# Apply crawl context metadata
# Determine or create crawl (every snapshot must have a crawl)
crawl = overrides.get('crawl')
snapshot = overrides.get('snapshot') # Parent snapshot
parent_snapshot = overrides.get('snapshot') # Parent snapshot
created_by_id = overrides.get('created_by_id') or (parent_snapshot.created_by.pk if parent_snapshot else get_or_create_system_user_pk())
if crawl:
record.setdefault('crawl_id', str(crawl.id))
record.setdefault('depth', (snapshot.depth + 1 if snapshot else 1))
if snapshot:
record.setdefault('parent_snapshot_id', str(snapshot.id))
# If no crawl provided, inherit from parent or auto-create one
if not crawl:
if parent_snapshot:
# Inherit crawl from parent snapshot
crawl = parent_snapshot.crawl
else:
# Auto-create a single-URL crawl
from archivebox.crawls.models import Crawl
from archivebox.config import CONSTANTS
try:
created_by_id = overrides.get('created_by_id') or (snapshot.created_by_id if snapshot else None)
new_snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
timestamp_str = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
sources_file = CONSTANTS.SOURCES_DIR / f'{timestamp_str}__auto_crawl.txt'
sources_file.parent.mkdir(parents=True, exist_ok=True)
sources_file.write_text(url)
# Queue for extraction
new_snapshot.status = Snapshot.StatusChoices.QUEUED
new_snapshot.retry_at = timezone.now()
new_snapshot.save()
crawl = Crawl.objects.create(
urls=url,
max_depth=0,
label=f'auto-created for {url[:50]}',
created_by_id=created_by_id,
)
return new_snapshot
except ValueError:
return None
# Parse tags
tags_str = record.get('tags', '')
tag_list = []
if tags_str:
tag_list = list(dict.fromkeys(
tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_str)
if tag.strip()
))
# Get most recent snapshot with this URL (URLs can exist in multiple crawls)
snapshot = Snapshot.objects.filter(url=url).order_by('-created_at').first()
title = record.get('title')
timestamp = record.get('timestamp')
if snapshot:
# Update existing snapshot
if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
snapshot.title = title
snapshot.save(update_fields=['title', 'modified_at'])
else:
# Create new snapshot
if timestamp:
while Snapshot.objects.filter(timestamp=timestamp).exists():
timestamp = str(float(timestamp) + 1.0)
snapshot = Snapshot.objects.create(
url=url,
timestamp=timestamp,
title=title,
crawl=crawl,
)
# Update tags
if tag_list:
existing_tags = set(snapshot.tags.values_list('name', flat=True))
new_tags = set(tag_list) | existing_tags
snapshot.save_tags(new_tags)
# Queue for extraction and update additional fields
update_fields = []
if queue_for_extraction:
snapshot.status = Snapshot.StatusChoices.QUEUED
snapshot.retry_at = timezone.now()
update_fields.extend(['status', 'retry_at'])
# Update additional fields if provided
for field_name in ('depth', 'parent_snapshot_id', 'crawl_id', 'bookmarked_at'):
value = record.get(field_name)
if value is not None and getattr(snapshot, field_name) != value:
setattr(snapshot, field_name, value)
update_fields.append(field_name)
if update_fields:
snapshot.save(update_fields=update_fields + ['modified_at'])
return snapshot
def create_pending_archiveresults(self) -> list['ArchiveResult']:
"""
@@ -1273,7 +1340,6 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
'plugin': plugin,
'status': ArchiveResult.INITIAL_STATE,
'retry_at': timezone.now(),
'created_by_id': self.created_by_id,
},
)
if archiveresult.status == ArchiveResult.INITIAL_STATE:
@@ -1329,6 +1395,36 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
self.save(update_fields=['current_step', 'modified_at'])
return True
def is_finished_processing(self) -> bool:
"""
Check if this snapshot has finished processing.
Used by SnapshotMachine.is_finished() to determine if snapshot is complete.
Returns:
True if all archiveresults are finished (or no work to do), False otherwise.
"""
# if no archiveresults exist yet, it's not finished
if not self.archiveresult_set.exists():
return False
# Try to advance step if ready (handles step-based hook execution)
# This will increment current_step when all foreground hooks in current step are done
while self.advance_step_if_ready():
pass # Keep advancing until we can't anymore
# if archiveresults exist but are still pending, it's not finished
if self.pending_archiveresults().exists():
return False
# Don't wait for background hooks - they'll be cleaned up on entering sealed state
# Background hooks in STARTED state are excluded by pending_archiveresults()
# (STARTED is in FINAL_OR_ACTIVE_STATES) so once all results are FINAL or ACTIVE,
# we can transition to sealed and cleanup() will kill the background hooks
# otherwise archiveresults exist and are all finished, so it's finished
return True
def retry_failed_archiveresults(self, retry_at: Optional['timezone.datetime'] = None) -> int:
"""
Reset failed/skipped ArchiveResults to queued for retry.
@@ -1730,6 +1826,97 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
return dt.strftime('%Y-%m-%d %H:%M:%S') if dt else None
# =============================================================================
# Snapshot State Machine
# =============================================================================
class SnapshotMachine(BaseStateMachine, strict_states=True):
"""
State machine for managing Snapshot lifecycle.
Hook Lifecycle:
┌─────────────────────────────────────────────────────────────┐
│ QUEUED State │
│ • Waiting for snapshot to be ready │
└─────────────────────────────────────────────────────────────┘
↓ tick() when can_start()
┌─────────────────────────────────────────────────────────────┐
│ STARTED State → enter_started() │
│ 1. snapshot.run() │
│ • discover_hooks('Snapshot') → finds all plugin hooks │
│ • create_pending_archiveresults() → creates ONE │
│ ArchiveResult per hook (NO execution yet) │
│ 2. ArchiveResults process independently with their own │
│ state machines (see ArchiveResultMachine) │
│ 3. Advance through steps 0-9 as foreground hooks complete │
└─────────────────────────────────────────────────────────────┘
↓ tick() when is_finished()
┌─────────────────────────────────────────────────────────────┐
│ SEALED State → enter_sealed() │
│ • cleanup() → kills any background hooks still running │
│ • Set retry_at=None (no more processing) │
└─────────────────────────────────────────────────────────────┘
https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
"""
model_attr_name = 'snapshot'
# States
queued = State(value=Snapshot.StatusChoices.QUEUED, initial=True)
started = State(value=Snapshot.StatusChoices.STARTED)
sealed = State(value=Snapshot.StatusChoices.SEALED, final=True)
# Tick Event
tick = (
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished') |
started.to(sealed, cond='is_finished')
)
def can_start(self) -> bool:
can_start = bool(self.snapshot.url)
return can_start
def is_finished(self) -> bool:
"""Check if snapshot processing is complete - delegates to model method."""
return self.snapshot.is_finished_processing()
@queued.enter
def enter_queued(self):
self.snapshot.update_and_requeue(
retry_at=timezone.now(),
status=Snapshot.StatusChoices.QUEUED,
)
@started.enter
def enter_started(self):
# lock the snapshot while we create the pending archiveresults
self.snapshot.update_and_requeue(
retry_at=timezone.now() + timedelta(seconds=30), # if failed, wait 30s before retrying
)
# Run the snapshot - creates pending archiveresults for all enabled plugins
self.snapshot.run()
# unlock the snapshot after we're done + set status = started
self.snapshot.update_and_requeue(
retry_at=timezone.now() + timedelta(seconds=5), # check again in 5s
status=Snapshot.StatusChoices.STARTED,
)
@sealed.enter
def enter_sealed(self):
# Clean up background hooks
self.snapshot.cleanup()
self.snapshot.update_and_requeue(
retry_at=None,
status=Snapshot.StatusChoices.SEALED,
)
class ArchiveResultManager(models.Manager):
def indexable(self, sorted: bool = True):
INDEXABLE_METHODS = [r[0] for r in EXTRACTOR_INDEXING_PRECEDENCE]
@@ -1761,7 +1948,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
# Note: unique constraint is added by migration 0027 - don't set unique=True here
# or SQLite table recreation in earlier migrations will fail
uuid = models.UUIDField(default=uuid7, null=True, blank=True, db_index=True)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='archiveresult_set', db_index=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
@@ -1782,7 +1968,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
# Binary FK (optional - set when hook reports cmd)
binary = models.ForeignKey(
'machine.Binary',
Binary,
on_delete=models.SET_NULL,
null=True, blank=True,
related_name='archiveresults',
@@ -1798,7 +1984,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
output_dir = models.CharField(max_length=256, default=None, null=True, blank=True)
iface = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True)
state_machine_name = 'core.statemachines.ArchiveResultMachine'
state_machine_name = 'core.models.ArchiveResultMachine'
retry_at_field_name = 'retry_at'
state_field_name = 'status'
active_state = StatusChoices.STARTED
@@ -1806,12 +1992,18 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
objects = ArchiveResultManager()
class Meta(TypedModelMeta):
app_label = 'core'
verbose_name = 'Archive Result'
verbose_name_plural = 'Archive Results Log'
def __str__(self):
return f'[{self.id}] {self.snapshot.url[:64]} -> {self.plugin}'
@property
def created_by(self):
"""Convenience property to access the user who created this archive result via its snapshot's crawl."""
return self.snapshot.crawl.created_by
def save(self, *args, **kwargs):
is_new = self._state.adding
# Skip ModelWithOutputDir.save() to avoid creating index.json in plugin directories
@@ -1900,6 +2092,12 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
def save_search_index(self):
pass
def cascade_health_update(self, success: bool):
"""Update health stats for self, parent Snapshot, and grandparent Crawl."""
self.increment_health_stats(success)
self.snapshot.increment_health_stats(success)
self.snapshot.crawl.increment_health_stats(success)
def run(self):
"""
Execute this ArchiveResult's hook and update status.
@@ -1911,8 +2109,13 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
"""
from django.utils import timezone
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook, is_background_hook
from archivebox.config.configset import get_config
config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
# Get merged config with proper context
config = get_config(
crawl=self.snapshot.crawl,
snapshot=self.snapshot,
)
# Determine which hook(s) to run
hooks = []
@@ -1962,10 +2165,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
result = run_hook(
hook,
output_dir=plugin_dir,
config_objects=config_objects,
config=config,
url=self.snapshot.url,
snapshot_id=str(self.snapshot.id),
crawl_id=str(self.snapshot.crawl.id) if self.snapshot.crawl else None,
crawl_id=str(self.snapshot.crawl.id),
depth=self.snapshot.depth,
)
@@ -2112,9 +2315,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
# Filter Snapshot records for depth/URL constraints
if record_type == 'Snapshot':
if not self.snapshot.crawl:
continue
url = record.get('url')
if not url:
continue
@@ -2132,19 +2332,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
overrides = {
'snapshot': self.snapshot,
'crawl': self.snapshot.crawl,
'created_by_id': self.snapshot.created_by_id,
'created_by_id': self.created_by.pk,
}
process_hook_records(filtered_records, overrides=overrides)
# Update snapshot title if this is the title plugin
plugin_name = get_plugin_name(self.plugin)
if self.status == self.StatusChoices.SUCCEEDED and plugin_name == 'title':
self._update_snapshot_title(plugin_dir)
# Trigger search indexing if succeeded
if self.status == self.StatusChoices.SUCCEEDED:
self.trigger_search_indexing()
# Cleanup PID files and empty logs
pid_file = plugin_dir / 'hook.pid'
pid_file.unlink(missing_ok=True)
@@ -2164,7 +2355,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
if not cmd:
return
from machine.models import Machine
from archivebox.machine.models import Machine
bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
machine = Machine.current()
@@ -2189,23 +2380,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
if binary:
self.binary = binary
def _update_snapshot_title(self, plugin_dir: Path):
"""
Update snapshot title from title plugin output.
The title plugin writes title.txt with the extracted page title.
This updates the Snapshot.title field if the file exists and has content.
"""
title_file = plugin_dir / 'title.txt'
if title_file.exists():
try:
title = title_file.read_text(encoding='utf-8').strip()
if title and (not self.snapshot.title or len(title) > len(self.snapshot.title)):
self.snapshot.title = title[:512] # Max length from model
self.snapshot.save(update_fields=['title', 'modified_at'])
except Exception:
pass # Failed to read title, that's okay
def _url_passes_filters(self, url: str) -> bool:
"""Check if URL passes URL_ALLOWLIST and URL_DENYLIST config filters.
@@ -2216,8 +2390,8 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
# Get merged config with proper hierarchy
config = get_config(
user=self.snapshot.created_by if self.snapshot else None,
crawl=self.snapshot.crawl if self.snapshot else None,
user=self.created_by,
crawl=self.snapshot.crawl,
snapshot=self.snapshot,
)
@@ -2256,23 +2430,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
return False # No allowlist patterns matched
return True # No filters or passed filters
def trigger_search_indexing(self):
"""Run any ArchiveResult__index hooks to update search indexes."""
from archivebox.hooks import discover_hooks, run_hook
# Pass config objects in priority order (later overrides earlier)
config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
for hook in discover_hooks('ArchiveResult__index'):
run_hook(
hook,
output_dir=self.output_dir,
config_objects=config_objects,
url=self.snapshot.url,
snapshot_id=str(self.snapshot.id),
plugin=self.plugin,
)
@property
def output_dir(self) -> Path:
@@ -2285,4 +2442,185 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
if not plugin_dir:
return False
pid_file = plugin_dir / 'hook.pid'
return pid_file.exists()
return pid_file.exists()
# =============================================================================
# ArchiveResult State Machine
# =============================================================================
class ArchiveResultMachine(BaseStateMachine, strict_states=True):
"""
State machine for managing ArchiveResult (single plugin execution) lifecycle.
Hook Lifecycle:
┌─────────────────────────────────────────────────────────────┐
│ QUEUED State │
│ • Waiting for its turn to run │
└─────────────────────────────────────────────────────────────┘
↓ tick() when can_start()
┌─────────────────────────────────────────────────────────────┐
│ STARTED State → enter_started() │
│ 1. archiveresult.run() │
│ • Find specific hook by hook_name │
│ • run_hook(script, output_dir, ...) → subprocess │
│ │
│ 2a. FOREGROUND hook (returns HookResult): │
│ • update_from_output() immediately │
│ - Read stdout.log │
│ - Parse JSONL records │
│ - Extract 'ArchiveResult' record → update status │
│ - Walk output_dir → populate output_files │
│ - Call process_hook_records() for side effects │
│ │
│ 2b. BACKGROUND hook (returns None): │
│ • Status stays STARTED │
│ • Continues running in background │
│ • Killed by Snapshot.cleanup() when sealed │
└─────────────────────────────────────────────────────────────┘
↓ tick() checks status
┌─────────────────────────────────────────────────────────────┐
│ SUCCEEDED / FAILED / SKIPPED / BACKOFF │
│ • Set by hook's JSONL output during update_from_output() │
│ • Health stats incremented (num_uses_succeeded/failed) │
│ • Parent Snapshot health stats also updated │
└─────────────────────────────────────────────────────────────┘
https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
"""
model_attr_name = 'archiveresult'
# States
queued = State(value=ArchiveResult.StatusChoices.QUEUED, initial=True)
started = State(value=ArchiveResult.StatusChoices.STARTED)
backoff = State(value=ArchiveResult.StatusChoices.BACKOFF)
succeeded = State(value=ArchiveResult.StatusChoices.SUCCEEDED, final=True)
failed = State(value=ArchiveResult.StatusChoices.FAILED, final=True)
skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True)
# Tick Event - transitions based on conditions
tick = (
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished') |
started.to(succeeded, cond='is_succeeded') |
started.to(failed, cond='is_failed') |
started.to(skipped, cond='is_skipped') |
started.to(backoff, cond='is_backoff') |
backoff.to.itself(unless='can_start') |
backoff.to(started, cond='can_start') |
backoff.to(succeeded, cond='is_succeeded') |
backoff.to(failed, cond='is_failed') |
backoff.to(skipped, cond='is_skipped')
)
def can_start(self) -> bool:
can_start = bool(self.archiveresult.snapshot.url)
return can_start
def is_succeeded(self) -> bool:
"""Check if extractor plugin succeeded (status was set by run())."""
return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED
def is_failed(self) -> bool:
"""Check if extractor plugin failed (status was set by run())."""
return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED
def is_skipped(self) -> bool:
"""Check if extractor plugin was skipped (status was set by run())."""
return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED
def is_backoff(self) -> bool:
"""Check if we should backoff and retry later."""
# Backoff if status is still started (plugin didn't complete) and output_str is empty
return (
self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
not self.archiveresult.output_str
)
def is_finished(self) -> bool:
"""Check if extraction has completed (success, failure, or skipped)."""
return self.archiveresult.status in (
ArchiveResult.StatusChoices.SUCCEEDED,
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
)
@queued.enter
def enter_queued(self):
self.archiveresult.update_and_requeue(
retry_at=timezone.now(),
status=ArchiveResult.StatusChoices.QUEUED,
start_ts=None,
) # bump the snapshot's retry_at so they pickup any new changes
@started.enter
def enter_started(self):
from archivebox.machine.models import NetworkInterface
# Lock the object and mark start time
self.archiveresult.update_and_requeue(
retry_at=timezone.now() + timedelta(seconds=120), # 2 min timeout for plugin
status=ArchiveResult.StatusChoices.STARTED,
start_ts=timezone.now(),
iface=NetworkInterface.current(),
)
# Run the plugin - this updates status, output, timestamps, etc.
self.archiveresult.run()
# Save the updated result
self.archiveresult.save()
@backoff.enter
def enter_backoff(self):
self.archiveresult.update_and_requeue(
retry_at=timezone.now() + timedelta(seconds=60),
status=ArchiveResult.StatusChoices.BACKOFF,
end_ts=None,
)
@succeeded.enter
def enter_succeeded(self):
self.archiveresult.update_and_requeue(
retry_at=None,
status=ArchiveResult.StatusChoices.SUCCEEDED,
end_ts=timezone.now(),
)
# Update health stats for ArchiveResult, Snapshot, and Crawl cascade
self.archiveresult.cascade_health_update(success=True)
@failed.enter
def enter_failed(self):
self.archiveresult.update_and_requeue(
retry_at=None,
status=ArchiveResult.StatusChoices.FAILED,
end_ts=timezone.now(),
)
# Update health stats for ArchiveResult, Snapshot, and Crawl cascade
self.archiveresult.cascade_health_update(success=False)
@skipped.enter
def enter_skipped(self):
self.archiveresult.update_and_requeue(
retry_at=None,
status=ArchiveResult.StatusChoices.SKIPPED,
end_ts=timezone.now(),
)
def after_transition(self, event: str, source: State, target: State):
self.archiveresult.snapshot.update_and_requeue() # bump snapshot retry time so it picks up all the new changes
# =============================================================================
# State Machine Registration
# =============================================================================
# Manually register state machines with python-statemachine registry
# (normally auto-discovered from statemachines.py, but we define them here for clarity)
registry.register(SnapshotMachine)
registry.register(ArchiveResultMachine)

2638
archivebox/core/models.py.bak Executable file

File diff suppressed because it is too large Load Diff

View File

@@ -30,9 +30,9 @@ LOADED_PLUGINS = archivebox.LOADED_PLUGINS
### Django Core Settings
################################################################################
WSGI_APPLICATION = "core.wsgi.application"
ASGI_APPLICATION = "core.asgi.application"
ROOT_URLCONF = "core.urls"
WSGI_APPLICATION = "archivebox.core.wsgi.application"
ASGI_APPLICATION = "archivebox.core.asgi.application"
ROOT_URLCONF = "archivebox.core.urls"
LOGIN_URL = "/accounts/login/"
LOGOUT_REDIRECT_URL = os.environ.get("LOGOUT_REDIRECT_URL", "/")
@@ -55,14 +55,15 @@ INSTALLED_APPS = [
# 3rd-party apps from PyPI
"signal_webhooks", # handles REST API outbound webhooks https://github.com/MrThearMan/django-signal-webhooks
"django_object_actions", # provides easy Django Admin action buttons on change views https://github.com/crccheck/django-object-actions
# Our ArchiveBox-provided apps
"config", # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
"machine", # handles collecting and storing information about the host machine, network interfaces, binaries, etc.
"workers", # handles starting and managing background workers and processes (orchestrators and actors)
"crawls", # handles Crawl and CrawlSchedule models and management
"personas", # handles Persona and session management
"core", # core django model with Snapshot, ArchiveResult, etc.
"api", # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
# Our ArchiveBox-provided apps (use fully qualified names)
# NOTE: Order matters! Apps with migrations that depend on other apps must come AFTER their dependencies
# "archivebox.config", # ArchiveBox config settings (no models, not a real Django app)
"archivebox.machine", # handles collecting and storing information about the host machine, network interfaces, binaries, etc.
"archivebox.workers", # handles starting and managing background workers and processes (orchestrators and actors)
"archivebox.personas", # handles Persona and session management
"archivebox.core", # core django model with Snapshot, ArchiveResult, etc. (crawls depends on this)
"archivebox.crawls", # handles Crawl and CrawlSchedule models and management (depends on core)
"archivebox.api", # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
# ArchiveBox plugins (hook-based plugins no longer add Django apps)
# Use hooks.py discover_hooks() for plugin functionality
# 3rd-party apps from PyPI that need to be loaded last
@@ -72,15 +73,15 @@ INSTALLED_APPS = [
MIDDLEWARE = [
"core.middleware.TimezoneMiddleware",
"archivebox.core.middleware.TimezoneMiddleware",
"django.middleware.security.SecurityMiddleware",
"django.contrib.sessions.middleware.SessionMiddleware",
"django.middleware.common.CommonMiddleware",
"django.middleware.csrf.CsrfViewMiddleware",
"django.contrib.auth.middleware.AuthenticationMiddleware",
"core.middleware.ReverseProxyAuthMiddleware",
"archivebox.core.middleware.ReverseProxyAuthMiddleware",
"django.contrib.messages.middleware.MessageMiddleware",
"core.middleware.CacheControlMiddleware",
"archivebox.core.middleware.CacheControlMiddleware",
# Additional middlewares from plugins (if any)
]
@@ -370,15 +371,15 @@ LOGGING = SETTINGS_LOGGING
################################################################################
# Add default webhook configuration to the User model
SIGNAL_WEBHOOKS_CUSTOM_MODEL = "api.models.OutboundWebhook"
SIGNAL_WEBHOOKS_CUSTOM_MODEL = "archivebox.api.models.OutboundWebhook"
SIGNAL_WEBHOOKS = {
"HOOKS": {
# ... is a special sigil value that means "use the default autogenerated hooks"
"django.contrib.auth.models.User": ...,
"core.models.Snapshot": ...,
"core.models.ArchiveResult": ...,
"core.models.Tag": ...,
"api.models.APIToken": ...,
"archivebox.core.models.Snapshot": ...,
"archivebox.core.models.ArchiveResult": ...,
"archivebox.core.models.Tag": ...,
"archivebox.api.models.APIToken": ...,
},
}
@@ -391,11 +392,11 @@ ADMIN_DATA_VIEWS = {
"URLS": [
{
"route": "config/",
"view": "core.views.live_config_list_view",
"view": "archivebox.core.views.live_config_list_view",
"name": "Configuration",
"items": {
"route": "<str:key>/",
"view": "core.views.live_config_value_view",
"view": "archivebox.core.views.live_config_value_view",
"name": "config_val",
},
},

View File

@@ -1,319 +0,0 @@
__package__ = 'archivebox.core'
import time
import os
from datetime import timedelta
from typing import ClassVar
from django.db.models import F
from django.utils import timezone
from rich import print
from statemachine import State, StateMachine
# from workers.actor import ActorType
from core.models import Snapshot, ArchiveResult
from crawls.models import Crawl
class SnapshotMachine(StateMachine, strict_states=True):
"""
State machine for managing Snapshot lifecycle.
https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
"""
model: Snapshot
# States
queued = State(value=Snapshot.StatusChoices.QUEUED, initial=True)
started = State(value=Snapshot.StatusChoices.STARTED)
sealed = State(value=Snapshot.StatusChoices.SEALED, final=True)
# Tick Event
tick = (
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished') |
started.to(sealed, cond='is_finished')
)
def __init__(self, snapshot, *args, **kwargs):
self.snapshot = snapshot
super().__init__(snapshot, *args, **kwargs)
def __repr__(self) -> str:
return f'Snapshot[{self.snapshot.id}]'
def __str__(self) -> str:
return self.__repr__()
def can_start(self) -> bool:
can_start = bool(self.snapshot.url)
# Suppressed: queue waiting logs
return can_start
def is_finished(self) -> bool:
# if no archiveresults exist yet, it's not finished
if not self.snapshot.archiveresult_set.exists():
return False
# Try to advance step if ready (handles step-based hook execution)
# This will increment current_step when all foreground hooks in current step are done
while self.snapshot.advance_step_if_ready():
pass # Keep advancing until we can't anymore
# if archiveresults exist but are still pending, it's not finished
if self.snapshot.pending_archiveresults().exists():
return False
# Don't wait for background hooks - they'll be cleaned up on entering sealed state
# Background hooks in STARTED state are excluded by pending_archiveresults()
# (STARTED is in FINAL_OR_ACTIVE_STATES) so once all results are FINAL or ACTIVE,
# we can transition to sealed and cleanup() will kill the background hooks
# otherwise archiveresults exist and are all finished, so it's finished
return True
# def on_transition(self, event, state):
# print(f'{self}.on_transition() [blue]{str(state).upper()}[/blue] ➡️ ...')
@queued.enter
def enter_queued(self):
# Suppressed: state transition logs
self.snapshot.update_for_workers(
retry_at=timezone.now(),
status=Snapshot.StatusChoices.QUEUED,
)
@started.enter
def enter_started(self):
# Suppressed: state transition logs
# lock the snapshot while we create the pending archiveresults
self.snapshot.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=30), # if failed, wait 30s before retrying
)
# Run the snapshot - creates pending archiveresults for all enabled plugins
self.snapshot.run()
# unlock the snapshot after we're done + set status = started
self.snapshot.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=5), # check again in 5s
status=Snapshot.StatusChoices.STARTED,
)
@sealed.enter
def enter_sealed(self):
# Clean up background hooks
self.snapshot.cleanup()
# Suppressed: state transition logs
self.snapshot.update_for_workers(
retry_at=None,
status=Snapshot.StatusChoices.SEALED,
)
# class SnapshotWorker(ActorType[Snapshot]):
# """
# The primary actor for progressing Snapshot objects
# through their lifecycle using the SnapshotMachine.
# """
# Model = Snapshot
# StateMachineClass = SnapshotMachine
# ACTIVE_STATE: ClassVar[State] = SnapshotMachine.started # 'started'
# MAX_CONCURRENT_ACTORS: ClassVar[int] = 3
# MAX_TICK_TIME: ClassVar[int] = 10
# CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
class ArchiveResultMachine(StateMachine, strict_states=True):
"""
State machine for managing ArchiveResult lifecycle.
https://github.com/ArchiveBox/ArchiveBox/wiki/ArchiveBox-Architecture-Diagrams
"""
model: ArchiveResult
# States
queued = State(value=ArchiveResult.StatusChoices.QUEUED, initial=True)
started = State(value=ArchiveResult.StatusChoices.STARTED)
backoff = State(value=ArchiveResult.StatusChoices.BACKOFF)
succeeded = State(value=ArchiveResult.StatusChoices.SUCCEEDED, final=True)
failed = State(value=ArchiveResult.StatusChoices.FAILED, final=True)
skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True)
# Tick Event - transitions based on conditions
tick = (
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished') |
started.to(succeeded, cond='is_succeeded') |
started.to(failed, cond='is_failed') |
started.to(skipped, cond='is_skipped') |
started.to(backoff, cond='is_backoff') |
backoff.to.itself(unless='can_start') |
backoff.to(started, cond='can_start') |
backoff.to(succeeded, cond='is_succeeded') |
backoff.to(failed, cond='is_failed') |
backoff.to(skipped, cond='is_skipped')
)
def __init__(self, archiveresult, *args, **kwargs):
self.archiveresult = archiveresult
super().__init__(archiveresult, *args, **kwargs)
def __repr__(self) -> str:
return f'ArchiveResult[{self.archiveresult.id}]'
def __str__(self) -> str:
return self.__repr__()
def can_start(self) -> bool:
can_start = bool(self.archiveresult.snapshot.url)
# Suppressed: queue waiting logs
return can_start
def is_succeeded(self) -> bool:
"""Check if extractor plugin succeeded (status was set by run())."""
return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED
def is_failed(self) -> bool:
"""Check if extractor plugin failed (status was set by run())."""
return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED
def is_skipped(self) -> bool:
"""Check if extractor plugin was skipped (status was set by run())."""
return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED
def is_backoff(self) -> bool:
"""Check if we should backoff and retry later."""
# Backoff if status is still started (plugin didn't complete) and output_str is empty
return (
self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
not self.archiveresult.output_str
)
def is_finished(self) -> bool:
"""Check if extraction has completed (success, failure, or skipped)."""
return self.archiveresult.status in (
ArchiveResult.StatusChoices.SUCCEEDED,
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
)
@queued.enter
def enter_queued(self):
# Suppressed: state transition logs
self.archiveresult.update_for_workers(
retry_at=timezone.now(),
status=ArchiveResult.StatusChoices.QUEUED,
start_ts=None,
) # bump the snapshot's retry_at so they pickup any new changes
@started.enter
def enter_started(self):
from machine.models import NetworkInterface
# Suppressed: state transition logs
# Lock the object and mark start time
self.archiveresult.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=120), # 2 min timeout for plugin
status=ArchiveResult.StatusChoices.STARTED,
start_ts=timezone.now(),
iface=NetworkInterface.current(),
)
# Run the plugin - this updates status, output, timestamps, etc.
self.archiveresult.run()
# Save the updated result
self.archiveresult.save()
# Suppressed: plugin result logs (already logged by worker)
@backoff.enter
def enter_backoff(self):
# Suppressed: state transition logs
self.archiveresult.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=60),
status=ArchiveResult.StatusChoices.BACKOFF,
end_ts=None,
# retries=F('retries') + 1, # F() equivalent to getattr(self.archiveresult, 'retries', 0) + 1,
)
self.archiveresult.save()
@succeeded.enter
def enter_succeeded(self):
# Suppressed: state transition logs
self.archiveresult.update_for_workers(
retry_at=None,
status=ArchiveResult.StatusChoices.SUCCEEDED,
end_ts=timezone.now(),
# **self.archiveresult.get_output_dict(), # {output, output_json, stderr, stdout, returncode, errors, cmd_version, pwd, cmd, machine}
)
self.archiveresult.save()
# Increment health stats on ArchiveResult, Snapshot, and optionally Crawl
ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
# Also update Crawl health stats if snapshot has a crawl
snapshot = self.archiveresult.snapshot
if snapshot.crawl_id:
Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
@failed.enter
def enter_failed(self):
# Suppressed: state transition logs
self.archiveresult.update_for_workers(
retry_at=None,
status=ArchiveResult.StatusChoices.FAILED,
end_ts=timezone.now(),
)
# Increment health stats on ArchiveResult, Snapshot, and optionally Crawl
ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_failed=F('num_uses_failed') + 1)
Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_failed=F('num_uses_failed') + 1)
# Also update Crawl health stats if snapshot has a crawl
snapshot = self.archiveresult.snapshot
if snapshot.crawl_id:
Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_failed=F('num_uses_failed') + 1)
@skipped.enter
def enter_skipped(self):
# Suppressed: state transition logs
self.archiveresult.update_for_workers(
retry_at=None,
status=ArchiveResult.StatusChoices.SKIPPED,
end_ts=timezone.now(),
)
def after_transition(self, event: str, source: State, target: State):
# print(f"after '{event}' from '{source.id}' to '{target.id}'")
self.archiveresult.snapshot.update_for_workers() # bump snapshot retry time so it picks up all the new changes
# class ArchiveResultWorker(ActorType[ArchiveResult]):
# """
# The primary actor for progressing ArchiveResult objects
# through their lifecycle using the ArchiveResultMachine.
# """
# Model = ArchiveResult
# StateMachineClass = ArchiveResultMachine
# ACTIVE_STATE: ClassVar[State] = ArchiveResultMachine.started # 'started'
# MAX_CONCURRENT_ACTORS: ClassVar[int] = 6
# MAX_TICK_TIME: ClassVar[int] = 60
# CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10

View File

@@ -0,0 +1,20 @@
"""Template tags for accessing config values in templates."""
from django import template
from archivebox.config.configset import get_config as _get_config
register = template.Library()
@register.simple_tag
def get_config(key: str) -> any:
"""
Get a config value by key.
Usage: {% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
"""
try:
return _get_config(key)
except (KeyError, AttributeError):
return None

View File

@@ -1,3 +1,319 @@
#from django.test import TestCase
"""Tests for the core views, especially AddView."""
# Create your tests here.
import os
import django
# Set up Django before importing any Django-dependent modules
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
django.setup()
from django.test import TestCase, Client
from django.contrib.auth.models import User
from django.urls import reverse
from archivebox.crawls.models import Crawl, CrawlSchedule
from archivebox.core.models import Tag
class AddViewTests(TestCase):
"""Tests for the AddView (crawl creation form)."""
def setUp(self):
"""Set up test user and client."""
self.client = Client()
self.user = User.objects.create_user(
username='testuser',
password='testpass123',
email='test@example.com'
)
self.client.login(username='testuser', password='testpass123')
self.add_url = reverse('add')
def test_add_view_get_requires_auth(self):
"""Test that GET /add requires authentication."""
self.client.logout()
response = self.client.get(self.add_url)
# Should redirect to login or show 403/404
self.assertIn(response.status_code, [302, 403, 404])
def test_add_view_get_shows_form(self):
"""Test that GET /add shows the form with all fields."""
response = self.client.get(self.add_url)
self.assertEqual(response.status_code, 200)
# Check that form fields are present
self.assertContains(response, 'name="url"')
self.assertContains(response, 'name="tag"')
self.assertContains(response, 'name="depth"')
self.assertContains(response, 'name="notes"')
self.assertContains(response, 'name="schedule"')
self.assertContains(response, 'name="persona"')
self.assertContains(response, 'name="overwrite"')
self.assertContains(response, 'name="update"')
self.assertContains(response, 'name="index_only"')
# Check for plugin groups
self.assertContains(response, 'name="chrome_plugins"')
self.assertContains(response, 'name="archiving_plugins"')
self.assertContains(response, 'name="parsing_plugins"')
def test_add_view_shows_tag_autocomplete(self):
"""Test that tag autocomplete datalist is rendered."""
# Create some tags
Tag.objects.create(name='test-tag-1')
Tag.objects.create(name='test-tag-2')
response = self.client.get(self.add_url)
self.assertEqual(response.status_code, 200)
# Check for datalist with tags
self.assertContains(response, 'id="tag-datalist"')
self.assertContains(response, 'test-tag-1')
self.assertContains(response, 'test-tag-2')
def test_add_view_shows_plugin_presets(self):
"""Test that plugin preset buttons are rendered."""
response = self.client.get(self.add_url)
self.assertEqual(response.status_code, 200)
self.assertContains(response, 'Quick Archive')
self.assertContains(response, 'Full Chrome')
self.assertContains(response, 'Text Only')
self.assertContains(response, 'Select All')
self.assertContains(response, 'Clear All')
def test_add_view_shows_links_to_resources(self):
"""Test that helpful links are present."""
response = self.client.get(self.add_url)
self.assertEqual(response.status_code, 200)
# Link to plugin documentation
self.assertContains(response, '/admin/environment/plugins/')
# Link to create new persona
self.assertContains(response, '/admin/personas/persona/add/')
def test_add_basic_crawl_without_schedule(self):
"""Test creating a basic crawl without a schedule."""
response = self.client.post(self.add_url, {
'url': 'https://example.com\nhttps://example.org',
'tag': 'test-tag',
'depth': '0',
'notes': 'Test crawl notes',
})
# Should redirect to crawl admin page
self.assertEqual(response.status_code, 302)
# Check that crawl was created
self.assertEqual(Crawl.objects.count(), 1)
crawl = Crawl.objects.first()
self.assertIn('https://example.com', crawl.urls)
self.assertIn('https://example.org', crawl.urls)
self.assertEqual(crawl.tags_str, 'test-tag')
self.assertEqual(crawl.max_depth, 0)
self.assertEqual(crawl.notes, 'Test crawl notes')
self.assertEqual(crawl.created_by, self.user)
# No schedule should be created
self.assertIsNone(crawl.schedule)
self.assertEqual(CrawlSchedule.objects.count(), 0)
def test_add_crawl_with_schedule(self):
"""Test creating a crawl with a repeat schedule."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'tag': 'scheduled',
'depth': '1',
'notes': 'Daily crawl',
'schedule': 'daily',
})
self.assertEqual(response.status_code, 302)
# Check that crawl and schedule were created
self.assertEqual(Crawl.objects.count(), 1)
self.assertEqual(CrawlSchedule.objects.count(), 1)
crawl = Crawl.objects.first()
schedule = CrawlSchedule.objects.first()
self.assertEqual(crawl.schedule, schedule)
self.assertEqual(schedule.template, crawl)
self.assertEqual(schedule.schedule, 'daily')
self.assertTrue(schedule.is_enabled)
self.assertEqual(schedule.created_by, self.user)
def test_add_crawl_with_cron_schedule(self):
"""Test creating a crawl with a cron format schedule."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
'schedule': '0 */6 * * *', # Every 6 hours
})
self.assertEqual(response.status_code, 302)
schedule = CrawlSchedule.objects.first()
self.assertEqual(schedule.schedule, '0 */6 * * *')
def test_add_crawl_with_plugins(self):
"""Test creating a crawl with specific plugins selected."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
'chrome_plugins': ['screenshot', 'dom'],
'archiving_plugins': ['wget'],
})
self.assertEqual(response.status_code, 302)
crawl = Crawl.objects.first()
plugins = crawl.config.get('PLUGINS', '')
# Should contain the selected plugins
self.assertIn('screenshot', plugins)
self.assertIn('dom', plugins)
self.assertIn('wget', plugins)
def test_add_crawl_with_depth_range(self):
"""Test creating crawls with different depth values (0-4)."""
for depth in range(5):
response = self.client.post(self.add_url, {
'url': f'https://example{depth}.com',
'depth': str(depth),
})
self.assertEqual(response.status_code, 302)
self.assertEqual(Crawl.objects.count(), 5)
for i, crawl in enumerate(Crawl.objects.order_by('created_at')):
self.assertEqual(crawl.max_depth, i)
def test_add_crawl_with_advanced_options(self):
"""Test creating a crawl with advanced options."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
'persona': 'CustomPersona',
'overwrite': True,
'update': True,
'index_only': True,
})
self.assertEqual(response.status_code, 302)
crawl = Crawl.objects.first()
config = crawl.config
self.assertEqual(config.get('DEFAULT_PERSONA'), 'CustomPersona')
self.assertEqual(config.get('OVERWRITE'), True)
self.assertEqual(config.get('ONLY_NEW'), False) # opposite of update
self.assertEqual(config.get('INDEX_ONLY'), True)
def test_add_crawl_with_custom_config(self):
"""Test creating a crawl with custom config overrides."""
# Note: Django test client can't easily POST the KeyValueWidget format,
# so this test would need to use the form directly or mock the cleaned_data
# For now, we'll skip this test or mark it as TODO
pass
def test_add_empty_urls_fails(self):
"""Test that submitting without URLs fails validation."""
response = self.client.post(self.add_url, {
'url': '',
'depth': '0',
})
# Should show form again with errors, not redirect
self.assertEqual(response.status_code, 200)
self.assertFormError(response, 'form', 'url', 'This field is required.')
def test_add_invalid_urls_fails(self):
"""Test that invalid URLs fail validation."""
response = self.client.post(self.add_url, {
'url': 'not-a-url',
'depth': '0',
})
# Should show form again with errors
self.assertEqual(response.status_code, 200)
# Check for validation error (URL regex should fail)
self.assertContains(response, 'error')
def test_add_success_message_without_schedule(self):
"""Test that success message is shown without schedule link."""
response = self.client.post(self.add_url, {
'url': 'https://example.com\nhttps://example.org',
'depth': '0',
}, follow=True)
# Check success message mentions crawl creation
messages = list(response.context['messages'])
self.assertEqual(len(messages), 1)
message_text = str(messages[0])
self.assertIn('Created crawl with 2 starting URL', message_text)
self.assertIn('View Crawl', message_text)
self.assertNotIn('scheduled to repeat', message_text)
def test_add_success_message_with_schedule(self):
"""Test that success message includes schedule link."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
'schedule': 'weekly',
}, follow=True)
# Check success message mentions schedule
messages = list(response.context['messages'])
self.assertEqual(len(messages), 1)
message_text = str(messages[0])
self.assertIn('Created crawl', message_text)
self.assertIn('scheduled to repeat weekly', message_text)
self.assertIn('View Crawl', message_text)
def test_add_crawl_creates_source_file(self):
"""Test that crawl creation saves URLs to sources file."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
})
self.assertEqual(response.status_code, 302)
# Check that source file was created in sources/ directory
from archivebox.config import CONSTANTS
sources_dir = CONSTANTS.SOURCES_DIR
# Should have created a source file
source_files = list(sources_dir.glob('*__web_ui_add_by_user_*.txt'))
self.assertGreater(len(source_files), 0)
def test_multiple_tags_are_saved(self):
"""Test that multiple comma-separated tags are saved."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
'tag': 'tag1,tag2,tag3',
})
self.assertEqual(response.status_code, 302)
crawl = Crawl.objects.first()
self.assertEqual(crawl.tags_str, 'tag1,tag2,tag3')
def test_crawl_redirects_to_admin_change_page(self):
"""Test that successful submission redirects to crawl admin page."""
response = self.client.post(self.add_url, {
'url': 'https://example.com',
'depth': '0',
})
crawl = Crawl.objects.first()
expected_redirect = f'/admin/crawls/crawl/{crawl.id}/change/'
self.assertRedirects(response, expected_redirect, fetch_redirect_response=False)

View File

@@ -7,10 +7,10 @@ from django.views.generic.base import RedirectView
from archivebox.misc.serve_static import serve_static
from core.admin_site import archivebox_admin
from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView, live_progress_view
from archivebox.core.admin_site import archivebox_admin
from archivebox.core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView, live_progress_view
from workers.views import JobsDashboardView
from archivebox.workers.views import JobsDashboardView
# GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
# from archivebox.config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE

View File

@@ -23,7 +23,7 @@ from admin_data_views.typing import TableContext, ItemContext
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
import archivebox
from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION, SAVE_ARCHIVE_DOT_ORG
from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG
from archivebox.config.configset import get_flat_config, get_config, get_all_configs
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
@@ -31,9 +31,9 @@ from archivebox.misc.serve_static import serve_static_with_byterange_support
from archivebox.misc.logging_util import printable_filesize
from archivebox.search import query_search_index
from core.models import Snapshot
from core.forms import AddLinkForm
from crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.core.forms import AddLinkForm
from archivebox.crawls.models import Crawl
from archivebox.hooks import get_extractors, get_extractor_name
@@ -150,7 +150,6 @@ class SnapshotView(View):
'status_color': 'success' if snapshot.is_archived else 'danger',
'oldest_archive_date': ts_to_date_str(snapshot.oldest_archive_date),
'warc_path': warc_path,
'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']),
'best_result': best_result,
@@ -421,35 +420,34 @@ class AddView(UserPassesTestMixin, FormView):
return SERVER_CONFIG.PUBLIC_ADD_VIEW or self.request.user.is_authenticated
def get_context_data(self, **kwargs):
from archivebox.core.models import Tag
return {
**super().get_context_data(**kwargs),
'title': "Add URLs",
'title': "Create Crawl",
# We can't just call request.build_absolute_uri in the template, because it would include query parameters
'absolute_add_path': self.request.build_absolute_uri(self.request.path),
'VERSION': VERSION,
'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
'stdout': '',
'available_tags': list(Tag.objects.all().order_by('name').values_list('name', flat=True)),
}
def form_valid(self, form):
urls = form.cleaned_data["url"]
print(f'[+] Adding URL: {urls}')
parser = form.cleaned_data.get("parser", "auto") # default to auto-detect parser
tag = form.cleaned_data["tag"]
depth = 0 if form.cleaned_data["depth"] == "0" else 1
plugins = ','.join(form.cleaned_data["archive_methods"])
input_kwargs = {
"urls": urls,
"tag": tag,
"depth": depth,
"parser": parser,
"update_all": False,
"out_dir": DATA_DIR,
"created_by_id": self.request.user.pk,
}
if plugins:
input_kwargs.update({"plugins": plugins})
# Extract all form fields
tag = form.cleaned_data["tag"]
depth = int(form.cleaned_data["depth"])
plugins = ','.join(form.cleaned_data.get("plugins", []))
schedule = form.cleaned_data.get("schedule", "").strip()
persona = form.cleaned_data.get("persona", "Default")
overwrite = form.cleaned_data.get("overwrite", False)
update = form.cleaned_data.get("update", False)
index_only = form.cleaned_data.get("index_only", False)
notes = form.cleaned_data.get("notes", "")
custom_config = form.cleaned_data.get("config", {})
from archivebox.config.permissions import HOSTNAME
@@ -461,33 +459,59 @@ class AddView(UserPassesTestMixin, FormView):
# 2. create a new Crawl with the URLs from the file
timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
urls_content = sources_file.read_text()
# Build complete config
config = {
'ONLY_NEW': not update,
'INDEX_ONLY': index_only,
'OVERWRITE': overwrite,
'DEPTH': depth,
'PLUGINS': plugins or '',
'DEFAULT_PERSONA': persona or 'Default',
}
# Merge custom config overrides
config.update(custom_config)
crawl = Crawl.objects.create(
urls=urls_content,
max_depth=depth,
tags_str=tag,
notes=notes,
label=f'{self.request.user.username}@{HOSTNAME}{self.request.path} {timestamp}',
created_by_id=self.request.user.pk,
config={
# 'ONLY_NEW': not update,
# 'INDEX_ONLY': index_only,
# 'OVERWRITE': False,
'DEPTH': depth,
'PLUGINS': plugins or '',
# 'DEFAULT_PERSONA': persona or 'Default',
}
config=config
)
# 3. create a CrawlSchedule if schedule is provided
if schedule:
from crawls.models import CrawlSchedule
crawl_schedule = CrawlSchedule.objects.create(
template=crawl,
schedule=schedule,
is_enabled=True,
label=crawl.label,
notes=f"Auto-created from add page. {notes}".strip(),
created_by_id=self.request.user.pk,
)
crawl.schedule = crawl_schedule
crawl.save(update_fields=['schedule'])
# 4. start the Orchestrator & wait until it completes
# ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
# from crawls.actors import CrawlActor
# from core.actors import SnapshotActor, ArchiveResultActor
# from archivebox.crawls.actors import CrawlActor
# from archivebox.core.actors import SnapshotActor, ArchiveResultActor
rough_url_count = urls.count('://')
# Build success message with schedule link if created
schedule_msg = ""
if schedule:
schedule_msg = f" and <a href='{crawl.schedule.admin_change_url}'>scheduled to repeat {schedule}</a>"
messages.success(
self.request,
mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a minute start seeing results) {crawl.admin_change_url}"),
mark_safe(f"Created crawl with {rough_url_count} starting URL(s){schedule_msg}. Snapshots will be created and archived in the background. <a href='{crawl.admin_change_url}'>View Crawl →</a>"),
)
# Orchestrator (managed by supervisord) will pick up the queued crawl
@@ -516,8 +540,8 @@ def live_progress_view(request):
"""Simple JSON endpoint for live progress status - used by admin progress monitor."""
try:
from workers.orchestrator import Orchestrator
from crawls.models import Crawl
from core.models import Snapshot, ArchiveResult
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot, ArchiveResult
from django.db.models import Case, When, Value, IntegerField
# Get orchestrator status
@@ -764,9 +788,9 @@ def key_is_safe(key: str) -> bool:
def find_config_source(key: str, merged_config: dict) -> str:
"""Determine where a config value comes from."""
import os
from machine.models import Machine
from archivebox.machine.models import Machine
# Check if it's from machine config
# Check if it's from archivebox.machine.config
try:
machine = Machine.current()
if machine.config and key in machine.config:
@@ -778,7 +802,7 @@ def find_config_source(key: str, merged_config: dict) -> str:
if key in os.environ:
return 'Environment'
# Check if it's from config file
# Check if it's from archivebox.config.file
from archivebox.config.configset import BaseConfigSet
file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE)
if key in file_config:
@@ -796,7 +820,7 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
# Get merged config that includes Machine.config overrides
try:
from machine.models import Machine
from archivebox.machine.models import Machine
machine = Machine.current()
merged_config = get_config()
except Exception as e:
@@ -859,7 +883,7 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
@render_with_item_view
def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
import os
from machine.models import Machine
from archivebox.machine.models import Machine
from archivebox.config.configset import BaseConfigSet
CONFIGS = get_all_configs()