diff --git a/archivebox/api/v1_core.py b/archivebox/api/v1_core.py index 766ee9c6..f49f05af 100644 --- a/archivebox/api/v1_core.py +++ b/archivebox/api/v1_core.py @@ -300,3 +300,160 @@ def get_any(request, id: str): pass raise HttpError(404, 'Object with given ID not found') + + +### Tag Editor API Endpoints ######################################################################### + +class TagAutocompleteSchema(Schema): + tags: List[dict] + + +class TagCreateSchema(Schema): + name: str + + +class TagCreateResponseSchema(Schema): + success: bool + tag_id: int + tag_name: str + created: bool + + +class TagSnapshotRequestSchema(Schema): + snapshot_id: str + tag_name: Optional[str] = None + tag_id: Optional[int] = None + + +class TagSnapshotResponseSchema(Schema): + success: bool + tag_id: int + tag_name: str + + +@router.get("/tags/autocomplete/", response=TagAutocompleteSchema, url_name="tags_autocomplete") +def tags_autocomplete(request, q: str = ""): + """Return tags matching the query for autocomplete.""" + if not q: + # Return all tags if no query (limited to 50) + tags = Tag.objects.all().order_by('name')[:50] + else: + tags = Tag.objects.filter(name__icontains=q).order_by('name')[:20] + + return { + 'tags': [{'id': tag.pk, 'name': tag.name, 'slug': tag.slug} for tag in tags] + } + + +@router.post("/tags/create/", response=TagCreateResponseSchema, url_name="tags_create") +def tags_create(request, data: TagCreateSchema): + """Create a new tag or return existing one.""" + name = data.name.strip() + if not name: + raise HttpError(400, 'Tag name is required') + + tag, created = Tag.objects.get_or_create( + name__iexact=name, + defaults={ + 'name': name, + 'created_by': request.user if request.user.is_authenticated else None, + } + ) + + # If found by case-insensitive match, use that tag + if not created: + tag = Tag.objects.filter(name__iexact=name).first() + + return { + 'success': True, + 'tag_id': tag.pk, + 'tag_name': tag.name, + 'created': created, + } + + +@router.post("/tags/add-to-snapshot/", response=TagSnapshotResponseSchema, url_name="tags_add_to_snapshot") +def tags_add_to_snapshot(request, data: TagSnapshotRequestSchema): + """Add a tag to a snapshot. Creates the tag if it doesn't exist.""" + # Get the snapshot + try: + snapshot = Snapshot.objects.get( + Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id) + ) + except Snapshot.DoesNotExist: + raise HttpError(404, 'Snapshot not found') + except Snapshot.MultipleObjectsReturned: + snapshot = Snapshot.objects.filter( + Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id) + ).first() + + # Get or create the tag + if data.tag_name: + name = data.tag_name.strip() + if not name: + raise HttpError(400, 'Tag name is required') + + tag, _ = Tag.objects.get_or_create( + name__iexact=name, + defaults={ + 'name': name, + 'created_by': request.user if request.user.is_authenticated else None, + } + ) + # If found by case-insensitive match, use that tag + tag = Tag.objects.filter(name__iexact=name).first() or tag + elif data.tag_id: + try: + tag = Tag.objects.get(pk=data.tag_id) + except Tag.DoesNotExist: + raise HttpError(404, 'Tag not found') + else: + raise HttpError(400, 'Either tag_name or tag_id is required') + + # Add the tag to the snapshot + snapshot.tags.add(tag) + + return { + 'success': True, + 'tag_id': tag.pk, + 'tag_name': tag.name, + } + + +@router.post("/tags/remove-from-snapshot/", response=TagSnapshotResponseSchema, url_name="tags_remove_from_snapshot") +def tags_remove_from_snapshot(request, data: TagSnapshotRequestSchema): + """Remove a tag from a snapshot.""" + # Get the snapshot + try: + snapshot = Snapshot.objects.get( + Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id) + ) + except Snapshot.DoesNotExist: + raise HttpError(404, 'Snapshot not found') + except Snapshot.MultipleObjectsReturned: + snapshot = Snapshot.objects.filter( + Q(id__startswith=data.snapshot_id) | Q(timestamp__startswith=data.snapshot_id) + ).first() + + # Get the tag + if data.tag_id: + try: + tag = Tag.objects.get(pk=data.tag_id) + except Tag.DoesNotExist: + raise HttpError(404, 'Tag not found') + elif data.tag_name: + try: + tag = Tag.objects.get(name__iexact=data.tag_name.strip()) + except Tag.DoesNotExist: + raise HttpError(404, 'Tag not found') + else: + raise HttpError(400, 'Either tag_name or tag_id is required') + + # Remove the tag from the snapshot + snapshot.tags.remove(tag) + + return { + 'success': True, + 'tag_id': tag.pk, + 'tag_name': tag.name, + } diff --git a/archivebox/core/admin_snapshots.py b/archivebox/core/admin_snapshots.py index f8662fc3..e5f972da 100644 --- a/archivebox/core/admin_snapshots.py +++ b/archivebox/core/admin_snapshots.py @@ -11,7 +11,6 @@ from django.utils import timezone from django import forms from django.template import Template, RequestContext from django.contrib.admin.helpers import ActionForm -from django.contrib.admin.widgets import FilteredSelectMultiple from archivebox.config import DATA_DIR from archivebox.config.common import SERVER_CONFIG @@ -24,8 +23,8 @@ from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin from archivebox.workers.tasks import bg_archive_snapshots, bg_add from archivebox.core.models import Tag, Snapshot -from archivebox.core.admin_tags import TagInline from archivebox.core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list +from archivebox.core.widgets import TagEditorWidget, InlineTagEditorWidget # GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False} @@ -36,16 +35,30 @@ class SnapshotActionForm(ActionForm): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Define tags field in __init__ to avoid database access during app initialization - self.fields['tags'] = forms.ModelMultipleChoiceField( + self.fields['tags'] = forms.CharField( label='Edit tags', - queryset=Tag.objects.all(), required=False, - widget=FilteredSelectMultiple( - 'core_tag__name', - False, - ), + widget=TagEditorWidget(), ) + def clean_tags(self): + """Parse comma-separated tag names into Tag objects.""" + tags_str = self.cleaned_data.get('tags', '') + if not tags_str: + return [] + + tag_names = [name.strip() for name in tags_str.split(',') if name.strip()] + tags = [] + for name in tag_names: + tag, _ = Tag.objects.get_or_create( + name__iexact=name, + defaults={'name': name} + ) + # Use the existing tag if found by case-insensitive match + tag = Tag.objects.filter(name__iexact=name).first() or tag + tags.append(tag) + return tags + # TODO: allow selecting actions for specific extractor plugins? is this useful? # plugin = forms.ChoiceField( # choices=ArchiveResult.PLUGIN_CHOICES, @@ -54,10 +67,59 @@ class SnapshotActionForm(ActionForm): # ) +class SnapshotAdminForm(forms.ModelForm): + """Custom form for Snapshot admin with tag editor widget.""" + tags_editor = forms.CharField( + label='Tags', + required=False, + widget=TagEditorWidget(), + help_text='Type tag names and press Enter or Space to add. Click × to remove.', + ) + + class Meta: + model = Snapshot + fields = '__all__' + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # Initialize tags_editor with current tags + if self.instance and self.instance.pk: + self.initial['tags_editor'] = ','.join( + sorted(tag.name for tag in self.instance.tags.all()) + ) + + def save(self, commit=True): + instance = super().save(commit=False) + + # Handle tags_editor field + if commit: + instance.save() + self._save_m2m() + + # Parse and save tags from tags_editor + tags_str = self.cleaned_data.get('tags_editor', '') + if tags_str: + tag_names = [name.strip() for name in tags_str.split(',') if name.strip()] + tags = [] + for name in tag_names: + tag, _ = Tag.objects.get_or_create( + name__iexact=name, + defaults={'name': name} + ) + tag = Tag.objects.filter(name__iexact=name).first() or tag + tags.append(tag) + instance.tags.set(tags) + else: + instance.tags.clear() + + return instance + + class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): + form = SnapshotAdminForm list_display = ('created_at', 'title_str', 'status', 'files', 'size', 'url_str') sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl') - readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir', 'archiveresults_list') + readonly_fields = ('admin_actions', 'status_info', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir', 'archiveresults_list') search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name') list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'crawl__created_by', 'tags__name') @@ -66,6 +128,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): 'fields': ('url', 'title'), 'classes': ('card', 'wide'), }), + ('Tags', { + 'fields': ('tags_editor',), + 'classes': ('card',), + }), ('Status', { 'fields': ('status', 'retry_at', 'status_info'), 'classes': ('card',), @@ -75,7 +141,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): 'classes': ('card',), }), ('Relations', { - 'fields': ('crawl', 'tags_str'), + 'fields': ('crawl',), 'classes': ('card',), }), ('Config', { @@ -98,7 +164,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): ordering = ['-created_at'] actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots'] - inlines = [TagInline] # Removed ArchiveResultInline, using custom renderer instead + inlines = [] # Removed TagInline, using TagEditorWidget instead list_per_page = min(max(5, SERVER_CONFIG.SNAPSHOTS_PER_PAGE), 5000) action_form = SnapshotActionForm @@ -257,11 +323,15 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): ordering='title', ) def title_str(self, obj): - tags = ''.join( - format_html('{} ', tag.pk, tag.name) - for tag in obj.tags.all() - if str(tag.name).strip() + # Render inline tag editor widget + widget = InlineTagEditorWidget(snapshot_id=str(obj.pk)) + tags_html = widget.render( + name=f'tags_{obj.pk}', + value=obj.tags.all(), + attrs={'id': f'tags_{obj.pk}'}, + snapshot_id=str(obj.pk), ) + # Show title if available, otherwise show URL display_text = obj.title or obj.url css_class = 'fetched' if obj.title else 'pending' @@ -278,7 +348,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): obj.archive_path, css_class, urldecode(htmldecode(display_text))[:128] - ) + mark_safe(f' {tags}') + ) + mark_safe(f' {tags_html}') @admin.display( description='Files Saved', @@ -428,13 +498,41 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): description="+" ) def add_tags(self, request, queryset): - tags = request.POST.getlist('tags') - print('[+] Adding tags', tags, 'to Snapshots', queryset) - for obj in queryset: - obj.tags.add(*tags) + from archivebox.core.models import SnapshotTag + + # Get tags from the form - now comma-separated string + tags_str = request.POST.get('tags', '') + if not tags_str: + messages.warning(request, "No tags specified.") + return + + # Parse comma-separated tag names and get/create Tag objects + tag_names = [name.strip() for name in tags_str.split(',') if name.strip()] + tags = [] + for name in tag_names: + tag, _ = Tag.objects.get_or_create( + name__iexact=name, + defaults={'name': name} + ) + tag = Tag.objects.filter(name__iexact=name).first() or tag + tags.append(tag) + + # Get snapshot IDs efficiently (works with select_across for all pages) + snapshot_ids = list(queryset.values_list('id', flat=True)) + num_snapshots = len(snapshot_ids) + + print('[+] Adding tags', [t.name for t in tags], 'to', num_snapshots, 'Snapshots') + + # Bulk create M2M relationships (1 query per tag, not per snapshot) + for tag in tags: + SnapshotTag.objects.bulk_create( + [SnapshotTag(snapshot_id=sid, tag=tag) for sid in snapshot_ids], + ignore_conflicts=True # Skip if relationship already exists + ) + messages.success( request, - f"Added {len(tags)} tags to {queryset.count()} Snapshots.", + f"Added {len(tags)} tag(s) to {num_snapshots} Snapshot(s).", ) @@ -442,11 +540,40 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): description="–" ) def remove_tags(self, request, queryset): - tags = request.POST.getlist('tags') - print('[-] Removing tags', tags, 'to Snapshots', queryset) - for obj in queryset: - obj.tags.remove(*tags) + from archivebox.core.models import SnapshotTag + + # Get tags from the form - now comma-separated string + tags_str = request.POST.get('tags', '') + if not tags_str: + messages.warning(request, "No tags specified.") + return + + # Parse comma-separated tag names and find matching Tag objects (case-insensitive) + tag_names = [name.strip() for name in tags_str.split(',') if name.strip()] + tags = [] + for name in tag_names: + tag = Tag.objects.filter(name__iexact=name).first() + if tag: + tags.append(tag) + + if not tags: + messages.warning(request, "No matching tags found.") + return + + # Get snapshot IDs efficiently (works with select_across for all pages) + snapshot_ids = list(queryset.values_list('id', flat=True)) + num_snapshots = len(snapshot_ids) + tag_ids = [t.pk for t in tags] + + print('[-] Removing tags', [t.name for t in tags], 'from', num_snapshots, 'Snapshots') + + # Bulk delete M2M relationships (1 query total, not per snapshot) + deleted_count, _ = SnapshotTag.objects.filter( + snapshot_id__in=snapshot_ids, + tag_id__in=tag_ids + ).delete() + messages.success( request, - f"Removed {len(tags)} tags from {queryset.count()} Snapshots.", + f"Removed {len(tags)} tag(s) from {num_snapshots} Snapshot(s) ({deleted_count} associations deleted).", ) diff --git a/archivebox/core/migrations/0025_cleanup_schema.py b/archivebox/core/migrations/0025_cleanup_schema.py new file mode 100644 index 00000000..78057e4b --- /dev/null +++ b/archivebox/core/migrations/0025_cleanup_schema.py @@ -0,0 +1,380 @@ +# Generated by hand on 2025-12-29 +# Cleans up extra columns from raw SQL migrations and ensures schema matches models + +from django.db import migrations, models +import django.db.models.deletion +import django.utils.timezone +from django.conf import settings +import archivebox.base_models.models + + +def cleanup_extra_columns(apps, schema_editor): + """ + Remove extra columns that were needed for v0.7.2/v0.8.6rc0 migration but don't exist in final models. + The actual models use @property methods to access these values from the process FK. + """ + with schema_editor.connection.cursor() as cursor: + # Check if cmd column exists (means we came from v0.7.2/v0.8.6rc0) + cursor.execute("SELECT COUNT(*) FROM pragma_table_info('core_archiveresult') WHERE name='cmd'") + has_cmd = cursor.fetchone()[0] > 0 + + if has_cmd: + print(" Cleaning up temporary columns from core_archiveresult...") + # Rebuild table without the extra columns + cursor.execute(""" + CREATE TABLE core_archiveresult_final ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + uuid TEXT, + created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + + snapshot_id TEXT NOT NULL, + plugin VARCHAR(32) NOT NULL DEFAULT '', + hook_name VARCHAR(255) NOT NULL DEFAULT '', + + start_ts DATETIME, + end_ts DATETIME, + status VARCHAR(15) NOT NULL DEFAULT 'queued', + retry_at DATETIME, + + output_files TEXT NOT NULL DEFAULT '{}', + output_json TEXT, + output_str TEXT NOT NULL DEFAULT '', + output_size INTEGER NOT NULL DEFAULT 0, + output_mimetypes VARCHAR(512) NOT NULL DEFAULT '', + + config TEXT, + notes TEXT NOT NULL DEFAULT '', + num_uses_succeeded INTEGER NOT NULL DEFAULT 0, + num_uses_failed INTEGER NOT NULL DEFAULT 0, + + process_id TEXT, + + FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE, + FOREIGN KEY (process_id) REFERENCES machine_process(id) ON DELETE RESTRICT + ) + """) + + # Copy data (cmd, pwd, etc. are now accessed via process FK) + cursor.execute(""" + INSERT INTO core_archiveresult_final SELECT + id, uuid, created_at, modified_at, + snapshot_id, plugin, hook_name, + start_ts, end_ts, status, retry_at, + output_files, output_json, output_str, output_size, output_mimetypes, + config, notes, num_uses_succeeded, num_uses_failed, + process_id + FROM core_archiveresult + """) + + # Replace table + cursor.execute("DROP TABLE core_archiveresult") + cursor.execute("ALTER TABLE core_archiveresult_final RENAME TO core_archiveresult") + + # Recreate indexes + cursor.execute("CREATE INDEX core_archiveresult_snapshot_id_idx ON core_archiveresult(snapshot_id)") + cursor.execute("CREATE INDEX core_archiveresult_plugin_idx ON core_archiveresult(plugin)") + cursor.execute("CREATE INDEX core_archiveresult_status_idx ON core_archiveresult(status)") + cursor.execute("CREATE INDEX core_archiveresult_retry_at_idx ON core_archiveresult(retry_at)") + cursor.execute("CREATE INDEX core_archiveresult_created_at_idx ON core_archiveresult(created_at)") + cursor.execute("CREATE INDEX core_archiveresult_uuid_idx ON core_archiveresult(uuid)") + + print(" ✓ Cleaned up core_archiveresult schema") + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0024_assign_default_crawl'), + ('machine', '0001_initial'), + ('crawls', '0002_upgrade_to_0_9_0'), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.SeparateDatabaseAndState( + database_operations=[ + migrations.RunPython( + cleanup_extra_columns, + reverse_code=migrations.RunPython.noop, + ), + ], + state_operations=[ + # Tell Django about all the fields that exist after raw SQL migrations + # ArchiveResult model options + migrations.AlterModelOptions( + name='archiveresult', + options={'verbose_name': 'Archive Result', 'verbose_name_plural': 'Archive Results Log'}, + ), + + # Remove old fields + migrations.RemoveField(model_name='archiveresult', name='cmd'), + migrations.RemoveField(model_name='archiveresult', name='pwd'), + migrations.RemoveField(model_name='archiveresult', name='cmd_version'), + migrations.RemoveField(model_name='archiveresult', name='extractor'), + migrations.RemoveField(model_name='archiveresult', name='output'), + migrations.RemoveField(model_name='snapshot', name='added'), + migrations.RemoveField(model_name='snapshot', name='updated'), + + # Add new ArchiveResult fields + migrations.AddField( + model_name='archiveresult', + name='plugin', + field=models.CharField(blank=True, default='', max_length=32), + ), + migrations.AddField( + model_name='archiveresult', + name='hook_name', + field=models.CharField(blank=True, default='', max_length=255), + ), + migrations.AddField( + model_name='archiveresult', + name='output_str', + field=models.TextField(blank=True, default=''), + ), + migrations.AddField( + model_name='archiveresult', + name='output_json', + field=models.JSONField(blank=True, default=dict, null=True), + ), + migrations.AddField( + model_name='archiveresult', + name='output_files', + field=models.JSONField(blank=True, default=dict), + ), + migrations.AddField( + model_name='archiveresult', + name='output_size', + field=models.PositiveIntegerField(default=0), + ), + migrations.AddField( + model_name='archiveresult', + name='output_mimetypes', + field=models.CharField(blank=True, default='', max_length=512), + ), + migrations.AddField( + model_name='archiveresult', + name='config', + field=models.JSONField(blank=True, default=dict, null=True), + ), + migrations.AddField( + model_name='archiveresult', + name='notes', + field=models.TextField(blank=True, default=''), + ), + migrations.AddField( + model_name='archiveresult', + name='num_uses_succeeded', + field=models.PositiveIntegerField(default=0), + ), + migrations.AddField( + model_name='archiveresult', + name='num_uses_failed', + field=models.PositiveIntegerField(default=0), + ), + migrations.AddField( + model_name='archiveresult', + name='retry_at', + field=models.DateTimeField(blank=True, db_index=True, default=None, null=True), + ), + migrations.AddField( + model_name='archiveresult', + name='created_at', + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AddField( + model_name='archiveresult', + name='modified_at', + field=models.DateTimeField(auto_now=True), + ), + migrations.AddField( + model_name='archiveresult', + name='process', + field=models.OneToOneField(null=True, on_delete=django.db.models.deletion.PROTECT, related_name='archiveresult', to='machine.process'), + ), + + # Update Snapshot model + migrations.AlterModelOptions( + name='snapshot', + options={'verbose_name': 'Snapshot', 'verbose_name_plural': 'Snapshots'}, + ), + migrations.AddField( + model_name='snapshot', + name='created_at', + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AddField( + model_name='snapshot', + name='modified_at', + field=models.DateTimeField(auto_now=True), + ), + migrations.AddField( + model_name='snapshot', + name='bookmarked_at', + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AddField( + model_name='snapshot', + name='downloaded_at', + field=models.DateTimeField(blank=True, null=True), + ), + migrations.AddField( + model_name='snapshot', + name='crawl', + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='crawls.crawl'), + ), + migrations.AddField( + model_name='snapshot', + name='depth', + field=models.PositiveSmallIntegerField(default=0), + ), + migrations.AddField( + model_name='snapshot', + name='parent_snapshot', + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='child_snapshots', to='core.snapshot'), + ), + migrations.AddField( + model_name='snapshot', + name='status', + field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('sealed', 'Sealed')], db_index=True, default='queued', max_length=15), + ), + migrations.AddField( + model_name='snapshot', + name='retry_at', + field=models.DateTimeField(blank=True, db_index=True, default=None, null=True), + ), + migrations.AddField( + model_name='snapshot', + name='current_step', + field=models.PositiveSmallIntegerField(default=0), + ), + migrations.AddField( + model_name='snapshot', + name='fs_version', + field=models.CharField(default='0.9.0', max_length=10), + ), + migrations.AddField( + model_name='snapshot', + name='config', + field=models.JSONField(blank=True, default=dict), + ), + migrations.AddField( + model_name='snapshot', + name='notes', + field=models.TextField(blank=True, default=''), + ), + migrations.AddField( + model_name='snapshot', + name='num_uses_succeeded', + field=models.PositiveIntegerField(default=0), + ), + migrations.AddField( + model_name='snapshot', + name='num_uses_failed', + field=models.PositiveIntegerField(default=0), + ), + + # Update Tag model + migrations.AlterModelOptions( + name='tag', + options={'verbose_name': 'Tag', 'verbose_name_plural': 'Tags'}, + ), + migrations.AddField( + model_name='tag', + name='created_at', + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now, null=True), + ), + migrations.AddField( + model_name='tag', + name='modified_at', + field=models.DateTimeField(auto_now=True), + ), + migrations.AddField( + model_name='tag', + name='created_by', + field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL), + ), + + # Alter field types + migrations.AlterField( + model_name='archiveresult', + name='id', + field=models.AutoField(primary_key=True, serialize=False, verbose_name='ID'), + ), + migrations.AlterField( + model_name='archiveresult', + name='uuid', + field=models.UUIDField(blank=True, db_index=True, editable=False, null=True, unique=True), + ), + migrations.AlterField( + model_name='archiveresult', + name='end_ts', + field=models.DateTimeField(blank=True, default=None, null=True), + ), + migrations.AlterField( + model_name='archiveresult', + name='start_ts', + field=models.DateTimeField(blank=True, default=None, null=True), + ), + migrations.AlterField( + model_name='archiveresult', + name='status', + field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('succeeded', 'Succeeded'), ('failed', 'Failed')], db_index=True, default='queued', max_length=15), + ), + migrations.AlterField( + model_name='snapshot', + name='id', + field=models.CharField(editable=False, max_length=32, primary_key=True, serialize=False, unique=True), + ), + migrations.AlterField( + model_name='snapshot', + name='timestamp', + field=models.CharField(db_index=True, max_length=32, unique=True), + ), + migrations.AlterField( + model_name='snapshot', + name='url', + field=models.URLField(max_length=2048), + ), + migrations.AlterField( + model_name='tag', + name='slug', + field=models.SlugField(editable=False, max_length=100, unique=True), + ), + + # Create M2M model for snapshot tags + migrations.CreateModel( + name='SnapshotTag', + fields=[ + ('id', models.AutoField(primary_key=True, serialize=False, verbose_name='ID')), + ('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.snapshot')), + ('tag', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.tag')), + ], + options={ + 'db_table': 'core_snapshot_tags', + }, + ), + migrations.AlterUniqueTogether( + name='snapshottag', + unique_together={('snapshot', 'tag')}, + ), + + # Update tags field on Snapshot to use the through model + migrations.AlterField( + model_name='snapshot', + name='tags', + field=models.ManyToManyField(related_name='snapshot_set', through='core.SnapshotTag', to='core.tag'), + ), + + # Add constraints + migrations.AddConstraint( + model_name='snapshot', + constraint=models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'), + ), + migrations.AddConstraint( + model_name='snapshot', + constraint=models.UniqueConstraint(fields=['timestamp'], name='unique_timestamp'), + ), + ], + ), + ] diff --git a/archivebox/core/migrations/0026_final_field_adjustments.py b/archivebox/core/migrations/0026_final_field_adjustments.py new file mode 100644 index 00000000..a7d16774 --- /dev/null +++ b/archivebox/core/migrations/0026_final_field_adjustments.py @@ -0,0 +1,76 @@ +# Generated by hand on 2025-12-30 +# Final field adjustments to match model definitions exactly + +from django.db import migrations, models +import django.db.models.deletion +import django.utils.timezone +from archivebox.uuid_compat import uuid7 + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0025_cleanup_schema'), + ('crawls', '0002_upgrade_to_0_9_0'), + ] + + operations = [ + # Alter Snapshot fields to match model exactly + migrations.AlterField( + model_name='snapshot', + name='id', + field=models.UUIDField(default=uuid7, editable=False, primary_key=True, unique=True), + ), + migrations.AlterField( + model_name='snapshot', + name='timestamp', + field=models.CharField(db_index=True, editable=False, max_length=32, unique=True), + ), + migrations.AlterField( + model_name='snapshot', + name='url', + field=models.URLField(db_index=True, unique=False), + ), + migrations.AlterField( + model_name='snapshot', + name='downloaded_at', + field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True), + ), + migrations.AlterField( + model_name='snapshot', + name='parent_snapshot', + field=models.ForeignKey(blank=True, db_index=True, help_text='Parent snapshot that discovered this URL (for recursive crawling)', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='child_snapshots', to='core.snapshot'), + ), + migrations.AlterField( + model_name='snapshot', + name='retry_at', + field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True), + ), + migrations.AlterField( + model_name='snapshot', + name='fs_version', + field=models.CharField(default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', max_length=10), + ), + migrations.AlterField( + model_name='snapshot', + name='tags', + field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'), + ), + + # Alter SnapshotTag fields + migrations.AlterField( + model_name='snapshottag', + name='id', + field=models.AutoField(primary_key=True, serialize=False, verbose_name='ID'), + ), + migrations.AlterField( + model_name='snapshottag', + name='snapshot', + field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot'), + ), + migrations.AlterField( + model_name='snapshottag', + name='tag', + field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag'), + ), + ] diff --git a/archivebox/core/widgets.py b/archivebox/core/widgets.py new file mode 100644 index 00000000..433f5c93 --- /dev/null +++ b/archivebox/core/widgets.py @@ -0,0 +1,512 @@ +__package__ = 'archivebox.core' + +import json +from django import forms +from django.utils.html import escape + + +class TagEditorWidget(forms.Widget): + """ + A widget that renders tags as clickable pills with inline editing. + - Displays existing tags alphabetically as styled pills with X remove button + - Text input with HTML5 datalist for autocomplete suggestions + - Press Enter or Space to create new tags (auto-creates if doesn't exist) + - Uses AJAX for autocomplete and tag creation + """ + template_name = None # We render manually + + class Media: + css = {'all': []} + js = [] + + def __init__(self, attrs=None, snapshot_id=None): + self.snapshot_id = snapshot_id + super().__init__(attrs) + + def _escape(self, value): + """Escape HTML entities in value.""" + return escape(str(value)) if value else '' + + def render(self, name, value, attrs=None, renderer=None): + """ + Render the tag editor widget. + + Args: + name: Field name + value: Can be: + - QuerySet of Tag objects (from M2M field) + - List of tag names + - Comma-separated string of tag names + - None + attrs: HTML attributes + renderer: Not used + """ + # Parse value to get list of tag names + tags = [] + if value: + if hasattr(value, 'all'): # QuerySet + tags = sorted([tag.name for tag in value.all()]) + elif isinstance(value, (list, tuple)): + if value and hasattr(value[0], 'name'): # List of Tag objects + tags = sorted([tag.name for tag in value]) + else: # List of strings or IDs + # Could be tag IDs from form submission + from archivebox.core.models import Tag + tag_names = [] + for v in value: + if isinstance(v, str) and not v.isdigit(): + tag_names.append(v) + else: + try: + tag = Tag.objects.get(pk=v) + tag_names.append(tag.name) + except (Tag.DoesNotExist, ValueError): + if isinstance(v, str): + tag_names.append(v) + tags = sorted(tag_names) + elif isinstance(value, str): + tags = sorted([t.strip() for t in value.split(',') if t.strip()]) + + widget_id = attrs.get('id', name) if attrs else name + + # Build pills HTML + pills_html = '' + for tag in tags: + pills_html += f''' + + {self._escape(tag)} + + + ''' + + # Build the widget HTML + html = f''' +
+
+ {pills_html} +
+ + + +
+ + + ''' + + return html + + +class InlineTagEditorWidget(TagEditorWidget): + """ + Inline version of TagEditorWidget for use in list views. + Includes AJAX save functionality for immediate persistence. + """ + + def __init__(self, attrs=None, snapshot_id=None): + super().__init__(attrs, snapshot_id) + self.snapshot_id = snapshot_id + + def render(self, name, value, attrs=None, renderer=None, snapshot_id=None): + """Render inline tag editor with AJAX save.""" + # Use snapshot_id from __init__ or from render call + snapshot_id = snapshot_id or self.snapshot_id + + # Parse value to get list of tag dicts with id and name + tags = [] + tag_data = [] + if value: + if hasattr(value, 'all'): # QuerySet + for tag in value.all(): + tag_data.append({'id': tag.pk, 'name': tag.name}) + tag_data.sort(key=lambda x: x['name'].lower()) + tags = [t['name'] for t in tag_data] + elif isinstance(value, (list, tuple)): + if value and hasattr(value[0], 'name'): + for tag in value: + tag_data.append({'id': tag.pk, 'name': tag.name}) + tag_data.sort(key=lambda x: x['name'].lower()) + tags = [t['name'] for t in tag_data] + + widget_id = f"inline_tags_{snapshot_id}" if snapshot_id else (attrs.get('id', name) if attrs else name) + + # Build pills HTML with filter links + pills_html = '' + for td in tag_data: + pills_html += f''' + + {self._escape(td['name'])} + + + ''' + + html = f''' + + + {pills_html} + + + + + + + ''' + + return html diff --git a/archivebox/crawls/migrations/0002_upgrade_to_0_9_0.py b/archivebox/crawls/migrations/0002_upgrade_to_0_9_0.py new file mode 100644 index 00000000..7afca909 --- /dev/null +++ b/archivebox/crawls/migrations/0002_upgrade_to_0_9_0.py @@ -0,0 +1,90 @@ +# Generated by hand on 2025-12-29 +# Upgrades crawls_crawl table from v0.8.6rc0 to v0.9.0 schema + +from django.db import migrations + + +def upgrade_crawl_schema_if_needed(apps, schema_editor): + """ + Upgrade crawls_crawl table if it has the old v0.8.6rc0 schema (no urls column). + """ + with schema_editor.connection.cursor() as cursor: + # Check if we need to upgrade (missing urls column means v0.8.6rc0) + cursor.execute(""" + SELECT COUNT(*) FROM pragma_table_info('crawls_crawl') WHERE name='urls' + """) + has_urls = cursor.fetchone()[0] > 0 + + if not has_urls: + print(" Upgrading crawls_crawl from v0.8.6rc0 to v0.9.0 schema...") + + # Create new table with v0.9.0 schema + cursor.execute(""" + CREATE TABLE crawls_crawl_new ( + id TEXT PRIMARY KEY NOT NULL, + created_at DATETIME NOT NULL, + modified_at DATETIME NOT NULL, + num_uses_succeeded INTEGER NOT NULL DEFAULT 0, + num_uses_failed INTEGER NOT NULL DEFAULT 0, + + urls TEXT NOT NULL DEFAULT '[]', + config TEXT, + max_depth INTEGER NOT NULL DEFAULT 0, + tags_str VARCHAR(1024) NOT NULL DEFAULT '', + persona_id TEXT, + label VARCHAR(64) NOT NULL DEFAULT '', + notes TEXT NOT NULL DEFAULT '', + output_dir VARCHAR(512) NOT NULL DEFAULT '', + + status VARCHAR(15) NOT NULL DEFAULT 'queued', + retry_at DATETIME, + created_by_id INTEGER NOT NULL, + schedule_id TEXT, + + FOREIGN KEY (created_by_id) REFERENCES auth_user(id) ON DELETE CASCADE, + FOREIGN KEY (schedule_id) REFERENCES crawls_crawlschedule(id) ON DELETE SET NULL + ) + """) + + # Copy data from old table (v0.8.6rc0 schema) + cursor.execute(""" + INSERT INTO crawls_crawl_new ( + id, created_at, modified_at, num_uses_succeeded, num_uses_failed, + urls, config, max_depth, tags_str, status, retry_at, created_by_id, schedule_id + ) + SELECT + id, created_at, modified_at, num_uses_succeeded, num_uses_failed, + '[]' as urls, config, max_depth, tags_str, status, retry_at, created_by_id, + CAST(schedule_id AS TEXT) + FROM crawls_crawl + """) + + # Replace old table + cursor.execute("DROP TABLE crawls_crawl") + cursor.execute("ALTER TABLE crawls_crawl_new RENAME TO crawls_crawl") + + # Create indexes + cursor.execute("CREATE INDEX crawls_crawl_status_idx ON crawls_crawl(status)") + cursor.execute("CREATE INDEX crawls_crawl_retry_at_idx ON crawls_crawl(retry_at)") + cursor.execute("CREATE INDEX crawls_crawl_created_at_idx ON crawls_crawl(created_at)") + cursor.execute("CREATE INDEX crawls_crawl_created_by_id_idx ON crawls_crawl(created_by_id)") + cursor.execute("CREATE INDEX crawls_crawl_schedule_id_idx ON crawls_crawl(schedule_id)") + + print(" ✓ Upgraded crawls_crawl to v0.9.0 schema") + else: + print(" ✓ crawls_crawl already has v0.9.0 schema") + + +class Migration(migrations.Migration): + + dependencies = [ + ('crawls', '0001_initial'), + ('auth', '0012_alter_user_first_name_max_length'), + ] + + operations = [ + migrations.RunPython( + upgrade_crawl_schema_if_needed, + reverse_code=migrations.RunPython.noop, + ), + ] diff --git a/archivebox/templates/admin/base.html b/archivebox/templates/admin/base.html index bbcb0a3b..bde628a4 100644 --- a/archivebox/templates/admin/base.html +++ b/archivebox/templates/admin/base.html @@ -1059,6 +1059,189 @@ color: #2563eb; margin-right: 8px; } + + /* ============================================ + Tag Editor Widget Styles + ============================================ */ + + /* Main container - acts as input field */ + .tag-editor-container { + display: flex; + flex-wrap: wrap; + align-items: center; + gap: 6px; + padding: 8px 12px; + min-height: 42px; + background: #fff; + border: 1px solid #d1d5db; + border-radius: 8px; + cursor: text; + transition: border-color 0.15s ease, box-shadow 0.15s ease; + } + + .tag-editor-container:focus-within { + border-color: #3b82f6; + box-shadow: 0 0 0 3px rgba(59, 130, 246, 0.15); + } + + /* Pills container */ + .tag-pills { + display: flex; + flex-wrap: wrap; + gap: 6px; + align-items: center; + } + + /* Individual tag pill */ + .tag-pill { + display: inline-flex; + align-items: center; + gap: 4px; + padding: 4px 8px 4px 10px; + background: linear-gradient(135deg, #3b82f6 0%, #2563eb 100%); + color: #fff; + font-size: 13px; + font-weight: 500; + border-radius: 16px; + white-space: nowrap; + transition: all 0.15s ease; + -webkit-font-smoothing: antialiased; + } + + .tag-pill:hover { + background: linear-gradient(135deg, #2563eb 0%, #1d4ed8 100%); + } + + .tag-pill a.tag-link { + color: #fff; + text-decoration: none; + } + + .tag-pill a.tag-link:hover { + text-decoration: underline; + } + + /* Remove button on pills */ + .tag-remove-btn { + display: inline-flex; + align-items: center; + justify-content: center; + width: 16px; + height: 16px; + padding: 0; + margin: 0; + background: rgba(255, 255, 255, 0.2); + border: none; + border-radius: 50%; + color: #fff; + font-size: 14px; + font-weight: 600; + line-height: 1; + cursor: pointer; + opacity: 0.7; + transition: all 0.15s ease; + } + + .tag-remove-btn:hover { + background: rgba(255, 255, 255, 0.4); + opacity: 1; + } + + /* Inline input for adding tags */ + .tag-inline-input { + flex: 1; + min-width: 120px; + padding: 4px 0; + border: none; + outline: none; + font-size: 14px; + font-family: inherit; + background: transparent; + color: #1e293b; + } + + .tag-inline-input::placeholder { + color: #94a3b8; + } + + /* Inline editor for list view - more compact */ + .tag-editor-inline { + display: inline-flex; + flex-wrap: wrap; + align-items: center; + gap: 4px; + padding: 2px 4px; + background: transparent; + border-radius: 4px; + cursor: text; + vertical-align: middle; + } + + .tag-pills-inline { + display: inline-flex; + flex-wrap: wrap; + gap: 4px; + align-items: center; + } + + .tag-editor-inline .tag-pill { + padding: 2px 6px 2px 8px; + font-size: 11px; + border-radius: 12px; + } + + .tag-editor-inline .tag-remove-btn { + width: 14px; + height: 14px; + font-size: 12px; + } + + .tag-inline-input-sm { + width: 24px; + min-width: 24px; + max-width: 100px; + padding: 2px 4px; + border: none; + outline: none; + font-size: 11px; + font-family: inherit; + background: transparent; + color: #64748b; + transition: width 0.15s ease; + } + + .tag-inline-input-sm:focus { + width: 80px; + color: #1e293b; + } + + .tag-inline-input-sm::placeholder { + color: #94a3b8; + } + + /* Container in list view title column */ + .tags-inline-editor { + display: inline; + margin-left: 8px; + } + + /* Existing tag styles (keep for backwards compat) */ + .tags .tag { + display: inline-block; + padding: 2px 8px; + margin: 1px 2px; + background: linear-gradient(135deg, #3b82f6 0%, #2563eb 100%); + color: #fff; + font-size: 11px; + font-weight: 500; + border-radius: 12px; + text-decoration: none; + transition: all 0.15s ease; + } + + .tags .tag:hover { + background: linear-gradient(135deg, #2563eb 0%, #1d4ed8 100%); + } {% endblock %}