continue renaming extractor to plugin, add plan for hook concurrency, add chrome kill helper script

2026-04-05 07:17:52 +10:00 · 2025-12-28 05:29:24 -08:00
parent d2e65cfd38
commit 4ccb0863bb
53 changed files with 456 additions and 493 deletions
--- a/archivebox/core/admin_archiveresults.py
+++ b/archivebox/core/admin_archiveresults.py
@@ -252,9 +252,9 @@ class ArchiveResultInline(admin.TabularInline):


 class ArchiveResultAdmin(BaseModelAdmin):
-    list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'extractor_with_icon', 'cmd_str', 'output_str')
+    list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str')
    sort_fields = ('id', 'created_by', 'created_at', 'plugin', 'status')
-    readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'extractor_with_icon', 'iface')
+    readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon', 'iface')
    search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp')
    autocomplete_fields = ['snapshot']

--- a/archivebox/core/admin_snapshots.py
+++ b/archivebox/core/admin_snapshots.py
@@ -46,9 +46,9 @@ class SnapshotActionForm(ActionForm):
            ),
        )

-    # TODO: allow selecting actions for specific extractors? is this useful?
-    # extractor = forms.ChoiceField(
-    #     choices=ArchiveResult.EXTRACTOR_CHOICES,
+    # TODO: allow selecting actions for specific extractor plugins? is this useful?
+    # plugin = forms.ChoiceField(
+    #     choices=ArchiveResult.PLUGIN_CHOICES,
    #     required=False,
    #     widget=forms.MultileChoiceField(attrs={'class': "form-control"})
    # )
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -1041,7 +1041,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        return cache.get_or_set(cache_key, calc_tags_str) if not nocache else calc_tags_str()

    def icons(self) -> str:
-        """Generate HTML icons showing which extractors have succeeded for this snapshot"""
+        """Generate HTML icons showing which extractor plugins have succeeded for this snapshot"""
        from django.utils.html import format_html, mark_safe

        cache_key = f'result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}'
@@ -1475,7 +1475,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
                        priority = 50
                elif 'index' in name_lower:
                    priority = 100
-                elif name_lower.startswith(('output', 'content', extractor_name)):
+                elif name_lower.startswith(('output', 'content', plugin_name)):
                    priority = 50
                elif ext in ('html', 'htm', 'pdf'):
                    priority = 30
--- a/archivebox/core/statemachines.py
+++ b/archivebox/core/statemachines.py
@@ -91,7 +91,7 @@ class SnapshotMachine(StateMachine, strict_states=True):
            retry_at=timezone.now() + timedelta(seconds=30),  # if failed, wait 30s before retrying
        )

-        # Run the snapshot - creates pending archiveresults for all enabled extractors
+        # Run the snapshot - creates pending archiveresults for all enabled plugins
        self.snapshot.run()

        # unlock the snapshot after we're done + set status = started
@@ -179,15 +179,15 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
        return can_start
    
    def is_succeeded(self) -> bool:
-        """Check if extraction succeeded (status was set by run_extractor())."""
+        """Check if extractor plugin succeeded (status was set by run())."""
        return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED
-    
+
    def is_failed(self) -> bool:
-        """Check if extraction failed (status was set by run_extractor())."""
+        """Check if extractor plugin failed (status was set by run())."""
        return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED
-    
+
    def is_skipped(self) -> bool:
-        """Check if extraction was skipped (status was set by run_extractor())."""
+        """Check if extractor plugin was skipped (status was set by run())."""
        return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED
    
    def is_backoff(self) -> bool:
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -96,8 +96,8 @@ class SnapshotView(View):
            if not key.endswith('_path') or not path or path.startswith('http'):
                continue

-            extractor_name = key.replace('_path', '')
-            if extractor_name in archiveresults:
+            plugin_name = key.replace('_path', '')
+            if plugin_name in archiveresults:
                continue  # Already have this from ArchiveResult

            file_path = snap_dir / path
@@ -107,8 +107,8 @@ class SnapshotView(View):
            try:
                file_size = file_path.stat().st_size
                if file_size >= 15_000:  # Only show files > 15KB
-                    archiveresults[extractor_name] = {
-                        'name': extractor_name,
+                    archiveresults[plugin_name] = {
+                        'name': plugin_name,
                        'path': path,
                        'ts': ts_to_date_str(file_path.stat().st_mtime or 0),
                        'size': file_size,
@@ -117,7 +117,7 @@ class SnapshotView(View):
            except OSError:
                continue

-        # Get available extractors from hooks (sorted by numeric prefix for ordering)
+        # Get available extractor plugins from hooks (sorted by numeric prefix for ordering)
        # Convert to base names for display ordering
        all_plugins = [get_extractor_name(e) for e in get_extractors()]
        preferred_types = tuple(all_plugins)
@@ -437,7 +437,7 @@ class AddView(UserPassesTestMixin, FormView):
        parser = form.cleaned_data.get("parser", "auto")  # default to auto-detect parser
        tag = form.cleaned_data["tag"]
        depth = 0 if form.cleaned_data["depth"] == "0" else 1
-        extractors = ','.join(form.cleaned_data["archive_methods"])
+        plugins = ','.join(form.cleaned_data["archive_methods"])
        input_kwargs = {
            "urls": urls,
            "tag": tag,
@@ -447,8 +447,8 @@ class AddView(UserPassesTestMixin, FormView):
            "out_dir": DATA_DIR,
            "created_by_id": self.request.user.pk,
        }
-        if extractors:
-            input_kwargs.update({"extractors": extractors})
+        if plugins:
+            input_kwargs.update({"plugins": plugins})


        from archivebox.config.permissions import HOSTNAME
@@ -472,7 +472,7 @@ class AddView(UserPassesTestMixin, FormView):
                # 'INDEX_ONLY': index_only,
                # 'OVERWRITE': False,
                'DEPTH': depth,
-                'EXTRACTORS': extractors or '',
+                'PLUGINS': plugins or '',
                # 'DEFAULT_PERSONA': persona or 'Default',
            }
        )
@@ -580,17 +580,17 @@ def live_progress_view(request):
                snapshot_results = snapshot.archiveresult_set.all()

                # Count in memory instead of DB queries
-                total_extractors = len(snapshot_results)
-                completed_extractors = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.SUCCEEDED)
-                failed_extractors = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.FAILED)
-                pending_extractors = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.QUEUED)
+                total_plugins = len(snapshot_results)
+                completed_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.SUCCEEDED)
+                failed_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.FAILED)
+                pending_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.QUEUED)

                # Calculate snapshot progress
-                snapshot_progress = int(((completed_extractors + failed_extractors) / total_extractors) * 100) if total_extractors > 0 else 0
+                snapshot_progress = int(((completed_plugins + failed_plugins) / total_plugins) * 100) if total_plugins > 0 else 0

-                # Get all extractors for this snapshot (already prefetched, sort in Python)
+                # Get all extractor plugins for this snapshot (already prefetched, sort in Python)
                # Order: started first, then queued, then completed
-                def extractor_sort_key(ar):
+                def plugin_sort_key(ar):
                    status_order = {
                        ArchiveResult.StatusChoices.STARTED: 0,
                        ArchiveResult.StatusChoices.QUEUED: 1,
@@ -605,7 +605,7 @@ def live_progress_view(request):
                        'plugin': ar.plugin,
                        'status': ar.status,
                    }
-                    for ar in sorted(snapshot_results, key=extractor_sort_key)
+                    for ar in sorted(snapshot_results, key=plugin_sort_key)
                ]

                active_snapshots_for_crawl.append({
@@ -614,10 +614,10 @@ def live_progress_view(request):
                    'status': snapshot.status,
                    'started': snapshot.modified_at.isoformat() if snapshot.modified_at else None,
                    'progress': snapshot_progress,
-                    'total_extractors': total_extractors,
-                    'completed_extractors': completed_extractors,
-                    'failed_extractors': failed_extractors,
-                    'pending_extractors': pending_extractors,
+                    'total_plugins': total_plugins,
+                    'completed_plugins': completed_plugins,
+                    'failed_plugins': failed_plugins,
+                    'pending_plugins': pending_plugins,
                    'all_plugins': all_plugins,
                })