continue renaming extractor to plugin, add plan for hook concurrency, add chrome kill helper script

This commit is contained in:
Nick Sweeting
2025-12-28 05:29:24 -08:00
parent d2e65cfd38
commit 4ccb0863bb
53 changed files with 456 additions and 493 deletions

View File

@@ -252,9 +252,9 @@ class ArchiveResultInline(admin.TabularInline):
class ArchiveResultAdmin(BaseModelAdmin):
list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'extractor_with_icon', 'cmd_str', 'output_str')
list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str')
sort_fields = ('id', 'created_by', 'created_at', 'plugin', 'status')
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'extractor_with_icon', 'iface')
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon', 'iface')
search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp')
autocomplete_fields = ['snapshot']

View File

@@ -46,9 +46,9 @@ class SnapshotActionForm(ActionForm):
),
)
# TODO: allow selecting actions for specific extractors? is this useful?
# extractor = forms.ChoiceField(
# choices=ArchiveResult.EXTRACTOR_CHOICES,
# TODO: allow selecting actions for specific extractor plugins? is this useful?
# plugin = forms.ChoiceField(
# choices=ArchiveResult.PLUGIN_CHOICES,
# required=False,
# widget=forms.MultileChoiceField(attrs={'class': "form-control"})
# )

View File

@@ -1041,7 +1041,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
return cache.get_or_set(cache_key, calc_tags_str) if not nocache else calc_tags_str()
def icons(self) -> str:
"""Generate HTML icons showing which extractors have succeeded for this snapshot"""
"""Generate HTML icons showing which extractor plugins have succeeded for this snapshot"""
from django.utils.html import format_html, mark_safe
cache_key = f'result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}'
@@ -1475,7 +1475,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
priority = 50
elif 'index' in name_lower:
priority = 100
elif name_lower.startswith(('output', 'content', extractor_name)):
elif name_lower.startswith(('output', 'content', plugin_name)):
priority = 50
elif ext in ('html', 'htm', 'pdf'):
priority = 30

View File

@@ -91,7 +91,7 @@ class SnapshotMachine(StateMachine, strict_states=True):
retry_at=timezone.now() + timedelta(seconds=30), # if failed, wait 30s before retrying
)
# Run the snapshot - creates pending archiveresults for all enabled extractors
# Run the snapshot - creates pending archiveresults for all enabled plugins
self.snapshot.run()
# unlock the snapshot after we're done + set status = started
@@ -179,15 +179,15 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
return can_start
def is_succeeded(self) -> bool:
"""Check if extraction succeeded (status was set by run_extractor())."""
"""Check if extractor plugin succeeded (status was set by run())."""
return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED
def is_failed(self) -> bool:
"""Check if extraction failed (status was set by run_extractor())."""
"""Check if extractor plugin failed (status was set by run())."""
return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED
def is_skipped(self) -> bool:
"""Check if extraction was skipped (status was set by run_extractor())."""
"""Check if extractor plugin was skipped (status was set by run())."""
return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED
def is_backoff(self) -> bool:

View File

@@ -96,8 +96,8 @@ class SnapshotView(View):
if not key.endswith('_path') or not path or path.startswith('http'):
continue
extractor_name = key.replace('_path', '')
if extractor_name in archiveresults:
plugin_name = key.replace('_path', '')
if plugin_name in archiveresults:
continue # Already have this from ArchiveResult
file_path = snap_dir / path
@@ -107,8 +107,8 @@ class SnapshotView(View):
try:
file_size = file_path.stat().st_size
if file_size >= 15_000: # Only show files > 15KB
archiveresults[extractor_name] = {
'name': extractor_name,
archiveresults[plugin_name] = {
'name': plugin_name,
'path': path,
'ts': ts_to_date_str(file_path.stat().st_mtime or 0),
'size': file_size,
@@ -117,7 +117,7 @@ class SnapshotView(View):
except OSError:
continue
# Get available extractors from hooks (sorted by numeric prefix for ordering)
# Get available extractor plugins from hooks (sorted by numeric prefix for ordering)
# Convert to base names for display ordering
all_plugins = [get_extractor_name(e) for e in get_extractors()]
preferred_types = tuple(all_plugins)
@@ -437,7 +437,7 @@ class AddView(UserPassesTestMixin, FormView):
parser = form.cleaned_data.get("parser", "auto") # default to auto-detect parser
tag = form.cleaned_data["tag"]
depth = 0 if form.cleaned_data["depth"] == "0" else 1
extractors = ','.join(form.cleaned_data["archive_methods"])
plugins = ','.join(form.cleaned_data["archive_methods"])
input_kwargs = {
"urls": urls,
"tag": tag,
@@ -447,8 +447,8 @@ class AddView(UserPassesTestMixin, FormView):
"out_dir": DATA_DIR,
"created_by_id": self.request.user.pk,
}
if extractors:
input_kwargs.update({"extractors": extractors})
if plugins:
input_kwargs.update({"plugins": plugins})
from archivebox.config.permissions import HOSTNAME
@@ -472,7 +472,7 @@ class AddView(UserPassesTestMixin, FormView):
# 'INDEX_ONLY': index_only,
# 'OVERWRITE': False,
'DEPTH': depth,
'EXTRACTORS': extractors or '',
'PLUGINS': plugins or '',
# 'DEFAULT_PERSONA': persona or 'Default',
}
)
@@ -580,17 +580,17 @@ def live_progress_view(request):
snapshot_results = snapshot.archiveresult_set.all()
# Count in memory instead of DB queries
total_extractors = len(snapshot_results)
completed_extractors = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.SUCCEEDED)
failed_extractors = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.FAILED)
pending_extractors = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.QUEUED)
total_plugins = len(snapshot_results)
completed_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.SUCCEEDED)
failed_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.FAILED)
pending_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.QUEUED)
# Calculate snapshot progress
snapshot_progress = int(((completed_extractors + failed_extractors) / total_extractors) * 100) if total_extractors > 0 else 0
snapshot_progress = int(((completed_plugins + failed_plugins) / total_plugins) * 100) if total_plugins > 0 else 0
# Get all extractors for this snapshot (already prefetched, sort in Python)
# Get all extractor plugins for this snapshot (already prefetched, sort in Python)
# Order: started first, then queued, then completed
def extractor_sort_key(ar):
def plugin_sort_key(ar):
status_order = {
ArchiveResult.StatusChoices.STARTED: 0,
ArchiveResult.StatusChoices.QUEUED: 1,
@@ -605,7 +605,7 @@ def live_progress_view(request):
'plugin': ar.plugin,
'status': ar.status,
}
for ar in sorted(snapshot_results, key=extractor_sort_key)
for ar in sorted(snapshot_results, key=plugin_sort_key)
]
active_snapshots_for_crawl.append({
@@ -614,10 +614,10 @@ def live_progress_view(request):
'status': snapshot.status,
'started': snapshot.modified_at.isoformat() if snapshot.modified_at else None,
'progress': snapshot_progress,
'total_extractors': total_extractors,
'completed_extractors': completed_extractors,
'failed_extractors': failed_extractors,
'pending_extractors': pending_extractors,
'total_plugins': total_plugins,
'completed_plugins': completed_plugins,
'failed_plugins': failed_plugins,
'pending_plugins': pending_plugins,
'all_plugins': all_plugins,
})