mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 07:17:52 +10:00
continue renaming extractor to plugin, add plan for hook concurrency, add chrome kill helper script
This commit is contained in:
@@ -252,9 +252,9 @@ class ArchiveResultInline(admin.TabularInline):
|
||||
|
||||
|
||||
class ArchiveResultAdmin(BaseModelAdmin):
|
||||
list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'extractor_with_icon', 'cmd_str', 'output_str')
|
||||
list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'plugin_with_icon', 'cmd_str', 'output_str')
|
||||
sort_fields = ('id', 'created_by', 'created_at', 'plugin', 'status')
|
||||
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'extractor_with_icon', 'iface')
|
||||
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'plugin_with_icon', 'iface')
|
||||
search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp')
|
||||
autocomplete_fields = ['snapshot']
|
||||
|
||||
|
||||
@@ -46,9 +46,9 @@ class SnapshotActionForm(ActionForm):
|
||||
),
|
||||
)
|
||||
|
||||
# TODO: allow selecting actions for specific extractors? is this useful?
|
||||
# extractor = forms.ChoiceField(
|
||||
# choices=ArchiveResult.EXTRACTOR_CHOICES,
|
||||
# TODO: allow selecting actions for specific extractor plugins? is this useful?
|
||||
# plugin = forms.ChoiceField(
|
||||
# choices=ArchiveResult.PLUGIN_CHOICES,
|
||||
# required=False,
|
||||
# widget=forms.MultileChoiceField(attrs={'class': "form-control"})
|
||||
# )
|
||||
|
||||
@@ -1041,7 +1041,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
return cache.get_or_set(cache_key, calc_tags_str) if not nocache else calc_tags_str()
|
||||
|
||||
def icons(self) -> str:
|
||||
"""Generate HTML icons showing which extractors have succeeded for this snapshot"""
|
||||
"""Generate HTML icons showing which extractor plugins have succeeded for this snapshot"""
|
||||
from django.utils.html import format_html, mark_safe
|
||||
|
||||
cache_key = f'result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}'
|
||||
@@ -1475,7 +1475,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
priority = 50
|
||||
elif 'index' in name_lower:
|
||||
priority = 100
|
||||
elif name_lower.startswith(('output', 'content', extractor_name)):
|
||||
elif name_lower.startswith(('output', 'content', plugin_name)):
|
||||
priority = 50
|
||||
elif ext in ('html', 'htm', 'pdf'):
|
||||
priority = 30
|
||||
|
||||
@@ -91,7 +91,7 @@ class SnapshotMachine(StateMachine, strict_states=True):
|
||||
retry_at=timezone.now() + timedelta(seconds=30), # if failed, wait 30s before retrying
|
||||
)
|
||||
|
||||
# Run the snapshot - creates pending archiveresults for all enabled extractors
|
||||
# Run the snapshot - creates pending archiveresults for all enabled plugins
|
||||
self.snapshot.run()
|
||||
|
||||
# unlock the snapshot after we're done + set status = started
|
||||
@@ -179,15 +179,15 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||
return can_start
|
||||
|
||||
def is_succeeded(self) -> bool:
|
||||
"""Check if extraction succeeded (status was set by run_extractor())."""
|
||||
"""Check if extractor plugin succeeded (status was set by run())."""
|
||||
return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED
|
||||
|
||||
|
||||
def is_failed(self) -> bool:
|
||||
"""Check if extraction failed (status was set by run_extractor())."""
|
||||
"""Check if extractor plugin failed (status was set by run())."""
|
||||
return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED
|
||||
|
||||
|
||||
def is_skipped(self) -> bool:
|
||||
"""Check if extraction was skipped (status was set by run_extractor())."""
|
||||
"""Check if extractor plugin was skipped (status was set by run())."""
|
||||
return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED
|
||||
|
||||
def is_backoff(self) -> bool:
|
||||
|
||||
@@ -96,8 +96,8 @@ class SnapshotView(View):
|
||||
if not key.endswith('_path') or not path or path.startswith('http'):
|
||||
continue
|
||||
|
||||
extractor_name = key.replace('_path', '')
|
||||
if extractor_name in archiveresults:
|
||||
plugin_name = key.replace('_path', '')
|
||||
if plugin_name in archiveresults:
|
||||
continue # Already have this from ArchiveResult
|
||||
|
||||
file_path = snap_dir / path
|
||||
@@ -107,8 +107,8 @@ class SnapshotView(View):
|
||||
try:
|
||||
file_size = file_path.stat().st_size
|
||||
if file_size >= 15_000: # Only show files > 15KB
|
||||
archiveresults[extractor_name] = {
|
||||
'name': extractor_name,
|
||||
archiveresults[plugin_name] = {
|
||||
'name': plugin_name,
|
||||
'path': path,
|
||||
'ts': ts_to_date_str(file_path.stat().st_mtime or 0),
|
||||
'size': file_size,
|
||||
@@ -117,7 +117,7 @@ class SnapshotView(View):
|
||||
except OSError:
|
||||
continue
|
||||
|
||||
# Get available extractors from hooks (sorted by numeric prefix for ordering)
|
||||
# Get available extractor plugins from hooks (sorted by numeric prefix for ordering)
|
||||
# Convert to base names for display ordering
|
||||
all_plugins = [get_extractor_name(e) for e in get_extractors()]
|
||||
preferred_types = tuple(all_plugins)
|
||||
@@ -437,7 +437,7 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
parser = form.cleaned_data.get("parser", "auto") # default to auto-detect parser
|
||||
tag = form.cleaned_data["tag"]
|
||||
depth = 0 if form.cleaned_data["depth"] == "0" else 1
|
||||
extractors = ','.join(form.cleaned_data["archive_methods"])
|
||||
plugins = ','.join(form.cleaned_data["archive_methods"])
|
||||
input_kwargs = {
|
||||
"urls": urls,
|
||||
"tag": tag,
|
||||
@@ -447,8 +447,8 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
"out_dir": DATA_DIR,
|
||||
"created_by_id": self.request.user.pk,
|
||||
}
|
||||
if extractors:
|
||||
input_kwargs.update({"extractors": extractors})
|
||||
if plugins:
|
||||
input_kwargs.update({"plugins": plugins})
|
||||
|
||||
|
||||
from archivebox.config.permissions import HOSTNAME
|
||||
@@ -472,7 +472,7 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
# 'INDEX_ONLY': index_only,
|
||||
# 'OVERWRITE': False,
|
||||
'DEPTH': depth,
|
||||
'EXTRACTORS': extractors or '',
|
||||
'PLUGINS': plugins or '',
|
||||
# 'DEFAULT_PERSONA': persona or 'Default',
|
||||
}
|
||||
)
|
||||
@@ -580,17 +580,17 @@ def live_progress_view(request):
|
||||
snapshot_results = snapshot.archiveresult_set.all()
|
||||
|
||||
# Count in memory instead of DB queries
|
||||
total_extractors = len(snapshot_results)
|
||||
completed_extractors = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.SUCCEEDED)
|
||||
failed_extractors = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.FAILED)
|
||||
pending_extractors = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.QUEUED)
|
||||
total_plugins = len(snapshot_results)
|
||||
completed_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.SUCCEEDED)
|
||||
failed_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.FAILED)
|
||||
pending_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.QUEUED)
|
||||
|
||||
# Calculate snapshot progress
|
||||
snapshot_progress = int(((completed_extractors + failed_extractors) / total_extractors) * 100) if total_extractors > 0 else 0
|
||||
snapshot_progress = int(((completed_plugins + failed_plugins) / total_plugins) * 100) if total_plugins > 0 else 0
|
||||
|
||||
# Get all extractors for this snapshot (already prefetched, sort in Python)
|
||||
# Get all extractor plugins for this snapshot (already prefetched, sort in Python)
|
||||
# Order: started first, then queued, then completed
|
||||
def extractor_sort_key(ar):
|
||||
def plugin_sort_key(ar):
|
||||
status_order = {
|
||||
ArchiveResult.StatusChoices.STARTED: 0,
|
||||
ArchiveResult.StatusChoices.QUEUED: 1,
|
||||
@@ -605,7 +605,7 @@ def live_progress_view(request):
|
||||
'plugin': ar.plugin,
|
||||
'status': ar.status,
|
||||
}
|
||||
for ar in sorted(snapshot_results, key=extractor_sort_key)
|
||||
for ar in sorted(snapshot_results, key=plugin_sort_key)
|
||||
]
|
||||
|
||||
active_snapshots_for_crawl.append({
|
||||
@@ -614,10 +614,10 @@ def live_progress_view(request):
|
||||
'status': snapshot.status,
|
||||
'started': snapshot.modified_at.isoformat() if snapshot.modified_at else None,
|
||||
'progress': snapshot_progress,
|
||||
'total_extractors': total_extractors,
|
||||
'completed_extractors': completed_extractors,
|
||||
'failed_extractors': failed_extractors,
|
||||
'pending_extractors': pending_extractors,
|
||||
'total_plugins': total_plugins,
|
||||
'completed_plugins': completed_plugins,
|
||||
'failed_plugins': failed_plugins,
|
||||
'pending_plugins': pending_plugins,
|
||||
'all_plugins': all_plugins,
|
||||
})
|
||||
|
||||
|
||||
Reference in New Issue
Block a user