mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 07:17:52 +10:00
rename extractor to plugin everywhere
This commit is contained in:
@@ -54,7 +54,7 @@ class AddCommandSchema(Schema):
|
||||
tag: str = ""
|
||||
depth: int = 0
|
||||
parser: str = "auto"
|
||||
extract: str = ""
|
||||
plugins: str = ""
|
||||
update: bool = not ARCHIVING_CONFIG.ONLY_NEW # Default to the opposite of ARCHIVING_CONFIG.ONLY_NEW
|
||||
overwrite: bool = False
|
||||
index_only: bool = False
|
||||
@@ -69,7 +69,7 @@ class UpdateCommandSchema(Schema):
|
||||
status: Optional[StatusChoices] = StatusChoices.unarchived
|
||||
filter_type: Optional[str] = FilterTypeChoices.substring
|
||||
filter_patterns: Optional[List[str]] = ['https://example.com']
|
||||
extractors: Optional[str] = ""
|
||||
plugins: Optional[str] = ""
|
||||
|
||||
class ScheduleCommandSchema(Schema):
|
||||
import_path: Optional[str] = None
|
||||
@@ -115,7 +115,7 @@ def cli_add(request, args: AddCommandSchema):
|
||||
update=args.update,
|
||||
index_only=args.index_only,
|
||||
overwrite=args.overwrite,
|
||||
plugins=args.extract, # extract in API maps to plugins param
|
||||
plugins=args.plugins,
|
||||
parser=args.parser,
|
||||
bg=True, # Always run in background for API calls
|
||||
)
|
||||
@@ -143,7 +143,7 @@ def cli_update(request, args: UpdateCommandSchema):
|
||||
status=args.status,
|
||||
filter_type=args.filter_type,
|
||||
filter_patterns=args.filter_patterns,
|
||||
extractors=args.extractors,
|
||||
plugins=args.plugins,
|
||||
)
|
||||
return {
|
||||
"success": True,
|
||||
|
||||
@@ -65,7 +65,8 @@ class MinimalArchiveResultSchema(Schema):
|
||||
created_by_username: str
|
||||
status: str
|
||||
retry_at: datetime | None
|
||||
extractor: str
|
||||
plugin: str
|
||||
hook_name: str
|
||||
cmd_version: str | None
|
||||
cmd: list[str] | None
|
||||
pwd: str | None
|
||||
@@ -113,13 +114,14 @@ class ArchiveResultSchema(MinimalArchiveResultSchema):
|
||||
|
||||
class ArchiveResultFilterSchema(FilterSchema):
|
||||
id: Optional[str] = Field(None, q=['id__startswith', 'snapshot__id__startswith', 'snapshot__timestamp__startswith'])
|
||||
search: Optional[str] = Field(None, q=['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'extractor', 'output_str__icontains', 'id__startswith', 'snapshot__id__startswith', 'snapshot__timestamp__startswith'])
|
||||
search: Optional[str] = Field(None, q=['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'plugin', 'output_str__icontains', 'id__startswith', 'snapshot__id__startswith', 'snapshot__timestamp__startswith'])
|
||||
snapshot_id: Optional[str] = Field(None, q=['snapshot__id__startswith', 'snapshot__timestamp__startswith'])
|
||||
snapshot_url: Optional[str] = Field(None, q='snapshot__url__icontains')
|
||||
snapshot_tag: Optional[str] = Field(None, q='snapshot__tags__name__icontains')
|
||||
status: Optional[str] = Field(None, q='status')
|
||||
output_str: Optional[str] = Field(None, q='output_str__icontains')
|
||||
extractor: Optional[str] = Field(None, q='extractor__icontains')
|
||||
plugin: Optional[str] = Field(None, q='plugin__icontains')
|
||||
hook_name: Optional[str] = Field(None, q='hook_name__icontains')
|
||||
cmd: Optional[str] = Field(None, q='cmd__0__icontains')
|
||||
pwd: Optional[str] = Field(None, q='pwd__icontains')
|
||||
cmd_version: Optional[str] = Field(None, q='cmd_version')
|
||||
|
||||
@@ -86,7 +86,7 @@ def add(urls: str | list[str],
|
||||
'ONLY_NEW': not update,
|
||||
'INDEX_ONLY': index_only,
|
||||
'OVERWRITE': overwrite,
|
||||
'EXTRACTORS': plugins,
|
||||
'PLUGINS': plugins,
|
||||
'DEFAULT_PERSONA': persona or 'Default',
|
||||
'PARSER': parser,
|
||||
}
|
||||
|
||||
@@ -40,7 +40,7 @@ def process_archiveresult_by_id(archiveresult_id: str) -> int:
|
||||
"""
|
||||
Run extraction for a single ArchiveResult by ID (used by workers).
|
||||
|
||||
Triggers the ArchiveResult's state machine tick() to run the extractor.
|
||||
Triggers the ArchiveResult's state machine tick() to run the extractor plugin.
|
||||
"""
|
||||
from rich import print as rprint
|
||||
from core.models import ArchiveResult
|
||||
@@ -51,7 +51,7 @@ def process_archiveresult_by_id(archiveresult_id: str) -> int:
|
||||
rprint(f'[red]ArchiveResult {archiveresult_id} not found[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
rprint(f'[blue]Extracting {archiveresult.extractor} for {archiveresult.snapshot.url}[/blue]', file=sys.stderr)
|
||||
rprint(f'[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]', file=sys.stderr)
|
||||
|
||||
try:
|
||||
# Trigger state machine tick - this runs the actual extraction
|
||||
@@ -151,7 +151,7 @@ def run_plugins(
|
||||
# Only create for specific plugin
|
||||
result, created = ArchiveResult.objects.get_or_create(
|
||||
snapshot=snapshot,
|
||||
extractor=plugin,
|
||||
plugin=plugin,
|
||||
defaults={
|
||||
'status': ArchiveResult.StatusChoices.QUEUED,
|
||||
'retry_at': timezone.now(),
|
||||
@@ -193,7 +193,7 @@ def run_plugins(
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
results = snapshot.archiveresult_set.all()
|
||||
if plugin:
|
||||
results = results.filter(extractor=plugin)
|
||||
results = results.filter(plugin=plugin)
|
||||
|
||||
for result in results:
|
||||
if is_tty:
|
||||
@@ -202,7 +202,7 @@ def run_plugins(
|
||||
'failed': 'red',
|
||||
'skipped': 'yellow',
|
||||
}.get(result.status, 'dim')
|
||||
rprint(f' [{status_color}]{result.status}[/{status_color}] {result.extractor} → {result.output_str or ""}', file=sys.stderr)
|
||||
rprint(f' [{status_color}]{result.status}[/{status_color}] {result.plugin} → {result.output_str or ""}', file=sys.stderr)
|
||||
else:
|
||||
write_record(archiveresult_to_jsonl(result))
|
||||
except Snapshot.DoesNotExist:
|
||||
|
||||
@@ -13,16 +13,16 @@ from archivebox.config import DATA_DIR
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
from archivebox.misc.paginators import AccelleratedPaginator
|
||||
from archivebox.base_models.admin import BaseModelAdmin
|
||||
from archivebox.hooks import get_extractor_icon
|
||||
from archivebox.hooks import get_plugin_icon
|
||||
|
||||
|
||||
from core.models import ArchiveResult, Snapshot
|
||||
|
||||
|
||||
def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
"""Render a nice inline list view of archive results with status, extractor, output, and actions."""
|
||||
"""Render a nice inline list view of archive results with status, plugin, output, and actions."""
|
||||
|
||||
results = list(archiveresults_qs.order_by('extractor').select_related('snapshot')[:limit])
|
||||
results = list(archiveresults_qs.order_by('plugin').select_related('snapshot')[:limit])
|
||||
|
||||
if not results:
|
||||
return mark_safe('<div style="color: #64748b; font-style: italic; padding: 16px 0;">No Archive Results yet...</div>')
|
||||
@@ -40,8 +40,8 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
status = result.status or 'queued'
|
||||
color, bg = status_colors.get(status, ('#6b7280', '#f3f4f6'))
|
||||
|
||||
# Get extractor icon
|
||||
icon = get_extractor_icon(result.extractor)
|
||||
# Get plugin icon
|
||||
icon = get_plugin_icon(result.plugin)
|
||||
|
||||
# Format timestamp
|
||||
end_time = result.end_ts.strftime('%Y-%m-%d %H:%M:%S') if result.end_ts else '-'
|
||||
@@ -79,7 +79,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
font-size: 11px; font-weight: 600; text-transform: uppercase;
|
||||
color: {color}; background: {bg};">{status}</span>
|
||||
</td>
|
||||
<td style="padding: 10px 12px; white-space: nowrap; font-size: 20px;" title="{result.extractor}">
|
||||
<td style="padding: 10px 12px; white-space: nowrap; font-size: 20px;" title="{result.plugin}">
|
||||
{icon}
|
||||
</td>
|
||||
<td style="padding: 10px 12px; font-weight: 500; color: #334155;">
|
||||
@@ -88,7 +88,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
title="View output fullscreen"
|
||||
onmouseover="this.style.color='#2563eb'; this.style.textDecoration='underline';"
|
||||
onmouseout="this.style.color='#334155'; this.style.textDecoration='none';">
|
||||
{result.extractor}
|
||||
{result.plugin}
|
||||
</a>
|
||||
</td>
|
||||
<td style="padding: 10px 12px; max-width: 280px;">
|
||||
@@ -162,7 +162,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">ID</th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Status</th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; width: 32px;"></th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Extractor</th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Plugin</th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Output</th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Completed</th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Version</th>
|
||||
@@ -185,9 +185,9 @@ class ArchiveResultInline(admin.TabularInline):
|
||||
parent_model = Snapshot
|
||||
# fk_name = 'snapshot'
|
||||
extra = 0
|
||||
sort_fields = ('end_ts', 'extractor', 'output_str', 'status', 'cmd_version')
|
||||
sort_fields = ('end_ts', 'plugin', 'output_str', 'status', 'cmd_version')
|
||||
readonly_fields = ('id', 'result_id', 'completed', 'command', 'version')
|
||||
fields = ('start_ts', 'end_ts', *readonly_fields, 'extractor', 'cmd', 'cmd_version', 'pwd', 'created_by', 'status', 'retry_at', 'output_str')
|
||||
fields = ('start_ts', 'end_ts', *readonly_fields, 'plugin', 'cmd', 'cmd_version', 'pwd', 'created_by', 'status', 'retry_at', 'output_str')
|
||||
# exclude = ('id',)
|
||||
ordering = ('end_ts',)
|
||||
show_change_link = True
|
||||
@@ -253,9 +253,9 @@ class ArchiveResultInline(admin.TabularInline):
|
||||
|
||||
class ArchiveResultAdmin(BaseModelAdmin):
|
||||
list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'extractor_with_icon', 'cmd_str', 'output_str')
|
||||
sort_fields = ('id', 'created_by', 'created_at', 'extractor', 'status')
|
||||
sort_fields = ('id', 'created_by', 'created_at', 'plugin', 'status')
|
||||
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'extractor_with_icon', 'iface')
|
||||
search_fields = ('id', 'snapshot__url', 'extractor', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp')
|
||||
search_fields = ('id', 'snapshot__url', 'plugin', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp')
|
||||
autocomplete_fields = ['snapshot']
|
||||
|
||||
fieldsets = (
|
||||
@@ -263,8 +263,8 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
'fields': ('snapshot', 'snapshot_info', 'tags_str'),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Extractor', {
|
||||
'fields': ('extractor', 'extractor_with_icon', 'status', 'retry_at', 'iface'),
|
||||
('Plugin', {
|
||||
'fields': ('plugin', 'plugin_with_icon', 'status', 'retry_at', 'iface'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Timing', {
|
||||
@@ -285,7 +285,7 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
}),
|
||||
)
|
||||
|
||||
list_filter = ('status', 'extractor', 'start_ts', 'cmd_version')
|
||||
list_filter = ('status', 'plugin', 'start_ts', 'cmd_version')
|
||||
ordering = ['-start_ts']
|
||||
list_per_page = SERVER_CONFIG.SNAPSHOTS_PER_PAGE
|
||||
|
||||
@@ -321,14 +321,14 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
def tags_str(self, result):
|
||||
return result.snapshot.tags_str()
|
||||
|
||||
@admin.display(description='Extractor', ordering='extractor')
|
||||
def extractor_with_icon(self, result):
|
||||
icon = get_extractor_icon(result.extractor)
|
||||
@admin.display(description='Plugin', ordering='plugin')
|
||||
def plugin_with_icon(self, result):
|
||||
icon = get_plugin_icon(result.plugin)
|
||||
return format_html(
|
||||
'<span title="{}">{}</span> {}',
|
||||
result.extractor,
|
||||
result.plugin,
|
||||
icon,
|
||||
result.extractor,
|
||||
result.plugin,
|
||||
)
|
||||
|
||||
def cmd_str(self, result):
|
||||
|
||||
@@ -10,19 +10,19 @@ DEPTH_CHOICES = (
|
||||
('1', 'depth = 1 (archive these URLs and all URLs one hop away)'),
|
||||
)
|
||||
|
||||
from archivebox.hooks import get_extractors
|
||||
from archivebox.hooks import get_plugins
|
||||
|
||||
def get_archive_methods():
|
||||
"""Get available archive methods from discovered hooks."""
|
||||
return [(name, name) for name in get_extractors()]
|
||||
def get_plugin_choices():
|
||||
"""Get available extractor plugins from discovered hooks."""
|
||||
return [(name, name) for name in get_plugins()]
|
||||
|
||||
|
||||
class AddLinkForm(forms.Form):
|
||||
url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True)
|
||||
tag = forms.CharField(label="Tags (comma separated tag1,tag2,tag3)", strip=True, required=False)
|
||||
depth = forms.ChoiceField(label="Archive depth", choices=DEPTH_CHOICES, initial='0', widget=forms.RadioSelect(attrs={"class": "depth-selection"}))
|
||||
archive_methods = forms.MultipleChoiceField(
|
||||
label="Archive methods (select at least 1, otherwise all will be used by default)",
|
||||
plugins = forms.MultipleChoiceField(
|
||||
label="Plugins (select at least 1, otherwise all will be used by default)",
|
||||
required=False,
|
||||
widget=forms.SelectMultiple,
|
||||
choices=[], # populated dynamically in __init__
|
||||
@@ -30,7 +30,7 @@ class AddLinkForm(forms.Form):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.fields['archive_methods'].choices = get_archive_methods()
|
||||
self.fields['plugins'].choices = get_plugin_choices()
|
||||
# TODO: hook these up to the view and put them
|
||||
# in a collapsible UI section labeled "Advanced"
|
||||
#
|
||||
|
||||
@@ -867,6 +867,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
ArchiveResult.objects.create(
|
||||
snapshot=self,
|
||||
plugin=plugin,
|
||||
hook_name=result_data.get('hook_name', ''),
|
||||
status=result_data.get('status', 'failed'),
|
||||
output_str=result_data.get('output', ''),
|
||||
cmd=result_data.get('cmd', []),
|
||||
@@ -1162,7 +1163,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
continue
|
||||
pid_file = plugin_dir / 'hook.pid'
|
||||
if pid_file.exists():
|
||||
kill_process(pid_file)
|
||||
kill_process(pid_file, validate=True) # Use validation
|
||||
|
||||
# Update the ArchiveResult from filesystem
|
||||
plugin_name = plugin_dir.name
|
||||
|
||||
@@ -192,7 +192,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||
|
||||
def is_backoff(self) -> bool:
|
||||
"""Check if we should backoff and retry later."""
|
||||
# Backoff if status is still started (extractor didn't complete) and output_str is empty
|
||||
# Backoff if status is still started (plugin didn't complete) and output_str is empty
|
||||
return (
|
||||
self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
|
||||
not self.archiveresult.output_str
|
||||
@@ -222,19 +222,19 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||
# Suppressed: state transition logs
|
||||
# Lock the object and mark start time
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=120), # 2 min timeout for extractor
|
||||
retry_at=timezone.now() + timedelta(seconds=120), # 2 min timeout for plugin
|
||||
status=ArchiveResult.StatusChoices.STARTED,
|
||||
start_ts=timezone.now(),
|
||||
iface=NetworkInterface.current(),
|
||||
)
|
||||
|
||||
# Run the extractor - this updates status, output, timestamps, etc.
|
||||
# Run the plugin - this updates status, output, timestamps, etc.
|
||||
self.archiveresult.run()
|
||||
|
||||
# Save the updated result
|
||||
self.archiveresult.save()
|
||||
|
||||
# Suppressed: extractor result logs (already logged by worker)
|
||||
# Suppressed: plugin result logs (already logged by worker)
|
||||
|
||||
@backoff.enter
|
||||
def enter_backoff(self):
|
||||
|
||||
@@ -5,7 +5,7 @@ from django.utils.safestring import mark_safe
|
||||
from typing import Union
|
||||
|
||||
from archivebox.hooks import (
|
||||
get_extractor_icon, get_extractor_template, get_extractor_name,
|
||||
get_plugin_icon, get_plugin_template, get_plugin_name,
|
||||
)
|
||||
|
||||
|
||||
@@ -51,30 +51,30 @@ def url_replace(context, **kwargs):
|
||||
|
||||
|
||||
@register.simple_tag
|
||||
def extractor_icon(extractor: str) -> str:
|
||||
def plugin_icon(plugin: str) -> str:
|
||||
"""
|
||||
Render the icon for an extractor.
|
||||
Render the icon for a plugin.
|
||||
|
||||
Usage: {% extractor_icon "screenshot" %}
|
||||
Usage: {% plugin_icon "screenshot" %}
|
||||
"""
|
||||
return mark_safe(get_extractor_icon(extractor))
|
||||
return mark_safe(get_plugin_icon(plugin))
|
||||
|
||||
|
||||
@register.simple_tag(takes_context=True)
|
||||
def extractor_thumbnail(context, result) -> str:
|
||||
def plugin_thumbnail(context, result) -> str:
|
||||
"""
|
||||
Render the thumbnail template for an archive result.
|
||||
|
||||
Usage: {% extractor_thumbnail result %}
|
||||
Usage: {% plugin_thumbnail result %}
|
||||
|
||||
Context variables passed to template:
|
||||
- result: ArchiveResult object
|
||||
- snapshot: Parent Snapshot object
|
||||
- output_path: Path to output relative to snapshot dir (from embed_path())
|
||||
- extractor: Extractor base name
|
||||
- plugin: Plugin base name
|
||||
"""
|
||||
extractor = get_extractor_name(result.extractor)
|
||||
template_str = get_extractor_template(extractor, 'thumbnail')
|
||||
plugin = get_plugin_name(result.plugin)
|
||||
template_str = get_plugin_template(plugin, 'thumbnail')
|
||||
|
||||
if not template_str:
|
||||
return ''
|
||||
@@ -89,7 +89,7 @@ def extractor_thumbnail(context, result) -> str:
|
||||
'result': result,
|
||||
'snapshot': result.snapshot,
|
||||
'output_path': output_path,
|
||||
'extractor': extractor,
|
||||
'plugin': plugin,
|
||||
})
|
||||
return mark_safe(tpl.render(ctx))
|
||||
except Exception:
|
||||
@@ -97,14 +97,14 @@ def extractor_thumbnail(context, result) -> str:
|
||||
|
||||
|
||||
@register.simple_tag(takes_context=True)
|
||||
def extractor_embed(context, result) -> str:
|
||||
def plugin_embed(context, result) -> str:
|
||||
"""
|
||||
Render the embed iframe template for an archive result.
|
||||
|
||||
Usage: {% extractor_embed result %}
|
||||
Usage: {% plugin_embed result %}
|
||||
"""
|
||||
extractor = get_extractor_name(result.extractor)
|
||||
template_str = get_extractor_template(extractor, 'embed')
|
||||
plugin = get_plugin_name(result.plugin)
|
||||
template_str = get_plugin_template(plugin, 'embed')
|
||||
|
||||
if not template_str:
|
||||
return ''
|
||||
@@ -117,7 +117,7 @@ def extractor_embed(context, result) -> str:
|
||||
'result': result,
|
||||
'snapshot': result.snapshot,
|
||||
'output_path': output_path,
|
||||
'extractor': extractor,
|
||||
'plugin': plugin,
|
||||
})
|
||||
return mark_safe(tpl.render(ctx))
|
||||
except Exception:
|
||||
@@ -125,14 +125,14 @@ def extractor_embed(context, result) -> str:
|
||||
|
||||
|
||||
@register.simple_tag(takes_context=True)
|
||||
def extractor_fullscreen(context, result) -> str:
|
||||
def plugin_fullscreen(context, result) -> str:
|
||||
"""
|
||||
Render the fullscreen template for an archive result.
|
||||
|
||||
Usage: {% extractor_fullscreen result %}
|
||||
Usage: {% plugin_fullscreen result %}
|
||||
"""
|
||||
extractor = get_extractor_name(result.extractor)
|
||||
template_str = get_extractor_template(extractor, 'fullscreen')
|
||||
plugin = get_plugin_name(result.plugin)
|
||||
template_str = get_plugin_template(plugin, 'fullscreen')
|
||||
|
||||
if not template_str:
|
||||
return ''
|
||||
@@ -145,7 +145,7 @@ def extractor_fullscreen(context, result) -> str:
|
||||
'result': result,
|
||||
'snapshot': result.snapshot,
|
||||
'output_path': output_path,
|
||||
'extractor': extractor,
|
||||
'plugin': plugin,
|
||||
})
|
||||
return mark_safe(tpl.render(ctx))
|
||||
except Exception:
|
||||
@@ -153,10 +153,10 @@ def extractor_fullscreen(context, result) -> str:
|
||||
|
||||
|
||||
@register.filter
|
||||
def extractor_name(value: str) -> str:
|
||||
def plugin_name(value: str) -> str:
|
||||
"""
|
||||
Get the base name of an extractor (strips numeric prefix).
|
||||
Get the base name of a plugin (strips numeric prefix).
|
||||
|
||||
Usage: {{ result.extractor|extractor_name }}
|
||||
Usage: {{ result.plugin|plugin_name }}
|
||||
"""
|
||||
return get_extractor_name(value)
|
||||
return get_plugin_name(value)
|
||||
|
||||
@@ -56,9 +56,9 @@ class SnapshotView(View):
|
||||
def render_live_index(request, snapshot):
|
||||
TITLE_LOADING_MSG = 'Not yet archived...'
|
||||
|
||||
# Dict of extractor -> ArchiveResult object
|
||||
# Dict of plugin -> ArchiveResult object
|
||||
archiveresult_objects = {}
|
||||
# Dict of extractor -> result info dict (for template compatibility)
|
||||
# Dict of plugin -> result info dict (for template compatibility)
|
||||
archiveresults = {}
|
||||
|
||||
results = snapshot.archiveresult_set.all()
|
||||
@@ -75,16 +75,16 @@ class SnapshotView(View):
|
||||
continue
|
||||
|
||||
# Store the full ArchiveResult object for template tags
|
||||
archiveresult_objects[result.extractor] = result
|
||||
archiveresult_objects[result.plugin] = result
|
||||
|
||||
result_info = {
|
||||
'name': result.extractor,
|
||||
'name': result.plugin,
|
||||
'path': embed_path,
|
||||
'ts': ts_to_date_str(result.end_ts),
|
||||
'size': abs_path.stat().st_size or '?',
|
||||
'result': result, # Include the full object for template tags
|
||||
}
|
||||
archiveresults[result.extractor] = result_info
|
||||
archiveresults[result.plugin] = result_info
|
||||
|
||||
# Use canonical_outputs for intelligent discovery
|
||||
# This method now scans ArchiveResults and uses smart heuristics
|
||||
@@ -119,8 +119,8 @@ class SnapshotView(View):
|
||||
|
||||
# Get available extractors from hooks (sorted by numeric prefix for ordering)
|
||||
# Convert to base names for display ordering
|
||||
all_extractors = [get_extractor_name(e) for e in get_extractors()]
|
||||
preferred_types = tuple(all_extractors)
|
||||
all_plugins = [get_extractor_name(e) for e in get_extractors()]
|
||||
preferred_types = tuple(all_plugins)
|
||||
all_types = preferred_types + tuple(result_type for result_type in archiveresults.keys() if result_type not in preferred_types)
|
||||
|
||||
best_result = {'path': 'None', 'result': None}
|
||||
@@ -463,7 +463,6 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
urls_content = sources_file.read_text()
|
||||
crawl = Crawl.objects.create(
|
||||
urls=urls_content,
|
||||
extractor=parser,
|
||||
max_depth=depth,
|
||||
tags_str=tag,
|
||||
label=f'{self.request.user.username}@{HOSTNAME}{self.request.path} {timestamp}',
|
||||
@@ -598,12 +597,12 @@ def live_progress_view(request):
|
||||
ArchiveResult.StatusChoices.SUCCEEDED: 2,
|
||||
ArchiveResult.StatusChoices.FAILED: 3,
|
||||
}
|
||||
return (status_order.get(ar.status, 4), ar.extractor)
|
||||
return (status_order.get(ar.status, 4), ar.plugin)
|
||||
|
||||
all_extractors = [
|
||||
all_plugins = [
|
||||
{
|
||||
'id': str(ar.id),
|
||||
'extractor': ar.extractor,
|
||||
'plugin': ar.plugin,
|
||||
'status': ar.status,
|
||||
}
|
||||
for ar in sorted(snapshot_results, key=extractor_sort_key)
|
||||
@@ -619,7 +618,7 @@ def live_progress_view(request):
|
||||
'completed_extractors': completed_extractors,
|
||||
'failed_extractors': failed_extractors,
|
||||
'pending_extractors': pending_extractors,
|
||||
'all_extractors': all_extractors,
|
||||
'all_plugins': all_plugins,
|
||||
})
|
||||
|
||||
# Check if crawl can start (for debugging stuck crawls)
|
||||
|
||||
@@ -353,10 +353,18 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
import signal
|
||||
from pathlib import Path
|
||||
from archivebox.hooks import run_hook, discover_hooks
|
||||
from archivebox.misc.process_utils import validate_pid_file
|
||||
|
||||
# Kill any background processes by scanning for all .pid files
|
||||
if self.OUTPUT_DIR.exists():
|
||||
for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
|
||||
# Validate PID before killing to avoid killing unrelated processes
|
||||
cmd_file = pid_file.parent / 'cmd.sh'
|
||||
if not validate_pid_file(pid_file, cmd_file):
|
||||
# PID reused by different process or process dead
|
||||
pid_file.unlink(missing_ok=True)
|
||||
continue
|
||||
|
||||
try:
|
||||
pid = int(pid_file.read_text().strip())
|
||||
try:
|
||||
|
||||
@@ -292,8 +292,13 @@ def run_hook(
|
||||
stdout_file = output_dir / 'stdout.log'
|
||||
stderr_file = output_dir / 'stderr.log'
|
||||
pid_file = output_dir / 'hook.pid'
|
||||
cmd_file = output_dir / 'cmd.sh'
|
||||
|
||||
try:
|
||||
# Write command script for validation
|
||||
from archivebox.misc.process_utils import write_cmd_file
|
||||
write_cmd_file(cmd_file, cmd)
|
||||
|
||||
# Open log files for writing
|
||||
with open(stdout_file, 'w') as out, open(stderr_file, 'w') as err:
|
||||
process = subprocess.Popen(
|
||||
@@ -304,8 +309,10 @@ def run_hook(
|
||||
env=env,
|
||||
)
|
||||
|
||||
# Write PID for all hooks (useful for debugging/cleanup)
|
||||
pid_file.write_text(str(process.pid))
|
||||
# Write PID with mtime set to process start time for validation
|
||||
from archivebox.misc.process_utils import write_pid_file_with_mtime
|
||||
process_start_time = time.time()
|
||||
write_pid_file_with_mtime(pid_file, process.pid, process_start_time)
|
||||
|
||||
if is_background:
|
||||
# Background hook - return None immediately, don't wait
|
||||
@@ -1327,21 +1334,29 @@ def process_is_alive(pid_file: Path) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
def kill_process(pid_file: Path, sig: int = signal.SIGTERM):
|
||||
def kill_process(pid_file: Path, sig: int = signal.SIGTERM, validate: bool = True):
|
||||
"""
|
||||
Kill process in PID file.
|
||||
Kill process in PID file with optional validation.
|
||||
|
||||
Args:
|
||||
pid_file: Path to hook.pid file
|
||||
sig: Signal to send (default SIGTERM)
|
||||
validate: If True, validate process identity before killing (default: True)
|
||||
"""
|
||||
if not pid_file.exists():
|
||||
return
|
||||
|
||||
try:
|
||||
pid = int(pid_file.read_text().strip())
|
||||
os.kill(pid, sig)
|
||||
except (OSError, ValueError):
|
||||
pass
|
||||
from archivebox.misc.process_utils import safe_kill_process
|
||||
|
||||
if validate:
|
||||
# Use safe kill with validation
|
||||
cmd_file = pid_file.parent / 'cmd.sh'
|
||||
safe_kill_process(pid_file, cmd_file, signal_num=sig, validate=True)
|
||||
else:
|
||||
# Legacy behavior - kill without validation
|
||||
if not pid_file.exists():
|
||||
return
|
||||
try:
|
||||
pid = int(pid_file.read_text().strip())
|
||||
os.kill(pid, sig)
|
||||
except (OSError, ValueError):
|
||||
pass
|
||||
|
||||
|
||||
|
||||
@@ -178,7 +178,8 @@ def archiveresult_to_jsonl(result) -> Dict[str, Any]:
|
||||
'type': TYPE_ARCHIVERESULT,
|
||||
'id': str(result.id),
|
||||
'snapshot_id': str(result.snapshot_id),
|
||||
'extractor': result.extractor,
|
||||
'plugin': result.plugin,
|
||||
'hook_name': result.hook_name,
|
||||
'status': result.status,
|
||||
'output_str': result.output_str,
|
||||
'start_ts': result.start_ts.isoformat() if result.start_ts else None,
|
||||
|
||||
@@ -29,6 +29,98 @@ const http = require('http');
|
||||
const EXTRACTOR_NAME = 'chrome_launch';
|
||||
const OUTPUT_DIR = 'chrome';
|
||||
|
||||
// Helper: Write PID file with mtime set to process start time
|
||||
function writePidWithMtime(filePath, pid, startTimeSeconds) {
|
||||
fs.writeFileSync(filePath, String(pid));
|
||||
// Set both atime and mtime to process start time for validation
|
||||
const startTimeMs = startTimeSeconds * 1000;
|
||||
fs.utimesSync(filePath, new Date(startTimeMs), new Date(startTimeMs));
|
||||
}
|
||||
|
||||
// Helper: Write command script for validation
|
||||
function writeCmdScript(filePath, binary, args) {
|
||||
// Shell escape arguments containing spaces or special characters
|
||||
const escapedArgs = args.map(arg => {
|
||||
if (arg.includes(' ') || arg.includes('"') || arg.includes('$')) {
|
||||
return `"${arg.replace(/"/g, '\\"')}"`;
|
||||
}
|
||||
return arg;
|
||||
});
|
||||
const script = `#!/bin/bash\n${binary} ${escapedArgs.join(' ')}\n`;
|
||||
fs.writeFileSync(filePath, script);
|
||||
fs.chmodSync(filePath, 0o755);
|
||||
}
|
||||
|
||||
// Helper: Get process start time (cross-platform)
|
||||
function getProcessStartTime(pid) {
|
||||
try {
|
||||
const { execSync } = require('child_process');
|
||||
if (process.platform === 'darwin') {
|
||||
// macOS: ps -p PID -o lstart= gives start time
|
||||
const output = execSync(`ps -p ${pid} -o lstart=`, { encoding: 'utf8', timeout: 1000 });
|
||||
return Date.parse(output.trim()) / 1000; // Convert to epoch seconds
|
||||
} else {
|
||||
// Linux: read /proc/PID/stat field 22 (starttime in clock ticks)
|
||||
const stat = fs.readFileSync(`/proc/${pid}/stat`, 'utf8');
|
||||
const match = stat.match(/\) \w+ (\d+)/);
|
||||
if (match) {
|
||||
const startTicks = parseInt(match[1], 10);
|
||||
// Convert clock ticks to seconds (assuming 100 ticks/sec)
|
||||
const uptimeSeconds = parseFloat(fs.readFileSync('/proc/uptime', 'utf8').split(' ')[0]);
|
||||
const bootTime = Date.now() / 1000 - uptimeSeconds;
|
||||
return bootTime + (startTicks / 100);
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
// Can't get start time
|
||||
return null;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// Helper: Validate PID using mtime and command
|
||||
function validatePid(pid, pidFile, cmdFile) {
|
||||
try {
|
||||
// Check process exists
|
||||
try {
|
||||
process.kill(pid, 0); // Signal 0 = check existence
|
||||
} catch (e) {
|
||||
return false; // Process doesn't exist
|
||||
}
|
||||
|
||||
// Check mtime matches process start time (within 5 sec tolerance)
|
||||
const fileStat = fs.statSync(pidFile);
|
||||
const fileMtime = fileStat.mtimeMs / 1000; // Convert to seconds
|
||||
const procStartTime = getProcessStartTime(pid);
|
||||
|
||||
if (procStartTime === null) {
|
||||
// Can't validate - fall back to basic existence check
|
||||
return true;
|
||||
}
|
||||
|
||||
if (Math.abs(fileMtime - procStartTime) > 5) {
|
||||
// PID was reused by different process
|
||||
return false;
|
||||
}
|
||||
|
||||
// Validate command if available
|
||||
if (fs.existsSync(cmdFile)) {
|
||||
const cmd = fs.readFileSync(cmdFile, 'utf8');
|
||||
// Check for Chrome/Chromium and debug port
|
||||
if (!cmd.includes('chrome') && !cmd.includes('chromium')) {
|
||||
return false;
|
||||
}
|
||||
if (!cmd.includes('--remote-debugging-port')) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Global state for cleanup
|
||||
let chromePid = null;
|
||||
|
||||
@@ -240,17 +332,17 @@ function killZombieChrome() {
|
||||
const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10);
|
||||
if (isNaN(pid) || pid <= 0) continue;
|
||||
|
||||
// Check if process exists
|
||||
try {
|
||||
process.kill(pid, 0);
|
||||
} catch (e) {
|
||||
// Process dead, remove stale PID file
|
||||
// Validate PID before killing
|
||||
const cmdFile = path.join(chromeDir, 'cmd.sh');
|
||||
if (!validatePid(pid, pidFile, cmdFile)) {
|
||||
// PID reused or validation failed
|
||||
console.error(`[!] PID ${pid} failed validation (reused or wrong process) - cleaning up`);
|
||||
try { fs.unlinkSync(pidFile); } catch (e) {}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Process alive but crawl is stale - zombie!
|
||||
console.error(`[!] Found zombie (PID ${pid}) from stale crawl ${crawl.name}`);
|
||||
// Process alive, validated, and crawl is stale - zombie!
|
||||
console.error(`[!] Found validated zombie (PID ${pid}) from stale crawl ${crawl.name}`);
|
||||
|
||||
try {
|
||||
// Kill process group first
|
||||
@@ -386,15 +478,20 @@ async function launchChrome(binary) {
|
||||
chromeProcess.unref(); // Don't keep Node.js process running
|
||||
|
||||
chromePid = chromeProcess.pid;
|
||||
const chromeStartTime = Date.now() / 1000; // Unix epoch seconds
|
||||
console.error(`[*] Launched Chrome (PID: ${chromePid}), waiting for debug port...`);
|
||||
|
||||
// Write Chrome PID for backup cleanup (named .pid so Crawl.cleanup() finds it)
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, 'chrome.pid'), String(chromePid));
|
||||
// Write Chrome PID with mtime set to start time for validation
|
||||
writePidWithMtime(path.join(OUTPUT_DIR, 'chrome.pid'), chromePid, chromeStartTime);
|
||||
|
||||
// Write command script for validation
|
||||
writeCmdScript(path.join(OUTPUT_DIR, 'cmd.sh'), binary, chromeArgs);
|
||||
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, 'port.txt'), String(debugPort));
|
||||
|
||||
// Write hook's own PID so Crawl.cleanup() can kill this hook process
|
||||
// (which will trigger our SIGTERM handler to kill Chrome)
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, 'hook.pid'), String(process.pid));
|
||||
// Write hook's own PID with mtime for validation
|
||||
const hookStartTime = Date.now() / 1000;
|
||||
writePidWithMtime(path.join(OUTPUT_DIR, 'hook.pid'), process.pid, hookStartTime);
|
||||
|
||||
try {
|
||||
// Wait for Chrome to be ready
|
||||
|
||||
@@ -488,7 +488,7 @@
|
||||
<span class="progress-fill"></span>
|
||||
<span class="badge-content">
|
||||
<span class="badge-icon">${icon}</span>
|
||||
<span>${extractor.extractor}</span>
|
||||
<span>${extractor.plugin}</span>
|
||||
</span>
|
||||
</span>
|
||||
`;
|
||||
@@ -499,10 +499,10 @@
|
||||
const adminUrl = `/admin/core/snapshot/${snapshot.id}/change/`;
|
||||
|
||||
let extractorHtml = '';
|
||||
if (snapshot.all_extractors && snapshot.all_extractors.length > 0) {
|
||||
// Sort extractors alphabetically by name to prevent reordering on updates
|
||||
const sortedExtractors = [...snapshot.all_extractors].sort((a, b) =>
|
||||
a.extractor.localeCompare(b.extractor)
|
||||
if (snapshot.all_plugins && snapshot.all_plugins.length > 0) {
|
||||
// Sort plugins alphabetically by name to prevent reordering on updates
|
||||
const sortedExtractors = [...snapshot.all_plugins].sort((a, b) =>
|
||||
a.plugin.localeCompare(b.plugin)
|
||||
);
|
||||
extractorHtml = `
|
||||
<div class="extractor-list">
|
||||
|
||||
@@ -158,7 +158,7 @@ CREATE TABLE IF NOT EXISTS core_snapshot_tags (
|
||||
CREATE TABLE IF NOT EXISTS core_archiveresult (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
snapshot_id CHAR(32) NOT NULL REFERENCES core_snapshot(id),
|
||||
extractor VARCHAR(32) NOT NULL,
|
||||
plugin VARCHAR(32) NOT NULL,
|
||||
cmd TEXT,
|
||||
pwd VARCHAR(256),
|
||||
cmd_version VARCHAR(128),
|
||||
@@ -379,7 +379,7 @@ CREATE TABLE IF NOT EXISTS crawls_seed (
|
||||
created_by_id INTEGER NOT NULL REFERENCES auth_user(id),
|
||||
modified_at DATETIME,
|
||||
uri VARCHAR(2048) NOT NULL,
|
||||
extractor VARCHAR(32) NOT NULL DEFAULT 'auto',
|
||||
plugin VARCHAR(32) NOT NULL DEFAULT 'auto',
|
||||
tags_str VARCHAR(255) NOT NULL DEFAULT '',
|
||||
label VARCHAR(255) NOT NULL DEFAULT '',
|
||||
config TEXT DEFAULT '{}',
|
||||
@@ -465,7 +465,7 @@ CREATE TABLE IF NOT EXISTS core_archiveresult (
|
||||
created_at DATETIME NOT NULL,
|
||||
modified_at DATETIME,
|
||||
snapshot_id CHAR(36) NOT NULL REFERENCES core_snapshot(id),
|
||||
extractor VARCHAR(32) NOT NULL,
|
||||
plugin VARCHAR(32) NOT NULL,
|
||||
pwd VARCHAR(256),
|
||||
cmd TEXT,
|
||||
cmd_version VARCHAR(128),
|
||||
|
||||
@@ -227,10 +227,10 @@ class Worker:
|
||||
urls = obj.get_urls_list()
|
||||
url = urls[0] if urls else None
|
||||
|
||||
extractor = None
|
||||
if hasattr(obj, 'extractor'):
|
||||
plugin = None
|
||||
if hasattr(obj, 'plugin'):
|
||||
# ArchiveResultWorker, Crawl
|
||||
extractor = obj.extractor
|
||||
plugin = obj.plugin
|
||||
|
||||
log_worker_event(
|
||||
worker_type=worker_type_name,
|
||||
@@ -239,7 +239,7 @@ class Worker:
|
||||
pid=self.pid,
|
||||
worker_id=str(self.worker_id),
|
||||
url=url,
|
||||
extractor=extractor,
|
||||
plugin=plugin,
|
||||
metadata=start_metadata if start_metadata else None,
|
||||
)
|
||||
|
||||
@@ -262,7 +262,7 @@ class Worker:
|
||||
pid=self.pid,
|
||||
worker_id=str(self.worker_id),
|
||||
url=url,
|
||||
extractor=extractor,
|
||||
plugin=plugin,
|
||||
metadata=complete_metadata,
|
||||
)
|
||||
else:
|
||||
@@ -345,9 +345,9 @@ class ArchiveResultWorker(Worker):
|
||||
name: ClassVar[str] = 'archiveresult'
|
||||
MAX_TICK_TIME: ClassVar[int] = 120
|
||||
|
||||
def __init__(self, extractor: str | None = None, **kwargs: Any):
|
||||
def __init__(self, plugin: str | None = None, **kwargs: Any):
|
||||
super().__init__(**kwargs)
|
||||
self.extractor = extractor
|
||||
self.plugin = plugin
|
||||
|
||||
def get_model(self):
|
||||
from core.models import ArchiveResult
|
||||
@@ -359,16 +359,16 @@ class ArchiveResultWorker(Worker):
|
||||
|
||||
qs = super().get_queue()
|
||||
|
||||
if self.extractor:
|
||||
qs = qs.filter(extractor=self.extractor)
|
||||
if self.plugin:
|
||||
qs = qs.filter(plugin=self.plugin)
|
||||
|
||||
# Note: Removed blocking logic since plugins have separate output directories
|
||||
# and don't interfere with each other. Each plugin (extractor) runs independently.
|
||||
# and don't interfere with each other. Each plugin runs independently.
|
||||
|
||||
return qs
|
||||
|
||||
def process_item(self, obj) -> bool:
|
||||
"""Process an ArchiveResult by running its extractor."""
|
||||
"""Process an ArchiveResult by running its plugin."""
|
||||
try:
|
||||
obj.sm.tick()
|
||||
return True
|
||||
@@ -378,8 +378,8 @@ class ArchiveResultWorker(Worker):
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def start(cls, worker_id: int | None = None, daemon: bool = False, extractor: str | None = None, **kwargs: Any) -> int:
|
||||
"""Fork a new worker as subprocess with optional extractor filter."""
|
||||
def start(cls, worker_id: int | None = None, daemon: bool = False, plugin: str | None = None, **kwargs: Any) -> int:
|
||||
"""Fork a new worker as subprocess with optional plugin filter."""
|
||||
if worker_id is None:
|
||||
worker_id = get_next_worker_id(cls.name)
|
||||
|
||||
@@ -387,7 +387,7 @@ class ArchiveResultWorker(Worker):
|
||||
proc = Process(
|
||||
target=_run_worker,
|
||||
args=(cls.name, worker_id, daemon),
|
||||
kwargs={'extractor': extractor, **kwargs},
|
||||
kwargs={'plugin': plugin, **kwargs},
|
||||
name=f'{cls.name}_worker_{worker_id}',
|
||||
)
|
||||
proc.start()
|
||||
|
||||
Reference in New Issue
Block a user