remove huey

This commit is contained in:
Nick Sweeting
2025-12-24 23:40:18 -08:00
parent 6c769d831c
commit d95f0dc186
105 changed files with 3635 additions and 1402 deletions

View File

@@ -42,6 +42,7 @@ def register_urls(api: NinjaAPI) -> NinjaAPI:
api.add_router('/crawls/', 'api.v1_crawls.router')
api.add_router('/cli/', 'api.v1_cli.router')
api.add_router('/workers/', 'api.v1_workers.router')
api.add_router('/machine/', 'api.v1_machine.router')
return api

View File

@@ -107,7 +107,7 @@ class RemoveCommandSchema(Schema):
@router.post("/add", response=CLICommandResponseSchema, summary='archivebox add [args] [urls]')
def cli_add(request, args: AddCommandSchema):
from archivebox.cli.archivebox_add import add
result = add(
urls=args.urls,
tag=args.tag,
@@ -115,8 +115,9 @@ def cli_add(request, args: AddCommandSchema):
update=args.update,
index_only=args.index_only,
overwrite=args.overwrite,
extract=args.extract,
plugins=args.extract, # extract in API maps to plugins param
parser=args.parser,
bg=True, # Always run in background for API calls
)
return {

View File

@@ -0,0 +1,206 @@
__package__ = 'archivebox.api'
from uuid import UUID
from typing import List, Optional
from datetime import datetime
from ninja import Router, Schema, FilterSchema, Field, Query
from ninja.pagination import paginate
from api.v1_core import CustomPagination
router = Router(tags=['Machine and Dependencies'])
# ============================================================================
# Machine Schemas
# ============================================================================
class MachineSchema(Schema):
"""Schema for Machine model."""
TYPE: str = 'machine.Machine'
id: UUID
created_at: datetime
modified_at: datetime
guid: str
hostname: str
hw_in_docker: bool
hw_in_vm: bool
hw_manufacturer: str
hw_product: str
hw_uuid: str
os_arch: str
os_family: str
os_platform: str
os_release: str
os_kernel: str
stats: dict
num_uses_succeeded: int
num_uses_failed: int
class MachineFilterSchema(FilterSchema):
id: Optional[str] = Field(None, q='id__startswith')
hostname: Optional[str] = Field(None, q='hostname__icontains')
os_platform: Optional[str] = Field(None, q='os_platform__icontains')
os_arch: Optional[str] = Field(None, q='os_arch')
hw_in_docker: Optional[bool] = Field(None, q='hw_in_docker')
hw_in_vm: Optional[bool] = Field(None, q='hw_in_vm')
# ============================================================================
# Dependency Schemas
# ============================================================================
class DependencySchema(Schema):
"""Schema for Dependency model."""
TYPE: str = 'machine.Dependency'
id: UUID
created_at: datetime
modified_at: datetime
bin_name: str
bin_providers: str
custom_cmds: dict
config: dict
is_installed: bool
installed_count: int
@staticmethod
def resolve_is_installed(obj) -> bool:
return obj.is_installed
@staticmethod
def resolve_installed_count(obj) -> int:
return obj.installed_binaries.count()
class DependencyFilterSchema(FilterSchema):
id: Optional[str] = Field(None, q='id__startswith')
bin_name: Optional[str] = Field(None, q='bin_name__icontains')
bin_providers: Optional[str] = Field(None, q='bin_providers__icontains')
# ============================================================================
# InstalledBinary Schemas
# ============================================================================
class InstalledBinarySchema(Schema):
"""Schema for InstalledBinary model."""
TYPE: str = 'machine.InstalledBinary'
id: UUID
created_at: datetime
modified_at: datetime
machine_id: UUID
machine_hostname: str
dependency_id: Optional[UUID]
dependency_bin_name: Optional[str]
name: str
binprovider: str
abspath: str
version: str
sha256: str
is_valid: bool
num_uses_succeeded: int
num_uses_failed: int
@staticmethod
def resolve_machine_hostname(obj) -> str:
return obj.machine.hostname
@staticmethod
def resolve_dependency_id(obj) -> Optional[UUID]:
return obj.dependency_id
@staticmethod
def resolve_dependency_bin_name(obj) -> Optional[str]:
return obj.dependency.bin_name if obj.dependency else None
@staticmethod
def resolve_is_valid(obj) -> bool:
return obj.is_valid
class InstalledBinaryFilterSchema(FilterSchema):
id: Optional[str] = Field(None, q='id__startswith')
name: Optional[str] = Field(None, q='name__icontains')
binprovider: Optional[str] = Field(None, q='binprovider')
machine_id: Optional[str] = Field(None, q='machine_id__startswith')
dependency_id: Optional[str] = Field(None, q='dependency_id__startswith')
version: Optional[str] = Field(None, q='version__icontains')
# ============================================================================
# Machine Endpoints
# ============================================================================
@router.get("/machines", response=List[MachineSchema], url_name="get_machines")
@paginate(CustomPagination)
def get_machines(request, filters: MachineFilterSchema = Query(...)):
"""List all machines."""
from machine.models import Machine
return filters.filter(Machine.objects.all()).distinct()
@router.get("/machine/{machine_id}", response=MachineSchema, url_name="get_machine")
def get_machine(request, machine_id: str):
"""Get a specific machine by ID."""
from machine.models import Machine
from django.db.models import Q
return Machine.objects.get(Q(id__startswith=machine_id) | Q(hostname__iexact=machine_id))
@router.get("/machine/current", response=MachineSchema, url_name="get_current_machine")
def get_current_machine(request):
"""Get the current machine."""
from machine.models import Machine
return Machine.current()
# ============================================================================
# Dependency Endpoints
# ============================================================================
@router.get("/dependencies", response=List[DependencySchema], url_name="get_dependencies")
@paginate(CustomPagination)
def get_dependencies(request, filters: DependencyFilterSchema = Query(...)):
"""List all dependencies."""
from machine.models import Dependency
return filters.filter(Dependency.objects.all()).distinct()
@router.get("/dependency/{dependency_id}", response=DependencySchema, url_name="get_dependency")
def get_dependency(request, dependency_id: str):
"""Get a specific dependency by ID or bin_name."""
from machine.models import Dependency
from django.db.models import Q
try:
return Dependency.objects.get(Q(id__startswith=dependency_id))
except Dependency.DoesNotExist:
return Dependency.objects.get(bin_name__iexact=dependency_id)
# ============================================================================
# InstalledBinary Endpoints
# ============================================================================
@router.get("/binaries", response=List[InstalledBinarySchema], url_name="get_binaries")
@paginate(CustomPagination)
def get_binaries(request, filters: InstalledBinaryFilterSchema = Query(...)):
"""List all installed binaries."""
from machine.models import InstalledBinary
return filters.filter(InstalledBinary.objects.all().select_related('machine', 'dependency')).distinct()
@router.get("/binary/{binary_id}", response=InstalledBinarySchema, url_name="get_binary")
def get_binary(request, binary_id: str):
"""Get a specific installed binary by ID."""
from machine.models import InstalledBinary
return InstalledBinary.objects.select_related('machine', 'dependency').get(id__startswith=binary_id)
@router.get("/binary/by-name/{name}", response=List[InstalledBinarySchema], url_name="get_binaries_by_name")
def get_binaries_by_name(request, name: str):
"""Get all installed binaries with the given name."""
from machine.models import InstalledBinary
return list(InstalledBinary.objects.filter(name__iexact=name).select_related('machine', 'dependency'))

View File

@@ -4,125 +4,157 @@ from uuid import UUID
from typing import List, Any
from datetime import datetime
from ninja import Router, Schema
router = Router(tags=['Workers and Tasks'])
class TaskSchema(Schema):
class QueueItemSchema(Schema):
"""Schema for a single item in a worker's queue."""
TYPE: str
id: UUID
description: str
status: str
retry_at: datetime | None
created_at: datetime
modified_at: datetime
created_by_id: int
description: str
@staticmethod
def resolve_TYPE(obj) -> str:
return f'{obj._meta.app_label}.{obj._meta.model_name}'
@staticmethod
def resolve_description(obj) -> str:
return str(obj)
class ActorSchema(Schema):
# TYPE: str = 'workers.actor.ActorType'
# name: str
#pid: int | None
idle_count: int
launch_kwargs: dict[str, Any]
mode: str
class WorkerSchema(Schema):
"""Schema for a Worker type."""
name: str
model: str
statemachine: str
ACTIVE_STATE: str
EVENT_NAME: str
CLAIM_ORDER: list[str]
CLAIM_FROM_TOP_N: int
CLAIM_ATOMIC: bool
MAX_TICK_TIME: int
MAX_CONCURRENT_ACTORS: int
future: list[TaskSchema]
pending: list[TaskSchema]
stalled: list[TaskSchema]
active: list[TaskSchema]
past: list[TaskSchema]
max_tick_time: int
max_concurrent_tasks: int
poll_interval: float
idle_timeout: int
running_count: int
running_workers: List[dict[str, Any]]
queue_count: int
queue: List[QueueItemSchema]
@staticmethod
def resolve_model(obj) -> str:
return obj.Model.__name__
@staticmethod
def resolve_statemachine(obj) -> str:
return obj.StateMachineClass.__name__
@staticmethod
def resolve_name(obj) -> str:
return str(obj)
Model = obj.get_model()
return f'{Model._meta.app_label}.{Model._meta.model_name}'
@staticmethod
def resolve_ACTIVE_STATE(obj) -> str:
return str(obj.ACTIVE_STATE)
@staticmethod
def resolve_FINAL_STATES(obj) -> list[str]:
return [str(state) for state in obj.FINAL_STATES]
@staticmethod
def resolve_future(obj) -> list[TaskSchema]:
return [obj for obj in obj.qs.filter(obj.future_q).order_by('-retry_at')]
@staticmethod
def resolve_pending(obj) -> list[TaskSchema]:
return [obj for obj in obj.qs.filter(obj.pending_q).order_by('-retry_at')]
@staticmethod
def resolve_stalled(obj) -> list[TaskSchema]:
return [obj for obj in obj.qs.filter(obj.stalled_q).order_by('-retry_at')]
@staticmethod
def resolve_active(obj) -> list[TaskSchema]:
return [obj for obj in obj.qs.filter(obj.active_q).order_by('-retry_at')]
def resolve_max_tick_time(obj) -> int:
return obj.MAX_TICK_TIME
@staticmethod
def resolve_past(obj) -> list[TaskSchema]:
return [obj for obj in obj.qs.filter(obj.final_q).order_by('-modified_at')]
def resolve_max_concurrent_tasks(obj) -> int:
return obj.MAX_CONCURRENT_TASKS
@staticmethod
def resolve_poll_interval(obj) -> float:
return obj.POLL_INTERVAL
@staticmethod
def resolve_idle_timeout(obj) -> int:
return obj.IDLE_TIMEOUT
@staticmethod
def resolve_running_count(obj) -> int:
return len(obj.get_running_workers())
@staticmethod
def resolve_running_workers(obj) -> List[dict[str, Any]]:
return obj.get_running_workers()
@staticmethod
def resolve_queue_count(obj) -> int:
return obj.get_queue().count()
@staticmethod
def resolve_queue(obj) -> List[QueueItemSchema]:
return list(obj.get_queue()[:50]) # Limit to 50 items
class OrchestratorSchema(Schema):
# TYPE: str = 'workers.orchestrator.Orchestrator'
#pid: int | None
exit_on_idle: bool
mode: str
actors: list[ActorSchema]
@staticmethod
def resolve_actors(obj) -> list[ActorSchema]:
return [actor() for actor in obj.actor_types.values()]
"""Schema for the Orchestrator."""
is_running: bool
poll_interval: float
idle_timeout: int
max_workers_per_type: int
max_total_workers: int
total_worker_count: int
workers: List[WorkerSchema]
@router.get("/orchestrators", response=List[OrchestratorSchema], url_name="get_orchestrators")
def get_orchestrators(request):
"""List all the task orchestrators (aka Orchestrators) that are currently running"""
@router.get("/orchestrator", response=OrchestratorSchema, url_name="get_orchestrator")
def get_orchestrator(request):
"""Get the orchestrator status and all worker queues."""
from workers.orchestrator import Orchestrator
from workers.worker import CrawlWorker, SnapshotWorker, ArchiveResultWorker
orchestrator = Orchestrator()
return [orchestrator]
# Create temporary worker instances to query their queues
workers = [
CrawlWorker(worker_id=-1),
SnapshotWorker(worker_id=-1),
ArchiveResultWorker(worker_id=-1),
]
return {
'is_running': orchestrator.is_running(),
'poll_interval': orchestrator.POLL_INTERVAL,
'idle_timeout': orchestrator.IDLE_TIMEOUT,
'max_workers_per_type': orchestrator.MAX_WORKERS_PER_TYPE,
'max_total_workers': orchestrator.MAX_TOTAL_WORKERS,
'total_worker_count': orchestrator.get_total_worker_count(),
'workers': workers,
}
@router.get("/actors", response=List[ActorSchema], url_name="get_actors")
def get_actors(request):
"""List all the task consumer workers (aka Actors) that are currently running"""
@router.get("/workers", response=List[WorkerSchema], url_name="get_workers")
def get_workers(request):
"""List all worker types and their current status."""
from workers.worker import CrawlWorker, SnapshotWorker, ArchiveResultWorker
from workers.orchestrator import Orchestrator
orchestrator = Orchestrator()
return orchestrator.actor_types.values()
# Create temporary instances to query their queues
return [
CrawlWorker(worker_id=-1),
SnapshotWorker(worker_id=-1),
ArchiveResultWorker(worker_id=-1),
]
@router.get("/worker/{worker_name}", response=WorkerSchema, url_name="get_worker")
def get_worker(request, worker_name: str):
"""Get status and queue for a specific worker type."""
from workers.worker import WORKER_TYPES
if worker_name not in WORKER_TYPES:
from ninja.errors import HttpError
raise HttpError(404, f"Unknown worker type: {worker_name}. Valid types: {list(WORKER_TYPES.keys())}")
WorkerClass = WORKER_TYPES[worker_name]
return WorkerClass(worker_id=-1)
@router.get("/worker/{worker_name}/queue", response=List[QueueItemSchema], url_name="get_worker_queue")
def get_worker_queue(request, worker_name: str, limit: int = 100):
"""Get the current queue for a specific worker type."""
from workers.worker import WORKER_TYPES
if worker_name not in WORKER_TYPES:
from ninja.errors import HttpError
raise HttpError(404, f"Unknown worker type: {worker_name}. Valid types: {list(WORKER_TYPES.keys())}")
WorkerClass = WORKER_TYPES[worker_name]
worker = WorkerClass(worker_id=-1)
return list(worker.get_queue()[:limit])
# Progress endpoint moved to core.views.live_progress_view for simplicity

View File

@@ -2,76 +2,226 @@
__package__ = 'archivebox.base_models'
import json
from django import forms
from django.contrib import admin
from django.utils.html import format_html, mark_safe
from django_object_actions import DjangoObjectActions
class KeyValueWidget(forms.Widget):
"""
A widget that renders JSON dict as editable key-value input fields
with + and - buttons to add/remove rows.
Includes autocomplete for available config keys from the plugin system.
"""
template_name = None # We render manually
class Media:
css = {
'all': []
}
js = []
def _get_config_options(self):
"""Get available config options from plugins."""
try:
from archivebox.hooks import discover_plugin_configs
plugin_configs = discover_plugin_configs()
options = {}
for plugin_name, schema in plugin_configs.items():
for key, prop in schema.get('properties', {}).items():
options[key] = {
'plugin': plugin_name,
'type': prop.get('type', 'string'),
'default': prop.get('default', ''),
'description': prop.get('description', ''),
}
return options
except Exception:
return {}
def render(self, name, value, attrs=None, renderer=None):
# Parse JSON value to dict
if value is None:
data = {}
elif isinstance(value, str):
try:
data = json.loads(value) if value else {}
except json.JSONDecodeError:
data = {}
elif isinstance(value, dict):
data = value
else:
data = {}
widget_id = attrs.get('id', name) if attrs else name
config_options = self._get_config_options()
# Build datalist options
datalist_options = '\n'.join(
f'<option value="{self._escape(key)}">{self._escape(opt["description"][:60] or opt["type"])}</option>'
for key, opt in sorted(config_options.items())
)
# Build config metadata as JSON for JS
config_meta_json = json.dumps(config_options)
html = f'''
<div id="{widget_id}_container" class="key-value-editor" style="max-width: 700px;">
<datalist id="{widget_id}_keys">
{datalist_options}
</datalist>
<div id="{widget_id}_rows" class="key-value-rows">
'''
# Render existing key-value pairs
row_idx = 0
for key, val in data.items():
val_str = json.dumps(val) if not isinstance(val, str) else val
html += self._render_row(widget_id, row_idx, key, val_str)
row_idx += 1
# Always add one empty row for new entries
html += self._render_row(widget_id, row_idx, '', '')
html += f'''
</div>
<div style="display: flex; gap: 8px; align-items: center; margin-top: 8px;">
<button type="button" onclick="addKeyValueRow_{widget_id}()"
style="padding: 4px 12px; cursor: pointer; background: #417690; color: white; border: none; border-radius: 4px;">
+ Add Row
</button>
<span id="{widget_id}_hint" style="font-size: 11px; color: #666; font-style: italic;"></span>
</div>
<input type="hidden" name="{name}" id="{widget_id}" value="">
<script>
(function() {{
var configMeta_{widget_id} = {config_meta_json};
function showKeyHint_{widget_id}(key) {{
var hint = document.getElementById('{widget_id}_hint');
var meta = configMeta_{widget_id}[key];
if (meta) {{
hint.innerHTML = '<b>' + key + '</b>: ' + (meta.description || meta.type) +
(meta.default !== '' ? ' <span style="color:#888">(default: ' + meta.default + ')</span>' : '');
}} else {{
hint.textContent = key ? 'Custom key: ' + key : '';
}}
}}
function updateHiddenField_{widget_id}() {{
var container = document.getElementById('{widget_id}_rows');
var rows = container.querySelectorAll('.key-value-row');
var result = {{}};
rows.forEach(function(row) {{
var keyInput = row.querySelector('.kv-key');
var valInput = row.querySelector('.kv-value');
if (keyInput && valInput && keyInput.value.trim()) {{
var key = keyInput.value.trim();
var val = valInput.value.trim();
// Try to parse as JSON (for booleans, numbers, etc)
try {{
if (val === 'true') result[key] = true;
else if (val === 'false') result[key] = false;
else if (val === 'null') result[key] = null;
else if (!isNaN(val) && val !== '') result[key] = Number(val);
else if ((val.startsWith('{{') && val.endsWith('}}')) ||
(val.startsWith('[') && val.endsWith(']')) ||
(val.startsWith('"') && val.endsWith('"')))
result[key] = JSON.parse(val);
else result[key] = val;
}} catch(e) {{
result[key] = val;
}}
}}
}});
document.getElementById('{widget_id}').value = JSON.stringify(result);
}}
window.addKeyValueRow_{widget_id} = function() {{
var container = document.getElementById('{widget_id}_rows');
var rows = container.querySelectorAll('.key-value-row');
var newIdx = rows.length;
var newRow = document.createElement('div');
newRow.className = 'key-value-row';
newRow.style.cssText = 'display: flex; gap: 8px; margin-bottom: 6px; align-items: center;';
newRow.innerHTML = '<input type="text" class="kv-key" placeholder="KEY" list="{widget_id}_keys" ' +
'style="flex: 1; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;" ' +
'onchange="updateHiddenField_{widget_id}()" oninput="updateHiddenField_{widget_id}(); showKeyHint_{widget_id}(this.value)" onfocus="showKeyHint_{widget_id}(this.value)">' +
'<input type="text" class="kv-value" placeholder="value" ' +
'style="flex: 2; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;" ' +
'onchange="updateHiddenField_{widget_id}()" oninput="updateHiddenField_{widget_id}()">' +
'<button type="button" onclick="removeKeyValueRow_{widget_id}(this)" ' +
'style="padding: 4px 10px; cursor: pointer; background: #ba2121; color: white; border: none; border-radius: 4px; font-weight: bold;"></button>';
container.appendChild(newRow);
newRow.querySelector('.kv-key').focus();
}};
window.removeKeyValueRow_{widget_id} = function(btn) {{
var row = btn.parentElement;
row.remove();
updateHiddenField_{widget_id}();
}};
window.showKeyHint_{widget_id} = showKeyHint_{widget_id};
window.updateHiddenField_{widget_id} = updateHiddenField_{widget_id};
// Initialize on load
document.addEventListener('DOMContentLoaded', function() {{
updateHiddenField_{widget_id}();
}});
// Also run immediately in case DOM is already ready
if (document.readyState !== 'loading') {{
updateHiddenField_{widget_id}();
}}
// Update on any input change
document.getElementById('{widget_id}_rows').addEventListener('input', updateHiddenField_{widget_id});
}})();
</script>
</div>
'''
return mark_safe(html)
def _render_row(self, widget_id, idx, key, value):
return f'''
<div class="key-value-row" style="display: flex; gap: 8px; margin-bottom: 6px; align-items: center;">
<input type="text" class="kv-key" value="{self._escape(key)}" placeholder="KEY" list="{widget_id}_keys"
style="flex: 1; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;"
onchange="updateHiddenField_{widget_id}()" oninput="updateHiddenField_{widget_id}(); showKeyHint_{widget_id}(this.value)" onfocus="showKeyHint_{widget_id}(this.value)">
<input type="text" class="kv-value" value="{self._escape(value)}" placeholder="value"
style="flex: 2; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;"
onchange="updateHiddenField_{widget_id}()" oninput="updateHiddenField_{widget_id}()">
<button type="button" onclick="removeKeyValueRow_{widget_id}(this)"
style="padding: 4px 10px; cursor: pointer; background: #ba2121; color: white; border: none; border-radius: 4px; font-weight: bold;"></button>
</div>
'''
def _escape(self, s):
"""Escape HTML special chars in attribute values."""
if not s:
return ''
return str(s).replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;')
def value_from_datadict(self, data, files, name):
value = data.get(name, '{}')
return value
class ConfigEditorMixin:
"""
Mixin for admin classes with a config JSON field.
Provides a readonly field that shows available config options
from all discovered plugin schemas.
Provides a key-value editor widget with autocomplete for available config keys.
"""
@admin.display(description='Available Config Options')
def available_config_options(self, obj):
"""Show documentation for available config keys."""
try:
from archivebox.hooks import discover_plugin_configs
plugin_configs = discover_plugin_configs()
except ImportError:
return format_html('<i>Plugin config system not available</i>')
html_parts = [
'<details>',
'<summary style="cursor: pointer; font-weight: bold; padding: 4px;">',
'Click to see available config keys ({})</summary>'.format(
sum(len(s.get('properties', {})) for s in plugin_configs.values())
),
'<div style="max-height: 400px; overflow-y: auto; padding: 8px; background: #f8f8f8; border-radius: 4px; font-family: monospace; font-size: 11px;">',
]
for plugin_name, schema in sorted(plugin_configs.items()):
properties = schema.get('properties', {})
if not properties:
continue
html_parts.append(f'<div style="margin: 8px 0;"><strong style="color: #333;">{plugin_name}</strong></div>')
html_parts.append('<table style="width: 100%; border-collapse: collapse; margin-bottom: 12px;">')
html_parts.append('<tr style="background: #eee;"><th style="text-align: left; padding: 4px;">Key</th><th style="text-align: left; padding: 4px;">Type</th><th style="text-align: left; padding: 4px;">Default</th><th style="text-align: left; padding: 4px;">Description</th></tr>')
for key, prop in sorted(properties.items()):
prop_type = prop.get('type', 'string')
default = prop.get('default', '')
description = prop.get('description', '')
# Truncate long defaults
default_str = str(default)
if len(default_str) > 30:
default_str = default_str[:27] + '...'
html_parts.append(
f'<tr style="border-bottom: 1px solid #ddd;">'
f'<td style="padding: 4px; font-weight: bold;">{key}</td>'
f'<td style="padding: 4px; color: #666;">{prop_type}</td>'
f'<td style="padding: 4px; color: #666;">{default_str}</td>'
f'<td style="padding: 4px;">{description}</td>'
f'</tr>'
)
html_parts.append('</table>')
html_parts.append('</div></details>')
html_parts.append(
'<p style="margin-top: 8px; color: #666; font-size: 11px;">'
'<strong>Usage:</strong> Add key-value pairs in JSON format, e.g., '
'<code>{"SAVE_WGET": false, "WGET_TIMEOUT": 120}</code>'
'</p>'
)
return mark_safe(''.join(html_parts))
def formfield_for_dbfield(self, db_field, request, **kwargs):
"""Use KeyValueWidget for the config JSON field."""
if db_field.name == 'config':
kwargs['widget'] = KeyValueWidget()
return super().formfield_for_dbfield(db_field, request, **kwargs)
class BaseModelAdmin(DjangoObjectActions, admin.ModelAdmin):

View File

@@ -72,9 +72,10 @@ def add(urls: str | list[str],
cli_args[0] = 'archivebox'
cmd_str = ' '.join(cli_args)
timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
seed = Seed.from_file(
sources_file,
label=f'{USER}@{HOSTNAME} $ {cmd_str}',
label=f'{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]',
parser=parser,
tag=tag,
created_by=created_by_id,

View File

@@ -11,21 +11,53 @@ __package__ = "archivebox.config"
import os
import json
from pathlib import Path
from typing import Any, Dict, Optional, List, Type, TYPE_CHECKING, cast
from typing import Any, Dict, Optional, List, Type, Tuple, TYPE_CHECKING, cast
from configparser import ConfigParser
from pydantic import Field
from pydantic_settings import BaseSettings
from pydantic_settings import BaseSettings, PydanticBaseSettingsSource
class IniConfigSettingsSource(PydanticBaseSettingsSource):
"""
Custom settings source that reads from ArchiveBox.conf (INI format).
Flattens all sections into a single namespace.
"""
def get_field_value(self, field: Any, field_name: str) -> Tuple[Any, str, bool]:
config_vals = self._load_config_file()
field_value = config_vals.get(field_name.upper())
return field_value, field_name, False
def __call__(self) -> Dict[str, Any]:
return self._load_config_file()
def _load_config_file(self) -> Dict[str, Any]:
try:
from archivebox.config.constants import CONSTANTS
config_path = CONSTANTS.CONFIG_FILE
except ImportError:
return {}
if not config_path.exists():
return {}
parser = ConfigParser()
parser.optionxform = lambda x: x # preserve case
parser.read(config_path)
# Flatten all sections into single namespace (ignore section headers)
return {key.upper(): value for section in parser.sections() for key, value in parser.items(section)}
class BaseConfigSet(BaseSettings):
"""
Base class for config sections.
Automatically loads values from:
1. Environment variables (highest priority)
2. ArchiveBox.conf file (if exists)
3. Default values (lowest priority)
Automatically loads values from (highest to lowest priority):
1. Environment variables
2. ArchiveBox.conf file (INI format, flattened)
3. Default values
Subclasses define fields with defaults and types:
@@ -35,11 +67,30 @@ class BaseConfigSet(BaseSettings):
"""
class Config:
# Use env vars with ARCHIVEBOX_ prefix or raw name
env_prefix = ""
extra = "ignore"
validate_default = True
@classmethod
def settings_customise_sources(
cls,
settings_cls: Type[BaseSettings],
init_settings: PydanticBaseSettingsSource,
env_settings: PydanticBaseSettingsSource,
dotenv_settings: PydanticBaseSettingsSource,
file_secret_settings: PydanticBaseSettingsSource,
) -> Tuple[PydanticBaseSettingsSource, ...]:
"""
Define the order of settings sources (first = highest priority).
"""
return (
init_settings, # 1. Passed to __init__
env_settings, # 2. Environment variables
IniConfigSettingsSource(settings_cls), # 3. ArchiveBox.conf file
# dotenv_settings, # Skip .env files
# file_secret_settings, # Skip secrets files
)
@classmethod
def load_from_file(cls, config_path: Path) -> Dict[str, str]:
"""Load config values from INI file."""
@@ -47,7 +98,7 @@ class BaseConfigSet(BaseSettings):
return {}
parser = ConfigParser()
parser.optionxform = lambda x: x # type: ignore # preserve case
parser.optionxform = lambda x: x # preserve case
parser.read(config_path)
# Flatten all sections into single namespace

View File

@@ -256,7 +256,7 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
# Show a helpful message when no plugins found
rows['Name'].append('(no plugins found)')
rows['Source'].append('-')
rows['Path'].append(format_html('<code>archivebox/plugins/</code> or <code>data/plugins/</code>'))
rows['Path'].append(mark_safe('<code>archivebox/plugins/</code> or <code>data/plugins/</code>'))
rows['Hooks'].append('-')
return TableContext(

View File

@@ -9,25 +9,17 @@ from django.core.exceptions import ValidationError
from django.urls import reverse, resolve
from django.utils import timezone
from huey_monitor.admin import TaskModel
from archivebox.config import DATA_DIR
from archivebox.config.common import SERVER_CONFIG
from archivebox.misc.paginators import AccelleratedPaginator
from archivebox.base_models.admin import BaseModelAdmin
from archivebox.hooks import get_extractor_icon
from core.models import ArchiveResult, Snapshot
def result_url(result: TaskModel) -> str:
url = reverse("admin:huey_monitor_taskmodel_change", args=[str(result.id)])
return format_html('<a href="{url}" class="fade-in-progress-url">See progress...</a>'.format(url=url))
class ArchiveResultInline(admin.TabularInline):
name = 'Archive Results Log'
model = ArchiveResult
@@ -101,9 +93,9 @@ class ArchiveResultInline(admin.TabularInline):
class ArchiveResultAdmin(BaseModelAdmin):
list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'extractor', 'cmd_str', 'output_str')
list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'extractor_with_icon', 'cmd_str', 'output_str')
sort_fields = ('id', 'created_by', 'created_at', 'extractor', 'status')
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary')
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'extractor_with_icon')
search_fields = ('id', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
fields = ('snapshot', 'extractor', 'status', 'retry_at', 'start_ts', 'end_ts', 'created_by', 'pwd', 'cmd_version', 'cmd', 'output', *readonly_fields)
autocomplete_fields = ['snapshot']
@@ -144,17 +136,29 @@ class ArchiveResultAdmin(BaseModelAdmin):
def tags_str(self, result):
return result.snapshot.tags_str()
@admin.display(description='Extractor', ordering='extractor')
def extractor_with_icon(self, result):
icon = get_extractor_icon(result.extractor)
return format_html(
'<span title="{}">{}</span> {}',
result.extractor,
icon,
result.extractor,
)
def cmd_str(self, result):
return format_html(
'<pre>{}</pre>',
' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd),
)
def output_str(self, result):
# Determine output link path - use output if file exists, otherwise link to index
output_path = result.output if (result.status == 'succeeded' and result.output) else 'index.html'
return format_html(
'<a href="/archive/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
result.snapshot.timestamp,
result.output if (result.status == 'succeeded') and result.extractor not in ('title', 'archive_org') else 'index.html',
output_path,
result.output,
)
@@ -185,7 +189,7 @@ class ArchiveResultAdmin(BaseModelAdmin):
is_hidden = filename.startswith('.')
output_str += format_html('<span style="opacity: {}.2">{}{}</span><br/>', int(not is_hidden), indentation_str, filename.strip())
return output_str + format_html('</code></pre>')
return output_str + mark_safe('</code></pre>')

View File

@@ -35,8 +35,19 @@ def register_admin_site():
admin.site = archivebox_admin
sites.site = archivebox_admin
# Plugin admin registration is now handled by individual app admins
# No longer using archivebox.pm.hook.register_admin()
# Register admin views for each app
# (Previously handled by ABX plugin system, now called directly)
from core.admin import register_admin as register_core_admin
from crawls.admin import register_admin as register_crawls_admin
from api.admin import register_admin as register_api_admin
from machine.admin import register_admin as register_machine_admin
from workers.admin import register_admin as register_workers_admin
register_core_admin(archivebox_admin)
register_crawls_admin(archivebox_admin)
register_api_admin(archivebox_admin)
register_machine_admin(archivebox_admin)
register_workers_admin(archivebox_admin)
return archivebox_admin

View File

@@ -25,7 +25,7 @@ from archivebox.workers.tasks import bg_archive_snapshots, bg_add
from core.models import Tag
from core.admin_tags import TagInline
from core.admin_archiveresults import ArchiveResultInline, result_url
from core.admin_archiveresults import ArchiveResultInline
# GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
@@ -54,10 +54,10 @@ class SnapshotActionForm(ActionForm):
class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
list_display = ('created_at', 'title_str', 'status', 'files', 'size', 'url_str')
sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'link_dir', 'available_config_options')
readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir')
search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', 'config', 'available_config_options', *readonly_fields[:-1])
fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', 'config', *readonly_fields)
ordering = ['-created_at']
actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
inlines = [TagInline, ArchiveResultInline]
@@ -93,12 +93,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
# self.request = request
# return super().get_queryset(request).prefetch_related('archiveresult_set').distinct() # .annotate(archiveresult_count=Count('archiveresult'))
@admin.action(
description="Imported Timestamp"
)
@admin.display(description="Imported Timestamp")
def imported_timestamp(self, obj):
context = RequestContext(self.request, {
'bookmarked_date': obj.bookmarked,
'bookmarked_date': obj.bookmarked_at,
'timestamp': obj.timestamp,
})
@@ -145,22 +143,15 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
def status_info(self, obj):
return format_html(
# URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
'''
Archived: {} ({} files {}) &nbsp; &nbsp;
Favicon: <img src="{}" style="height: 20px"/> &nbsp; &nbsp;
Status code: {} &nbsp; &nbsp;<br/>
Server: {} &nbsp; &nbsp;
Content type: {} &nbsp; &nbsp;
Extension: {} &nbsp; &nbsp;
''',
'' if obj.is_archived else '',
obj.num_outputs,
self.size(obj) or '0kb',
f'/archive/{obj.timestamp}/favicon.ico',
obj.status_code or '-',
obj.headers and obj.headers.get('Server') or '-',
obj.headers and obj.headers.get('Content-Type') or '-',
obj.extension or '-',
)
@@ -184,8 +175,8 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
obj.archive_path,
obj.archive_path,
obj.archive_path,
'fetched' if obj.latest_title or obj.title else 'pending',
urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
'fetched' if obj.title else 'pending',
urldecode(htmldecode(obj.title or ''))[:128] or 'Pending...'
) + mark_safe(f' <span class="tags">{tags}</span>')
@admin.display(
@@ -259,14 +250,13 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
description=" Get Title"
)
def update_titles(self, request, queryset):
from core.models import Snapshot
count = queryset.count()
# Queue snapshots for archiving via the state machine system
result = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR})
queued = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR})
messages.success(
request,
mark_safe(f"Title and favicon are updating in the background for {count} URLs. {result_url(result)}"),
f"Queued {queued} snapshots for title/favicon update. The orchestrator will process them in the background.",
)
@admin.action(
@@ -275,11 +265,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
def update_snapshots(self, request, queryset):
count = queryset.count()
result = bg_archive_snapshots(queryset, kwargs={"overwrite": False, "out_dir": DATA_DIR})
queued = bg_archive_snapshots(queryset, kwargs={"overwrite": False, "out_dir": DATA_DIR})
messages.success(
request,
mark_safe(f"Re-trying any previously failed methods for {count} URLs in the background. {result_url(result)}"),
f"Queued {queued} snapshots for re-archiving. The orchestrator will process them in the background.",
)
@@ -291,11 +281,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
timestamp = timezone.now().isoformat('T', 'seconds')
new_url = snapshot.url.split('#')[0] + f'#{timestamp}'
result = bg_add({'urls': new_url, 'tag': snapshot.tags_str()})
bg_add({'urls': new_url, 'tag': snapshot.tags_str()})
messages.success(
request,
mark_safe(f"Creating new fresh snapshots for {queryset.count()} URLs in the background. {result_url(result)}"),
f"Creating {queryset.count()} new fresh snapshots. The orchestrator will process them in the background.",
)
@admin.action(
@@ -304,11 +294,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
def overwrite_snapshots(self, request, queryset):
count = queryset.count()
result = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "out_dir": DATA_DIR})
queued = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "out_dir": DATA_DIR})
messages.success(
request,
mark_safe(f"Clearing all previous results and re-downloading {count} URLs in the background. {result_url(result)}"),
f"Queued {queued} snapshots for full re-archive (overwriting existing). The orchestrator will process them in the background.",
)
@admin.action(

View File

@@ -1,5 +1,7 @@
__package__ = 'archivebox.core'
import sys
from django.apps import AppConfig
@@ -10,6 +12,41 @@ class CoreConfig(AppConfig):
"""Register the archivebox.core.admin_site as the main django admin site"""
from core.admin_site import register_admin_site
register_admin_site()
# Auto-start the orchestrator when running the web server
self._maybe_start_orchestrator()
def _maybe_start_orchestrator(self):
"""Start the orchestrator if we're running a web server."""
import os
# Don't start orchestrator during migrations, shell, tests, etc.
# Only start when running: runserver, daphne, gunicorn, uwsgi
if not self._is_web_server():
return
# Don't start if RUN_ORCHESTRATOR env var is explicitly set to false
if os.environ.get('RUN_ORCHESTRATOR', '').lower() in ('false', '0', 'no'):
return
# Don't start in autoreload child process (avoid double-start)
if os.environ.get('RUN_MAIN') != 'true' and 'runserver' in sys.argv:
return
try:
from workers.orchestrator import Orchestrator
if not Orchestrator.is_running():
# Start orchestrator as daemon (won't exit on idle when started by server)
orchestrator = Orchestrator(exit_on_idle=False)
orchestrator.start()
except Exception as e:
# Don't crash the server if orchestrator fails to start
import logging
logging.getLogger('archivebox').warning(f'Failed to auto-start orchestrator: {e}')
def _is_web_server(self) -> bool:
"""Check if we're running a web server command."""
# Check for common web server indicators
server_commands = ('runserver', 'daphne', 'gunicorn', 'uwsgi', 'server')
return any(cmd in ' '.join(sys.argv).lower() for cmd in server_commands)

View File

@@ -23,7 +23,11 @@ from archivebox.config import CONSTANTS
from archivebox.misc.system import get_dir_size, atomic_write
from archivebox.misc.util import parse_date, base_url, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode
from archivebox.misc.hashing import get_dir_info
from archivebox.hooks import ARCHIVE_METHODS_INDEXING_PRECEDENCE
from archivebox.hooks import (
ARCHIVE_METHODS_INDEXING_PRECEDENCE,
get_extractors, get_extractor_name, get_extractor_icon,
DEFAULT_EXTRACTOR_ICONS,
)
from archivebox.base_models.models import (
ModelWithUUID, ModelWithSerializers, ModelWithOutputDir,
ModelWithConfig, ModelWithNotes, ModelWithHealthStats,
@@ -343,45 +347,37 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
def icons(self) -> str:
"""Generate HTML icons showing which extractors have succeeded for this snapshot"""
from django.utils.html import format_html, mark_safe
from collections import defaultdict
cache_key = f'result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}'
def calc_icons():
if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
archive_results = [r for r in self.archiveresult_set.all() if r.status == "succeeded" and r.output]
archive_results = {r.extractor: r for r in self.archiveresult_set.all() if r.status == "succeeded" and r.output}
else:
archive_results = self.archiveresult_set.filter(status="succeeded", output__isnull=False)
archive_results = {r.extractor: r for r in self.archiveresult_set.filter(status="succeeded", output__isnull=False)}
path = self.archive_path
canon = self.canonical_outputs()
output = ""
output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> &nbsp;'
icons = {
"singlefile": "", "wget": "🆆", "dom": "🅷", "pdf": "📄",
"screenshot": "💻", "media": "📼", "git": "🅶", "archive_org": "🏛",
"readability": "🆁", "mercury": "🅼", "warc": "📦"
}
exclude = ["favicon", "title", "headers", "htmltotext", "archive_org"]
extractor_outputs = defaultdict(lambda: None)
for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES:
for result in archive_results:
if result.extractor == extractor:
extractor_outputs[extractor] = result
# Get all extractors from hooks system (sorted by numeric prefix)
all_extractors = [get_extractor_name(e) for e in get_extractors()]
for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES:
if extractor not in exclude:
existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
output += format_html(output_template, path, canon.get(extractor, ''), str(bool(existing)), extractor, icons.get(extractor, "?"))
if extractor == "wget":
exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
output += format_html(output_template, path, canon.get("warc", "warc/"), str(bool(exists)), "warc", icons.get("warc", "?"))
if extractor == "archive_org":
exists = extractor in extractor_outputs and extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon.get("archive_org", ""), str(exists), "archive_org", icons.get("archive_org", "?"))
for extractor in all_extractors:
result = archive_results.get(extractor)
existing = result and result.status == 'succeeded' and result.output
icon = get_extractor_icon(extractor)
output += format_html(
output_template,
path,
canon.get(extractor, extractor + '/'),
str(bool(existing)),
extractor,
icon
)
return format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))
return format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}</span>', mark_safe(output))
cache_result = cache.get(cache_key)
if cache_result:
@@ -767,12 +763,11 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
FAILED = 'failed', 'Failed'
SKIPPED = 'skipped', 'Skipped'
EXTRACTOR_CHOICES = (
('htmltotext', 'htmltotext'), ('git', 'git'), ('singlefile', 'singlefile'), ('media', 'media'),
('archive_org', 'archive_org'), ('readability', 'readability'), ('mercury', 'mercury'),
('favicon', 'favicon'), ('pdf', 'pdf'), ('headers', 'headers'), ('screenshot', 'screenshot'),
('dom', 'dom'), ('title', 'title'), ('wget', 'wget'),
)
@classmethod
def get_extractor_choices(cls):
"""Get extractor choices from discovered hooks (for forms/admin)."""
extractors = [get_extractor_name(e) for e in get_extractors()]
return tuple((e, e) for e in extractors)
# Keep AutoField for backward compatibility with 0.7.x databases
# UUID field is added separately by migration for new records
@@ -783,7 +778,8 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
modified_at = models.DateTimeField(auto_now=True)
snapshot: Snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) # type: ignore
extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32, blank=False, null=False, db_index=True)
# No choices= constraint - extractor names come from plugin system and can be any string
extractor = models.CharField(max_length=32, blank=False, null=False, db_index=True)
pwd = models.CharField(max_length=256, default=None, null=True, blank=True)
cmd = models.JSONField(default=None, null=True, blank=True)
cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
@@ -835,6 +831,25 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
def output_exists(self) -> bool:
return os.path.exists(Path(self.snapshot_dir) / self.extractor)
def embed_path(self) -> Optional[str]:
"""
Get the relative path to the embeddable output file for this result.
Returns the output field if set and file exists, otherwise tries to
find a reasonable default based on the extractor type.
"""
if self.output:
return self.output
# Try to find output file based on extractor's canonical output path
canonical = self.snapshot.canonical_outputs()
extractor_key = f'{self.extractor}_path'
if extractor_key in canonical:
return canonical[extractor_key]
# Fallback to extractor directory
return f'{self.extractor}/'
def create_output_dir(self):
output_dir = Path(self.snapshot_dir) / self.extractor
output_dir.mkdir(parents=True, exist_ok=True)
@@ -891,6 +906,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
output_dir=extractor_dir,
config_objects=config_objects,
url=self.snapshot.url,
snapshot_id=str(self.snapshot.id),
)
end_ts = timezone.now()
@@ -1000,6 +1016,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
hook,
output_dir=self.output_dir,
config_objects=config_objects,
url=self.snapshot.url,
snapshot_id=str(self.snapshot.id),
extractor=self.extractor,
)

View File

@@ -68,9 +68,6 @@ INSTALLED_APPS = [
# 3rd-party apps from PyPI that need to be loaded last
"admin_data_views", # handles rendering some convenient automatic read-only views of data in Django admin
"django_extensions", # provides Django Debug Toolbar (and other non-debug helpers)
"django_huey", # provides multi-queue support for django huey https://github.com/gaiacoop/django-huey
"bx_django_utils", # needed for huey_monitor https://github.com/boxine/bx_django_utils
"huey_monitor", # adds an admin UI for monitoring background huey tasks https://github.com/boxine/django-huey-monitor
]
@@ -215,70 +212,6 @@ MIGRATION_MODULES = {"signal_webhooks": None}
# as much as I'd love this to be a UUID or ULID field, it's not supported yet as of Django 5.0
DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"
HUEY = {
"huey_class": "huey.SqliteHuey",
"filename": CONSTANTS.QUEUE_DATABASE_FILENAME,
"name": "commands",
"results": True,
"store_none": True,
"immediate": False,
"utc": True,
"consumer": {
"workers": 1,
"worker_type": "thread",
"initial_delay": 0.1, # Smallest polling interval, same as -d.
"backoff": 1.15, # Exponential backoff using this rate, -b.
"max_delay": 10.0, # Max possible polling interval, -m.
"scheduler_interval": 1, # Check schedule every second, -s.
"periodic": True, # Enable crontab feature.
"check_worker_health": True, # Enable worker health checks.
"health_check_interval": 1, # Check worker health every second.
},
}
# https://huey.readthedocs.io/en/latest/contrib.html#setting-things-up
# https://github.com/gaiacoop/django-huey
DJANGO_HUEY = {
"default": "commands",
"queues": {
HUEY["name"]: HUEY.copy(),
# more registered here at plugin import-time by BaseQueue.register()
# Additional huey queues configured via settings
},
}
class HueyDBRouter:
"""
A router to store all the Huey result k:v / Huey Monitor models in the queue.sqlite3 database.
We keep the databases separate because the queue database receives many more reads/writes per second
and we want to avoid single-write lock contention with the main database. Also all the in-progress task
data is ephemeral/not-important-long-term. This makes it easier to for the user to clear non-critical
temp data by just deleting queue.sqlite3 and leaving index.sqlite3.
"""
route_app_labels = {"huey_monitor", "django_huey", "djhuey"}
db_name = "queue"
def db_for_read(self, model, **hints):
if model._meta.app_label in self.route_app_labels:
return self.db_name
return "default"
def db_for_write(self, model, **hints):
if model._meta.app_label in self.route_app_labels:
return self.db_name
return "default"
def allow_relation(self, obj1, obj2, **hints):
if obj1._meta.app_label in self.route_app_labels or obj2._meta.app_label in self.route_app_labels:
return obj1._meta.app_label == obj2._meta.app_label
return None
def allow_migrate(self, db, app_label, model_name=None, **hints):
if app_label in self.route_app_labels:
return db == self.db_name
return db == "default"
# class FilestoreDBRouter:
@@ -311,7 +244,7 @@ class HueyDBRouter:
# return db == self.db_name
# return db == "default"
DATABASE_ROUTERS = ["core.settings.HueyDBRouter"]
DATABASE_ROUTERS = []
CACHES = {
"default": {"BACKEND": "django.core.cache.backends.locmem.LocMemCache"},

View File

@@ -1,9 +1,13 @@
from django import template
from django.contrib.admin.templatetags.base import InclusionAdminNode
from django.utils.safestring import mark_safe
from typing import Union
from archivebox.hooks import (
get_extractor_icon, get_extractor_template, get_extractor_name,
)
register = template.Library()
@@ -44,3 +48,115 @@ def url_replace(context, **kwargs):
dict_ = context['request'].GET.copy()
dict_.update(**kwargs)
return dict_.urlencode()
@register.simple_tag
def extractor_icon(extractor: str) -> str:
"""
Render the icon for an extractor.
Usage: {% extractor_icon "screenshot" %}
"""
return mark_safe(get_extractor_icon(extractor))
@register.simple_tag(takes_context=True)
def extractor_thumbnail(context, result) -> str:
"""
Render the thumbnail template for an archive result.
Usage: {% extractor_thumbnail result %}
Context variables passed to template:
- result: ArchiveResult object
- snapshot: Parent Snapshot object
- output_path: Path to output relative to snapshot dir (from embed_path())
- extractor: Extractor base name
"""
extractor = get_extractor_name(result.extractor)
template_str = get_extractor_template(extractor, 'thumbnail')
if not template_str:
return ''
# Use embed_path() for the display path (includes canonical paths)
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
# Create a mini template and render it with context
try:
tpl = template.Template(template_str)
ctx = template.Context({
'result': result,
'snapshot': result.snapshot,
'output_path': output_path,
'extractor': extractor,
})
return mark_safe(tpl.render(ctx))
except Exception:
return ''
@register.simple_tag(takes_context=True)
def extractor_embed(context, result) -> str:
"""
Render the embed iframe template for an archive result.
Usage: {% extractor_embed result %}
"""
extractor = get_extractor_name(result.extractor)
template_str = get_extractor_template(extractor, 'embed')
if not template_str:
return ''
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
try:
tpl = template.Template(template_str)
ctx = template.Context({
'result': result,
'snapshot': result.snapshot,
'output_path': output_path,
'extractor': extractor,
})
return mark_safe(tpl.render(ctx))
except Exception:
return ''
@register.simple_tag(takes_context=True)
def extractor_fullscreen(context, result) -> str:
"""
Render the fullscreen template for an archive result.
Usage: {% extractor_fullscreen result %}
"""
extractor = get_extractor_name(result.extractor)
template_str = get_extractor_template(extractor, 'fullscreen')
if not template_str:
return ''
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
try:
tpl = template.Template(template_str)
ctx = template.Context({
'result': result,
'snapshot': result.snapshot,
'output_path': output_path,
'extractor': extractor,
})
return mark_safe(tpl.render(ctx))
except Exception:
return ''
@register.filter
def extractor_name(value: str) -> str:
"""
Get the base name of an extractor (strips numeric prefix).
Usage: {{ result.extractor|extractor_name }}
"""
return get_extractor_name(value)

View File

@@ -8,7 +8,7 @@ from django.views.generic.base import RedirectView
from archivebox.misc.serve_static import serve_static
from core.admin_site import archivebox_admin
from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView, live_progress_view
from workers.views import JobsDashboardView
@@ -43,8 +43,10 @@ urlpatterns = [
path('accounts/', include('django.contrib.auth.urls')),
path('admin/live-progress/', live_progress_view, name='live_progress'),
path('admin/', archivebox_admin.urls),
path("api/", include('api.urls'), name='api'),
path('health/', HealthCheckView.as_view(), name='healthcheck'),

View File

@@ -34,6 +34,7 @@ from archivebox.search import query_search_index
from core.models import Snapshot
from core.forms import AddLinkForm
from crawls.models import Seed, Crawl
from archivebox.hooks import get_extractors, get_extractor_name
@@ -54,8 +55,10 @@ class SnapshotView(View):
@staticmethod
def render_live_index(request, snapshot):
TITLE_LOADING_MSG = 'Not yet archived...'
HIDDEN_RESULTS = ('favicon', 'headers', 'title', 'htmltotext', 'warc', 'archive_org')
# Dict of extractor -> ArchiveResult object
archiveresult_objects = {}
# Dict of extractor -> result info dict (for template compatibility)
archiveresults = {}
results = snapshot.archiveresult_set.all()
@@ -65,18 +68,21 @@ class SnapshotView(View):
abs_path = result.snapshot_dir / (embed_path or 'None')
if (result.status == 'succeeded'
and (result.extractor not in HIDDEN_RESULTS)
and embed_path
and os.access(abs_path, os.R_OK)
and abs_path.exists()):
if os.path.isdir(abs_path) and not any(abs_path.glob('*.*')):
continue
# Store the full ArchiveResult object for template tags
archiveresult_objects[result.extractor] = result
result_info = {
'name': result.extractor,
'path': embed_path,
'ts': ts_to_date_str(result.end_ts),
'size': abs_path.stat().st_size or '?',
'result': result, # Include the full object for template tags
}
archiveresults[result.extractor] = result_info
@@ -101,11 +107,11 @@ class SnapshotView(View):
}
# iterate through all the files in the snapshot dir and add the biggest ones to1 the result list
# iterate through all the files in the snapshot dir and add the biggest ones to the result list
snap_dir = Path(snapshot.output_dir)
if not os.path.isdir(snap_dir) and os.access(snap_dir, os.R_OK):
return {}
for result_file in (*snap_dir.glob('*'), *snap_dir.glob('*/*')):
extension = result_file.suffix.lstrip('.').lower()
if result_file.is_dir() or result_file.name.startswith('.') or extension not in allowed_extensions:
@@ -121,12 +127,16 @@ class SnapshotView(View):
'path': result_file.relative_to(snap_dir),
'ts': ts_to_date_str(result_file.stat().st_mtime or 0),
'size': file_size,
'result': None, # No ArchiveResult object for filesystem-discovered files
}
preferred_types = ('singlefile', 'screenshot', 'wget', 'dom', 'media', 'pdf', 'readability', 'mercury')
# Get available extractors from hooks (sorted by numeric prefix for ordering)
# Convert to base names for display ordering
all_extractors = [get_extractor_name(e) for e in get_extractors()]
preferred_types = tuple(all_extractors)
all_types = preferred_types + tuple(result_type for result_type in archiveresults.keys() if result_type not in preferred_types)
best_result = {'path': 'None'}
best_result = {'path': 'None', 'result': None}
for result_type in preferred_types:
if result_type in archiveresults:
best_result = archiveresults[result_type]
@@ -157,6 +167,7 @@ class SnapshotView(View):
'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']),
'best_result': best_result,
'snapshot': snapshot, # Pass the snapshot object for template tags
}
return render(template_name='core/snapshot_live.html', request=request, context=context)
@@ -436,7 +447,7 @@ class AddView(UserPassesTestMixin, FormView):
def form_valid(self, form):
urls = form.cleaned_data["url"]
print(f'[+] Adding URL: {urls}')
parser = form.cleaned_data["parser"]
parser = form.cleaned_data.get("parser", "auto") # default to auto-detect parser
tag = form.cleaned_data["tag"]
depth = 0 if form.cleaned_data["depth"] == "0" else 1
extractors = ','.join(form.cleaned_data["archive_methods"])
@@ -452,18 +463,19 @@ class AddView(UserPassesTestMixin, FormView):
if extractors:
input_kwargs.update({"extractors": extractors})
from archivebox.config.permissions import HOSTNAME
# 1. save the provided urls to sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{self.request.user.pk}.txt'
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
# 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
seed = Seed.from_file(
sources_file,
label=f'{self.request.user.username}@{HOSTNAME}{self.request.path}',
label=f'{self.request.user.username}@{HOSTNAME}{self.request.path} {timestamp}',
parser=parser,
tag=tag,
created_by=self.request.user.pk,
@@ -472,7 +484,7 @@ class AddView(UserPassesTestMixin, FormView):
# 'INDEX_ONLY': index_only,
# 'OVERWRITE': False,
'DEPTH': depth,
'EXTRACTORS': parser,
'EXTRACTORS': extractors or '',
# 'DEFAULT_PERSONA': persona or 'Default',
})
# 3. create a new Crawl pointing to the Seed
@@ -490,10 +502,15 @@ class AddView(UserPassesTestMixin, FormView):
self.request,
mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a minute start seeing results) {crawl.admin_change_url}"),
)
# if not bg:
# from workers.orchestrator import Orchestrator
# orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4)
# orchestrator.start()
# Start orchestrator in background to process the queued crawl
try:
from archivebox.workers.tasks import ensure_orchestrator_running
ensure_orchestrator_running()
except Exception as e:
# Orchestrator may already be running via supervisord, or fail to start
# This is not fatal - the crawl will be processed when orchestrator runs
print(f'[!] Failed to start orchestrator: {e}')
return redirect(crawl.admin_change_url)
@@ -513,6 +530,141 @@ class HealthCheckView(View):
)
import json
from django.http import JsonResponse
def live_progress_view(request):
"""Simple JSON endpoint for live progress status - used by admin progress monitor."""
try:
from workers.orchestrator import Orchestrator
from crawls.models import Crawl
from core.models import Snapshot, ArchiveResult
# Get orchestrator status
orchestrator_running = Orchestrator.is_running()
total_workers = Orchestrator().get_total_worker_count() if orchestrator_running else 0
# Get model counts by status
crawls_pending = Crawl.objects.filter(status=Crawl.StatusChoices.QUEUED).count()
crawls_started = Crawl.objects.filter(status=Crawl.StatusChoices.STARTED).count()
# Get recent crawls (last 24 hours)
from datetime import timedelta
one_day_ago = timezone.now() - timedelta(days=1)
crawls_recent = Crawl.objects.filter(created_at__gte=one_day_ago).count()
snapshots_pending = Snapshot.objects.filter(status=Snapshot.StatusChoices.QUEUED).count()
snapshots_started = Snapshot.objects.filter(status=Snapshot.StatusChoices.STARTED).count()
archiveresults_pending = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.QUEUED).count()
archiveresults_started = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.STARTED).count()
archiveresults_succeeded = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.SUCCEEDED).count()
archiveresults_failed = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.FAILED).count()
# Build hierarchical active crawls with nested snapshots and archive results
active_crawls = []
for crawl in Crawl.objects.filter(
status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]
).order_by('-modified_at')[:10]:
# Get snapshots for this crawl
crawl_snapshots = Snapshot.objects.filter(crawl=crawl)
total_snapshots = crawl_snapshots.count()
completed_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.SEALED).count()
pending_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.QUEUED).count()
# Calculate crawl progress
crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0
# Get active snapshots for this crawl
active_snapshots_for_crawl = []
for snapshot in crawl_snapshots.filter(
status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]
).order_by('-modified_at')[:5]:
# Get archive results for this snapshot
snapshot_results = ArchiveResult.objects.filter(snapshot=snapshot)
total_extractors = snapshot_results.count()
completed_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.SUCCEEDED).count()
failed_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.FAILED).count()
pending_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.QUEUED).count()
# Calculate snapshot progress
snapshot_progress = int(((completed_extractors + failed_extractors) / total_extractors) * 100) if total_extractors > 0 else 0
# Get active extractors for this snapshot
active_extractors = [
{
'id': str(ar.id),
'extractor': ar.extractor,
'status': ar.status,
'started': ar.start_ts.isoformat() if ar.start_ts else None,
'progress': 50,
}
for ar in snapshot_results.filter(status=ArchiveResult.StatusChoices.STARTED).order_by('-start_ts')[:5]
]
active_snapshots_for_crawl.append({
'id': str(snapshot.id),
'url': snapshot.url[:80],
'status': snapshot.status,
'started': snapshot.modified_at.isoformat() if snapshot.modified_at else None,
'progress': snapshot_progress,
'total_extractors': total_extractors,
'completed_extractors': completed_extractors,
'failed_extractors': failed_extractors,
'pending_extractors': pending_extractors,
'active_extractors': active_extractors,
})
active_crawls.append({
'id': str(crawl.id),
'label': str(crawl)[:60],
'status': crawl.status,
'started': crawl.modified_at.isoformat() if crawl.modified_at else None,
'progress': crawl_progress,
'max_depth': crawl.max_depth,
'total_snapshots': total_snapshots,
'completed_snapshots': completed_snapshots,
'failed_snapshots': 0,
'pending_snapshots': pending_snapshots,
'active_snapshots': active_snapshots_for_crawl,
})
return JsonResponse({
'orchestrator_running': orchestrator_running,
'total_workers': total_workers,
'crawls_pending': crawls_pending,
'crawls_started': crawls_started,
'crawls_recent': crawls_recent,
'snapshots_pending': snapshots_pending,
'snapshots_started': snapshots_started,
'archiveresults_pending': archiveresults_pending,
'archiveresults_started': archiveresults_started,
'archiveresults_succeeded': archiveresults_succeeded,
'archiveresults_failed': archiveresults_failed,
'active_crawls': active_crawls,
'server_time': timezone.now().isoformat(),
})
except Exception as e:
import traceback
return JsonResponse({
'error': str(e),
'traceback': traceback.format_exc(),
'orchestrator_running': False,
'total_workers': 0,
'crawls_pending': 0,
'crawls_started': 0,
'crawls_recent': 0,
'snapshots_pending': 0,
'snapshots_started': 0,
'archiveresults_pending': 0,
'archiveresults_started': 0,
'archiveresults_succeeded': 0,
'archiveresults_failed': 0,
'active_crawls': [],
'server_time': timezone.now().isoformat(),
}, status=500)
def find_config_section(key: str) -> str:
CONFIGS = get_all_configs()

View File

@@ -1,10 +1,18 @@
__package__ = 'archivebox.crawls'
from django.utils.html import format_html, format_html_join
from django.contrib import admin
import json
from pathlib import Path
from django.utils.html import format_html, format_html_join, mark_safe
from django.contrib import admin, messages
from django.urls import path
from django.http import JsonResponse
from django.views.decorators.http import require_POST
from archivebox import DATA_DIR
from django_object_actions import action
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
from core.models import Snapshot
@@ -16,8 +24,8 @@ class SeedAdmin(ConfigEditorMixin, BaseModelAdmin):
sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str')
search_fields = ('id', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str')
readonly_fields = ('created_at', 'modified_at', 'scheduled_crawls', 'crawls', 'snapshots', 'contents', 'available_config_options')
fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'available_config_options', 'created_by', *readonly_fields[:-1])
readonly_fields = ('created_at', 'modified_at', 'scheduled_crawls', 'crawls', 'snapshots', 'contents')
fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'created_by', *readonly_fields)
list_filter = ('extractor', 'created_by')
ordering = ['-created_at']
@@ -34,19 +42,19 @@ class SeedAdmin(ConfigEditorMixin, BaseModelAdmin):
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
(scheduledcrawl.admin_change_url, scheduledcrawl)
for scheduledcrawl in obj.scheduled_crawl_set.all().order_by('-created_at')[:20]
)) or format_html('<i>No Scheduled Crawls yet...</i>')
)) or mark_safe('<i>No Scheduled Crawls yet...</i>')
def crawls(self, obj):
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
(crawl.admin_change_url, crawl)
for crawl in obj.crawl_set.all().order_by('-created_at')[:20]
)) or format_html('<i>No Crawls yet...</i>')
)) or mark_safe('<i>No Crawls yet...</i>')
def snapshots(self, obj):
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
(snapshot.admin_change_url, snapshot)
for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
)) or format_html('<i>No Snapshots yet...</i>')
)) or mark_safe('<i>No Snapshots yet...</i>')
def contents(self, obj):
if obj.uri.startswith('file:///data/'):
@@ -69,14 +77,81 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
sort_fields = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at')
search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'schedule_id', 'status', 'seed__uri')
readonly_fields = ('created_at', 'modified_at', 'snapshots', 'seed_contents', 'available_config_options')
fields = ('label', 'notes', 'urls', 'config', 'available_config_options', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', *readonly_fields[:-1])
readonly_fields = ('created_at', 'modified_at', 'snapshots', 'seed_urls_editor')
fields = ('label', 'notes', 'seed_urls_editor', 'config', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', 'created_at', 'modified_at', 'snapshots')
list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at')
ordering = ['-created_at', '-retry_at']
list_per_page = 100
actions = ["delete_selected"]
change_actions = ['recrawl']
@action(label='Recrawl', description='Create a new crawl with the same settings')
def recrawl(self, request, obj):
"""Duplicate this crawl as a new crawl with the same seed and settings."""
from django.utils import timezone
new_crawl = Crawl.objects.create(
seed=obj.seed,
urls=obj.urls,
max_depth=obj.max_depth,
config=obj.config,
schedule=obj.schedule,
label=f"{obj.label} (recrawl)" if obj.label else "",
notes=obj.notes,
created_by=request.user,
status=Crawl.StatusChoices.QUEUED,
retry_at=timezone.now(),
)
messages.success(
request,
f'Created new crawl {new_crawl.id} with the same settings. '
f'It will start processing shortly.'
)
# Redirect to the new crawl's change page
from django.shortcuts import redirect
return redirect('admin:crawls_crawl_change', new_crawl.id)
def get_urls(self):
urls = super().get_urls()
custom_urls = [
path('<path:object_id>/save_seed_contents/',
self.admin_site.admin_view(self.save_seed_contents_view),
name='crawls_crawl_save_seed_contents'),
]
return custom_urls + urls
def save_seed_contents_view(self, request, object_id):
"""Handle saving seed file contents via AJAX."""
if request.method != 'POST':
return JsonResponse({'success': False, 'error': 'POST required'}, status=405)
try:
crawl = Crawl.objects.get(pk=object_id)
except Crawl.DoesNotExist:
return JsonResponse({'success': False, 'error': 'Crawl not found'}, status=404)
if not (crawl.seed and crawl.seed.uri and crawl.seed.uri.startswith('file:///data/')):
return JsonResponse({'success': False, 'error': 'Seed is not a local file'}, status=400)
try:
data = json.loads(request.body)
contents = data.get('contents', '')
except json.JSONDecodeError:
return JsonResponse({'success': False, 'error': 'Invalid JSON'}, status=400)
source_file = DATA_DIR / crawl.seed.uri.replace('file:///data/', '', 1)
try:
# Ensure parent directory exists
source_file.parent.mkdir(parents=True, exist_ok=True)
source_file.write_text(contents)
return JsonResponse({'success': True, 'message': f'Saved {len(contents)} bytes to {source_file.name}'})
except Exception as e:
return JsonResponse({'success': False, 'error': str(e)}, status=500)
def num_snapshots(self, obj):
return obj.snapshot_set.count()
@@ -84,35 +159,175 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
return format_html_join('<br/>', '<a href="{}">{}</a>', (
(snapshot.admin_change_url, snapshot)
for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
)) or format_html('<i>No Snapshots yet...</i>')
)) or mark_safe('<i>No Snapshots yet...</i>')
@admin.display(description='Schedule', ordering='schedule')
def schedule_str(self, obj):
if not obj.schedule:
return format_html('<i>None</i>')
return mark_safe('<i>None</i>')
return format_html('<a href="{}">{}</a>', obj.schedule.admin_change_url, obj.schedule)
@admin.display(description='Seed', ordering='seed')
def seed_str(self, obj):
if not obj.seed:
return format_html('<i>None</i>')
return mark_safe('<i>None</i>')
return format_html('<a href="{}">{}</a>', obj.seed.admin_change_url, obj.seed)
def seed_contents(self, obj):
if not (obj.seed and obj.seed.uri):
return format_html('<i>None</i>')
if obj.seed.uri.startswith('file:///data/'):
source_file = DATA_DIR / obj.seed.uri.replace('file:///data/', '', 1)
contents = ""
@admin.display(description='URLs')
def seed_urls_editor(self, obj):
"""Combined editor showing seed URL and file contents."""
widget_id = f'seed_urls_{obj.pk}'
# Get the seed URI (or use urls field if no seed)
seed_uri = ''
if obj.seed and obj.seed.uri:
seed_uri = obj.seed.uri
elif obj.urls:
seed_uri = obj.urls
# Check if it's a local file we can edit
is_file = seed_uri.startswith('file:///data/')
contents = ""
error = None
source_file = None
if is_file:
source_file = DATA_DIR / seed_uri.replace('file:///data/', '', 1)
try:
contents = source_file.read_text().strip()[:14_000]
contents = source_file.read_text().strip()
except Exception as e:
contents = f'Error reading {source_file}: {e}'
return format_html('<b><code>{}</code>:</b><br/><pre>{}</pre>', source_file, contents)
return format_html('See URLs here: <a href="{}">{}</a>', obj.seed.uri, obj.seed.uri)
error = f'Error reading {source_file}: {e}'
# Escape for safe HTML embedding
escaped_uri = seed_uri.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;')
escaped_contents = (contents or '').replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;')
# Count lines for auto-expand logic
line_count = len(contents.split('\n')) if contents else 0
uri_rows = min(max(1, seed_uri.count('\n') + 1), 3)
html = f'''
<div id="{widget_id}_container" style="max-width: 900px;">
<!-- Seed URL input (auto-expands) -->
<div style="margin-bottom: 12px;">
<label style="font-weight: bold; display: block; margin-bottom: 4px;">Seed URL:</label>
<textarea id="{widget_id}_uri"
style="width: 100%; font-family: monospace; font-size: 13px;
padding: 8px; border: 1px solid #ccc; border-radius: 4px;
resize: vertical; min-height: 32px; overflow: hidden;"
rows="{uri_rows}"
placeholder="file:///data/sources/... or https://..."
{"readonly" if not obj.pk else ""}>{escaped_uri}</textarea>
</div>
{"" if not is_file else f'''
<!-- File contents editor -->
<div style="margin-bottom: 8px;">
<label style="font-weight: bold; display: block; margin-bottom: 4px;">
File Contents: <code style="font-weight: normal; color: #666;">{source_file}</code>
</label>
{"<div style='color: #dc3545; margin-bottom: 8px;'>" + error + "</div>" if error else ""}
<textarea id="{widget_id}_contents"
style="width: 100%; height: {min(400, max(150, line_count * 18))}px; font-family: monospace; font-size: 12px;
padding: 8px; border: 1px solid #ccc; border-radius: 4px; resize: vertical;"
placeholder="Enter URLs, one per line...">{escaped_contents}</textarea>
</div>
<div style="display: flex; gap: 12px; align-items: center; flex-wrap: wrap;">
<button type="button" id="{widget_id}_save_btn"
onclick="saveSeedUrls_{widget_id}()"
style="padding: 8px 20px; background: #417690; color: white; border: none;
border-radius: 4px; cursor: pointer; font-weight: bold;">
Save URLs
</button>
<span id="{widget_id}_line_count" style="color: #666; font-size: 12px;"></span>
<span id="{widget_id}_status" style="color: #666; font-size: 12px;"></span>
</div>
'''}
{"" if is_file else f'''
<div style="margin-top: 8px; color: #666;">
<a href="{seed_uri}" target="_blank">{seed_uri}</a>
</div>
'''}
<script>
(function() {{
var uriInput = document.getElementById('{widget_id}_uri');
var contentsInput = document.getElementById('{widget_id}_contents');
var status = document.getElementById('{widget_id}_status');
var lineCount = document.getElementById('{widget_id}_line_count');
var saveBtn = document.getElementById('{widget_id}_save_btn');
// Auto-resize URI input
function autoResizeUri() {{
uriInput.style.height = 'auto';
uriInput.style.height = Math.min(100, uriInput.scrollHeight) + 'px';
}}
uriInput.addEventListener('input', autoResizeUri);
autoResizeUri();
if (contentsInput) {{
function updateLineCount() {{
var lines = contentsInput.value.split('\\n').filter(function(l) {{ return l.trim(); }});
lineCount.textContent = lines.length + ' URLs';
}}
contentsInput.addEventListener('input', function() {{
updateLineCount();
if (status) {{
status.textContent = '(unsaved changes)';
status.style.color = '#c4820e';
}}
}});
updateLineCount();
}}
window.saveSeedUrls_{widget_id} = function() {{
if (!saveBtn) return;
saveBtn.disabled = true;
saveBtn.textContent = 'Saving...';
if (status) status.textContent = '';
fetch(window.location.pathname + 'save_seed_contents/', {{
method: 'POST',
headers: {{
'Content-Type': 'application/json',
'X-CSRFToken': document.querySelector('[name=csrfmiddlewaretoken]').value
}},
body: JSON.stringify({{ contents: contentsInput ? contentsInput.value : '' }})
}})
.then(function(response) {{ return response.json(); }})
.then(function(data) {{
if (data.success) {{
if (status) {{
status.textContent = '' + data.message;
status.style.color = '#28a745';
}}
}} else {{
if (status) {{
status.textContent = '' + data.error;
status.style.color = '#dc3545';
}}
}}
}})
.catch(function(err) {{
if (status) {{
status.textContent = '✗ Error: ' + err;
status.style.color = '#dc3545';
}}
}})
.finally(function() {{
saveBtn.disabled = false;
saveBtn.textContent = 'Save URLs';
}});
}};
}})();
</script>
</div>
'''
return mark_safe(html)
@@ -143,14 +358,14 @@ class CrawlScheduleAdmin(BaseModelAdmin):
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
(crawl.admin_change_url, crawl)
for crawl in obj.crawl_set.all().order_by('-created_at')[:20]
)) or format_html('<i>No Crawls yet...</i>')
)) or mark_safe('<i>No Crawls yet...</i>')
def snapshots(self, obj):
crawl_ids = obj.crawl_set.values_list('pk', flat=True)
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
(snapshot.admin_change_url, snapshot)
for snapshot in Snapshot.objects.filter(crawl_id__in=crawl_ids).order_by('-created_at')[:20]
)) or format_html('<i>No Snapshots yet...</i>')
)) or mark_safe('<i>No Snapshots yet...</i>')
def register_admin(admin_site):

View File

@@ -865,3 +865,189 @@ def export_plugin_config_to_env(
return env
# =============================================================================
# Plugin Template Discovery
# =============================================================================
#
# Plugins can provide custom templates for rendering their output in the UI.
# Templates are discovered by filename convention inside each plugin's templates/ dir:
#
# archivebox/plugins/<plugin_name>/
# templates/
# icon.html # Icon for admin table view (small inline HTML)
# thumbnail.html # Preview thumbnail for snapshot cards
# embed.html # Iframe embed content for main preview
# fullscreen.html # Fullscreen view template
#
# Template context variables available:
# {{ result }} - ArchiveResult object
# {{ snapshot }} - Parent Snapshot object
# {{ output_path }} - Path to output file/dir relative to snapshot dir
# {{ extractor }} - Extractor name (e.g., 'screenshot', 'singlefile')
#
# Default templates used when plugin doesn't provide one
DEFAULT_TEMPLATES = {
'icon': '''<span title="{{ extractor }}">{{ icon }}</span>''',
'thumbnail': '''
<img src="{{ output_path }}"
alt="{{ extractor }} output"
style="max-width: 100%; max-height: 100px; object-fit: cover;"
onerror="this.style.display='none'">
''',
'embed': '''
<iframe src="{{ output_path }}"
style="width: 100%; height: 100%; border: none;"
sandbox="allow-same-origin allow-scripts">
</iframe>
''',
'fullscreen': '''
<iframe src="{{ output_path }}"
style="width: 100%; height: 100vh; border: none;"
sandbox="allow-same-origin allow-scripts allow-forms">
</iframe>
''',
}
# Default icons for known extractors (emoji or short HTML)
DEFAULT_EXTRACTOR_ICONS = {
'screenshot': '📷',
'pdf': '📄',
'singlefile': '📦',
'dom': '🌐',
'wget': '📥',
'media': '🎬',
'git': '📂',
'readability': '📖',
'mercury': '☿️',
'favicon': '',
'title': '📝',
'headers': '📋',
'archive_org': '🏛️',
'htmltotext': '📃',
'warc': '🗄️',
}
def get_plugin_template(extractor: str, template_name: str) -> Optional[str]:
"""
Get a plugin template by extractor name and template type.
Args:
extractor: Extractor name (e.g., 'screenshot', '15_singlefile')
template_name: One of 'icon', 'thumbnail', 'embed', 'fullscreen'
Returns:
Template content as string, or None if not found.
"""
base_name = get_extractor_name(extractor)
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
if not base_dir.exists():
continue
# Look for plugin directory matching extractor name
for plugin_dir in base_dir.iterdir():
if not plugin_dir.is_dir():
continue
# Match by directory name (exact or partial)
if plugin_dir.name == base_name or plugin_dir.name.endswith(f'_{base_name}'):
template_path = plugin_dir / 'templates' / f'{template_name}.html'
if template_path.exists():
return template_path.read_text()
return None
def get_extractor_template(extractor: str, template_name: str) -> str:
"""
Get template for an extractor, falling back to defaults.
Args:
extractor: Extractor name (e.g., 'screenshot', '15_singlefile')
template_name: One of 'icon', 'thumbnail', 'embed', 'fullscreen'
Returns:
Template content as string (plugin template or default).
"""
# Try plugin-provided template first
template = get_plugin_template(extractor, template_name)
if template:
return template
# Fall back to default template
return DEFAULT_TEMPLATES.get(template_name, '')
def get_extractor_icon(extractor: str) -> str:
"""
Get the icon for an extractor.
First checks for plugin-provided icon.html template,
then falls back to DEFAULT_EXTRACTOR_ICONS.
Args:
extractor: Extractor name (e.g., 'screenshot', '15_singlefile')
Returns:
Icon HTML/emoji string.
"""
base_name = get_extractor_name(extractor)
# Try plugin-provided icon template
icon_template = get_plugin_template(extractor, 'icon')
if icon_template:
return icon_template.strip()
# Fall back to default icon
return DEFAULT_EXTRACTOR_ICONS.get(base_name, '📁')
def get_all_extractor_icons() -> Dict[str, str]:
"""
Get icons for all discovered extractors.
Returns:
Dict mapping extractor base names to their icons.
"""
icons = {}
for extractor in get_extractors():
base_name = get_extractor_name(extractor)
icons[base_name] = get_extractor_icon(extractor)
return icons
def discover_plugin_templates() -> Dict[str, Dict[str, str]]:
"""
Discover all plugin templates organized by extractor.
Returns:
Dict mapping extractor names to dicts of template_name -> template_path.
e.g., {'screenshot': {'icon': '/path/to/icon.html', 'thumbnail': '/path/to/thumbnail.html'}}
"""
templates: Dict[str, Dict[str, str]] = {}
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
if not base_dir.exists():
continue
for plugin_dir in base_dir.iterdir():
if not plugin_dir.is_dir():
continue
templates_dir = plugin_dir / 'templates'
if not templates_dir.exists():
continue
plugin_templates = {}
for template_file in templates_dir.glob('*.html'):
template_name = template_file.stem # icon, thumbnail, embed, fullscreen
plugin_templates[template_name] = str(template_file)
if plugin_templates:
templates[plugin_dir.name] = plugin_templates
return templates

View File

@@ -3,16 +3,16 @@ __package__ = 'archivebox.machine'
from django.contrib import admin
from django.utils.html import format_html
from archivebox.base_models.admin import BaseModelAdmin
from machine.models import Machine, NetworkInterface, InstalledBinary
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
from machine.models import Machine, NetworkInterface, InstalledBinary, Dependency
class MachineAdmin(BaseModelAdmin):
class MachineAdmin(ConfigEditorMixin, BaseModelAdmin):
list_display = ('id', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid', 'health')
sort_fields = ('id', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid')
readonly_fields = ('guid', 'created_at', 'modified_at', 'ips')
fields = (*readonly_fields, 'hostname', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'hw_uuid', 'os_arch', 'os_family', 'os_platform', 'os_kernel', 'os_release', 'stats', 'num_uses_succeeded', 'num_uses_failed')
fields = (*readonly_fields, 'hostname', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'hw_uuid', 'os_arch', 'os_family', 'os_platform', 'os_kernel', 'os_release', 'stats', 'config', 'num_uses_succeeded', 'num_uses_failed')
list_filter = ('hw_in_docker', 'hw_in_vm', 'os_arch', 'os_family', 'os_platform')
ordering = ['-created_at']
@@ -48,15 +48,43 @@ class NetworkInterfaceAdmin(BaseModelAdmin):
)
class DependencyAdmin(ConfigEditorMixin, BaseModelAdmin):
list_display = ('id', 'created_at', 'bin_name', 'bin_providers', 'is_installed', 'installed_count')
sort_fields = ('id', 'created_at', 'bin_name', 'bin_providers')
search_fields = ('id', 'bin_name', 'bin_providers')
readonly_fields = ('id', 'created_at', 'modified_at', 'is_installed', 'installed_count')
fields = ('bin_name', 'bin_providers', 'custom_cmds', 'config', *readonly_fields)
list_filter = ('bin_providers', 'created_at')
ordering = ['-created_at']
list_per_page = 100
actions = ["delete_selected"]
@admin.display(description='Installed', boolean=True)
def is_installed(self, dependency):
return dependency.is_installed
@admin.display(description='# Binaries')
def installed_count(self, dependency):
count = dependency.installed_binaries.count()
if count:
return format_html(
'<a href="/admin/machine/installedbinary/?dependency__id__exact={}">{}</a>',
dependency.id, count,
)
return '0'
class InstalledBinaryAdmin(BaseModelAdmin):
list_display = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'health')
list_display = ('id', 'created_at', 'machine_info', 'name', 'dependency_link', 'binprovider', 'version', 'abspath', 'sha256', 'health')
sort_fields = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256')
search_fields = ('id', 'machine__id', 'name', 'binprovider', 'version', 'abspath', 'sha256')
search_fields = ('id', 'machine__id', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'dependency__bin_name')
readonly_fields = ('created_at', 'modified_at')
fields = ('machine', 'name', 'binprovider', 'abspath', 'version', 'sha256', *readonly_fields, 'num_uses_succeeded', 'num_uses_failed')
fields = ('machine', 'dependency', 'name', 'binprovider', 'abspath', 'version', 'sha256', *readonly_fields, 'num_uses_succeeded', 'num_uses_failed')
list_filter = ('name', 'binprovider', 'machine_id')
list_filter = ('name', 'binprovider', 'machine_id', 'dependency')
ordering = ['-created_at']
list_per_page = 100
actions = ["delete_selected"]
@@ -68,8 +96,18 @@ class InstalledBinaryAdmin(BaseModelAdmin):
installed_binary.machine.id, str(installed_binary.machine.id)[:8], installed_binary.machine.hostname,
)
@admin.display(description='Dependency', ordering='dependency__bin_name')
def dependency_link(self, installed_binary):
if installed_binary.dependency:
return format_html(
'<a href="/admin/machine/dependency/{}/change">{}</a>',
installed_binary.dependency.id, installed_binary.dependency.bin_name,
)
return '-'
def register_admin(admin_site):
admin_site.register(Machine, MachineAdmin)
admin_site.register(NetworkInterface, NetworkInterfaceAdmin)
admin_site.register(Dependency, DependencyAdmin)
admin_site.register(InstalledBinary, InstalledBinaryAdmin)

View File

@@ -37,15 +37,13 @@ def apply_migrations(out_dir: Path = DATA_DIR) -> List[str]:
"""Apply pending Django migrations"""
from django.core.management import call_command
out1, out2 = StringIO(), StringIO()
out1 = StringIO()
call_command("migrate", interactive=False, database='default', stdout=out1)
out1.seek(0)
call_command("migrate", "huey_monitor", interactive=False, database='queue', stdout=out2)
out2.seek(0)
return [
line.strip() for line in out1.readlines() + out2.readlines() if line.strip()
line.strip() for line in out1.readlines() if line.strip()
]

View File

@@ -480,6 +480,138 @@ def printable_filesize(num_bytes: Union[int, float]) -> str:
return '%3.1f %s' % (num_bytes, 'TB')
@enforce_types
def format_duration(seconds: float) -> str:
"""Format duration in human-readable form."""
if seconds < 1:
return f'{seconds*1000:.0f}ms'
elif seconds < 60:
return f'{seconds:.1f}s'
elif seconds < 3600:
minutes = int(seconds // 60)
secs = int(seconds % 60)
return f'{minutes}min {secs}s' if secs else f'{minutes}min'
else:
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
return f'{hours}hr {minutes}min' if minutes else f'{hours}hr'
@enforce_types
def truncate_url(url: str, max_length: int = 60) -> str:
"""Truncate URL to max_length, keeping domain and adding ellipsis."""
if len(url) <= max_length:
return url
# Try to keep the domain and beginning of path
if '://' in url:
protocol, rest = url.split('://', 1)
if '/' in rest:
domain, path = rest.split('/', 1)
available = max_length - len(protocol) - len(domain) - 6 # for "://", "/", "..."
if available > 10:
return f'{protocol}://{domain}/{path[:available]}...'
# Fallback: just truncate
return url[:max_length-3] + '...'
@enforce_types
def log_worker_event(
worker_type: str,
event: str,
indent_level: int = 0,
pid: Optional[int] = None,
worker_id: Optional[str] = None,
url: Optional[str] = None,
extractor: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
error: Optional[Exception] = None,
) -> None:
"""
Log a worker event with structured metadata and indentation.
Args:
worker_type: Type of worker (Orchestrator, CrawlWorker, SnapshotWorker, etc.)
event: Event name (Starting, Completed, Failed, etc.)
indent_level: Indentation level (0=Orchestrator, 1=CrawlWorker, 2=SnapshotWorker, 3=ArchiveResultWorker)
pid: Process ID
worker_id: Worker ID (UUID for CrawlWorker, url for SnapshotWorker, extractor for ArchiveResultWorker)
url: URL being processed (for SnapshotWorker/ArchiveResultWorker)
extractor: Extractor name (for ArchiveResultWorker)
metadata: Dict of metadata to show in curly braces
error: Exception if event is an error
"""
indent = ' ' * indent_level
# Build worker identifier
worker_parts = [worker_type]
if pid:
worker_parts.append(f'pid={pid}')
if worker_id and worker_type in ('CrawlWorker', 'Orchestrator'):
worker_parts.append(f'id={worker_id}')
if url and worker_type == 'SnapshotWorker':
worker_parts.append(f'url={truncate_url(url)}')
if extractor and worker_type == 'ArchiveResultWorker':
worker_parts.append(f'extractor={extractor}')
worker_label = f'{worker_parts[0]}[{", ".join(worker_parts[1:])}]'
# Build metadata string
metadata_str = ''
if metadata:
# Format metadata nicely
meta_parts = []
for k, v in metadata.items():
if isinstance(v, float):
# Format floats nicely (durations, sizes)
if 'duration' in k.lower():
meta_parts.append(f'{k}: {format_duration(v)}')
elif 'size' in k.lower():
meta_parts.append(f'{k}: {printable_filesize(int(v))}')
else:
meta_parts.append(f'{k}: {v:.2f}')
elif isinstance(v, int):
# Format integers - check if it's a size
if 'size' in k.lower() or 'bytes' in k.lower():
meta_parts.append(f'{k}: {printable_filesize(v)}')
else:
meta_parts.append(f'{k}: {v}')
elif isinstance(v, (list, tuple)):
meta_parts.append(f'{k}: {len(v)}')
else:
meta_parts.append(f'{k}: {v}')
metadata_str = ' {' + ', '.join(meta_parts) + '}'
# Determine color based on event
color = 'white'
if event in ('Starting...', 'Started', 'STARTED', 'Started in background'):
color = 'green'
elif event in ('Processing...', 'PROCESSING'):
color = 'blue'
elif event in ('Completed', 'COMPLETED', 'All work complete'):
color = 'blue'
elif event in ('Failed', 'ERROR', 'Failed to spawn worker'):
color = 'red'
elif event in ('Shutting down', 'SHUTDOWN'):
color = 'grey53'
# Build final message
error_str = f' {type(error).__name__}: {error}' if error else ''
# Build colored message - worker_label needs to be inside color tags
# But first we need to format the color tags separately from the worker label
from archivebox.misc.logging import CONSOLE
from rich.text import Text
# Create a Rich Text object for proper formatting
text = Text()
text.append(indent) # Indentation
# Append worker label and event with color
text.append(f'{worker_label} {event}{error_str}', style=color)
# Append metadata without color
text.append(metadata_str)
CONSOLE.print(text)
@enforce_types
def printable_folders(folders: Dict[str, Optional["Snapshot"]], with_headers: bool=False) -> str:
return '\n'.join(

View File

@@ -0,0 +1 @@
🏛️

View File

@@ -7,7 +7,7 @@ new plugin-based output structure to the legacy canonical output paths that
ArchiveBox has historically used. This maintains backward compatibility with
existing tools and scripts that expect outputs at specific locations.
Canonical output paths (from Snapshot.canonical_outputs()):
Canonical output paths:
- favicon.ico → favicon/favicon.ico
- singlefile.html → singlefile/singlefile.html
- readability/content.html → readability/content.html
@@ -27,27 +27,20 @@ New plugin outputs:
- redirects.json → redirects/redirects.json
- console.jsonl → consolelog/console.jsonl
Usage: on_Snapshot__91_canonical_outputs.py --url=<url> --snapshot-id=<uuid>
Usage: on_Snapshot__92_canonical_outputs.py --url=<url> --snapshot-id=<uuid>
Environment variables:
SAVE_CANONICAL_SYMLINKS: Enable canonical symlinks (default: true)
DATA_DIR: ArchiveBox data directory
ARCHIVE_DIR: Archive output directory
"""
__package__ = 'archivebox.plugins.canonical_outputs'
import os
import sys
import json
from pathlib import Path
from typing import Dict, Optional
# Configure Django if running standalone
if __name__ == '__main__':
parent_dir = str(Path(__file__).resolve().parent.parent.parent)
if parent_dir not in sys.path:
sys.path.insert(0, parent_dir)
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
import django
django.setup()
from datetime import datetime, timezone
from typing import Dict
import rich_click as click
@@ -150,10 +143,7 @@ def create_canonical_symlinks(snapshot_dir: Path) -> Dict[str, bool]:
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Create symlinks from plugin outputs to canonical legacy locations."""
from datetime import datetime
from archivebox.core.models import Snapshot
start_ts = datetime.now()
start_ts = datetime.now(timezone.utc)
status = 'failed'
output = None
error = ''
@@ -161,31 +151,20 @@ def main(url: str, snapshot_id: str):
try:
# Check if enabled
from archivebox.config import CONSTANTS
save_canonical = os.getenv('SAVE_CANONICAL_SYMLINKS', 'true').lower() in ('true', '1', 'yes', 'on')
if not save_canonical:
click.echo('Skipping canonical symlinks (SAVE_CANONICAL_SYMLINKS=False)')
status = 'skipped'
end_ts = datetime.now()
click.echo(f'START_TS={start_ts.isoformat()}')
click.echo(f'END_TS={end_ts.isoformat()}')
click.echo(f'STATUS={status}')
click.echo(f'RESULT_JSON={{"extractor": "canonical_outputs", "status": "{status}", "url": "{url}", "snapshot_id": "{snapshot_id}"}}')
click.echo(json.dumps({'status': status, 'output': 'SAVE_CANONICAL_SYMLINKS=false'}))
sys.exit(0)
# Get snapshot
try:
snapshot = Snapshot.objects.get(id=snapshot_id)
except Snapshot.DoesNotExist:
error = f'Snapshot {snapshot_id} not found'
raise ValueError(error)
# Working directory is the extractor output dir (e.g., <snapshot>/canonical_outputs/)
# Parent is the snapshot directory
output_dir = Path.cwd()
snapshot_dir = output_dir.parent
# Get snapshot directory
snapshot_dir = Path(snapshot.output_dir)
if not snapshot_dir.exists():
error = f'Snapshot directory not found: {snapshot_dir}'
raise FileNotFoundError(error)
raise FileNotFoundError(f'Snapshot directory not found: {snapshot_dir}')
# Create canonical symlinks
results = create_canonical_symlinks(snapshot_dir)
@@ -203,37 +182,18 @@ def main(url: str, snapshot_id: str):
status = 'failed'
click.echo(f'Error: {error}', err=True)
end_ts = datetime.now()
duration = (end_ts - start_ts).total_seconds()
end_ts = datetime.now(timezone.utc)
# Print results
click.echo(f'START_TS={start_ts.isoformat()}')
click.echo(f'END_TS={end_ts.isoformat()}')
click.echo(f'DURATION={duration:.2f}')
if output:
click.echo(f'OUTPUT={output}')
click.echo(f'STATUS={status}')
if error:
click.echo(f'ERROR={error}', err=True)
# Print JSON result
import json
result_json = {
'extractor': 'canonical_outputs',
'url': url,
'snapshot_id': snapshot_id,
# Print JSON result for hook runner
result = {
'status': status,
'start_ts': start_ts.isoformat(),
'end_ts': end_ts.isoformat(),
'duration': round(duration, 2),
'output': output,
'symlinks_created': symlinks_created,
'error': error or None,
'symlinks_created': symlinks_created,
}
click.echo(f'RESULT_JSON={json.dumps(result_json)}')
click.echo(json.dumps(result))
sys.exit(0 if status == 'succeeded' else 1)
sys.exit(0 if status in ('succeeded', 'skipped') else 1)
if __name__ == '__main__':

View File

@@ -1,149 +0,0 @@
#!/usr/bin/env python3
"""
Install Chrome/Chromium if not already available.
Runs at crawl start to ensure Chrome is installed.
Uses playwright to install chromium if no system Chrome found.
Outputs JSONL for InstalledBinary.
"""
import json
import sys
import os
import shutil
from pathlib import Path
def find_chrome():
"""Try to find system Chrome/Chromium."""
# Comprehensive list of Chrome/Chromium binary names and paths
chromium_names_linux = [
'chromium',
'chromium-browser',
'chromium-browser-beta',
'chromium-browser-unstable',
'chromium-browser-canary',
'chromium-browser-dev',
]
chrome_names_linux = [
'google-chrome',
'google-chrome-stable',
'google-chrome-beta',
'google-chrome-canary',
'google-chrome-unstable',
'google-chrome-dev',
'chrome',
]
chrome_paths_macos = [
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
'/Applications/Chromium.app/Contents/MacOS/Chromium',
]
chrome_paths_linux = [
'/usr/bin/google-chrome',
'/usr/bin/google-chrome-stable',
'/usr/bin/chromium',
'/usr/bin/chromium-browser',
'/snap/bin/chromium',
'/opt/google/chrome/chrome',
]
all_chrome_names = chrome_names_linux + chromium_names_linux
all_chrome_paths = chrome_paths_macos + chrome_paths_linux
# Check env var first
env_path = os.environ.get('CHROME_BINARY', '')
if env_path and Path(env_path).is_file():
return env_path
# Try shutil.which for various names
for name in all_chrome_names:
abspath = shutil.which(name)
if abspath:
return abspath
# Check common paths
for path in all_chrome_paths:
if Path(path).is_file():
return path
return None
def main():
try:
# First try to find system Chrome
system_chrome = find_chrome()
if system_chrome:
print(json.dumps({
'type': 'InstalledBinary',
'name': 'chrome',
'abspath': str(system_chrome),
'version': None,
'sha256': None,
'binprovider': 'env',
}))
sys.exit(0)
# If not found in system, try to install chromium via apt/brew
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
AptProvider.model_rebuild()
BrewProvider.model_rebuild()
EnvProvider.model_rebuild()
# Try chromium-browser or chromium via system package managers
for binary_name in ['chromium', 'chromium-browser', 'google-chrome']:
try:
chrome_binary = Binary(
name=binary_name,
binproviders=[AptProvider(), BrewProvider(), EnvProvider()]
)
# Try to load, install if not found
try:
loaded = chrome_binary.load()
if not loaded or not loaded.abspath:
raise Exception("Not loaded")
except Exception:
# Install via system package manager
loaded = chrome_binary.install()
if loaded and loaded.abspath:
# Output InstalledBinary JSONL
print(json.dumps({
'type': 'InstalledBinary',
'name': 'chrome',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256,
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
}))
sys.exit(0)
except Exception:
continue
# If all attempts failed
print(json.dumps({
'type': 'Dependency',
'bin_name': 'chrome',
'bin_providers': 'apt,brew,env',
}))
print("Failed to install Chrome/Chromium", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'chrome',
'bin_providers': 'apt,brew,env',
}))
print(f"Error installing Chrome: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -2,7 +2,7 @@
Integration tests for chrome_session plugin
Tests verify:
1. Install hook finds system Chrome or installs chromium
1. Validate hook checks for Chrome/Chromium binary
2. Verify deps with abx-pkg
3. Chrome session script exists
"""
@@ -14,7 +14,7 @@ from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
CHROME_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_chrome.py'
CHROME_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_chrome.py'
CHROME_SESSION_HOOK = PLUGIN_DIR / 'on_Snapshot__20_chrome_session.js'
@@ -23,37 +23,50 @@ def test_hook_script_exists():
assert CHROME_SESSION_HOOK.exists(), f"Hook not found: {CHROME_SESSION_HOOK}"
def test_chrome_install_hook():
"""Test chrome install hook to find or install Chrome/Chromium."""
def test_chrome_validate_hook():
"""Test chrome validate hook checks for Chrome/Chromium binary."""
result = subprocess.run(
[sys.executable, str(CHROME_INSTALL_HOOK)],
[sys.executable, str(CHROME_VALIDATE_HOOK)],
capture_output=True,
text=True,
timeout=600
timeout=30
)
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
# Verify InstalledBinary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'InstalledBinary':
assert record['name'] == 'chrome'
assert record['abspath']
assert Path(record['abspath']).exists(), f"Chrome binary should exist at {record['abspath']}"
found_binary = True
break
except json.JSONDecodeError:
pass
assert found_binary, "Should output InstalledBinary record"
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
if result.returncode == 0:
# Binary found - verify InstalledBinary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'InstalledBinary':
assert record['name'] == 'chrome'
assert record['abspath']
assert Path(record['abspath']).exists(), f"Chrome binary should exist at {record['abspath']}"
found_binary = True
break
except json.JSONDecodeError:
pass
assert found_binary, "Should output InstalledBinary record when binary found"
else:
# Binary not found - verify Dependency JSONL output
found_dependency = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'Dependency':
assert record['bin_name'] == 'chrome'
found_dependency = True
break
except json.JSONDecodeError:
pass
assert found_dependency, "Should output Dependency record when binary not found"
def test_verify_deps_with_abx_pkg():
"""Verify chrome is available via abx-pkg after hook installation."""
"""Verify chrome is available via abx-pkg."""
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
AptProvider.model_rebuild()
@@ -75,10 +88,10 @@ def test_verify_deps_with_abx_pkg():
except Exception:
continue
# If we get here, chrome should still be available from system
# If we get here, chrome not available
import shutil
assert shutil.which('chromium') or shutil.which('chrome') or shutil.which('google-chrome'), \
"Chrome should be available after install hook"
if not (shutil.which('chromium') or shutil.which('chrome') or shutil.which('google-chrome')):
pytest.skip("Chrome/Chromium not available - Dependency record should have been emitted")
if __name__ == '__main__':

View File

@@ -0,0 +1,6 @@
<!-- DOM embed - full iframe of captured DOM HTML -->
<iframe src="{{ output_path }}"
class="extractor-embed dom-embed"
style="width: 100%; height: 100%; min-height: 500px; border: none;"
sandbox="allow-same-origin allow-scripts allow-forms">
</iframe>

View File

@@ -0,0 +1,6 @@
<!-- DOM fullscreen - full page iframe -->
<iframe src="{{ output_path }}"
class="extractor-fullscreen dom-fullscreen"
style="width: 100%; height: 100vh; border: none;"
sandbox="allow-same-origin allow-scripts allow-forms allow-top-navigation-by-user-activation">
</iframe>

View File

@@ -0,0 +1 @@
🌐

View File

@@ -0,0 +1,8 @@
<!-- DOM thumbnail - scaled down iframe preview of captured DOM HTML -->
<div class="extractor-thumbnail dom-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fff;">
<iframe src="{{ output_path }}"
style="width: 400%; height: 400px; transform: scale(0.25); transform-origin: top left; pointer-events: none; border: none;"
loading="lazy"
sandbox="allow-same-origin">
</iframe>
</div>

View File

@@ -0,0 +1 @@

View File

@@ -1,68 +0,0 @@
#!/usr/bin/env python3
"""
Install git if not already available.
Runs at crawl start to ensure git is installed.
Outputs JSONL for InstalledBinary.
"""
import json
import sys
from pathlib import Path
def main():
try:
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
AptProvider.model_rebuild()
BrewProvider.model_rebuild()
EnvProvider.model_rebuild()
# git binary and package have same name
git_binary = Binary(
name='git',
binproviders=[AptProvider(), BrewProvider(), EnvProvider()]
)
# Try to load, install if not found
try:
loaded = git_binary.load()
if not loaded or not loaded.abspath:
raise Exception("Not loaded")
except Exception:
# Install via system package manager
loaded = git_binary.install()
if loaded and loaded.abspath:
# Output InstalledBinary JSONL
print(json.dumps({
'type': 'InstalledBinary',
'name': 'git',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256,
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
}))
sys.exit(0)
else:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'git',
'bin_providers': 'apt,brew,env',
}))
print("Failed to install git", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'git',
'bin_providers': 'apt,brew,env',
}))
print(f"Error installing git: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,6 @@
<!-- Git embed - directory listing of cloned repo -->
<iframe src="{{ output_path }}"
class="extractor-embed git-embed"
style="width: 100%; height: 100%; min-height: 400px; border: none; background: #fff;"
sandbox="allow-same-origin">
</iframe>

View File

@@ -0,0 +1,6 @@
<!-- Git fullscreen - full directory listing -->
<iframe src="{{ output_path }}"
class="extractor-fullscreen git-fullscreen"
style="width: 100%; height: 100vh; border: none; background: #fff;"
sandbox="allow-same-origin">
</iframe>

View File

@@ -0,0 +1 @@
📂

View File

@@ -0,0 +1,5 @@
<!-- Git thumbnail - shows git repository icon and info -->
<div class="extractor-thumbnail git-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #f6f8fa; display: flex; flex-direction: column; align-items: center; justify-content: center; padding: 10px;">
<span style="font-size: 32px;">📂</span>
<span style="font-size: 11px; color: #586069; margin-top: 4px;">Git Repository</span>
</div>

View File

@@ -2,7 +2,7 @@
Integration tests for git plugin
Tests verify:
1. Install hook installs git via abx-pkg
1. Validate hook checks for git binary
2. Verify deps with abx-pkg
3. Standalone git extractor execution
"""
@@ -17,50 +17,64 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
GIT_HOOK = PLUGIN_DIR / 'on_Snapshot__12_git.py'
GIT_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_git.py'
GIT_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_git.py'
TEST_URL = 'https://github.com/example/repo.git'
def test_hook_script_exists():
assert GIT_HOOK.exists()
def test_git_install_hook():
"""Test git install hook to install git if needed."""
def test_git_validate_hook():
"""Test git validate hook checks for git binary."""
result = subprocess.run(
[sys.executable, str(GIT_INSTALL_HOOK)],
[sys.executable, str(GIT_VALIDATE_HOOK)],
capture_output=True,
text=True,
timeout=600
timeout=30
)
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
# Verify InstalledBinary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'InstalledBinary':
assert record['name'] == 'git'
assert record['abspath']
found_binary = True
break
except json.JSONDecodeError:
pass
assert found_binary, "Should output InstalledBinary record"
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
if result.returncode == 0:
# Binary found - verify InstalledBinary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'InstalledBinary':
assert record['name'] == 'git'
assert record['abspath']
found_binary = True
break
except json.JSONDecodeError:
pass
assert found_binary, "Should output InstalledBinary record when binary found"
else:
# Binary not found - verify Dependency JSONL output
found_dependency = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'Dependency':
assert record['bin_name'] == 'git'
assert 'env' in record['bin_providers']
found_dependency = True
break
except json.JSONDecodeError:
pass
assert found_dependency, "Should output Dependency record when binary not found"
def test_verify_deps_with_abx_pkg():
"""Verify git is available via abx-pkg after hook installation."""
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
AptProvider.model_rebuild()
BrewProvider.model_rebuild()
EnvProvider.model_rebuild()
"""Verify git is available via abx-pkg."""
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
git_binary = Binary(name='git', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
git_loaded = git_binary.load()
assert git_loaded and git_loaded.abspath, "git should be available after install hook"
if git_loaded and git_loaded.abspath:
assert True, "git is available"
else:
pytest.skip("git not available - Dependency record should have been emitted")
def test_reports_missing_git():
with tempfile.TemporaryDirectory() as tmpdir:

View File

@@ -0,0 +1 @@
📋

View File

@@ -0,0 +1 @@
📃

View File

@@ -1,67 +0,0 @@
#!/usr/bin/env python3
"""
Install yt-dlp if not already available.
Runs at crawl start to ensure yt-dlp is installed.
Outputs JSONL for InstalledBinary.
"""
import json
import sys
from pathlib import Path
def main():
try:
from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides
PipProvider.model_rebuild()
EnvProvider.model_rebuild()
# yt-dlp binary and package have same name
ytdlp_binary = Binary(
name='yt-dlp',
binproviders=[PipProvider(), EnvProvider()]
)
# Try to load, install if not found
try:
loaded = ytdlp_binary.load()
if not loaded or not loaded.abspath:
raise Exception("Not loaded")
except Exception:
# Install via pip
loaded = ytdlp_binary.install()
if loaded and loaded.abspath:
# Output InstalledBinary JSONL
print(json.dumps({
'type': 'InstalledBinary',
'name': 'yt-dlp',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256,
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
}))
sys.exit(0)
else:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'yt-dlp',
'bin_providers': 'pip,brew,env',
}))
print("Failed to install yt-dlp", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'yt-dlp',
'bin_providers': 'pip,brew,env',
}))
print(f"Error installing yt-dlp: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,278 @@
#!/usr/bin/env python3
"""
Validation hook for yt-dlp and its dependencies (node, ffmpeg).
Runs at crawl start to verify yt-dlp and required binaries are available.
Outputs JSONL for InstalledBinary and Machine config updates.
"""
import os
import sys
import json
import shutil
import hashlib
import subprocess
from pathlib import Path
def get_binary_version(abspath: str, version_flag: str = '--version') -> str | None:
"""Get version string from binary."""
try:
result = subprocess.run(
[abspath, version_flag],
capture_output=True,
text=True,
timeout=5,
)
if result.returncode == 0 and result.stdout:
first_line = result.stdout.strip().split('\n')[0]
return first_line[:64]
except Exception:
pass
return None
def get_binary_hash(abspath: str) -> str | None:
"""Get SHA256 hash of binary."""
try:
with open(abspath, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
except Exception:
return None
def find_ytdlp() -> dict | None:
"""Find yt-dlp binary."""
try:
from abx_pkg import Binary, PipProvider, EnvProvider
class YtdlpBinary(Binary):
name: str = 'yt-dlp'
binproviders_supported = [PipProvider(), EnvProvider()]
binary = YtdlpBinary()
loaded = binary.load()
if loaded and loaded.abspath:
return {
'name': 'yt-dlp',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
}
except ImportError:
pass
except Exception:
pass
# Fallback to shutil.which
abspath = shutil.which('yt-dlp') or os.environ.get('YTDLP_BINARY', '')
if abspath and Path(abspath).is_file():
return {
'name': 'yt-dlp',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'env',
}
return None
def find_node() -> dict | None:
"""Find node binary."""
try:
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
class NodeBinary(Binary):
name: str = 'node'
binproviders_supported = [AptProvider(), BrewProvider(), EnvProvider()]
overrides: dict = {'apt': {'packages': ['nodejs']}}
binary = NodeBinary()
loaded = binary.load()
if loaded and loaded.abspath:
return {
'name': 'node',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
}
except ImportError:
pass
except Exception:
pass
# Fallback to shutil.which
abspath = shutil.which('node') or os.environ.get('NODE_BINARY', '')
if abspath and Path(abspath).is_file():
return {
'name': 'node',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'env',
}
return None
def find_ffmpeg() -> dict | None:
"""Find ffmpeg binary."""
try:
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
class FfmpegBinary(Binary):
name: str = 'ffmpeg'
binproviders_supported = [AptProvider(), BrewProvider(), EnvProvider()]
binary = FfmpegBinary()
loaded = binary.load()
if loaded and loaded.abspath:
return {
'name': 'ffmpeg',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
}
except ImportError:
pass
except Exception:
pass
# Fallback to shutil.which
abspath = shutil.which('ffmpeg') or os.environ.get('FFMPEG_BINARY', '')
if abspath and Path(abspath).is_file():
return {
'name': 'ffmpeg',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'env',
}
return None
def main():
# Check for yt-dlp (required)
ytdlp_result = find_ytdlp()
# Check for node (required for JS extraction)
node_result = find_node()
# Check for ffmpeg (required for video conversion)
ffmpeg_result = find_ffmpeg()
missing_deps = []
# Emit results for yt-dlp
if ytdlp_result and ytdlp_result.get('abspath'):
print(json.dumps({
'type': 'InstalledBinary',
'name': ytdlp_result['name'],
'abspath': ytdlp_result['abspath'],
'version': ytdlp_result['version'],
'sha256': ytdlp_result['sha256'],
'binprovider': ytdlp_result['binprovider'],
}))
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/YTDLP_BINARY',
'value': ytdlp_result['abspath'],
}))
if ytdlp_result['version']:
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/YTDLP_VERSION',
'value': ytdlp_result['version'],
}))
else:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'yt-dlp',
'bin_providers': 'pip,env',
}))
missing_deps.append('yt-dlp')
# Emit results for node
if node_result and node_result.get('abspath'):
print(json.dumps({
'type': 'InstalledBinary',
'name': node_result['name'],
'abspath': node_result['abspath'],
'version': node_result['version'],
'sha256': node_result['sha256'],
'binprovider': node_result['binprovider'],
}))
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/NODE_BINARY',
'value': node_result['abspath'],
}))
if node_result['version']:
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/NODE_VERSION',
'value': node_result['version'],
}))
else:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'node',
'bin_providers': 'apt,brew,env',
}))
missing_deps.append('node')
# Emit results for ffmpeg
if ffmpeg_result and ffmpeg_result.get('abspath'):
print(json.dumps({
'type': 'InstalledBinary',
'name': ffmpeg_result['name'],
'abspath': ffmpeg_result['abspath'],
'version': ffmpeg_result['version'],
'sha256': ffmpeg_result['sha256'],
'binprovider': ffmpeg_result['binprovider'],
}))
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/FFMPEG_BINARY',
'value': ffmpeg_result['abspath'],
}))
if ffmpeg_result['version']:
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/FFMPEG_VERSION',
'value': ffmpeg_result['version'],
}))
else:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'ffmpeg',
'bin_providers': 'apt,brew,env',
}))
missing_deps.append('ffmpeg')
if missing_deps:
print(f"Missing dependencies: {', '.join(missing_deps)}", file=sys.stderr)
sys.exit(1)
else:
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,9 @@
<!-- Media embed - video/audio player -->
<div class="extractor-embed media-embed" style="width: 100%; height: 100%; min-height: 400px; background: #1a1a1a; display: flex; align-items: center; justify-content: center;">
<video src="{{ output_path }}"
style="max-width: 100%; max-height: 100%;"
controls
preload="metadata">
Your browser does not support the video tag.
</video>
</div>

View File

@@ -0,0 +1,10 @@
<!-- Media fullscreen - full video/audio player -->
<div class="extractor-fullscreen media-fullscreen" style="width: 100%; height: 100vh; background: #000; display: flex; align-items: center; justify-content: center;">
<video src="{{ output_path }}"
style="max-width: 100%; max-height: 100%;"
controls
autoplay
preload="auto">
Your browser does not support the video tag.
</video>
</div>

View File

@@ -0,0 +1 @@
🎬

View File

@@ -0,0 +1,14 @@
<!-- Media thumbnail - shows video/audio player or placeholder -->
<div class="extractor-thumbnail media-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #1a1a1a; display: flex; align-items: center; justify-content: center;">
<video src="{{ output_path }}"
style="width: 100%; height: 100px; object-fit: contain;"
poster=""
preload="metadata"
muted
onerror="this.style.display='none'; this.nextElementSibling.style.display='flex';">
</video>
<div style="display: none; flex-direction: column; align-items: center; color: #888; font-size: 12px;">
<span style="font-size: 32px;">🎬</span>
<span>Media</span>
</div>
</div>

View File

@@ -21,7 +21,7 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
MEDIA_HOOK = PLUGIN_DIR / 'on_Snapshot__51_media.py'
MEDIA_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_ytdlp.py'
MEDIA_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_ytdlp.py'
TEST_URL = 'https://example.com/video.mp4'
def test_hook_script_exists():
@@ -29,46 +29,72 @@ def test_hook_script_exists():
assert MEDIA_HOOK.exists(), f"Hook not found: {MEDIA_HOOK}"
def test_ytdlp_install_hook():
"""Test yt-dlp install hook to install yt-dlp if needed."""
# Run yt-dlp install hook
def test_ytdlp_validate_hook():
"""Test yt-dlp validate hook checks for yt-dlp and dependencies (node, ffmpeg)."""
# Run yt-dlp validate hook
result = subprocess.run(
[sys.executable, str(MEDIA_INSTALL_HOOK)],
[sys.executable, str(MEDIA_VALIDATE_HOOK)],
capture_output=True,
text=True,
timeout=600
timeout=30
)
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
# Hook exits 0 if all binaries found, 1 if any not found
# Parse output for InstalledBinary and Dependency records
found_binaries = {'node': False, 'ffmpeg': False, 'yt-dlp': False}
found_dependencies = {'node': False, 'ffmpeg': False, 'yt-dlp': False}
# Verify InstalledBinary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'InstalledBinary':
assert record['name'] == 'yt-dlp'
assert record['abspath']
found_binary = True
break
name = record['name']
if name in found_binaries:
assert record['abspath'], f"{name} should have abspath"
found_binaries[name] = True
elif record.get('type') == 'Dependency':
name = record['bin_name']
if name in found_dependencies:
found_dependencies[name] = True
except json.JSONDecodeError:
pass
assert found_binary, "Should output InstalledBinary record"
# Each binary should either be found (InstalledBinary) or missing (Dependency)
for binary_name in ['yt-dlp', 'node', 'ffmpeg']:
assert found_binaries[binary_name] or found_dependencies[binary_name], \
f"{binary_name} should have either InstalledBinary or Dependency record"
def test_verify_deps_with_abx_pkg():
"""Verify yt-dlp is available via abx-pkg after hook installation."""
from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides
"""Verify yt-dlp, node, and ffmpeg are available via abx-pkg."""
from abx_pkg import Binary, PipProvider, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
PipProvider.model_rebuild()
EnvProvider.model_rebuild()
missing_binaries = []
# Verify yt-dlp is available
ytdlp_binary = Binary(name='yt-dlp', binproviders=[PipProvider(), EnvProvider()])
ytdlp_loaded = ytdlp_binary.load()
assert ytdlp_loaded and ytdlp_loaded.abspath, "yt-dlp should be available after install hook"
if not (ytdlp_loaded and ytdlp_loaded.abspath):
missing_binaries.append('yt-dlp')
# Verify node is available (yt-dlp needs it for JS extraction)
node_binary = Binary(
name='node',
binproviders=[AptProvider(), BrewProvider(), EnvProvider()]
)
node_loaded = node_binary.load()
if not (node_loaded and node_loaded.abspath):
missing_binaries.append('node')
# Verify ffmpeg is available (yt-dlp needs it for video conversion)
ffmpeg_binary = Binary(name='ffmpeg', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
ffmpeg_loaded = ffmpeg_binary.load()
if not (ffmpeg_loaded and ffmpeg_loaded.abspath):
missing_binaries.append('ffmpeg')
if missing_binaries:
pytest.skip(f"Binaries not available: {', '.join(missing_binaries)} - Dependency records should have been emitted")
def test_handles_non_media_url():
"""Test that media extractor handles non-media URLs gracefully via hook."""

View File

@@ -1,68 +0,0 @@
#!/usr/bin/env python3
"""
Install mercury-parser if not already available.
Runs at crawl start to ensure mercury-parser is installed.
Outputs JSONL for InstalledBinary.
"""
import json
import sys
from pathlib import Path
def main():
try:
from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
NpmProvider.model_rebuild()
EnvProvider.model_rebuild()
# Note: npm package is @postlight/mercury-parser, binary is mercury-parser
mercury_binary = Binary(
name='mercury-parser',
binproviders=[NpmProvider(), EnvProvider()],
overrides={'npm': {'packages': ['@postlight/mercury-parser']}}
)
# Try to load, install if not found
try:
loaded = mercury_binary.load()
if not loaded or not loaded.abspath:
raise Exception("Not loaded")
except Exception:
# Install via npm
loaded = mercury_binary.install()
if loaded and loaded.abspath:
# Output InstalledBinary JSONL
print(json.dumps({
'type': 'InstalledBinary',
'name': 'mercury-parser',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256,
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
}))
sys.exit(0)
else:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'mercury-parser',
'bin_providers': 'npm,env',
}))
print("Failed to install mercury-parser", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'mercury-parser',
'bin_providers': 'npm,env',
}))
print(f"Error installing mercury-parser: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,123 @@
#!/usr/bin/env python3
"""
Validation hook for postlight-parser binary.
Runs at crawl start to verify postlight-parser is available.
Outputs JSONL for InstalledBinary and Machine config updates.
"""
import os
import sys
import json
import shutil
import hashlib
import subprocess
from pathlib import Path
def get_binary_version(abspath: str) -> str | None:
"""Get version string from binary."""
try:
result = subprocess.run(
[abspath, '--version'],
capture_output=True,
text=True,
timeout=5,
)
if result.returncode == 0 and result.stdout:
first_line = result.stdout.strip().split('\n')[0]
return first_line[:64]
except Exception:
pass
return None
def get_binary_hash(abspath: str) -> str | None:
"""Get SHA256 hash of binary."""
try:
with open(abspath, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
except Exception:
return None
def find_mercury() -> dict | None:
"""Find postlight-parser binary."""
try:
from abx_pkg import Binary, NpmProvider, EnvProvider
class MercuryBinary(Binary):
name: str = 'postlight-parser'
binproviders_supported = [NpmProvider(), EnvProvider()]
overrides: dict = {'npm': {'packages': ['@postlight/parser']}}
binary = MercuryBinary()
loaded = binary.load()
if loaded and loaded.abspath:
return {
'name': 'postlight-parser',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
}
except ImportError:
pass
except Exception:
pass
# Fallback to shutil.which
abspath = shutil.which('postlight-parser') or os.environ.get('MERCURY_BINARY', '')
if abspath and Path(abspath).is_file():
return {
'name': 'postlight-parser',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'env',
}
return None
def main():
result = find_mercury()
if result and result.get('abspath'):
print(json.dumps({
'type': 'InstalledBinary',
'name': result['name'],
'abspath': result['abspath'],
'version': result['version'],
'sha256': result['sha256'],
'binprovider': result['binprovider'],
}))
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/MERCURY_BINARY',
'value': result['abspath'],
}))
if result['version']:
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/MERCURY_VERSION',
'value': result['version'],
}))
sys.exit(0)
else:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'postlight-parser',
'bin_providers': 'npm,env',
}))
print(f"postlight-parser binary not found", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -6,10 +6,10 @@ Usage: on_Snapshot__mercury.py --url=<url> --snapshot-id=<uuid>
Output: Creates mercury/ directory with content.html, content.txt, article.json
Environment variables:
MERCURY_BINARY: Path to mercury-parser binary
MERCURY_BINARY: Path to postlight-parser binary
TIMEOUT: Timeout in seconds (default: 60)
Note: Requires mercury-parser: npm install -g @postlight/mercury-parser
Note: Requires postlight-parser: npm install -g @postlight/parser
"""
import json
@@ -25,7 +25,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'mercury'
BIN_NAME = 'mercury-parser'
BIN_NAME = 'postlight-parser'
BIN_PROVIDERS = 'npm,env'
OUTPUT_DIR = 'mercury'
@@ -42,12 +42,12 @@ def get_env_int(name: str, default: int = 0) -> int:
def find_mercury() -> str | None:
"""Find mercury-parser binary."""
"""Find postlight-parser binary."""
mercury = get_env('MERCURY_BINARY')
if mercury and os.path.isfile(mercury):
return mercury
for name in ['mercury-parser', 'mercury']:
for name in ['postlight-parser']:
binary = shutil.which(name)
if binary:
return binary
@@ -56,7 +56,7 @@ def find_mercury() -> str | None:
def get_version(binary: str) -> str:
"""Get mercury-parser version."""
"""Get postlight-parser version."""
try:
result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
return result.stdout.strip()[:64]
@@ -83,12 +83,12 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
if result_text.returncode != 0:
stderr = result_text.stderr.decode('utf-8', errors='replace')
return False, None, f'mercury-parser failed: {stderr[:200]}'
return False, None, f'postlight-parser failed: {stderr[:200]}'
try:
text_json = json.loads(result_text.stdout)
except json.JSONDecodeError:
return False, None, 'mercury-parser returned invalid JSON'
return False, None, 'postlight-parser returned invalid JSON'
if text_json.get('failed'):
return False, None, 'Mercury was not able to extract article'
@@ -139,7 +139,7 @@ def main(url: str, snapshot_id: str):
# Find binary
binary = find_mercury()
if not binary:
print(f'ERROR: mercury-parser binary not found', file=sys.stderr)
print(f'ERROR: postlight-parser binary not found', file=sys.stderr)
print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
sys.exit(1)

View File

@@ -0,0 +1,6 @@
<!-- Mercury embed - Mercury parser article view -->
<iframe src="{{ output_path }}"
class="extractor-embed mercury-embed"
style="width: 100%; height: 100%; min-height: 500px; border: none; background: #fefefe;"
sandbox="allow-same-origin">
</iframe>

View File

@@ -0,0 +1,6 @@
<!-- Mercury fullscreen - full Mercury parser article -->
<iframe src="{{ output_path }}"
class="extractor-fullscreen mercury-fullscreen"
style="width: 100%; height: 100vh; border: none; background: #fefefe;"
sandbox="allow-same-origin">
</iframe>

View File

@@ -0,0 +1 @@
☿️

View File

@@ -0,0 +1,8 @@
<!-- Mercury thumbnail - shows Mercury parser extracted article content -->
<div class="extractor-thumbnail mercury-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fefefe; padding: 8px; font-family: Georgia, serif; font-size: 11px; line-height: 1.4; color: #333;">
<iframe src="{{ output_path }}"
style="width: 100%; height: 300px; border: none; pointer-events: none;"
loading="lazy"
sandbox="allow-same-origin">
</iframe>
</div>

View File

@@ -21,7 +21,7 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
MERCURY_HOOK = PLUGIN_DIR / 'on_Snapshot__53_mercury.py'
MERCURY_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_mercury.py'
MERCURY_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_mercury.py'
TEST_URL = 'https://example.com'
def test_hook_script_exists():
@@ -29,53 +29,70 @@ def test_hook_script_exists():
assert MERCURY_HOOK.exists(), f"Hook not found: {MERCURY_HOOK}"
def test_mercury_install_hook():
"""Test mercury install hook to install mercury-parser if needed."""
# Run mercury install hook
def test_mercury_validate_hook():
"""Test mercury validate hook checks for postlight-parser."""
# Run mercury validate hook
result = subprocess.run(
[sys.executable, str(MERCURY_INSTALL_HOOK)],
[sys.executable, str(MERCURY_VALIDATE_HOOK)],
capture_output=True,
text=True,
timeout=600
timeout=30
)
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
# Verify InstalledBinary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'InstalledBinary':
assert record['name'] == 'mercury-parser'
assert record['abspath']
found_binary = True
break
except json.JSONDecodeError:
pass
assert found_binary, "Should output InstalledBinary record"
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
if result.returncode == 0:
# Binary found - verify InstalledBinary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'InstalledBinary':
assert record['name'] == 'postlight-parser'
assert record['abspath']
found_binary = True
break
except json.JSONDecodeError:
pass
assert found_binary, "Should output InstalledBinary record when binary found"
else:
# Binary not found - verify Dependency JSONL output
found_dependency = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'Dependency':
assert record['bin_name'] == 'postlight-parser'
assert 'npm' in record['bin_providers']
found_dependency = True
break
except json.JSONDecodeError:
pass
assert found_dependency, "Should output Dependency record when binary not found"
def test_verify_deps_with_abx_pkg():
"""Verify mercury-parser is available via abx-pkg after hook installation."""
"""Verify postlight-parser is available via abx-pkg."""
from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
NpmProvider.model_rebuild()
EnvProvider.model_rebuild()
# Verify mercury-parser is available
# Verify postlight-parser is available
mercury_binary = Binary(
name='mercury-parser',
name='postlight-parser',
binproviders=[NpmProvider(), EnvProvider()],
overrides={'npm': {'packages': ['@postlight/mercury-parser']}}
overrides={'npm': {'packages': ['@postlight/parser']}}
)
mercury_loaded = mercury_binary.load()
assert mercury_loaded and mercury_loaded.abspath, "mercury-parser should be available after install hook"
# If validate hook found it (exit 0), this should succeed
# If validate hook didn't find it (exit 1), this may fail unless binprovider installed it
if mercury_loaded and mercury_loaded.abspath:
assert True, "postlight-parser is available"
else:
pytest.skip("postlight-parser not available - Dependency record should have been emitted")
def test_extracts_with_mercury_parser():
"""Test full workflow: extract with mercury-parser from real HTML via hook."""
"""Test full workflow: extract with postlight-parser from real HTML via hook."""
# Prerequisites checked by earlier test
with tempfile.TemporaryDirectory() as tmpdir:

View File

@@ -2,46 +2,28 @@
"""
Create a Merkle tree of all archived outputs.
This plugin runs after all extractors and post-processing complete (priority 92)
and generates a cryptographic Merkle tree of all files in the snapshot directory.
This provides:
- Tamper detection: verify archive integrity
- Efficient updates: only re-hash changed files
- Compact proofs: prove file inclusion without sending all files
- Deduplication: identify identical content across snapshots
This plugin runs after all extractors complete (priority 93) and generates
a cryptographic Merkle tree of all files in the snapshot directory.
Output: merkletree/merkletree.json containing:
- root_hash: SHA256 hash of the Merkle root
- tree: Full tree structure with internal nodes
- files: List of all files with their hashes
- metadata: Timestamp, file count, total size
Output: merkletree.json containing root_hash, tree structure, file list, metadata
Usage: on_Snapshot__92_merkletree.py --url=<url> --snapshot-id=<uuid>
Usage: on_Snapshot__93_merkletree.py --url=<url> --snapshot-id=<uuid>
Environment variables:
SAVE_MERKLETREE: Enable merkle tree generation (default: true)
DATA_DIR: ArchiveBox data directory
ARCHIVE_DIR: Archive output directory
"""
__package__ = 'archivebox.plugins.merkletree'
import os
import sys
import json
import hashlib
from pathlib import Path
from datetime import datetime
from datetime import datetime, timezone
from typing import Dict, List, Optional, Tuple, Any
# Configure Django if running standalone
if __name__ == '__main__':
parent_dir = str(Path(__file__).resolve().parent.parent.parent)
if parent_dir not in sys.path:
sys.path.insert(0, parent_dir)
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
import django
django.setup()
import rich_click as click
import click
def sha256_file(filepath: Path) -> str:
@@ -49,12 +31,10 @@ def sha256_file(filepath: Path) -> str:
h = hashlib.sha256()
try:
with open(filepath, 'rb') as f:
# Read in 64kb chunks
while chunk := f.read(65536):
h.update(chunk)
return h.hexdigest()
except (OSError, PermissionError):
# If we can't read the file, return a null hash
return '0' * 64
@@ -64,74 +44,45 @@ def sha256_data(data: bytes) -> str:
def collect_files(snapshot_dir: Path, exclude_dirs: Optional[List[str]] = None) -> List[Tuple[Path, str, int]]:
"""
Recursively collect all files in snapshot directory.
Args:
snapshot_dir: Root directory to scan
exclude_dirs: Directory names to exclude (e.g., ['merkletree', '.git'])
Returns:
List of (relative_path, sha256_hash, file_size) tuples
"""
"""Recursively collect all files in snapshot directory."""
exclude_dirs = exclude_dirs or ['merkletree', '.git', '__pycache__']
files = []
for root, dirs, filenames in os.walk(snapshot_dir):
# Filter out excluded directories
dirs[:] = [d for d in dirs if d not in exclude_dirs]
for filename in filenames:
filepath = Path(root) / filename
rel_path = filepath.relative_to(snapshot_dir)
# Skip symlinks (we hash the target, not the link)
if filepath.is_symlink():
continue
# Compute hash and size
file_hash = sha256_file(filepath)
file_size = filepath.stat().st_size if filepath.exists() else 0
files.append((rel_path, file_hash, file_size))
# Sort by path for deterministic tree
files.sort(key=lambda x: str(x[0]))
return files
def build_merkle_tree(file_hashes: List[str]) -> Tuple[str, List[List[str]]]:
"""
Build a Merkle tree from a list of leaf hashes.
Args:
file_hashes: List of SHA256 hashes (leaves)
Returns:
(root_hash, tree_levels) where tree_levels is a list of hash lists per level
"""
"""Build a Merkle tree from a list of leaf hashes."""
if not file_hashes:
# Empty tree
return sha256_data(b''), [[]]
# Initialize with leaf level
tree_levels = [file_hashes.copy()]
# Build tree bottom-up
while len(tree_levels[-1]) > 1:
current_level = tree_levels[-1]
next_level = []
# Process pairs
for i in range(0, len(current_level), 2):
left = current_level[i]
if i + 1 < len(current_level):
# Combine left + right
right = current_level[i + 1]
combined = left + right
else:
# Odd number of nodes: duplicate the last one
combined = left + left
parent_hash = sha256_data(combined.encode('utf-8'))
@@ -139,67 +90,41 @@ def build_merkle_tree(file_hashes: List[str]) -> Tuple[str, List[List[str]]]:
tree_levels.append(next_level)
# Root is the single hash at the top level
root_hash = tree_levels[-1][0]
return root_hash, tree_levels
def create_merkle_tree(snapshot_dir: Path) -> Dict[str, Any]:
"""
Create a complete Merkle tree of all files in snapshot directory.
Args:
snapshot_dir: The snapshot directory to scan
Returns:
Dict containing root_hash, tree structure, file list, and metadata
"""
# Collect all files
"""Create a complete Merkle tree of all files in snapshot directory."""
files = collect_files(snapshot_dir)
# Extract just the hashes for tree building
file_hashes = [file_hash for _, file_hash, _ in files]
# Build Merkle tree
root_hash, tree_levels = build_merkle_tree(file_hashes)
# Calculate total size
total_size = sum(size for _, _, size in files)
# Prepare file list with metadata
file_list = [
{
'path': str(path),
'hash': file_hash,
'size': size,
}
{'path': str(path), 'hash': file_hash, 'size': size}
for path, file_hash, size in files
]
# Prepare result
result = {
return {
'root_hash': root_hash,
'tree_levels': tree_levels,
'files': file_list,
'metadata': {
'timestamp': datetime.now().isoformat(),
'timestamp': datetime.now(timezone.utc).isoformat(),
'file_count': len(files),
'total_size': total_size,
'tree_depth': len(tree_levels),
},
}
return result
@click.command()
@click.option('--url', required=True, help='URL being archived')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Generate Merkle tree of all archived outputs."""
from archivebox.core.models import Snapshot
start_ts = datetime.now()
start_ts = datetime.now(timezone.utc)
status = 'failed'
output = None
error = ''
@@ -211,30 +136,19 @@ def main(url: str, snapshot_id: str):
save_merkletree = os.getenv('SAVE_MERKLETREE', 'true').lower() in ('true', '1', 'yes', 'on')
if not save_merkletree:
click.echo('Skipping merkle tree (SAVE_MERKLETREE=False)')
status = 'skipped'
end_ts = datetime.now()
click.echo(f'START_TS={start_ts.isoformat()}')
click.echo(f'END_TS={end_ts.isoformat()}')
click.echo(f'STATUS={status}')
click.echo(f'RESULT_JSON={{"extractor": "merkletree", "status": "{status}", "url": "{url}", "snapshot_id": "{snapshot_id}"}}')
click.echo(json.dumps({'status': status, 'output': 'SAVE_MERKLETREE=false'}))
sys.exit(0)
# Get snapshot
try:
snapshot = Snapshot.objects.get(id=snapshot_id)
except Snapshot.DoesNotExist:
error = f'Snapshot {snapshot_id} not found'
raise ValueError(error)
# Working directory is the extractor output dir (e.g., <snapshot>/merkletree/)
# Parent is the snapshot directory
output_dir = Path.cwd()
snapshot_dir = output_dir.parent
# Get snapshot directory
snapshot_dir = Path(snapshot.output_dir)
if not snapshot_dir.exists():
error = f'Snapshot directory not found: {snapshot_dir}'
raise FileNotFoundError(error)
raise FileNotFoundError(f'Snapshot directory not found: {snapshot_dir}')
# Create output directory
output_dir = snapshot_dir / 'merkletree'
# Ensure output directory exists
output_dir.mkdir(exist_ok=True)
output_path = output_dir / 'merkletree.json'
@@ -246,49 +160,31 @@ def main(url: str, snapshot_id: str):
json.dump(merkle_data, f, indent=2)
status = 'succeeded'
output = str(output_path)
output = 'merkletree.json'
root_hash = merkle_data['root_hash']
file_count = merkle_data['metadata']['file_count']
total_size = merkle_data['metadata']['total_size']
click.echo(f'Merkle tree created: {file_count} files, root={root_hash[:16]}..., size={total_size:,} bytes')
click.echo(f'Merkle tree: {file_count} files, root={root_hash[:16]}..., size={total_size:,} bytes')
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
click.echo(f'Error: {error}', err=True)
end_ts = datetime.now()
duration = (end_ts - start_ts).total_seconds()
end_ts = datetime.now(timezone.utc)
# Print results
click.echo(f'START_TS={start_ts.isoformat()}')
click.echo(f'END_TS={end_ts.isoformat()}')
click.echo(f'DURATION={duration:.2f}')
if output:
click.echo(f'OUTPUT={output}')
click.echo(f'STATUS={status}')
if error:
click.echo(f'ERROR={error}', err=True)
# Print JSON result
result_json = {
'extractor': 'merkletree',
'url': url,
'snapshot_id': snapshot_id,
# Print JSON result for hook runner
result = {
'status': status,
'start_ts': start_ts.isoformat(),
'end_ts': end_ts.isoformat(),
'duration': round(duration, 2),
'output': output,
'error': error or None,
'root_hash': root_hash,
'file_count': file_count,
'error': error or None,
}
click.echo(f'RESULT_JSON={json.dumps(result_json)}')
click.echo(json.dumps(result))
sys.exit(0 if status == 'succeeded' else 1)
sys.exit(0 if status in ('succeeded', 'skipped') else 1)
if __name__ == '__main__':

View File

@@ -0,0 +1 @@
🔗

View File

@@ -133,7 +133,8 @@ def fetch_content(url: str) -> str:
@click.command()
@click.option('--url', required=True, help='HTML URL to parse')
def main(url: str):
@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
def main(url: str, snapshot_id: str = None):
"""Parse HTML and extract href URLs."""
# Skip only if parse_dom_outlinks already ran AND found URLs (it uses Chrome for better coverage)

View File

@@ -0,0 +1 @@
🔗

View File

@@ -127,7 +127,8 @@ def fetch_content(url: str) -> str:
@click.command()
@click.option('--url', required=True, help='JSONL file URL to parse')
def main(url: str):
@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
def main(url: str, snapshot_id: str = None):
"""Parse JSONL bookmark file and extract URLs."""
try:

View File

@@ -0,0 +1 @@
📋

View File

@@ -52,7 +52,8 @@ def fetch_content(url: str) -> str:
@click.command()
@click.option('--url', required=True, help='Netscape bookmark file URL to parse')
def main(url: str):
@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
def main(url: str, snapshot_id: str = None):
"""Parse Netscape bookmark HTML and extract URLs."""
try:

View File

@@ -0,0 +1 @@
🔖

View File

@@ -51,7 +51,8 @@ def fetch_content(url: str) -> str:
@click.command()
@click.option('--url', required=True, help='RSS/Atom feed URL to parse')
def main(url: str):
@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
def main(url: str, snapshot_id: str = None):
"""Parse RSS/Atom feed and extract article URLs."""
if feedparser is None:

View File

@@ -0,0 +1 @@
📡

View File

@@ -100,7 +100,8 @@ def fetch_content(url: str) -> str:
@click.command()
@click.option('--url', required=True, help='URL to parse (file:// or https://)')
def main(url: str):
@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
def main(url: str, snapshot_id: str = None):
"""Parse plain text and extract URLs."""
try:

View File

@@ -0,0 +1 @@
📃

View File

@@ -0,0 +1,5 @@
<!-- PDF embed - full PDF viewer -->
<embed src="{{ output_path }}#toolbar=1&navpanes=1"
type="application/pdf"
class="extractor-embed pdf-embed"
style="width: 100%; height: 100%; min-height: 500px;">

View File

@@ -0,0 +1,5 @@
<!-- PDF fullscreen - full PDF viewer -->
<embed src="{{ output_path }}#toolbar=1&navpanes=1&view=FitH"
type="application/pdf"
class="extractor-fullscreen pdf-fullscreen"
style="width: 100%; height: 100vh;">

View File

@@ -0,0 +1 @@
📄

View File

@@ -0,0 +1,6 @@
<!-- PDF thumbnail - shows first page preview -->
<div class="extractor-thumbnail pdf-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #f5f5f5;">
<embed src="{{ output_path }}#toolbar=0&navpanes=0&scrollbar=0&page=1&view=FitH"
type="application/pdf"
style="width: 100%; height: 200px; margin-top: -20px; pointer-events: none;">
</div>

View File

@@ -1,68 +0,0 @@
#!/usr/bin/env python3
"""
Install readability-extractor if not already available.
Runs at crawl start to ensure readability-extractor is installed.
Outputs JSONL for InstalledBinary.
"""
import json
import sys
from pathlib import Path
def main():
try:
from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
NpmProvider.model_rebuild()
EnvProvider.model_rebuild()
# Note: npm package is from github:ArchiveBox/readability-extractor
readability_binary = Binary(
name='readability-extractor',
binproviders=[NpmProvider(), EnvProvider()],
overrides={'npm': {'packages': ['github:ArchiveBox/readability-extractor']}}
)
# Try to load, install if not found
try:
loaded = readability_binary.load()
if not loaded or not loaded.abspath:
raise Exception("Not loaded")
except Exception:
# Install via npm from GitHub repo
loaded = readability_binary.install()
if loaded and loaded.abspath:
# Output InstalledBinary JSONL
print(json.dumps({
'type': 'InstalledBinary',
'name': 'readability-extractor',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256,
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
}))
sys.exit(0)
else:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'readability-extractor',
'bin_providers': 'npm,env',
}))
print("Failed to install readability-extractor", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'readability-extractor',
'bin_providers': 'npm,env',
}))
print(f"Error installing readability-extractor: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,123 @@
#!/usr/bin/env python3
"""
Validation hook for readability-extractor binary.
Runs at crawl start to verify readability-extractor is available.
Outputs JSONL for InstalledBinary and Machine config updates.
"""
import os
import sys
import json
import shutil
import hashlib
import subprocess
from pathlib import Path
def get_binary_version(abspath: str) -> str | None:
"""Get version string from binary."""
try:
result = subprocess.run(
[abspath, '--version'],
capture_output=True,
text=True,
timeout=5,
)
if result.returncode == 0 and result.stdout:
first_line = result.stdout.strip().split('\n')[0]
return first_line[:64]
except Exception:
pass
return None
def get_binary_hash(abspath: str) -> str | None:
"""Get SHA256 hash of binary."""
try:
with open(abspath, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
except Exception:
return None
def find_readability() -> dict | None:
"""Find readability-extractor binary."""
try:
from abx_pkg import Binary, NpmProvider, EnvProvider
class ReadabilityBinary(Binary):
name: str = 'readability-extractor'
binproviders_supported = [NpmProvider(), EnvProvider()]
overrides: dict = {'npm': {'packages': ['github:ArchiveBox/readability-extractor']}}
binary = ReadabilityBinary()
loaded = binary.load()
if loaded and loaded.abspath:
return {
'name': 'readability-extractor',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
}
except ImportError:
pass
except Exception:
pass
# Fallback to shutil.which
abspath = shutil.which('readability-extractor') or os.environ.get('READABILITY_BINARY', '')
if abspath and Path(abspath).is_file():
return {
'name': 'readability-extractor',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'env',
}
return None
def main():
result = find_readability()
if result and result.get('abspath'):
print(json.dumps({
'type': 'InstalledBinary',
'name': result['name'],
'abspath': result['abspath'],
'version': result['version'],
'sha256': result['sha256'],
'binprovider': result['binprovider'],
}))
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/READABILITY_BINARY',
'value': result['abspath'],
}))
if result['version']:
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/READABILITY_VERSION',
'value': result['version'],
}))
sys.exit(0)
else:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'readability-extractor',
'bin_providers': 'npm,env',
}))
print(f"readability-extractor binary not found", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,6 @@
<!-- Readability embed - reader-mode article view -->
<iframe src="{{ output_path }}"
class="extractor-embed readability-embed"
style="width: 100%; height: 100%; min-height: 500px; border: none; background: #fefefe;"
sandbox="allow-same-origin">
</iframe>

View File

@@ -0,0 +1,6 @@
<!-- Readability fullscreen - full reader-mode article -->
<iframe src="{{ output_path }}"
class="extractor-fullscreen readability-fullscreen"
style="width: 100%; height: 100vh; border: none; background: #fefefe;"
sandbox="allow-same-origin">
</iframe>

View File

@@ -0,0 +1 @@
📖

View File

@@ -0,0 +1,8 @@
<!-- Readability thumbnail - shows reader-mode extracted article content -->
<div class="extractor-thumbnail readability-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fefefe; padding: 8px; font-family: Georgia, serif; font-size: 11px; line-height: 1.4; color: #333;">
<iframe src="{{ output_path }}"
style="width: 100%; height: 300px; border: none; pointer-events: none;"
loading="lazy"
sandbox="allow-same-origin">
</iframe>
</div>

View File

@@ -2,7 +2,7 @@
Integration tests for readability plugin
Tests verify:
1. Install hook installs readability-extractor via abx-pkg
1. Validate hook checks for readability-extractor binary
2. Verify deps with abx-pkg
3. Plugin reports missing dependency correctly
4. Extraction works against real example.com content
@@ -21,7 +21,7 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
READABILITY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_readability.py'))
READABILITY_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_readability.py'
READABILITY_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_readability.py'
TEST_URL = 'https://example.com'
@@ -101,48 +101,63 @@ def test_reports_missing_dependency_when_not_installed():
assert 'readability-extractor' in combined or 'BIN_NAME' in combined, "Should mention readability-extractor"
def test_readability_install_hook():
"""Test readability install hook to install readability-extractor if needed."""
def test_readability_validate_hook():
"""Test readability validate hook checks for readability-extractor binary."""
result = subprocess.run(
[sys.executable, str(READABILITY_INSTALL_HOOK)],
[sys.executable, str(READABILITY_VALIDATE_HOOK)],
capture_output=True,
text=True,
timeout=600
timeout=30
)
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
# Verify InstalledBinary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'InstalledBinary':
assert record['name'] == 'readability-extractor'
assert record['abspath']
found_binary = True
break
except json.JSONDecodeError:
pass
assert found_binary, "Should output InstalledBinary record"
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
if result.returncode == 0:
# Binary found - verify InstalledBinary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'InstalledBinary':
assert record['name'] == 'readability-extractor'
assert record['abspath']
found_binary = True
break
except json.JSONDecodeError:
pass
assert found_binary, "Should output InstalledBinary record when binary found"
else:
# Binary not found - verify Dependency JSONL output
found_dependency = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'Dependency':
assert record['bin_name'] == 'readability-extractor'
assert 'npm' in record['bin_providers']
found_dependency = True
break
except json.JSONDecodeError:
pass
assert found_dependency, "Should output Dependency record when binary not found"
def test_verify_deps_with_abx_pkg():
"""Verify readability-extractor is available via abx-pkg after hook installation."""
"""Verify readability-extractor is available via abx-pkg."""
from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
NpmProvider.model_rebuild()
EnvProvider.model_rebuild()
readability_binary = Binary(
name='readability-extractor',
binproviders=[NpmProvider(), EnvProvider()],
overrides={'npm': {'packages': ['github:ArchiveBox/readability-extractor']}}
)
readability_loaded = readability_binary.load()
assert readability_loaded and readability_loaded.abspath, "readability-extractor should be available after install hook"
if readability_loaded and readability_loaded.abspath:
assert True, "readability-extractor is available"
else:
pytest.skip("readability-extractor not available - Dependency record should have been emitted")
def test_extracts_article_after_installation():

View File

@@ -0,0 +1,5 @@
<!-- Screenshot embed - full image view -->
<img src="{{ output_path }}"
alt="Screenshot of page"
class="extractor-embed screenshot-embed"
style="max-width: 100%; height: auto;">

View File

@@ -0,0 +1,8 @@
<!-- Screenshot fullscreen - zoomable image -->
<div style="width: 100%; height: 100vh; overflow: auto; background: #222; display: flex; align-items: start; justify-content: center;">
<img src="{{ output_path }}"
alt="Screenshot of page"
class="extractor-fullscreen screenshot-fullscreen"
style="max-width: 100%; cursor: zoom-in;"
onclick="this.style.maxWidth = this.style.maxWidth === 'none' ? '100%' : 'none'; this.style.cursor = this.style.maxWidth === 'none' ? 'zoom-out' : 'zoom-in';">
</div>

View File

@@ -0,0 +1 @@
📷

View File

@@ -0,0 +1,8 @@
<!-- Screenshot thumbnail - shows the captured screenshot image -->
<img src="{{ output_path }}"
alt="Screenshot of page"
class="extractor-thumbnail screenshot-thumbnail"
style="width: 100%; height: 100px; object-fit: cover; object-position: top center; background: #333;"
loading="lazy"
onerror="this.style.display='none'; this.nextElementSibling.style.display='block';">
<div style="display: none; text-align: center; padding: 20px; color: #999;">📷 Screenshot</div>

View File

@@ -0,0 +1,6 @@
<!-- Singlefile embed - full iframe of archived HTML -->
<iframe src="{{ output_path }}"
class="extractor-embed singlefile-embed"
style="width: 100%; height: 100%; min-height: 500px; border: none;"
sandbox="allow-same-origin allow-scripts allow-forms">
</iframe>

View File

@@ -0,0 +1,6 @@
<!-- Singlefile fullscreen - full page iframe -->
<iframe src="{{ output_path }}"
class="extractor-fullscreen singlefile-fullscreen"
style="width: 100%; height: 100vh; border: none;"
sandbox="allow-same-origin allow-scripts allow-forms allow-top-navigation-by-user-activation">
</iframe>

View File

@@ -0,0 +1 @@
📦

View File

@@ -0,0 +1,8 @@
<!-- Singlefile thumbnail - scaled down iframe preview of archived HTML -->
<div class="extractor-thumbnail singlefile-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fff;">
<iframe src="{{ output_path }}"
style="width: 400%; height: 400px; transform: scale(0.25); transform-origin: top left; pointer-events: none; border: none;"
loading="lazy"
sandbox="allow-same-origin">
</iframe>
</div>

View File

@@ -0,0 +1 @@
📁

View File

@@ -0,0 +1 @@
📝

View File

@@ -1,68 +0,0 @@
#!/usr/bin/env python3
"""
Install wget if not already available.
Runs at crawl start to ensure wget is installed.
Outputs JSONL for InstalledBinary.
"""
import json
import sys
from pathlib import Path
def main():
try:
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
AptProvider.model_rebuild()
BrewProvider.model_rebuild()
EnvProvider.model_rebuild()
# wget binary and package have same name
wget_binary = Binary(
name='wget',
binproviders=[AptProvider(), BrewProvider(), EnvProvider()]
)
# Try to load, install if not found
try:
loaded = wget_binary.load()
if not loaded or not loaded.abspath:
raise Exception("Not loaded")
except Exception:
# Install via system package manager
loaded = wget_binary.install()
if loaded and loaded.abspath:
# Output InstalledBinary JSONL
print(json.dumps({
'type': 'InstalledBinary',
'name': 'wget',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256,
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
}))
sys.exit(0)
else:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'wget',
'bin_providers': 'apt,brew,env',
}))
print("Failed to install wget", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'wget',
'bin_providers': 'apt,brew,env',
}))
print(f"Error installing wget: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,6 @@
<!-- Wget embed - full iframe of mirrored site -->
<iframe src="{{ output_path }}"
class="extractor-embed wget-embed"
style="width: 100%; height: 100%; min-height: 500px; border: none;"
sandbox="allow-same-origin allow-scripts allow-forms">
</iframe>

View File

@@ -0,0 +1,6 @@
<!-- Wget fullscreen - full page iframe of mirrored site -->
<iframe src="{{ output_path }}"
class="extractor-fullscreen wget-fullscreen"
style="width: 100%; height: 100vh; border: none;"
sandbox="allow-same-origin allow-scripts allow-forms allow-top-navigation-by-user-activation">
</iframe>

View File

@@ -0,0 +1 @@
📥

View File

@@ -0,0 +1,8 @@
<!-- Wget thumbnail - scaled down iframe preview of mirrored site -->
<div class="extractor-thumbnail wget-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fff;">
<iframe src="{{ output_path }}"
style="width: 400%; height: 400px; transform: scale(0.25); transform-origin: top left; pointer-events: none; border: none;"
loading="lazy"
sandbox="allow-same-origin">
</iframe>
</div>

View File

@@ -2,8 +2,8 @@
Integration tests for wget plugin
Tests verify:
1. Plugin reports missing dependency correctly
2. wget can be installed via brew/apt provider hooks
1. Validate hook checks for wget binary
2. Verify deps with abx-pkg
3. Config options work (SAVE_WGET, SAVE_WARC, etc.)
4. Extraction works against real example.com
5. Output files contain actual page content
@@ -26,7 +26,7 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
WGET_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_wget.py'))
WGET_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_wget.py'
WGET_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_wget.py'
BREW_HOOK = PLUGINS_ROOT / 'brew' / 'on_Dependency__install_using_brew_provider.py'
APT_HOOK = PLUGINS_ROOT / 'apt' / 'on_Dependency__install_using_apt_provider.py'
TEST_URL = 'https://example.com'
@@ -37,45 +37,59 @@ def test_hook_script_exists():
assert WGET_HOOK.exists(), f"Hook script not found: {WGET_HOOK}"
def test_wget_install_hook():
"""Test wget install hook to install wget if needed."""
def test_wget_validate_hook():
"""Test wget validate hook checks for wget binary."""
result = subprocess.run(
[sys.executable, str(WGET_INSTALL_HOOK)],
[sys.executable, str(WGET_VALIDATE_HOOK)],
capture_output=True,
text=True,
timeout=600
timeout=30
)
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
# Verify InstalledBinary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'InstalledBinary':
assert record['name'] == 'wget'
assert record['abspath']
found_binary = True
break
except json.JSONDecodeError:
pass
assert found_binary, "Should output InstalledBinary record"
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
if result.returncode == 0:
# Binary found - verify InstalledBinary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'InstalledBinary':
assert record['name'] == 'wget'
assert record['abspath']
found_binary = True
break
except json.JSONDecodeError:
pass
assert found_binary, "Should output InstalledBinary record when binary found"
else:
# Binary not found - verify Dependency JSONL output
found_dependency = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'Dependency':
assert record['bin_name'] == 'wget'
assert 'env' in record['bin_providers']
found_dependency = True
break
except json.JSONDecodeError:
pass
assert found_dependency, "Should output Dependency record when binary not found"
def test_verify_deps_with_abx_pkg():
"""Verify wget is available via abx-pkg after hook installation."""
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
AptProvider.model_rebuild()
BrewProvider.model_rebuild()
EnvProvider.model_rebuild()
"""Verify wget is available via abx-pkg."""
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
wget_binary = Binary(name='wget', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
wget_loaded = wget_binary.load()
assert wget_loaded and wget_loaded.abspath, "wget should be available after install hook"
if wget_loaded and wget_loaded.abspath:
assert True, "wget is available"
else:
pytest.skip("wget not available - Dependency record should have been emitted")
def test_reports_missing_dependency_when_not_installed():

View File

@@ -110,6 +110,10 @@
{% block nav-global %}{% endblock %}
</div>
{% if has_permission %}
{% include 'admin/progress_monitor.html' %}
{% endif %}
{% block breadcrumbs %}
<div class="breadcrumbs">
<a href="{% url 'admin:index' %}">{% trans 'Home' %}</a>

View File

@@ -0,0 +1,648 @@
<style>
/* Progress Monitor Container */
#progress-monitor {
background: linear-gradient(135deg, #0d1117 0%, #161b22 100%);
color: #c9d1d9;
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Noto Sans', Helvetica, Arial, sans-serif;
font-size: 12px;
border-bottom: 1px solid #30363d;
position: relative;
z-index: 100;
}
#progress-monitor.hidden {
display: none;
}
#progress-monitor .tree-container {
max-height: 350px;
overflow-y: auto;
}
/* Header Bar */
#progress-monitor .header-bar {
display: flex;
justify-content: space-between;
align-items: center;
padding: 8px 16px;
background: rgba(0,0,0,0.2);
border-bottom: 1px solid #30363d;
position: sticky;
top: 0;
z-index: 10;
}
#progress-monitor .header-left {
display: flex;
align-items: center;
gap: 16px;
}
#progress-monitor .header-right {
display: flex;
align-items: center;
gap: 12px;
}
/* Orchestrator Status */
#progress-monitor .orchestrator-status {
display: flex;
align-items: center;
gap: 6px;
}
#progress-monitor .status-dot {
width: 8px;
height: 8px;
border-radius: 50%;
flex-shrink: 0;
}
#progress-monitor .status-dot.running {
background: #3fb950;
box-shadow: 0 0 8px #3fb950;
animation: pulse 2s infinite;
}
#progress-monitor .status-dot.stopped {
background: #f85149;
}
@keyframes pulse {
0%, 100% { opacity: 1; box-shadow: 0 0 8px #3fb950; }
50% { opacity: 0.6; box-shadow: 0 0 4px #3fb950; }
}
/* Stats */
#progress-monitor .stats {
display: flex;
gap: 16px;
}
#progress-monitor .stat {
display: flex;
align-items: center;
gap: 4px;
}
#progress-monitor .stat-label {
color: #8b949e;
font-size: 10px;
text-transform: uppercase;
letter-spacing: 0.5px;
}
#progress-monitor .stat-value {
font-weight: 600;
font-variant-numeric: tabular-nums;
}
#progress-monitor .stat-value.success { color: #3fb950; }
#progress-monitor .stat-value.error { color: #f85149; }
#progress-monitor .stat-value.warning { color: #d29922; }
#progress-monitor .stat-value.info { color: #58a6ff; }
/* Toggle Button */
#progress-monitor .toggle-btn {
background: transparent;
border: 1px solid #30363d;
color: #8b949e;
cursor: pointer;
padding: 4px 8px;
border-radius: 6px;
font-size: 11px;
transition: all 0.2s;
}
#progress-monitor .toggle-btn:hover {
background: #21262d;
color: #c9d1d9;
border-color: #8b949e;
}
/* Tree Container */
#progress-monitor .tree-container {
padding: 12px 16px;
}
#progress-monitor.collapsed .tree-container {
display: none;
}
/* Idle Message */
#progress-monitor .idle-message {
color: #8b949e;
font-style: italic;
padding: 8px 0;
text-align: center;
}
/* Crawl Item */
#progress-monitor .crawl-item {
background: #161b22;
border: 1px solid #30363d;
border-radius: 8px;
margin-bottom: 12px;
overflow: hidden;
}
#progress-monitor .crawl-header {
display: flex;
align-items: center;
gap: 12px;
padding: 10px 14px;
background: rgba(0,0,0,0.2);
cursor: pointer;
}
#progress-monitor .crawl-header:hover {
background: rgba(88, 166, 255, 0.1);
}
#progress-monitor .crawl-icon {
font-size: 16px;
width: 20px;
text-align: center;
}
#progress-monitor .crawl-info {
flex: 1;
min-width: 0;
}
#progress-monitor .crawl-label {
font-weight: 600;
color: #58a6ff;
white-space: nowrap;
overflow: hidden;
text-overflow: ellipsis;
}
#progress-monitor .crawl-meta {
font-size: 11px;
color: #8b949e;
margin-top: 2px;
}
#progress-monitor .crawl-stats {
display: flex;
gap: 12px;
font-size: 11px;
}
/* Progress Bar */
#progress-monitor .progress-bar-container {
height: 4px;
background: #21262d;
border-radius: 2px;
overflow: hidden;
position: relative;
}
#progress-monitor .progress-bar {
height: 100%;
border-radius: 2px;
transition: width 0.5s ease-out;
position: relative;
}
#progress-monitor .progress-bar.crawl {
background: linear-gradient(90deg, #238636 0%, #3fb950 100%);
}
#progress-monitor .progress-bar.snapshot {
background: linear-gradient(90deg, #1f6feb 0%, #58a6ff 100%);
}
#progress-monitor .progress-bar.extractor {
background: linear-gradient(90deg, #8957e5 0%, #a371f7 100%);
}
#progress-monitor .progress-bar.indeterminate {
background: linear-gradient(90deg, transparent 0%, #58a6ff 50%, transparent 100%);
animation: indeterminate 1.5s infinite linear;
width: 30% !important;
}
@keyframes indeterminate {
0% { transform: translateX(-100%); }
100% { transform: translateX(400%); }
}
/* Crawl Body */
#progress-monitor .crawl-body {
padding: 0 14px 14px;
}
#progress-monitor .crawl-progress {
padding: 10px 14px;
border-bottom: 1px solid #21262d;
}
/* Snapshot List */
#progress-monitor .snapshot-list {
margin-top: 8px;
}
#progress-monitor .snapshot-item {
background: #0d1117;
border: 1px solid #21262d;
border-radius: 6px;
margin-bottom: 8px;
overflow: hidden;
}
#progress-monitor .snapshot-header {
display: flex;
align-items: center;
gap: 10px;
padding: 8px 12px;
cursor: pointer;
}
#progress-monitor .snapshot-header:hover {
background: rgba(88, 166, 255, 0.05);
}
#progress-monitor .snapshot-icon {
font-size: 14px;
width: 18px;
text-align: center;
color: #58a6ff;
}
#progress-monitor .snapshot-info {
flex: 1;
min-width: 0;
}
#progress-monitor .snapshot-url {
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
font-size: 11px;
color: #c9d1d9;
white-space: nowrap;
overflow: hidden;
text-overflow: ellipsis;
}
#progress-monitor .snapshot-meta {
font-size: 10px;
color: #8b949e;
margin-top: 2px;
}
#progress-monitor .snapshot-progress {
padding: 0 12px 8px;
}
/* Extractor List */
#progress-monitor .extractor-list {
padding: 8px 12px;
background: rgba(0,0,0,0.2);
border-top: 1px solid #21262d;
}
#progress-monitor .extractor-item {
display: flex;
align-items: center;
gap: 8px;
padding: 4px 0;
}
#progress-monitor .extractor-icon {
font-size: 12px;
width: 16px;
text-align: center;
}
#progress-monitor .extractor-icon.running {
color: #d29922;
animation: spin 1s linear infinite;
}
#progress-monitor .extractor-icon.success {
color: #3fb950;
}
#progress-monitor .extractor-icon.failed {
color: #f85149;
}
#progress-monitor .extractor-icon.pending {
color: #8b949e;
}
@keyframes spin {
from { transform: rotate(0deg); }
to { transform: rotate(360deg); }
}
#progress-monitor .extractor-name {
flex: 1;
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
font-size: 11px;
}
#progress-monitor .extractor-progress {
width: 60px;
}
/* Status Badge */
#progress-monitor .status-badge {
font-size: 10px;
padding: 2px 6px;
border-radius: 10px;
font-weight: 500;
text-transform: uppercase;
letter-spacing: 0.3px;
}
#progress-monitor .status-badge.queued {
background: #21262d;
color: #8b949e;
}
#progress-monitor .status-badge.started {
background: rgba(210, 153, 34, 0.2);
color: #d29922;
}
#progress-monitor .status-badge.sealed,
#progress-monitor .status-badge.succeeded {
background: rgba(63, 185, 80, 0.2);
color: #3fb950;
}
#progress-monitor .status-badge.failed {
background: rgba(248, 81, 73, 0.2);
color: #f85149;
}
/* Expand/Collapse Icons */
#progress-monitor .expand-icon {
color: #8b949e;
font-size: 10px;
transition: transform 0.2s;
}
#progress-monitor .expand-icon.expanded {
transform: rotate(90deg);
}
</style>
<div id="progress-monitor">
<div class="header-bar">
<div class="header-left">
<div class="orchestrator-status">
<span class="status-dot stopped" id="orchestrator-dot"></span>
<span id="orchestrator-text">Stopped</span>
</div>
<div class="stats">
<div class="stat">
<span class="stat-label">Workers</span>
<span class="stat-value info" id="worker-count">0</span>
</div>
<div class="stat">
<span class="stat-label">Queued</span>
<span class="stat-value warning" id="total-queued">0</span>
</div>
<div class="stat">
<span class="stat-label">Done</span>
<span class="stat-value success" id="total-succeeded">0</span>
</div>
<div class="stat">
<span class="stat-label">Failed</span>
<span class="stat-value error" id="total-failed">0</span>
</div>
</div>
</div>
<div class="header-right">
<button class="toggle-btn" id="progress-collapse" title="Toggle details">Details</button>
</div>
</div>
<div class="tree-container" id="tree-container">
<div class="idle-message" id="idle-message">No active crawls</div>
<div id="crawl-tree"></div>
</div>
</div>
<script>
(function() {
const monitor = document.getElementById('progress-monitor');
const collapseBtn = document.getElementById('progress-collapse');
const treeContainer = document.getElementById('tree-container');
const crawlTree = document.getElementById('crawl-tree');
const idleMessage = document.getElementById('idle-message');
let pollInterval = null;
let isCollapsed = localStorage.getItem('progress-monitor-collapsed') === 'true';
let expandedCrawls = new Set(JSON.parse(localStorage.getItem('progress-monitor-expanded-crawls') || '[]'));
let expandedSnapshots = new Set(JSON.parse(localStorage.getItem('progress-monitor-expanded-snapshots') || '[]'));
function formatUrl(url) {
try {
const u = new URL(url);
return u.hostname + u.pathname.substring(0, 30) + (u.pathname.length > 30 ? '...' : '');
} catch {
return url.substring(0, 50) + (url.length > 50 ? '...' : '');
}
}
function renderExtractor(extractor) {
const iconClass = extractor.status === 'started' ? 'running' :
extractor.status === 'succeeded' ? 'success' :
extractor.status === 'failed' ? 'failed' : 'pending';
const icon = extractor.status === 'started' ? '&#8635;' :
extractor.status === 'succeeded' ? '&#10003;' :
extractor.status === 'failed' ? '&#10007;' : '&#9675;';
return `
<div class="extractor-item">
<span class="extractor-icon ${iconClass}">${icon}</span>
<span class="extractor-name">${extractor.extractor}</span>
<div class="extractor-progress">
<div class="progress-bar-container">
<div class="progress-bar extractor ${extractor.status === 'started' ? 'indeterminate' : ''}"
style="width: ${extractor.status === 'succeeded' ? '100' : extractor.status === 'failed' ? '100' : extractor.progress}%"></div>
</div>
</div>
</div>
`;
}
function renderSnapshot(snapshot, crawlId) {
const snapshotKey = `${crawlId}-${snapshot.id}`;
const isExpanded = expandedSnapshots.has(snapshotKey);
const statusIcon = snapshot.status === 'started' ? '&#8635;' : '&#128196;';
let extractorHtml = '';
if (snapshot.active_extractors && snapshot.active_extractors.length > 0) {
extractorHtml = `
<div class="extractor-list" style="${isExpanded ? '' : 'display:none'}">
${snapshot.active_extractors.map(e => renderExtractor(e)).join('')}
</div>
`;
}
return `
<div class="snapshot-item" data-snapshot-key="${snapshotKey}">
<div class="snapshot-header" onclick="window.toggleSnapshot('${snapshotKey}')">
<span class="expand-icon ${isExpanded ? 'expanded' : ''}">${snapshot.active_extractors?.length ? '&#9654;' : ''}</span>
<span class="snapshot-icon">${statusIcon}</span>
<div class="snapshot-info">
<div class="snapshot-url">${formatUrl(snapshot.url)}</div>
<div class="snapshot-meta">
${snapshot.completed_extractors}/${snapshot.total_extractors} extractors
${snapshot.failed_extractors > 0 ? `<span style="color:#f85149">(${snapshot.failed_extractors} failed)</span>` : ''}
</div>
</div>
<span class="status-badge ${snapshot.status}">${snapshot.status}</span>
</div>
<div class="snapshot-progress">
<div class="progress-bar-container">
<div class="progress-bar snapshot ${snapshot.status === 'started' && snapshot.progress === 0 ? 'indeterminate' : ''}"
style="width: ${snapshot.progress}%"></div>
</div>
</div>
${extractorHtml}
</div>
`;
}
function renderCrawl(crawl) {
const isExpanded = expandedCrawls.has(crawl.id);
const statusIcon = crawl.status === 'started' ? '&#8635;' : '&#128269;';
let snapshotsHtml = '';
if (crawl.active_snapshots && crawl.active_snapshots.length > 0) {
snapshotsHtml = crawl.active_snapshots.map(s => renderSnapshot(s, crawl.id)).join('');
}
return `
<div class="crawl-item" data-crawl-id="${crawl.id}">
<div class="crawl-header" onclick="window.toggleCrawl('${crawl.id}')">
<span class="expand-icon ${isExpanded ? 'expanded' : ''}">${crawl.active_snapshots?.length ? '&#9654;' : ''}</span>
<span class="crawl-icon">${statusIcon}</span>
<div class="crawl-info">
<div class="crawl-label">${crawl.label}</div>
<div class="crawl-meta">depth: ${crawl.max_depth} | ${crawl.total_snapshots} snapshots</div>
</div>
<div class="crawl-stats">
<span style="color:#3fb950">${crawl.completed_snapshots} done</span>
<span style="color:#8b949e">${crawl.pending_snapshots} pending</span>
</div>
<span class="status-badge ${crawl.status}">${crawl.status}</span>
</div>
<div class="crawl-progress">
<div class="progress-bar-container">
<div class="progress-bar crawl ${crawl.status === 'started' && crawl.progress === 0 ? 'indeterminate' : ''}"
style="width: ${crawl.progress}%"></div>
</div>
</div>
<div class="crawl-body" style="${isExpanded ? '' : 'display:none'}">
<div class="snapshot-list">
${snapshotsHtml}
</div>
</div>
</div>
`;
}
window.toggleCrawl = function(crawlId) {
const item = document.querySelector(`[data-crawl-id="${crawlId}"]`);
const body = item.querySelector('.crawl-body');
const icon = item.querySelector('.expand-icon');
if (expandedCrawls.has(crawlId)) {
expandedCrawls.delete(crawlId);
body.style.display = 'none';
icon.classList.remove('expanded');
} else {
expandedCrawls.add(crawlId);
body.style.display = '';
icon.classList.add('expanded');
}
localStorage.setItem('progress-monitor-expanded-crawls', JSON.stringify([...expandedCrawls]));
};
window.toggleSnapshot = function(snapshotKey) {
const item = document.querySelector(`[data-snapshot-key="${snapshotKey}"]`);
const extractorList = item.querySelector('.extractor-list');
const icon = item.querySelector('.expand-icon');
if (!extractorList) return;
if (expandedSnapshots.has(snapshotKey)) {
expandedSnapshots.delete(snapshotKey);
extractorList.style.display = 'none';
icon.classList.remove('expanded');
} else {
expandedSnapshots.add(snapshotKey);
extractorList.style.display = '';
icon.classList.add('expanded');
}
localStorage.setItem('progress-monitor-expanded-snapshots', JSON.stringify([...expandedSnapshots]));
};
function updateProgress(data) {
// Calculate if there's activity
const hasActivity = data.active_crawls.length > 0 ||
data.crawls_pending > 0 || data.crawls_started > 0 ||
data.snapshots_pending > 0 || data.snapshots_started > 0 ||
data.archiveresults_pending > 0 || data.archiveresults_started > 0;
// Update orchestrator status
const dot = document.getElementById('orchestrator-dot');
const text = document.getElementById('orchestrator-text');
if (data.orchestrator_running) {
dot.classList.remove('stopped');
dot.classList.add('running');
text.textContent = 'Running';
} else {
dot.classList.remove('running');
dot.classList.add('stopped');
text.textContent = 'Stopped';
}
// Update stats
document.getElementById('worker-count').textContent = data.total_workers;
document.getElementById('total-queued').textContent =
data.crawls_pending + data.snapshots_pending + data.archiveresults_pending;
document.getElementById('total-succeeded').textContent = data.archiveresults_succeeded;
document.getElementById('total-failed').textContent = data.archiveresults_failed;
// Render crawl tree
if (data.active_crawls.length > 0) {
idleMessage.style.display = 'none';
crawlTree.innerHTML = data.active_crawls.map(c => renderCrawl(c)).join('');
} else if (hasActivity) {
idleMessage.style.display = 'none';
crawlTree.innerHTML = `
<div class="idle-message">
${data.snapshots_started} snapshots processing, ${data.archiveresults_started} extractors running
</div>
`;
} else {
idleMessage.style.display = '';
// Build the URL for recent crawls (last 24 hours)
var yesterday = new Date(Date.now() - 24*60*60*1000).toISOString().split('T')[0];
var recentUrl = '/admin/crawls/crawl/?created_at__gte=' + yesterday + '&o=-1';
idleMessage.innerHTML = `No active crawls (${data.crawls_pending} pending, ${data.crawls_started} started, <a href="${recentUrl}" style="color: #58a6ff;">${data.crawls_recent} recent</a>)`;
crawlTree.innerHTML = '';
}
}
function fetchProgress() {
fetch('/admin/live-progress/')
.then(response => response.json())
.then(data => {
if (data.error) {
console.error('Progress API error:', data.error, data.traceback);
idleMessage.textContent = 'API Error: ' + data.error;
idleMessage.style.color = '#f85149';
}
updateProgress(data);
})
.catch(error => {
console.error('Progress fetch error:', error);
idleMessage.textContent = 'Fetch Error: ' + error.message;
idleMessage.style.color = '#f85149';
});
}
function startPolling() {
if (pollInterval) return;
fetchProgress();
pollInterval = setInterval(fetchProgress, 1000); // Poll every 1 second
}
function stopPolling() {
if (pollInterval) {
clearInterval(pollInterval);
pollInterval = null;
}
}
// Collapse toggle
collapseBtn.addEventListener('click', function() {
isCollapsed = !isCollapsed;
localStorage.setItem('progress-monitor-collapsed', isCollapsed);
if (isCollapsed) {
monitor.classList.add('collapsed');
collapseBtn.textContent = 'Expand';
} else {
monitor.classList.remove('collapsed');
collapseBtn.textContent = 'Details';
}
});
// Apply initial state
if (isCollapsed) {
monitor.classList.add('collapsed');
collapseBtn.textContent = 'Expand';
}
// Start polling when page loads
startPolling();
// Pause polling when tab is hidden
document.addEventListener('visibilitychange', function() {
if (document.hidden) {
stopPolling();
} else {
startPolling();
}
});
})();
</script>

View File

@@ -192,6 +192,42 @@
border: 0px;
border-top: 3px solid #aa1e55;
}
#main-frame-wrapper {
width: 100%;
height: calc(100vh - 210px);
border-top: 3px solid #aa1e55;
overflow: hidden;
}
#main-frame-wrapper iframe {
width: 100%;
height: 100%;
border: none;
}
.full-page-wrapper {
width: 100%;
height: calc(100vh - 210px);
}
.thumbnail-wrapper {
height: 100px;
overflow: hidden;
background-color: #333;
pointer-events: none;
}
.thumbnail-wrapper iframe {
width: 405%;
height: 430px;
margin-bottom: -330px;
margin-left: -1%;
transform: scale(0.25);
transform-origin: 0 0;
border: none;
}
.thumbnail-wrapper img {
width: 100%;
height: 100%;
object-fit: cover;
object-position: top center;
}
.card.selected-card {
border: 2px solid orange;
box-shadow: 0px -6px 13px 1px rgba(0,0,0,0.05);
@@ -403,12 +439,18 @@
<div class="card {% if forloop.first %}selected-card{% endif %}">
<div class="card-body">
<a href="{{result.path|urlencode}}" target="preview" title="./{{result.path}} (downloaded {{result.ts}})">
<h4>{{result.name|truncatechars:24}} <small>({{result.size|filesizeformat}})</small></h4>
<!-- <p class="card-text" ><code>./{{result.path|truncatechars:30}}</code></p> -->
<h4>{% extractor_icon result.name %} {{result.name|extractor_name|truncatechars:20}} <small>({{result.size|filesizeformat}})</small></h4>
</a>
<!--<a href="{{result.path}}" target="preview"><h4 class="card-title">{{result.name}}</h4></a>-->
</div>
<iframe class="card-img-top" src="{{result.path|urlencode}}?autoplay=0" allow="autoplay 'none'; fullscreen 'none'; navigation-override 'none'; " sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy"></iframe>
{% if result.result %}
{# Use plugin-specific thumbnail template when ArchiveResult is available #}
<div class="card-img-top thumbnail-wrapper">
{% extractor_thumbnail result.result %}
</div>
{% else %}
{# Fall back to generic iframe for filesystem-discovered files #}
<iframe class="card-img-top" src="{{result.path|urlencode}}?autoplay=0" allow="autoplay 'none'; fullscreen 'none'; navigation-override 'none'; " sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy"></iframe>
{% endif %}
</div>
</div>
{% endfor %}
@@ -431,7 +473,15 @@
<iframe id="main-frame" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" class="full-page-iframe" src="{{best_result.path|urlencode}}" name="preview"></iframe>
{% if best_result.result %}
{# Use plugin-specific fullscreen template when ArchiveResult is available #}
<div id="main-frame-wrapper" class="full-page-wrapper">
{% extractor_fullscreen best_result.result %}
</div>
{% else %}
{# Fall back to generic iframe #}
<iframe id="main-frame" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" class="full-page-iframe" src="{{best_result.path|urlencode}}" name="preview"></iframe>
{% endif %}

View File

@@ -1,23 +1,13 @@
"""
Workers admin module.
The orchestrator/worker system doesn't need Django admin registration
as workers are managed via CLI commands and the orchestrator.
"""
__package__ = 'archivebox.workers'
from django.contrib.auth import get_permission_codename
from huey_monitor.apps import HueyMonitorConfig
from huey_monitor.admin import TaskModel, TaskModelAdmin, SignalInfoModel, SignalInfoModelAdmin
HueyMonitorConfig.verbose_name = 'Background Workers'
class CustomTaskModelAdmin(TaskModelAdmin):
actions = ["delete_selected"]
def has_delete_permission(self, request, obj=None):
codename = get_permission_codename("delete", self.opts)
return request.user.has_perm("%s.%s" % (self.opts.app_label, codename))
def register_admin(admin_site):
admin_site.register(TaskModel, CustomTaskModelAdmin)
admin_site.register(SignalInfoModel, SignalInfoModelAdmin)
"""No models to register - workers are process-based, not Django models."""
pass

View File

@@ -0,0 +1,15 @@
from django.core.management.base import BaseCommand
from workers.orchestrator import Orchestrator
class Command(BaseCommand):
help = 'Run the archivebox orchestrator'
def add_arguments(self, parser):
parser.add_argument('--daemon', '-d', action='store_true', help="Run forever (don't exit on idle)")
def handle(self, *args, **kwargs):
daemon = kwargs.get('daemon', False)
orchestrator = Orchestrator(exit_on_idle=not daemon)
orchestrator.runloop()

Some files were not shown because too many files have changed in this diff Show More