mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-03 06:17:53 +10:00
remove huey
This commit is contained in:
@@ -42,6 +42,7 @@ def register_urls(api: NinjaAPI) -> NinjaAPI:
|
||||
api.add_router('/crawls/', 'api.v1_crawls.router')
|
||||
api.add_router('/cli/', 'api.v1_cli.router')
|
||||
api.add_router('/workers/', 'api.v1_workers.router')
|
||||
api.add_router('/machine/', 'api.v1_machine.router')
|
||||
return api
|
||||
|
||||
|
||||
|
||||
@@ -107,7 +107,7 @@ class RemoveCommandSchema(Schema):
|
||||
@router.post("/add", response=CLICommandResponseSchema, summary='archivebox add [args] [urls]')
|
||||
def cli_add(request, args: AddCommandSchema):
|
||||
from archivebox.cli.archivebox_add import add
|
||||
|
||||
|
||||
result = add(
|
||||
urls=args.urls,
|
||||
tag=args.tag,
|
||||
@@ -115,8 +115,9 @@ def cli_add(request, args: AddCommandSchema):
|
||||
update=args.update,
|
||||
index_only=args.index_only,
|
||||
overwrite=args.overwrite,
|
||||
extract=args.extract,
|
||||
plugins=args.extract, # extract in API maps to plugins param
|
||||
parser=args.parser,
|
||||
bg=True, # Always run in background for API calls
|
||||
)
|
||||
|
||||
return {
|
||||
|
||||
206
archivebox/api/v1_machine.py
Normal file
206
archivebox/api/v1_machine.py
Normal file
@@ -0,0 +1,206 @@
|
||||
__package__ = 'archivebox.api'
|
||||
|
||||
from uuid import UUID
|
||||
from typing import List, Optional
|
||||
from datetime import datetime
|
||||
|
||||
from ninja import Router, Schema, FilterSchema, Field, Query
|
||||
from ninja.pagination import paginate
|
||||
|
||||
from api.v1_core import CustomPagination
|
||||
|
||||
|
||||
router = Router(tags=['Machine and Dependencies'])
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Machine Schemas
|
||||
# ============================================================================
|
||||
|
||||
class MachineSchema(Schema):
|
||||
"""Schema for Machine model."""
|
||||
TYPE: str = 'machine.Machine'
|
||||
id: UUID
|
||||
created_at: datetime
|
||||
modified_at: datetime
|
||||
guid: str
|
||||
hostname: str
|
||||
hw_in_docker: bool
|
||||
hw_in_vm: bool
|
||||
hw_manufacturer: str
|
||||
hw_product: str
|
||||
hw_uuid: str
|
||||
os_arch: str
|
||||
os_family: str
|
||||
os_platform: str
|
||||
os_release: str
|
||||
os_kernel: str
|
||||
stats: dict
|
||||
num_uses_succeeded: int
|
||||
num_uses_failed: int
|
||||
|
||||
|
||||
class MachineFilterSchema(FilterSchema):
|
||||
id: Optional[str] = Field(None, q='id__startswith')
|
||||
hostname: Optional[str] = Field(None, q='hostname__icontains')
|
||||
os_platform: Optional[str] = Field(None, q='os_platform__icontains')
|
||||
os_arch: Optional[str] = Field(None, q='os_arch')
|
||||
hw_in_docker: Optional[bool] = Field(None, q='hw_in_docker')
|
||||
hw_in_vm: Optional[bool] = Field(None, q='hw_in_vm')
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Dependency Schemas
|
||||
# ============================================================================
|
||||
|
||||
class DependencySchema(Schema):
|
||||
"""Schema for Dependency model."""
|
||||
TYPE: str = 'machine.Dependency'
|
||||
id: UUID
|
||||
created_at: datetime
|
||||
modified_at: datetime
|
||||
bin_name: str
|
||||
bin_providers: str
|
||||
custom_cmds: dict
|
||||
config: dict
|
||||
is_installed: bool
|
||||
installed_count: int
|
||||
|
||||
@staticmethod
|
||||
def resolve_is_installed(obj) -> bool:
|
||||
return obj.is_installed
|
||||
|
||||
@staticmethod
|
||||
def resolve_installed_count(obj) -> int:
|
||||
return obj.installed_binaries.count()
|
||||
|
||||
|
||||
class DependencyFilterSchema(FilterSchema):
|
||||
id: Optional[str] = Field(None, q='id__startswith')
|
||||
bin_name: Optional[str] = Field(None, q='bin_name__icontains')
|
||||
bin_providers: Optional[str] = Field(None, q='bin_providers__icontains')
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# InstalledBinary Schemas
|
||||
# ============================================================================
|
||||
|
||||
class InstalledBinarySchema(Schema):
|
||||
"""Schema for InstalledBinary model."""
|
||||
TYPE: str = 'machine.InstalledBinary'
|
||||
id: UUID
|
||||
created_at: datetime
|
||||
modified_at: datetime
|
||||
machine_id: UUID
|
||||
machine_hostname: str
|
||||
dependency_id: Optional[UUID]
|
||||
dependency_bin_name: Optional[str]
|
||||
name: str
|
||||
binprovider: str
|
||||
abspath: str
|
||||
version: str
|
||||
sha256: str
|
||||
is_valid: bool
|
||||
num_uses_succeeded: int
|
||||
num_uses_failed: int
|
||||
|
||||
@staticmethod
|
||||
def resolve_machine_hostname(obj) -> str:
|
||||
return obj.machine.hostname
|
||||
|
||||
@staticmethod
|
||||
def resolve_dependency_id(obj) -> Optional[UUID]:
|
||||
return obj.dependency_id
|
||||
|
||||
@staticmethod
|
||||
def resolve_dependency_bin_name(obj) -> Optional[str]:
|
||||
return obj.dependency.bin_name if obj.dependency else None
|
||||
|
||||
@staticmethod
|
||||
def resolve_is_valid(obj) -> bool:
|
||||
return obj.is_valid
|
||||
|
||||
|
||||
class InstalledBinaryFilterSchema(FilterSchema):
|
||||
id: Optional[str] = Field(None, q='id__startswith')
|
||||
name: Optional[str] = Field(None, q='name__icontains')
|
||||
binprovider: Optional[str] = Field(None, q='binprovider')
|
||||
machine_id: Optional[str] = Field(None, q='machine_id__startswith')
|
||||
dependency_id: Optional[str] = Field(None, q='dependency_id__startswith')
|
||||
version: Optional[str] = Field(None, q='version__icontains')
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Machine Endpoints
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/machines", response=List[MachineSchema], url_name="get_machines")
|
||||
@paginate(CustomPagination)
|
||||
def get_machines(request, filters: MachineFilterSchema = Query(...)):
|
||||
"""List all machines."""
|
||||
from machine.models import Machine
|
||||
return filters.filter(Machine.objects.all()).distinct()
|
||||
|
||||
|
||||
@router.get("/machine/{machine_id}", response=MachineSchema, url_name="get_machine")
|
||||
def get_machine(request, machine_id: str):
|
||||
"""Get a specific machine by ID."""
|
||||
from machine.models import Machine
|
||||
from django.db.models import Q
|
||||
return Machine.objects.get(Q(id__startswith=machine_id) | Q(hostname__iexact=machine_id))
|
||||
|
||||
|
||||
@router.get("/machine/current", response=MachineSchema, url_name="get_current_machine")
|
||||
def get_current_machine(request):
|
||||
"""Get the current machine."""
|
||||
from machine.models import Machine
|
||||
return Machine.current()
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Dependency Endpoints
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/dependencies", response=List[DependencySchema], url_name="get_dependencies")
|
||||
@paginate(CustomPagination)
|
||||
def get_dependencies(request, filters: DependencyFilterSchema = Query(...)):
|
||||
"""List all dependencies."""
|
||||
from machine.models import Dependency
|
||||
return filters.filter(Dependency.objects.all()).distinct()
|
||||
|
||||
|
||||
@router.get("/dependency/{dependency_id}", response=DependencySchema, url_name="get_dependency")
|
||||
def get_dependency(request, dependency_id: str):
|
||||
"""Get a specific dependency by ID or bin_name."""
|
||||
from machine.models import Dependency
|
||||
from django.db.models import Q
|
||||
try:
|
||||
return Dependency.objects.get(Q(id__startswith=dependency_id))
|
||||
except Dependency.DoesNotExist:
|
||||
return Dependency.objects.get(bin_name__iexact=dependency_id)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# InstalledBinary Endpoints
|
||||
# ============================================================================
|
||||
|
||||
@router.get("/binaries", response=List[InstalledBinarySchema], url_name="get_binaries")
|
||||
@paginate(CustomPagination)
|
||||
def get_binaries(request, filters: InstalledBinaryFilterSchema = Query(...)):
|
||||
"""List all installed binaries."""
|
||||
from machine.models import InstalledBinary
|
||||
return filters.filter(InstalledBinary.objects.all().select_related('machine', 'dependency')).distinct()
|
||||
|
||||
|
||||
@router.get("/binary/{binary_id}", response=InstalledBinarySchema, url_name="get_binary")
|
||||
def get_binary(request, binary_id: str):
|
||||
"""Get a specific installed binary by ID."""
|
||||
from machine.models import InstalledBinary
|
||||
return InstalledBinary.objects.select_related('machine', 'dependency').get(id__startswith=binary_id)
|
||||
|
||||
|
||||
@router.get("/binary/by-name/{name}", response=List[InstalledBinarySchema], url_name="get_binaries_by_name")
|
||||
def get_binaries_by_name(request, name: str):
|
||||
"""Get all installed binaries with the given name."""
|
||||
from machine.models import InstalledBinary
|
||||
return list(InstalledBinary.objects.filter(name__iexact=name).select_related('machine', 'dependency'))
|
||||
@@ -4,125 +4,157 @@ from uuid import UUID
|
||||
from typing import List, Any
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
from ninja import Router, Schema
|
||||
|
||||
|
||||
router = Router(tags=['Workers and Tasks'])
|
||||
|
||||
|
||||
class TaskSchema(Schema):
|
||||
class QueueItemSchema(Schema):
|
||||
"""Schema for a single item in a worker's queue."""
|
||||
TYPE: str
|
||||
|
||||
id: UUID
|
||||
description: str
|
||||
|
||||
status: str
|
||||
retry_at: datetime | None
|
||||
|
||||
created_at: datetime
|
||||
modified_at: datetime
|
||||
created_by_id: int
|
||||
|
||||
description: str
|
||||
|
||||
@staticmethod
|
||||
def resolve_TYPE(obj) -> str:
|
||||
return f'{obj._meta.app_label}.{obj._meta.model_name}'
|
||||
|
||||
@staticmethod
|
||||
def resolve_description(obj) -> str:
|
||||
return str(obj)
|
||||
|
||||
|
||||
class ActorSchema(Schema):
|
||||
# TYPE: str = 'workers.actor.ActorType'
|
||||
|
||||
# name: str
|
||||
#pid: int | None
|
||||
idle_count: int
|
||||
launch_kwargs: dict[str, Any]
|
||||
mode: str
|
||||
|
||||
class WorkerSchema(Schema):
|
||||
"""Schema for a Worker type."""
|
||||
name: str
|
||||
model: str
|
||||
statemachine: str
|
||||
ACTIVE_STATE: str
|
||||
EVENT_NAME: str
|
||||
CLAIM_ORDER: list[str]
|
||||
CLAIM_FROM_TOP_N: int
|
||||
CLAIM_ATOMIC: bool
|
||||
MAX_TICK_TIME: int
|
||||
MAX_CONCURRENT_ACTORS: int
|
||||
|
||||
future: list[TaskSchema]
|
||||
pending: list[TaskSchema]
|
||||
stalled: list[TaskSchema]
|
||||
active: list[TaskSchema]
|
||||
past: list[TaskSchema]
|
||||
|
||||
max_tick_time: int
|
||||
max_concurrent_tasks: int
|
||||
poll_interval: float
|
||||
idle_timeout: int
|
||||
running_count: int
|
||||
running_workers: List[dict[str, Any]]
|
||||
queue_count: int
|
||||
queue: List[QueueItemSchema]
|
||||
|
||||
@staticmethod
|
||||
def resolve_model(obj) -> str:
|
||||
return obj.Model.__name__
|
||||
|
||||
@staticmethod
|
||||
def resolve_statemachine(obj) -> str:
|
||||
return obj.StateMachineClass.__name__
|
||||
|
||||
@staticmethod
|
||||
def resolve_name(obj) -> str:
|
||||
return str(obj)
|
||||
Model = obj.get_model()
|
||||
return f'{Model._meta.app_label}.{Model._meta.model_name}'
|
||||
|
||||
@staticmethod
|
||||
def resolve_ACTIVE_STATE(obj) -> str:
|
||||
return str(obj.ACTIVE_STATE)
|
||||
|
||||
@staticmethod
|
||||
def resolve_FINAL_STATES(obj) -> list[str]:
|
||||
return [str(state) for state in obj.FINAL_STATES]
|
||||
|
||||
@staticmethod
|
||||
def resolve_future(obj) -> list[TaskSchema]:
|
||||
return [obj for obj in obj.qs.filter(obj.future_q).order_by('-retry_at')]
|
||||
|
||||
@staticmethod
|
||||
def resolve_pending(obj) -> list[TaskSchema]:
|
||||
return [obj for obj in obj.qs.filter(obj.pending_q).order_by('-retry_at')]
|
||||
|
||||
@staticmethod
|
||||
def resolve_stalled(obj) -> list[TaskSchema]:
|
||||
return [obj for obj in obj.qs.filter(obj.stalled_q).order_by('-retry_at')]
|
||||
|
||||
@staticmethod
|
||||
def resolve_active(obj) -> list[TaskSchema]:
|
||||
return [obj for obj in obj.qs.filter(obj.active_q).order_by('-retry_at')]
|
||||
def resolve_max_tick_time(obj) -> int:
|
||||
return obj.MAX_TICK_TIME
|
||||
|
||||
@staticmethod
|
||||
def resolve_past(obj) -> list[TaskSchema]:
|
||||
return [obj for obj in obj.qs.filter(obj.final_q).order_by('-modified_at')]
|
||||
def resolve_max_concurrent_tasks(obj) -> int:
|
||||
return obj.MAX_CONCURRENT_TASKS
|
||||
|
||||
@staticmethod
|
||||
def resolve_poll_interval(obj) -> float:
|
||||
return obj.POLL_INTERVAL
|
||||
|
||||
@staticmethod
|
||||
def resolve_idle_timeout(obj) -> int:
|
||||
return obj.IDLE_TIMEOUT
|
||||
|
||||
@staticmethod
|
||||
def resolve_running_count(obj) -> int:
|
||||
return len(obj.get_running_workers())
|
||||
|
||||
@staticmethod
|
||||
def resolve_running_workers(obj) -> List[dict[str, Any]]:
|
||||
return obj.get_running_workers()
|
||||
|
||||
@staticmethod
|
||||
def resolve_queue_count(obj) -> int:
|
||||
return obj.get_queue().count()
|
||||
|
||||
@staticmethod
|
||||
def resolve_queue(obj) -> List[QueueItemSchema]:
|
||||
return list(obj.get_queue()[:50]) # Limit to 50 items
|
||||
|
||||
|
||||
class OrchestratorSchema(Schema):
|
||||
# TYPE: str = 'workers.orchestrator.Orchestrator'
|
||||
|
||||
#pid: int | None
|
||||
exit_on_idle: bool
|
||||
mode: str
|
||||
|
||||
actors: list[ActorSchema]
|
||||
|
||||
@staticmethod
|
||||
def resolve_actors(obj) -> list[ActorSchema]:
|
||||
return [actor() for actor in obj.actor_types.values()]
|
||||
"""Schema for the Orchestrator."""
|
||||
is_running: bool
|
||||
poll_interval: float
|
||||
idle_timeout: int
|
||||
max_workers_per_type: int
|
||||
max_total_workers: int
|
||||
total_worker_count: int
|
||||
workers: List[WorkerSchema]
|
||||
|
||||
|
||||
@router.get("/orchestrators", response=List[OrchestratorSchema], url_name="get_orchestrators")
|
||||
def get_orchestrators(request):
|
||||
"""List all the task orchestrators (aka Orchestrators) that are currently running"""
|
||||
|
||||
@router.get("/orchestrator", response=OrchestratorSchema, url_name="get_orchestrator")
|
||||
def get_orchestrator(request):
|
||||
"""Get the orchestrator status and all worker queues."""
|
||||
from workers.orchestrator import Orchestrator
|
||||
from workers.worker import CrawlWorker, SnapshotWorker, ArchiveResultWorker
|
||||
|
||||
orchestrator = Orchestrator()
|
||||
|
||||
return [orchestrator]
|
||||
# Create temporary worker instances to query their queues
|
||||
workers = [
|
||||
CrawlWorker(worker_id=-1),
|
||||
SnapshotWorker(worker_id=-1),
|
||||
ArchiveResultWorker(worker_id=-1),
|
||||
]
|
||||
|
||||
return {
|
||||
'is_running': orchestrator.is_running(),
|
||||
'poll_interval': orchestrator.POLL_INTERVAL,
|
||||
'idle_timeout': orchestrator.IDLE_TIMEOUT,
|
||||
'max_workers_per_type': orchestrator.MAX_WORKERS_PER_TYPE,
|
||||
'max_total_workers': orchestrator.MAX_TOTAL_WORKERS,
|
||||
'total_worker_count': orchestrator.get_total_worker_count(),
|
||||
'workers': workers,
|
||||
}
|
||||
|
||||
|
||||
@router.get("/actors", response=List[ActorSchema], url_name="get_actors")
|
||||
def get_actors(request):
|
||||
"""List all the task consumer workers (aka Actors) that are currently running"""
|
||||
@router.get("/workers", response=List[WorkerSchema], url_name="get_workers")
|
||||
def get_workers(request):
|
||||
"""List all worker types and their current status."""
|
||||
from workers.worker import CrawlWorker, SnapshotWorker, ArchiveResultWorker
|
||||
|
||||
from workers.orchestrator import Orchestrator
|
||||
orchestrator = Orchestrator()
|
||||
return orchestrator.actor_types.values()
|
||||
# Create temporary instances to query their queues
|
||||
return [
|
||||
CrawlWorker(worker_id=-1),
|
||||
SnapshotWorker(worker_id=-1),
|
||||
ArchiveResultWorker(worker_id=-1),
|
||||
]
|
||||
|
||||
|
||||
@router.get("/worker/{worker_name}", response=WorkerSchema, url_name="get_worker")
|
||||
def get_worker(request, worker_name: str):
|
||||
"""Get status and queue for a specific worker type."""
|
||||
from workers.worker import WORKER_TYPES
|
||||
|
||||
if worker_name not in WORKER_TYPES:
|
||||
from ninja.errors import HttpError
|
||||
raise HttpError(404, f"Unknown worker type: {worker_name}. Valid types: {list(WORKER_TYPES.keys())}")
|
||||
|
||||
WorkerClass = WORKER_TYPES[worker_name]
|
||||
return WorkerClass(worker_id=-1)
|
||||
|
||||
|
||||
@router.get("/worker/{worker_name}/queue", response=List[QueueItemSchema], url_name="get_worker_queue")
|
||||
def get_worker_queue(request, worker_name: str, limit: int = 100):
|
||||
"""Get the current queue for a specific worker type."""
|
||||
from workers.worker import WORKER_TYPES
|
||||
|
||||
if worker_name not in WORKER_TYPES:
|
||||
from ninja.errors import HttpError
|
||||
raise HttpError(404, f"Unknown worker type: {worker_name}. Valid types: {list(WORKER_TYPES.keys())}")
|
||||
|
||||
WorkerClass = WORKER_TYPES[worker_name]
|
||||
worker = WorkerClass(worker_id=-1)
|
||||
return list(worker.get_queue()[:limit])
|
||||
|
||||
|
||||
# Progress endpoint moved to core.views.live_progress_view for simplicity
|
||||
|
||||
@@ -2,76 +2,226 @@
|
||||
|
||||
__package__ = 'archivebox.base_models'
|
||||
|
||||
import json
|
||||
|
||||
from django import forms
|
||||
from django.contrib import admin
|
||||
from django.utils.html import format_html, mark_safe
|
||||
from django_object_actions import DjangoObjectActions
|
||||
|
||||
|
||||
class KeyValueWidget(forms.Widget):
|
||||
"""
|
||||
A widget that renders JSON dict as editable key-value input fields
|
||||
with + and - buttons to add/remove rows.
|
||||
Includes autocomplete for available config keys from the plugin system.
|
||||
"""
|
||||
template_name = None # We render manually
|
||||
|
||||
class Media:
|
||||
css = {
|
||||
'all': []
|
||||
}
|
||||
js = []
|
||||
|
||||
def _get_config_options(self):
|
||||
"""Get available config options from plugins."""
|
||||
try:
|
||||
from archivebox.hooks import discover_plugin_configs
|
||||
plugin_configs = discover_plugin_configs()
|
||||
options = {}
|
||||
for plugin_name, schema in plugin_configs.items():
|
||||
for key, prop in schema.get('properties', {}).items():
|
||||
options[key] = {
|
||||
'plugin': plugin_name,
|
||||
'type': prop.get('type', 'string'),
|
||||
'default': prop.get('default', ''),
|
||||
'description': prop.get('description', ''),
|
||||
}
|
||||
return options
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
def render(self, name, value, attrs=None, renderer=None):
|
||||
# Parse JSON value to dict
|
||||
if value is None:
|
||||
data = {}
|
||||
elif isinstance(value, str):
|
||||
try:
|
||||
data = json.loads(value) if value else {}
|
||||
except json.JSONDecodeError:
|
||||
data = {}
|
||||
elif isinstance(value, dict):
|
||||
data = value
|
||||
else:
|
||||
data = {}
|
||||
|
||||
widget_id = attrs.get('id', name) if attrs else name
|
||||
config_options = self._get_config_options()
|
||||
|
||||
# Build datalist options
|
||||
datalist_options = '\n'.join(
|
||||
f'<option value="{self._escape(key)}">{self._escape(opt["description"][:60] or opt["type"])}</option>'
|
||||
for key, opt in sorted(config_options.items())
|
||||
)
|
||||
|
||||
# Build config metadata as JSON for JS
|
||||
config_meta_json = json.dumps(config_options)
|
||||
|
||||
html = f'''
|
||||
<div id="{widget_id}_container" class="key-value-editor" style="max-width: 700px;">
|
||||
<datalist id="{widget_id}_keys">
|
||||
{datalist_options}
|
||||
</datalist>
|
||||
<div id="{widget_id}_rows" class="key-value-rows">
|
||||
'''
|
||||
|
||||
# Render existing key-value pairs
|
||||
row_idx = 0
|
||||
for key, val in data.items():
|
||||
val_str = json.dumps(val) if not isinstance(val, str) else val
|
||||
html += self._render_row(widget_id, row_idx, key, val_str)
|
||||
row_idx += 1
|
||||
|
||||
# Always add one empty row for new entries
|
||||
html += self._render_row(widget_id, row_idx, '', '')
|
||||
|
||||
html += f'''
|
||||
</div>
|
||||
<div style="display: flex; gap: 8px; align-items: center; margin-top: 8px;">
|
||||
<button type="button" onclick="addKeyValueRow_{widget_id}()"
|
||||
style="padding: 4px 12px; cursor: pointer; background: #417690; color: white; border: none; border-radius: 4px;">
|
||||
+ Add Row
|
||||
</button>
|
||||
<span id="{widget_id}_hint" style="font-size: 11px; color: #666; font-style: italic;"></span>
|
||||
</div>
|
||||
<input type="hidden" name="{name}" id="{widget_id}" value="">
|
||||
<script>
|
||||
(function() {{
|
||||
var configMeta_{widget_id} = {config_meta_json};
|
||||
|
||||
function showKeyHint_{widget_id}(key) {{
|
||||
var hint = document.getElementById('{widget_id}_hint');
|
||||
var meta = configMeta_{widget_id}[key];
|
||||
if (meta) {{
|
||||
hint.innerHTML = '<b>' + key + '</b>: ' + (meta.description || meta.type) +
|
||||
(meta.default !== '' ? ' <span style="color:#888">(default: ' + meta.default + ')</span>' : '');
|
||||
}} else {{
|
||||
hint.textContent = key ? 'Custom key: ' + key : '';
|
||||
}}
|
||||
}}
|
||||
|
||||
function updateHiddenField_{widget_id}() {{
|
||||
var container = document.getElementById('{widget_id}_rows');
|
||||
var rows = container.querySelectorAll('.key-value-row');
|
||||
var result = {{}};
|
||||
rows.forEach(function(row) {{
|
||||
var keyInput = row.querySelector('.kv-key');
|
||||
var valInput = row.querySelector('.kv-value');
|
||||
if (keyInput && valInput && keyInput.value.trim()) {{
|
||||
var key = keyInput.value.trim();
|
||||
var val = valInput.value.trim();
|
||||
// Try to parse as JSON (for booleans, numbers, etc)
|
||||
try {{
|
||||
if (val === 'true') result[key] = true;
|
||||
else if (val === 'false') result[key] = false;
|
||||
else if (val === 'null') result[key] = null;
|
||||
else if (!isNaN(val) && val !== '') result[key] = Number(val);
|
||||
else if ((val.startsWith('{{') && val.endsWith('}}')) ||
|
||||
(val.startsWith('[') && val.endsWith(']')) ||
|
||||
(val.startsWith('"') && val.endsWith('"')))
|
||||
result[key] = JSON.parse(val);
|
||||
else result[key] = val;
|
||||
}} catch(e) {{
|
||||
result[key] = val;
|
||||
}}
|
||||
}}
|
||||
}});
|
||||
document.getElementById('{widget_id}').value = JSON.stringify(result);
|
||||
}}
|
||||
|
||||
window.addKeyValueRow_{widget_id} = function() {{
|
||||
var container = document.getElementById('{widget_id}_rows');
|
||||
var rows = container.querySelectorAll('.key-value-row');
|
||||
var newIdx = rows.length;
|
||||
var newRow = document.createElement('div');
|
||||
newRow.className = 'key-value-row';
|
||||
newRow.style.cssText = 'display: flex; gap: 8px; margin-bottom: 6px; align-items: center;';
|
||||
newRow.innerHTML = '<input type="text" class="kv-key" placeholder="KEY" list="{widget_id}_keys" ' +
|
||||
'style="flex: 1; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;" ' +
|
||||
'onchange="updateHiddenField_{widget_id}()" oninput="updateHiddenField_{widget_id}(); showKeyHint_{widget_id}(this.value)" onfocus="showKeyHint_{widget_id}(this.value)">' +
|
||||
'<input type="text" class="kv-value" placeholder="value" ' +
|
||||
'style="flex: 2; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;" ' +
|
||||
'onchange="updateHiddenField_{widget_id}()" oninput="updateHiddenField_{widget_id}()">' +
|
||||
'<button type="button" onclick="removeKeyValueRow_{widget_id}(this)" ' +
|
||||
'style="padding: 4px 10px; cursor: pointer; background: #ba2121; color: white; border: none; border-radius: 4px; font-weight: bold;">−</button>';
|
||||
container.appendChild(newRow);
|
||||
newRow.querySelector('.kv-key').focus();
|
||||
}};
|
||||
|
||||
window.removeKeyValueRow_{widget_id} = function(btn) {{
|
||||
var row = btn.parentElement;
|
||||
row.remove();
|
||||
updateHiddenField_{widget_id}();
|
||||
}};
|
||||
|
||||
window.showKeyHint_{widget_id} = showKeyHint_{widget_id};
|
||||
window.updateHiddenField_{widget_id} = updateHiddenField_{widget_id};
|
||||
|
||||
// Initialize on load
|
||||
document.addEventListener('DOMContentLoaded', function() {{
|
||||
updateHiddenField_{widget_id}();
|
||||
}});
|
||||
// Also run immediately in case DOM is already ready
|
||||
if (document.readyState !== 'loading') {{
|
||||
updateHiddenField_{widget_id}();
|
||||
}}
|
||||
|
||||
// Update on any input change
|
||||
document.getElementById('{widget_id}_rows').addEventListener('input', updateHiddenField_{widget_id});
|
||||
}})();
|
||||
</script>
|
||||
</div>
|
||||
'''
|
||||
return mark_safe(html)
|
||||
|
||||
def _render_row(self, widget_id, idx, key, value):
|
||||
return f'''
|
||||
<div class="key-value-row" style="display: flex; gap: 8px; margin-bottom: 6px; align-items: center;">
|
||||
<input type="text" class="kv-key" value="{self._escape(key)}" placeholder="KEY" list="{widget_id}_keys"
|
||||
style="flex: 1; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;"
|
||||
onchange="updateHiddenField_{widget_id}()" oninput="updateHiddenField_{widget_id}(); showKeyHint_{widget_id}(this.value)" onfocus="showKeyHint_{widget_id}(this.value)">
|
||||
<input type="text" class="kv-value" value="{self._escape(value)}" placeholder="value"
|
||||
style="flex: 2; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;"
|
||||
onchange="updateHiddenField_{widget_id}()" oninput="updateHiddenField_{widget_id}()">
|
||||
<button type="button" onclick="removeKeyValueRow_{widget_id}(this)"
|
||||
style="padding: 4px 10px; cursor: pointer; background: #ba2121; color: white; border: none; border-radius: 4px; font-weight: bold;">−</button>
|
||||
</div>
|
||||
'''
|
||||
|
||||
def _escape(self, s):
|
||||
"""Escape HTML special chars in attribute values."""
|
||||
if not s:
|
||||
return ''
|
||||
return str(s).replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"')
|
||||
|
||||
def value_from_datadict(self, data, files, name):
|
||||
value = data.get(name, '{}')
|
||||
return value
|
||||
|
||||
|
||||
class ConfigEditorMixin:
|
||||
"""
|
||||
Mixin for admin classes with a config JSON field.
|
||||
|
||||
Provides a readonly field that shows available config options
|
||||
from all discovered plugin schemas.
|
||||
Provides a key-value editor widget with autocomplete for available config keys.
|
||||
"""
|
||||
|
||||
@admin.display(description='Available Config Options')
|
||||
def available_config_options(self, obj):
|
||||
"""Show documentation for available config keys."""
|
||||
try:
|
||||
from archivebox.hooks import discover_plugin_configs
|
||||
plugin_configs = discover_plugin_configs()
|
||||
except ImportError:
|
||||
return format_html('<i>Plugin config system not available</i>')
|
||||
|
||||
html_parts = [
|
||||
'<details>',
|
||||
'<summary style="cursor: pointer; font-weight: bold; padding: 4px;">',
|
||||
'Click to see available config keys ({})</summary>'.format(
|
||||
sum(len(s.get('properties', {})) for s in plugin_configs.values())
|
||||
),
|
||||
'<div style="max-height: 400px; overflow-y: auto; padding: 8px; background: #f8f8f8; border-radius: 4px; font-family: monospace; font-size: 11px;">',
|
||||
]
|
||||
|
||||
for plugin_name, schema in sorted(plugin_configs.items()):
|
||||
properties = schema.get('properties', {})
|
||||
if not properties:
|
||||
continue
|
||||
|
||||
html_parts.append(f'<div style="margin: 8px 0;"><strong style="color: #333;">{plugin_name}</strong></div>')
|
||||
html_parts.append('<table style="width: 100%; border-collapse: collapse; margin-bottom: 12px;">')
|
||||
html_parts.append('<tr style="background: #eee;"><th style="text-align: left; padding: 4px;">Key</th><th style="text-align: left; padding: 4px;">Type</th><th style="text-align: left; padding: 4px;">Default</th><th style="text-align: left; padding: 4px;">Description</th></tr>')
|
||||
|
||||
for key, prop in sorted(properties.items()):
|
||||
prop_type = prop.get('type', 'string')
|
||||
default = prop.get('default', '')
|
||||
description = prop.get('description', '')
|
||||
|
||||
# Truncate long defaults
|
||||
default_str = str(default)
|
||||
if len(default_str) > 30:
|
||||
default_str = default_str[:27] + '...'
|
||||
|
||||
html_parts.append(
|
||||
f'<tr style="border-bottom: 1px solid #ddd;">'
|
||||
f'<td style="padding: 4px; font-weight: bold;">{key}</td>'
|
||||
f'<td style="padding: 4px; color: #666;">{prop_type}</td>'
|
||||
f'<td style="padding: 4px; color: #666;">{default_str}</td>'
|
||||
f'<td style="padding: 4px;">{description}</td>'
|
||||
f'</tr>'
|
||||
)
|
||||
|
||||
html_parts.append('</table>')
|
||||
|
||||
html_parts.append('</div></details>')
|
||||
html_parts.append(
|
||||
'<p style="margin-top: 8px; color: #666; font-size: 11px;">'
|
||||
'<strong>Usage:</strong> Add key-value pairs in JSON format, e.g., '
|
||||
'<code>{"SAVE_WGET": false, "WGET_TIMEOUT": 120}</code>'
|
||||
'</p>'
|
||||
)
|
||||
|
||||
return mark_safe(''.join(html_parts))
|
||||
def formfield_for_dbfield(self, db_field, request, **kwargs):
|
||||
"""Use KeyValueWidget for the config JSON field."""
|
||||
if db_field.name == 'config':
|
||||
kwargs['widget'] = KeyValueWidget()
|
||||
return super().formfield_for_dbfield(db_field, request, **kwargs)
|
||||
|
||||
|
||||
class BaseModelAdmin(DjangoObjectActions, admin.ModelAdmin):
|
||||
|
||||
@@ -72,9 +72,10 @@ def add(urls: str | list[str],
|
||||
cli_args[0] = 'archivebox'
|
||||
cmd_str = ' '.join(cli_args)
|
||||
|
||||
timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
|
||||
seed = Seed.from_file(
|
||||
sources_file,
|
||||
label=f'{USER}@{HOSTNAME} $ {cmd_str}',
|
||||
label=f'{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]',
|
||||
parser=parser,
|
||||
tag=tag,
|
||||
created_by=created_by_id,
|
||||
|
||||
@@ -11,21 +11,53 @@ __package__ = "archivebox.config"
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional, List, Type, TYPE_CHECKING, cast
|
||||
from typing import Any, Dict, Optional, List, Type, Tuple, TYPE_CHECKING, cast
|
||||
from configparser import ConfigParser
|
||||
|
||||
from pydantic import Field
|
||||
from pydantic_settings import BaseSettings
|
||||
from pydantic_settings import BaseSettings, PydanticBaseSettingsSource
|
||||
|
||||
|
||||
class IniConfigSettingsSource(PydanticBaseSettingsSource):
|
||||
"""
|
||||
Custom settings source that reads from ArchiveBox.conf (INI format).
|
||||
Flattens all sections into a single namespace.
|
||||
"""
|
||||
|
||||
def get_field_value(self, field: Any, field_name: str) -> Tuple[Any, str, bool]:
|
||||
config_vals = self._load_config_file()
|
||||
field_value = config_vals.get(field_name.upper())
|
||||
return field_value, field_name, False
|
||||
|
||||
def __call__(self) -> Dict[str, Any]:
|
||||
return self._load_config_file()
|
||||
|
||||
def _load_config_file(self) -> Dict[str, Any]:
|
||||
try:
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
config_path = CONSTANTS.CONFIG_FILE
|
||||
except ImportError:
|
||||
return {}
|
||||
|
||||
if not config_path.exists():
|
||||
return {}
|
||||
|
||||
parser = ConfigParser()
|
||||
parser.optionxform = lambda x: x # preserve case
|
||||
parser.read(config_path)
|
||||
|
||||
# Flatten all sections into single namespace (ignore section headers)
|
||||
return {key.upper(): value for section in parser.sections() for key, value in parser.items(section)}
|
||||
|
||||
|
||||
class BaseConfigSet(BaseSettings):
|
||||
"""
|
||||
Base class for config sections.
|
||||
|
||||
Automatically loads values from:
|
||||
1. Environment variables (highest priority)
|
||||
2. ArchiveBox.conf file (if exists)
|
||||
3. Default values (lowest priority)
|
||||
Automatically loads values from (highest to lowest priority):
|
||||
1. Environment variables
|
||||
2. ArchiveBox.conf file (INI format, flattened)
|
||||
3. Default values
|
||||
|
||||
Subclasses define fields with defaults and types:
|
||||
|
||||
@@ -35,11 +67,30 @@ class BaseConfigSet(BaseSettings):
|
||||
"""
|
||||
|
||||
class Config:
|
||||
# Use env vars with ARCHIVEBOX_ prefix or raw name
|
||||
env_prefix = ""
|
||||
extra = "ignore"
|
||||
validate_default = True
|
||||
|
||||
@classmethod
|
||||
def settings_customise_sources(
|
||||
cls,
|
||||
settings_cls: Type[BaseSettings],
|
||||
init_settings: PydanticBaseSettingsSource,
|
||||
env_settings: PydanticBaseSettingsSource,
|
||||
dotenv_settings: PydanticBaseSettingsSource,
|
||||
file_secret_settings: PydanticBaseSettingsSource,
|
||||
) -> Tuple[PydanticBaseSettingsSource, ...]:
|
||||
"""
|
||||
Define the order of settings sources (first = highest priority).
|
||||
"""
|
||||
return (
|
||||
init_settings, # 1. Passed to __init__
|
||||
env_settings, # 2. Environment variables
|
||||
IniConfigSettingsSource(settings_cls), # 3. ArchiveBox.conf file
|
||||
# dotenv_settings, # Skip .env files
|
||||
# file_secret_settings, # Skip secrets files
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def load_from_file(cls, config_path: Path) -> Dict[str, str]:
|
||||
"""Load config values from INI file."""
|
||||
@@ -47,7 +98,7 @@ class BaseConfigSet(BaseSettings):
|
||||
return {}
|
||||
|
||||
parser = ConfigParser()
|
||||
parser.optionxform = lambda x: x # type: ignore # preserve case
|
||||
parser.optionxform = lambda x: x # preserve case
|
||||
parser.read(config_path)
|
||||
|
||||
# Flatten all sections into single namespace
|
||||
|
||||
@@ -256,7 +256,7 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
# Show a helpful message when no plugins found
|
||||
rows['Name'].append('(no plugins found)')
|
||||
rows['Source'].append('-')
|
||||
rows['Path'].append(format_html('<code>archivebox/plugins/</code> or <code>data/plugins/</code>'))
|
||||
rows['Path'].append(mark_safe('<code>archivebox/plugins/</code> or <code>data/plugins/</code>'))
|
||||
rows['Hooks'].append('-')
|
||||
|
||||
return TableContext(
|
||||
|
||||
@@ -9,25 +9,17 @@ from django.core.exceptions import ValidationError
|
||||
from django.urls import reverse, resolve
|
||||
from django.utils import timezone
|
||||
|
||||
from huey_monitor.admin import TaskModel
|
||||
|
||||
from archivebox.config import DATA_DIR
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
from archivebox.misc.paginators import AccelleratedPaginator
|
||||
from archivebox.base_models.admin import BaseModelAdmin
|
||||
from archivebox.hooks import get_extractor_icon
|
||||
|
||||
|
||||
from core.models import ArchiveResult, Snapshot
|
||||
|
||||
|
||||
|
||||
|
||||
def result_url(result: TaskModel) -> str:
|
||||
url = reverse("admin:huey_monitor_taskmodel_change", args=[str(result.id)])
|
||||
return format_html('<a href="{url}" class="fade-in-progress-url">See progress...</a>'.format(url=url))
|
||||
|
||||
|
||||
|
||||
class ArchiveResultInline(admin.TabularInline):
|
||||
name = 'Archive Results Log'
|
||||
model = ArchiveResult
|
||||
@@ -101,9 +93,9 @@ class ArchiveResultInline(admin.TabularInline):
|
||||
|
||||
|
||||
class ArchiveResultAdmin(BaseModelAdmin):
|
||||
list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'extractor', 'cmd_str', 'output_str')
|
||||
list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'extractor_with_icon', 'cmd_str', 'output_str')
|
||||
sort_fields = ('id', 'created_by', 'created_at', 'extractor', 'status')
|
||||
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary')
|
||||
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'extractor_with_icon')
|
||||
search_fields = ('id', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
|
||||
fields = ('snapshot', 'extractor', 'status', 'retry_at', 'start_ts', 'end_ts', 'created_by', 'pwd', 'cmd_version', 'cmd', 'output', *readonly_fields)
|
||||
autocomplete_fields = ['snapshot']
|
||||
@@ -144,17 +136,29 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
def tags_str(self, result):
|
||||
return result.snapshot.tags_str()
|
||||
|
||||
@admin.display(description='Extractor', ordering='extractor')
|
||||
def extractor_with_icon(self, result):
|
||||
icon = get_extractor_icon(result.extractor)
|
||||
return format_html(
|
||||
'<span title="{}">{}</span> {}',
|
||||
result.extractor,
|
||||
icon,
|
||||
result.extractor,
|
||||
)
|
||||
|
||||
def cmd_str(self, result):
|
||||
return format_html(
|
||||
'<pre>{}</pre>',
|
||||
' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd),
|
||||
)
|
||||
|
||||
|
||||
def output_str(self, result):
|
||||
# Determine output link path - use output if file exists, otherwise link to index
|
||||
output_path = result.output if (result.status == 'succeeded' and result.output) else 'index.html'
|
||||
return format_html(
|
||||
'<a href="/archive/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
|
||||
result.snapshot.timestamp,
|
||||
result.output if (result.status == 'succeeded') and result.extractor not in ('title', 'archive_org') else 'index.html',
|
||||
output_path,
|
||||
result.output,
|
||||
)
|
||||
|
||||
@@ -185,7 +189,7 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
is_hidden = filename.startswith('.')
|
||||
output_str += format_html('<span style="opacity: {}.2">{}{}</span><br/>', int(not is_hidden), indentation_str, filename.strip())
|
||||
|
||||
return output_str + format_html('</code></pre>')
|
||||
return output_str + mark_safe('</code></pre>')
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -35,8 +35,19 @@ def register_admin_site():
|
||||
|
||||
admin.site = archivebox_admin
|
||||
sites.site = archivebox_admin
|
||||
|
||||
# Plugin admin registration is now handled by individual app admins
|
||||
# No longer using archivebox.pm.hook.register_admin()
|
||||
|
||||
|
||||
# Register admin views for each app
|
||||
# (Previously handled by ABX plugin system, now called directly)
|
||||
from core.admin import register_admin as register_core_admin
|
||||
from crawls.admin import register_admin as register_crawls_admin
|
||||
from api.admin import register_admin as register_api_admin
|
||||
from machine.admin import register_admin as register_machine_admin
|
||||
from workers.admin import register_admin as register_workers_admin
|
||||
|
||||
register_core_admin(archivebox_admin)
|
||||
register_crawls_admin(archivebox_admin)
|
||||
register_api_admin(archivebox_admin)
|
||||
register_machine_admin(archivebox_admin)
|
||||
register_workers_admin(archivebox_admin)
|
||||
|
||||
return archivebox_admin
|
||||
|
||||
@@ -25,7 +25,7 @@ from archivebox.workers.tasks import bg_archive_snapshots, bg_add
|
||||
|
||||
from core.models import Tag
|
||||
from core.admin_tags import TagInline
|
||||
from core.admin_archiveresults import ArchiveResultInline, result_url
|
||||
from core.admin_archiveresults import ArchiveResultInline
|
||||
|
||||
|
||||
# GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
|
||||
@@ -54,10 +54,10 @@ class SnapshotActionForm(ActionForm):
|
||||
class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
list_display = ('created_at', 'title_str', 'status', 'files', 'size', 'url_str')
|
||||
sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
|
||||
readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'link_dir', 'available_config_options')
|
||||
readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir')
|
||||
search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
|
||||
list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
|
||||
fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', 'config', 'available_config_options', *readonly_fields[:-1])
|
||||
fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', 'config', *readonly_fields)
|
||||
ordering = ['-created_at']
|
||||
actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
|
||||
inlines = [TagInline, ArchiveResultInline]
|
||||
@@ -93,12 +93,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
# self.request = request
|
||||
# return super().get_queryset(request).prefetch_related('archiveresult_set').distinct() # .annotate(archiveresult_count=Count('archiveresult'))
|
||||
|
||||
@admin.action(
|
||||
description="Imported Timestamp"
|
||||
)
|
||||
@admin.display(description="Imported Timestamp")
|
||||
def imported_timestamp(self, obj):
|
||||
context = RequestContext(self.request, {
|
||||
'bookmarked_date': obj.bookmarked,
|
||||
'bookmarked_date': obj.bookmarked_at,
|
||||
'timestamp': obj.timestamp,
|
||||
})
|
||||
|
||||
@@ -145,22 +143,15 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
|
||||
def status_info(self, obj):
|
||||
return format_html(
|
||||
# URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
|
||||
'''
|
||||
Archived: {} ({} files {})
|
||||
Favicon: <img src="{}" style="height: 20px"/>
|
||||
Status code: {} <br/>
|
||||
Server: {}
|
||||
Content type: {}
|
||||
Extension: {}
|
||||
''',
|
||||
'✅' if obj.is_archived else '❌',
|
||||
obj.num_outputs,
|
||||
self.size(obj) or '0kb',
|
||||
f'/archive/{obj.timestamp}/favicon.ico',
|
||||
obj.status_code or '-',
|
||||
obj.headers and obj.headers.get('Server') or '-',
|
||||
obj.headers and obj.headers.get('Content-Type') or '-',
|
||||
obj.extension or '-',
|
||||
)
|
||||
|
||||
@@ -184,8 +175,8 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
obj.archive_path,
|
||||
obj.archive_path,
|
||||
obj.archive_path,
|
||||
'fetched' if obj.latest_title or obj.title else 'pending',
|
||||
urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
|
||||
'fetched' if obj.title else 'pending',
|
||||
urldecode(htmldecode(obj.title or ''))[:128] or 'Pending...'
|
||||
) + mark_safe(f' <span class="tags">{tags}</span>')
|
||||
|
||||
@admin.display(
|
||||
@@ -259,14 +250,13 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
description="ℹ️ Get Title"
|
||||
)
|
||||
def update_titles(self, request, queryset):
|
||||
from core.models import Snapshot
|
||||
count = queryset.count()
|
||||
|
||||
# Queue snapshots for archiving via the state machine system
|
||||
result = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR})
|
||||
queued = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR})
|
||||
messages.success(
|
||||
request,
|
||||
mark_safe(f"Title and favicon are updating in the background for {count} URLs. {result_url(result)}"),
|
||||
f"Queued {queued} snapshots for title/favicon update. The orchestrator will process them in the background.",
|
||||
)
|
||||
|
||||
@admin.action(
|
||||
@@ -275,11 +265,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
def update_snapshots(self, request, queryset):
|
||||
count = queryset.count()
|
||||
|
||||
result = bg_archive_snapshots(queryset, kwargs={"overwrite": False, "out_dir": DATA_DIR})
|
||||
queued = bg_archive_snapshots(queryset, kwargs={"overwrite": False, "out_dir": DATA_DIR})
|
||||
|
||||
messages.success(
|
||||
request,
|
||||
mark_safe(f"Re-trying any previously failed methods for {count} URLs in the background. {result_url(result)}"),
|
||||
f"Queued {queued} snapshots for re-archiving. The orchestrator will process them in the background.",
|
||||
)
|
||||
|
||||
|
||||
@@ -291,11 +281,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
timestamp = timezone.now().isoformat('T', 'seconds')
|
||||
new_url = snapshot.url.split('#')[0] + f'#{timestamp}'
|
||||
|
||||
result = bg_add({'urls': new_url, 'tag': snapshot.tags_str()})
|
||||
bg_add({'urls': new_url, 'tag': snapshot.tags_str()})
|
||||
|
||||
messages.success(
|
||||
request,
|
||||
mark_safe(f"Creating new fresh snapshots for {queryset.count()} URLs in the background. {result_url(result)}"),
|
||||
f"Creating {queryset.count()} new fresh snapshots. The orchestrator will process them in the background.",
|
||||
)
|
||||
|
||||
@admin.action(
|
||||
@@ -304,11 +294,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
def overwrite_snapshots(self, request, queryset):
|
||||
count = queryset.count()
|
||||
|
||||
result = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "out_dir": DATA_DIR})
|
||||
queued = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "out_dir": DATA_DIR})
|
||||
|
||||
messages.success(
|
||||
request,
|
||||
mark_safe(f"Clearing all previous results and re-downloading {count} URLs in the background. {result_url(result)}"),
|
||||
f"Queued {queued} snapshots for full re-archive (overwriting existing). The orchestrator will process them in the background.",
|
||||
)
|
||||
|
||||
@admin.action(
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
__package__ = 'archivebox.core'
|
||||
|
||||
import sys
|
||||
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
@@ -10,6 +12,41 @@ class CoreConfig(AppConfig):
|
||||
"""Register the archivebox.core.admin_site as the main django admin site"""
|
||||
from core.admin_site import register_admin_site
|
||||
register_admin_site()
|
||||
|
||||
|
||||
# Auto-start the orchestrator when running the web server
|
||||
self._maybe_start_orchestrator()
|
||||
|
||||
def _maybe_start_orchestrator(self):
|
||||
"""Start the orchestrator if we're running a web server."""
|
||||
import os
|
||||
|
||||
# Don't start orchestrator during migrations, shell, tests, etc.
|
||||
# Only start when running: runserver, daphne, gunicorn, uwsgi
|
||||
if not self._is_web_server():
|
||||
return
|
||||
|
||||
# Don't start if RUN_ORCHESTRATOR env var is explicitly set to false
|
||||
if os.environ.get('RUN_ORCHESTRATOR', '').lower() in ('false', '0', 'no'):
|
||||
return
|
||||
|
||||
# Don't start in autoreload child process (avoid double-start)
|
||||
if os.environ.get('RUN_MAIN') != 'true' and 'runserver' in sys.argv:
|
||||
return
|
||||
|
||||
try:
|
||||
from workers.orchestrator import Orchestrator
|
||||
|
||||
if not Orchestrator.is_running():
|
||||
# Start orchestrator as daemon (won't exit on idle when started by server)
|
||||
orchestrator = Orchestrator(exit_on_idle=False)
|
||||
orchestrator.start()
|
||||
except Exception as e:
|
||||
# Don't crash the server if orchestrator fails to start
|
||||
import logging
|
||||
logging.getLogger('archivebox').warning(f'Failed to auto-start orchestrator: {e}')
|
||||
|
||||
def _is_web_server(self) -> bool:
|
||||
"""Check if we're running a web server command."""
|
||||
# Check for common web server indicators
|
||||
server_commands = ('runserver', 'daphne', 'gunicorn', 'uwsgi', 'server')
|
||||
return any(cmd in ' '.join(sys.argv).lower() for cmd in server_commands)
|
||||
|
||||
@@ -23,7 +23,11 @@ from archivebox.config import CONSTANTS
|
||||
from archivebox.misc.system import get_dir_size, atomic_write
|
||||
from archivebox.misc.util import parse_date, base_url, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode
|
||||
from archivebox.misc.hashing import get_dir_info
|
||||
from archivebox.hooks import ARCHIVE_METHODS_INDEXING_PRECEDENCE
|
||||
from archivebox.hooks import (
|
||||
ARCHIVE_METHODS_INDEXING_PRECEDENCE,
|
||||
get_extractors, get_extractor_name, get_extractor_icon,
|
||||
DEFAULT_EXTRACTOR_ICONS,
|
||||
)
|
||||
from archivebox.base_models.models import (
|
||||
ModelWithUUID, ModelWithSerializers, ModelWithOutputDir,
|
||||
ModelWithConfig, ModelWithNotes, ModelWithHealthStats,
|
||||
@@ -343,45 +347,37 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
def icons(self) -> str:
|
||||
"""Generate HTML icons showing which extractors have succeeded for this snapshot"""
|
||||
from django.utils.html import format_html, mark_safe
|
||||
from collections import defaultdict
|
||||
|
||||
cache_key = f'result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}'
|
||||
|
||||
def calc_icons():
|
||||
if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
|
||||
archive_results = [r for r in self.archiveresult_set.all() if r.status == "succeeded" and r.output]
|
||||
archive_results = {r.extractor: r for r in self.archiveresult_set.all() if r.status == "succeeded" and r.output}
|
||||
else:
|
||||
archive_results = self.archiveresult_set.filter(status="succeeded", output__isnull=False)
|
||||
archive_results = {r.extractor: r for r in self.archiveresult_set.filter(status="succeeded", output__isnull=False)}
|
||||
|
||||
path = self.archive_path
|
||||
canon = self.canonical_outputs()
|
||||
output = ""
|
||||
output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> '
|
||||
icons = {
|
||||
"singlefile": "❶", "wget": "🆆", "dom": "🅷", "pdf": "📄",
|
||||
"screenshot": "💻", "media": "📼", "git": "🅶", "archive_org": "🏛",
|
||||
"readability": "🆁", "mercury": "🅼", "warc": "📦"
|
||||
}
|
||||
exclude = ["favicon", "title", "headers", "htmltotext", "archive_org"]
|
||||
|
||||
extractor_outputs = defaultdict(lambda: None)
|
||||
for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES:
|
||||
for result in archive_results:
|
||||
if result.extractor == extractor:
|
||||
extractor_outputs[extractor] = result
|
||||
# Get all extractors from hooks system (sorted by numeric prefix)
|
||||
all_extractors = [get_extractor_name(e) for e in get_extractors()]
|
||||
|
||||
for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES:
|
||||
if extractor not in exclude:
|
||||
existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
||||
output += format_html(output_template, path, canon.get(extractor, ''), str(bool(existing)), extractor, icons.get(extractor, "?"))
|
||||
if extractor == "wget":
|
||||
exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
||||
output += format_html(output_template, path, canon.get("warc", "warc/"), str(bool(exists)), "warc", icons.get("warc", "?"))
|
||||
if extractor == "archive_org":
|
||||
exists = extractor in extractor_outputs and extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
||||
output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon.get("archive_org", ""), str(exists), "archive_org", icons.get("archive_org", "?"))
|
||||
for extractor in all_extractors:
|
||||
result = archive_results.get(extractor)
|
||||
existing = result and result.status == 'succeeded' and result.output
|
||||
icon = get_extractor_icon(extractor)
|
||||
output += format_html(
|
||||
output_template,
|
||||
path,
|
||||
canon.get(extractor, extractor + '/'),
|
||||
str(bool(existing)),
|
||||
extractor,
|
||||
icon
|
||||
)
|
||||
|
||||
return format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))
|
||||
return format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}</span>', mark_safe(output))
|
||||
|
||||
cache_result = cache.get(cache_key)
|
||||
if cache_result:
|
||||
@@ -767,12 +763,11 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
FAILED = 'failed', 'Failed'
|
||||
SKIPPED = 'skipped', 'Skipped'
|
||||
|
||||
EXTRACTOR_CHOICES = (
|
||||
('htmltotext', 'htmltotext'), ('git', 'git'), ('singlefile', 'singlefile'), ('media', 'media'),
|
||||
('archive_org', 'archive_org'), ('readability', 'readability'), ('mercury', 'mercury'),
|
||||
('favicon', 'favicon'), ('pdf', 'pdf'), ('headers', 'headers'), ('screenshot', 'screenshot'),
|
||||
('dom', 'dom'), ('title', 'title'), ('wget', 'wget'),
|
||||
)
|
||||
@classmethod
|
||||
def get_extractor_choices(cls):
|
||||
"""Get extractor choices from discovered hooks (for forms/admin)."""
|
||||
extractors = [get_extractor_name(e) for e in get_extractors()]
|
||||
return tuple((e, e) for e in extractors)
|
||||
|
||||
# Keep AutoField for backward compatibility with 0.7.x databases
|
||||
# UUID field is added separately by migration for new records
|
||||
@@ -783,7 +778,8 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
modified_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
snapshot: Snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) # type: ignore
|
||||
extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32, blank=False, null=False, db_index=True)
|
||||
# No choices= constraint - extractor names come from plugin system and can be any string
|
||||
extractor = models.CharField(max_length=32, blank=False, null=False, db_index=True)
|
||||
pwd = models.CharField(max_length=256, default=None, null=True, blank=True)
|
||||
cmd = models.JSONField(default=None, null=True, blank=True)
|
||||
cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
|
||||
@@ -835,6 +831,25 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
def output_exists(self) -> bool:
|
||||
return os.path.exists(Path(self.snapshot_dir) / self.extractor)
|
||||
|
||||
def embed_path(self) -> Optional[str]:
|
||||
"""
|
||||
Get the relative path to the embeddable output file for this result.
|
||||
|
||||
Returns the output field if set and file exists, otherwise tries to
|
||||
find a reasonable default based on the extractor type.
|
||||
"""
|
||||
if self.output:
|
||||
return self.output
|
||||
|
||||
# Try to find output file based on extractor's canonical output path
|
||||
canonical = self.snapshot.canonical_outputs()
|
||||
extractor_key = f'{self.extractor}_path'
|
||||
if extractor_key in canonical:
|
||||
return canonical[extractor_key]
|
||||
|
||||
# Fallback to extractor directory
|
||||
return f'{self.extractor}/'
|
||||
|
||||
def create_output_dir(self):
|
||||
output_dir = Path(self.snapshot_dir) / self.extractor
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
@@ -891,6 +906,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
output_dir=extractor_dir,
|
||||
config_objects=config_objects,
|
||||
url=self.snapshot.url,
|
||||
snapshot_id=str(self.snapshot.id),
|
||||
)
|
||||
end_ts = timezone.now()
|
||||
|
||||
@@ -1000,6 +1016,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
hook,
|
||||
output_dir=self.output_dir,
|
||||
config_objects=config_objects,
|
||||
url=self.snapshot.url,
|
||||
snapshot_id=str(self.snapshot.id),
|
||||
extractor=self.extractor,
|
||||
)
|
||||
|
||||
@@ -68,9 +68,6 @@ INSTALLED_APPS = [
|
||||
# 3rd-party apps from PyPI that need to be loaded last
|
||||
"admin_data_views", # handles rendering some convenient automatic read-only views of data in Django admin
|
||||
"django_extensions", # provides Django Debug Toolbar (and other non-debug helpers)
|
||||
"django_huey", # provides multi-queue support for django huey https://github.com/gaiacoop/django-huey
|
||||
"bx_django_utils", # needed for huey_monitor https://github.com/boxine/bx_django_utils
|
||||
"huey_monitor", # adds an admin UI for monitoring background huey tasks https://github.com/boxine/django-huey-monitor
|
||||
]
|
||||
|
||||
|
||||
@@ -215,70 +212,6 @@ MIGRATION_MODULES = {"signal_webhooks": None}
|
||||
# as much as I'd love this to be a UUID or ULID field, it's not supported yet as of Django 5.0
|
||||
DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"
|
||||
|
||||
HUEY = {
|
||||
"huey_class": "huey.SqliteHuey",
|
||||
"filename": CONSTANTS.QUEUE_DATABASE_FILENAME,
|
||||
"name": "commands",
|
||||
"results": True,
|
||||
"store_none": True,
|
||||
"immediate": False,
|
||||
"utc": True,
|
||||
"consumer": {
|
||||
"workers": 1,
|
||||
"worker_type": "thread",
|
||||
"initial_delay": 0.1, # Smallest polling interval, same as -d.
|
||||
"backoff": 1.15, # Exponential backoff using this rate, -b.
|
||||
"max_delay": 10.0, # Max possible polling interval, -m.
|
||||
"scheduler_interval": 1, # Check schedule every second, -s.
|
||||
"periodic": True, # Enable crontab feature.
|
||||
"check_worker_health": True, # Enable worker health checks.
|
||||
"health_check_interval": 1, # Check worker health every second.
|
||||
},
|
||||
}
|
||||
|
||||
# https://huey.readthedocs.io/en/latest/contrib.html#setting-things-up
|
||||
# https://github.com/gaiacoop/django-huey
|
||||
DJANGO_HUEY = {
|
||||
"default": "commands",
|
||||
"queues": {
|
||||
HUEY["name"]: HUEY.copy(),
|
||||
# more registered here at plugin import-time by BaseQueue.register()
|
||||
# Additional huey queues configured via settings
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
class HueyDBRouter:
|
||||
"""
|
||||
A router to store all the Huey result k:v / Huey Monitor models in the queue.sqlite3 database.
|
||||
We keep the databases separate because the queue database receives many more reads/writes per second
|
||||
and we want to avoid single-write lock contention with the main database. Also all the in-progress task
|
||||
data is ephemeral/not-important-long-term. This makes it easier to for the user to clear non-critical
|
||||
temp data by just deleting queue.sqlite3 and leaving index.sqlite3.
|
||||
"""
|
||||
|
||||
route_app_labels = {"huey_monitor", "django_huey", "djhuey"}
|
||||
db_name = "queue"
|
||||
|
||||
def db_for_read(self, model, **hints):
|
||||
if model._meta.app_label in self.route_app_labels:
|
||||
return self.db_name
|
||||
return "default"
|
||||
|
||||
def db_for_write(self, model, **hints):
|
||||
if model._meta.app_label in self.route_app_labels:
|
||||
return self.db_name
|
||||
return "default"
|
||||
|
||||
def allow_relation(self, obj1, obj2, **hints):
|
||||
if obj1._meta.app_label in self.route_app_labels or obj2._meta.app_label in self.route_app_labels:
|
||||
return obj1._meta.app_label == obj2._meta.app_label
|
||||
return None
|
||||
|
||||
def allow_migrate(self, db, app_label, model_name=None, **hints):
|
||||
if app_label in self.route_app_labels:
|
||||
return db == self.db_name
|
||||
return db == "default"
|
||||
|
||||
|
||||
# class FilestoreDBRouter:
|
||||
@@ -311,7 +244,7 @@ class HueyDBRouter:
|
||||
# return db == self.db_name
|
||||
# return db == "default"
|
||||
|
||||
DATABASE_ROUTERS = ["core.settings.HueyDBRouter"]
|
||||
DATABASE_ROUTERS = []
|
||||
|
||||
CACHES = {
|
||||
"default": {"BACKEND": "django.core.cache.backends.locmem.LocMemCache"},
|
||||
|
||||
@@ -1,9 +1,13 @@
|
||||
from django import template
|
||||
from django.contrib.admin.templatetags.base import InclusionAdminNode
|
||||
|
||||
from django.utils.safestring import mark_safe
|
||||
|
||||
from typing import Union
|
||||
|
||||
from archivebox.hooks import (
|
||||
get_extractor_icon, get_extractor_template, get_extractor_name,
|
||||
)
|
||||
|
||||
|
||||
register = template.Library()
|
||||
|
||||
@@ -44,3 +48,115 @@ def url_replace(context, **kwargs):
|
||||
dict_ = context['request'].GET.copy()
|
||||
dict_.update(**kwargs)
|
||||
return dict_.urlencode()
|
||||
|
||||
|
||||
@register.simple_tag
|
||||
def extractor_icon(extractor: str) -> str:
|
||||
"""
|
||||
Render the icon for an extractor.
|
||||
|
||||
Usage: {% extractor_icon "screenshot" %}
|
||||
"""
|
||||
return mark_safe(get_extractor_icon(extractor))
|
||||
|
||||
|
||||
@register.simple_tag(takes_context=True)
|
||||
def extractor_thumbnail(context, result) -> str:
|
||||
"""
|
||||
Render the thumbnail template for an archive result.
|
||||
|
||||
Usage: {% extractor_thumbnail result %}
|
||||
|
||||
Context variables passed to template:
|
||||
- result: ArchiveResult object
|
||||
- snapshot: Parent Snapshot object
|
||||
- output_path: Path to output relative to snapshot dir (from embed_path())
|
||||
- extractor: Extractor base name
|
||||
"""
|
||||
extractor = get_extractor_name(result.extractor)
|
||||
template_str = get_extractor_template(extractor, 'thumbnail')
|
||||
|
||||
if not template_str:
|
||||
return ''
|
||||
|
||||
# Use embed_path() for the display path (includes canonical paths)
|
||||
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
|
||||
|
||||
# Create a mini template and render it with context
|
||||
try:
|
||||
tpl = template.Template(template_str)
|
||||
ctx = template.Context({
|
||||
'result': result,
|
||||
'snapshot': result.snapshot,
|
||||
'output_path': output_path,
|
||||
'extractor': extractor,
|
||||
})
|
||||
return mark_safe(tpl.render(ctx))
|
||||
except Exception:
|
||||
return ''
|
||||
|
||||
|
||||
@register.simple_tag(takes_context=True)
|
||||
def extractor_embed(context, result) -> str:
|
||||
"""
|
||||
Render the embed iframe template for an archive result.
|
||||
|
||||
Usage: {% extractor_embed result %}
|
||||
"""
|
||||
extractor = get_extractor_name(result.extractor)
|
||||
template_str = get_extractor_template(extractor, 'embed')
|
||||
|
||||
if not template_str:
|
||||
return ''
|
||||
|
||||
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
|
||||
|
||||
try:
|
||||
tpl = template.Template(template_str)
|
||||
ctx = template.Context({
|
||||
'result': result,
|
||||
'snapshot': result.snapshot,
|
||||
'output_path': output_path,
|
||||
'extractor': extractor,
|
||||
})
|
||||
return mark_safe(tpl.render(ctx))
|
||||
except Exception:
|
||||
return ''
|
||||
|
||||
|
||||
@register.simple_tag(takes_context=True)
|
||||
def extractor_fullscreen(context, result) -> str:
|
||||
"""
|
||||
Render the fullscreen template for an archive result.
|
||||
|
||||
Usage: {% extractor_fullscreen result %}
|
||||
"""
|
||||
extractor = get_extractor_name(result.extractor)
|
||||
template_str = get_extractor_template(extractor, 'fullscreen')
|
||||
|
||||
if not template_str:
|
||||
return ''
|
||||
|
||||
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
|
||||
|
||||
try:
|
||||
tpl = template.Template(template_str)
|
||||
ctx = template.Context({
|
||||
'result': result,
|
||||
'snapshot': result.snapshot,
|
||||
'output_path': output_path,
|
||||
'extractor': extractor,
|
||||
})
|
||||
return mark_safe(tpl.render(ctx))
|
||||
except Exception:
|
||||
return ''
|
||||
|
||||
|
||||
@register.filter
|
||||
def extractor_name(value: str) -> str:
|
||||
"""
|
||||
Get the base name of an extractor (strips numeric prefix).
|
||||
|
||||
Usage: {{ result.extractor|extractor_name }}
|
||||
"""
|
||||
return get_extractor_name(value)
|
||||
|
||||
@@ -8,7 +8,7 @@ from django.views.generic.base import RedirectView
|
||||
from archivebox.misc.serve_static import serve_static
|
||||
|
||||
from core.admin_site import archivebox_admin
|
||||
from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
|
||||
from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView, live_progress_view
|
||||
|
||||
from workers.views import JobsDashboardView
|
||||
|
||||
@@ -43,8 +43,10 @@ urlpatterns = [
|
||||
|
||||
|
||||
path('accounts/', include('django.contrib.auth.urls')),
|
||||
|
||||
path('admin/live-progress/', live_progress_view, name='live_progress'),
|
||||
path('admin/', archivebox_admin.urls),
|
||||
|
||||
|
||||
path("api/", include('api.urls'), name='api'),
|
||||
|
||||
path('health/', HealthCheckView.as_view(), name='healthcheck'),
|
||||
|
||||
@@ -34,6 +34,7 @@ from archivebox.search import query_search_index
|
||||
from core.models import Snapshot
|
||||
from core.forms import AddLinkForm
|
||||
from crawls.models import Seed, Crawl
|
||||
from archivebox.hooks import get_extractors, get_extractor_name
|
||||
|
||||
|
||||
|
||||
@@ -54,8 +55,10 @@ class SnapshotView(View):
|
||||
@staticmethod
|
||||
def render_live_index(request, snapshot):
|
||||
TITLE_LOADING_MSG = 'Not yet archived...'
|
||||
HIDDEN_RESULTS = ('favicon', 'headers', 'title', 'htmltotext', 'warc', 'archive_org')
|
||||
|
||||
# Dict of extractor -> ArchiveResult object
|
||||
archiveresult_objects = {}
|
||||
# Dict of extractor -> result info dict (for template compatibility)
|
||||
archiveresults = {}
|
||||
|
||||
results = snapshot.archiveresult_set.all()
|
||||
@@ -65,18 +68,21 @@ class SnapshotView(View):
|
||||
abs_path = result.snapshot_dir / (embed_path or 'None')
|
||||
|
||||
if (result.status == 'succeeded'
|
||||
and (result.extractor not in HIDDEN_RESULTS)
|
||||
and embed_path
|
||||
and os.access(abs_path, os.R_OK)
|
||||
and abs_path.exists()):
|
||||
if os.path.isdir(abs_path) and not any(abs_path.glob('*.*')):
|
||||
continue
|
||||
|
||||
# Store the full ArchiveResult object for template tags
|
||||
archiveresult_objects[result.extractor] = result
|
||||
|
||||
result_info = {
|
||||
'name': result.extractor,
|
||||
'path': embed_path,
|
||||
'ts': ts_to_date_str(result.end_ts),
|
||||
'size': abs_path.stat().st_size or '?',
|
||||
'result': result, # Include the full object for template tags
|
||||
}
|
||||
archiveresults[result.extractor] = result_info
|
||||
|
||||
@@ -101,11 +107,11 @@ class SnapshotView(View):
|
||||
}
|
||||
|
||||
|
||||
# iterate through all the files in the snapshot dir and add the biggest ones to1 the result list
|
||||
# iterate through all the files in the snapshot dir and add the biggest ones to the result list
|
||||
snap_dir = Path(snapshot.output_dir)
|
||||
if not os.path.isdir(snap_dir) and os.access(snap_dir, os.R_OK):
|
||||
return {}
|
||||
|
||||
|
||||
for result_file in (*snap_dir.glob('*'), *snap_dir.glob('*/*')):
|
||||
extension = result_file.suffix.lstrip('.').lower()
|
||||
if result_file.is_dir() or result_file.name.startswith('.') or extension not in allowed_extensions:
|
||||
@@ -121,12 +127,16 @@ class SnapshotView(View):
|
||||
'path': result_file.relative_to(snap_dir),
|
||||
'ts': ts_to_date_str(result_file.stat().st_mtime or 0),
|
||||
'size': file_size,
|
||||
'result': None, # No ArchiveResult object for filesystem-discovered files
|
||||
}
|
||||
|
||||
preferred_types = ('singlefile', 'screenshot', 'wget', 'dom', 'media', 'pdf', 'readability', 'mercury')
|
||||
# Get available extractors from hooks (sorted by numeric prefix for ordering)
|
||||
# Convert to base names for display ordering
|
||||
all_extractors = [get_extractor_name(e) for e in get_extractors()]
|
||||
preferred_types = tuple(all_extractors)
|
||||
all_types = preferred_types + tuple(result_type for result_type in archiveresults.keys() if result_type not in preferred_types)
|
||||
|
||||
best_result = {'path': 'None'}
|
||||
best_result = {'path': 'None', 'result': None}
|
||||
for result_type in preferred_types:
|
||||
if result_type in archiveresults:
|
||||
best_result = archiveresults[result_type]
|
||||
@@ -157,6 +167,7 @@ class SnapshotView(View):
|
||||
'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
|
||||
'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']),
|
||||
'best_result': best_result,
|
||||
'snapshot': snapshot, # Pass the snapshot object for template tags
|
||||
}
|
||||
return render(template_name='core/snapshot_live.html', request=request, context=context)
|
||||
|
||||
@@ -436,7 +447,7 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
def form_valid(self, form):
|
||||
urls = form.cleaned_data["url"]
|
||||
print(f'[+] Adding URL: {urls}')
|
||||
parser = form.cleaned_data["parser"]
|
||||
parser = form.cleaned_data.get("parser", "auto") # default to auto-detect parser
|
||||
tag = form.cleaned_data["tag"]
|
||||
depth = 0 if form.cleaned_data["depth"] == "0" else 1
|
||||
extractors = ','.join(form.cleaned_data["archive_methods"])
|
||||
@@ -452,18 +463,19 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
if extractors:
|
||||
input_kwargs.update({"extractors": extractors})
|
||||
|
||||
|
||||
|
||||
from archivebox.config.permissions import HOSTNAME
|
||||
|
||||
|
||||
|
||||
|
||||
# 1. save the provided urls to sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
|
||||
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{self.request.user.pk}.txt'
|
||||
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
|
||||
|
||||
|
||||
# 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
|
||||
timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
|
||||
seed = Seed.from_file(
|
||||
sources_file,
|
||||
label=f'{self.request.user.username}@{HOSTNAME}{self.request.path}',
|
||||
label=f'{self.request.user.username}@{HOSTNAME}{self.request.path} {timestamp}',
|
||||
parser=parser,
|
||||
tag=tag,
|
||||
created_by=self.request.user.pk,
|
||||
@@ -472,7 +484,7 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
# 'INDEX_ONLY': index_only,
|
||||
# 'OVERWRITE': False,
|
||||
'DEPTH': depth,
|
||||
'EXTRACTORS': parser,
|
||||
'EXTRACTORS': extractors or '',
|
||||
# 'DEFAULT_PERSONA': persona or 'Default',
|
||||
})
|
||||
# 3. create a new Crawl pointing to the Seed
|
||||
@@ -490,10 +502,15 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
self.request,
|
||||
mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a minute start seeing results) {crawl.admin_change_url}"),
|
||||
)
|
||||
# if not bg:
|
||||
# from workers.orchestrator import Orchestrator
|
||||
# orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4)
|
||||
# orchestrator.start()
|
||||
|
||||
# Start orchestrator in background to process the queued crawl
|
||||
try:
|
||||
from archivebox.workers.tasks import ensure_orchestrator_running
|
||||
ensure_orchestrator_running()
|
||||
except Exception as e:
|
||||
# Orchestrator may already be running via supervisord, or fail to start
|
||||
# This is not fatal - the crawl will be processed when orchestrator runs
|
||||
print(f'[!] Failed to start orchestrator: {e}')
|
||||
|
||||
return redirect(crawl.admin_change_url)
|
||||
|
||||
@@ -513,6 +530,141 @@ class HealthCheckView(View):
|
||||
)
|
||||
|
||||
|
||||
import json
|
||||
from django.http import JsonResponse
|
||||
|
||||
def live_progress_view(request):
|
||||
"""Simple JSON endpoint for live progress status - used by admin progress monitor."""
|
||||
try:
|
||||
from workers.orchestrator import Orchestrator
|
||||
from crawls.models import Crawl
|
||||
from core.models import Snapshot, ArchiveResult
|
||||
|
||||
# Get orchestrator status
|
||||
orchestrator_running = Orchestrator.is_running()
|
||||
total_workers = Orchestrator().get_total_worker_count() if orchestrator_running else 0
|
||||
|
||||
# Get model counts by status
|
||||
crawls_pending = Crawl.objects.filter(status=Crawl.StatusChoices.QUEUED).count()
|
||||
crawls_started = Crawl.objects.filter(status=Crawl.StatusChoices.STARTED).count()
|
||||
|
||||
# Get recent crawls (last 24 hours)
|
||||
from datetime import timedelta
|
||||
one_day_ago = timezone.now() - timedelta(days=1)
|
||||
crawls_recent = Crawl.objects.filter(created_at__gte=one_day_ago).count()
|
||||
|
||||
snapshots_pending = Snapshot.objects.filter(status=Snapshot.StatusChoices.QUEUED).count()
|
||||
snapshots_started = Snapshot.objects.filter(status=Snapshot.StatusChoices.STARTED).count()
|
||||
|
||||
archiveresults_pending = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.QUEUED).count()
|
||||
archiveresults_started = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.STARTED).count()
|
||||
archiveresults_succeeded = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.SUCCEEDED).count()
|
||||
archiveresults_failed = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.FAILED).count()
|
||||
|
||||
# Build hierarchical active crawls with nested snapshots and archive results
|
||||
active_crawls = []
|
||||
for crawl in Crawl.objects.filter(
|
||||
status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]
|
||||
).order_by('-modified_at')[:10]:
|
||||
# Get snapshots for this crawl
|
||||
crawl_snapshots = Snapshot.objects.filter(crawl=crawl)
|
||||
total_snapshots = crawl_snapshots.count()
|
||||
completed_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.SEALED).count()
|
||||
pending_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.QUEUED).count()
|
||||
|
||||
# Calculate crawl progress
|
||||
crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0
|
||||
|
||||
# Get active snapshots for this crawl
|
||||
active_snapshots_for_crawl = []
|
||||
for snapshot in crawl_snapshots.filter(
|
||||
status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]
|
||||
).order_by('-modified_at')[:5]:
|
||||
# Get archive results for this snapshot
|
||||
snapshot_results = ArchiveResult.objects.filter(snapshot=snapshot)
|
||||
total_extractors = snapshot_results.count()
|
||||
completed_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.SUCCEEDED).count()
|
||||
failed_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.FAILED).count()
|
||||
pending_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.QUEUED).count()
|
||||
|
||||
# Calculate snapshot progress
|
||||
snapshot_progress = int(((completed_extractors + failed_extractors) / total_extractors) * 100) if total_extractors > 0 else 0
|
||||
|
||||
# Get active extractors for this snapshot
|
||||
active_extractors = [
|
||||
{
|
||||
'id': str(ar.id),
|
||||
'extractor': ar.extractor,
|
||||
'status': ar.status,
|
||||
'started': ar.start_ts.isoformat() if ar.start_ts else None,
|
||||
'progress': 50,
|
||||
}
|
||||
for ar in snapshot_results.filter(status=ArchiveResult.StatusChoices.STARTED).order_by('-start_ts')[:5]
|
||||
]
|
||||
|
||||
active_snapshots_for_crawl.append({
|
||||
'id': str(snapshot.id),
|
||||
'url': snapshot.url[:80],
|
||||
'status': snapshot.status,
|
||||
'started': snapshot.modified_at.isoformat() if snapshot.modified_at else None,
|
||||
'progress': snapshot_progress,
|
||||
'total_extractors': total_extractors,
|
||||
'completed_extractors': completed_extractors,
|
||||
'failed_extractors': failed_extractors,
|
||||
'pending_extractors': pending_extractors,
|
||||
'active_extractors': active_extractors,
|
||||
})
|
||||
|
||||
active_crawls.append({
|
||||
'id': str(crawl.id),
|
||||
'label': str(crawl)[:60],
|
||||
'status': crawl.status,
|
||||
'started': crawl.modified_at.isoformat() if crawl.modified_at else None,
|
||||
'progress': crawl_progress,
|
||||
'max_depth': crawl.max_depth,
|
||||
'total_snapshots': total_snapshots,
|
||||
'completed_snapshots': completed_snapshots,
|
||||
'failed_snapshots': 0,
|
||||
'pending_snapshots': pending_snapshots,
|
||||
'active_snapshots': active_snapshots_for_crawl,
|
||||
})
|
||||
|
||||
return JsonResponse({
|
||||
'orchestrator_running': orchestrator_running,
|
||||
'total_workers': total_workers,
|
||||
'crawls_pending': crawls_pending,
|
||||
'crawls_started': crawls_started,
|
||||
'crawls_recent': crawls_recent,
|
||||
'snapshots_pending': snapshots_pending,
|
||||
'snapshots_started': snapshots_started,
|
||||
'archiveresults_pending': archiveresults_pending,
|
||||
'archiveresults_started': archiveresults_started,
|
||||
'archiveresults_succeeded': archiveresults_succeeded,
|
||||
'archiveresults_failed': archiveresults_failed,
|
||||
'active_crawls': active_crawls,
|
||||
'server_time': timezone.now().isoformat(),
|
||||
})
|
||||
except Exception as e:
|
||||
import traceback
|
||||
return JsonResponse({
|
||||
'error': str(e),
|
||||
'traceback': traceback.format_exc(),
|
||||
'orchestrator_running': False,
|
||||
'total_workers': 0,
|
||||
'crawls_pending': 0,
|
||||
'crawls_started': 0,
|
||||
'crawls_recent': 0,
|
||||
'snapshots_pending': 0,
|
||||
'snapshots_started': 0,
|
||||
'archiveresults_pending': 0,
|
||||
'archiveresults_started': 0,
|
||||
'archiveresults_succeeded': 0,
|
||||
'archiveresults_failed': 0,
|
||||
'active_crawls': [],
|
||||
'server_time': timezone.now().isoformat(),
|
||||
}, status=500)
|
||||
|
||||
|
||||
def find_config_section(key: str) -> str:
|
||||
CONFIGS = get_all_configs()
|
||||
|
||||
|
||||
@@ -1,10 +1,18 @@
|
||||
__package__ = 'archivebox.crawls'
|
||||
|
||||
from django.utils.html import format_html, format_html_join
|
||||
from django.contrib import admin
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from django.utils.html import format_html, format_html_join, mark_safe
|
||||
from django.contrib import admin, messages
|
||||
from django.urls import path
|
||||
from django.http import JsonResponse
|
||||
from django.views.decorators.http import require_POST
|
||||
|
||||
from archivebox import DATA_DIR
|
||||
|
||||
from django_object_actions import action
|
||||
|
||||
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
|
||||
|
||||
from core.models import Snapshot
|
||||
@@ -16,8 +24,8 @@ class SeedAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str')
|
||||
search_fields = ('id', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str')
|
||||
|
||||
readonly_fields = ('created_at', 'modified_at', 'scheduled_crawls', 'crawls', 'snapshots', 'contents', 'available_config_options')
|
||||
fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'available_config_options', 'created_by', *readonly_fields[:-1])
|
||||
readonly_fields = ('created_at', 'modified_at', 'scheduled_crawls', 'crawls', 'snapshots', 'contents')
|
||||
fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'created_by', *readonly_fields)
|
||||
|
||||
list_filter = ('extractor', 'created_by')
|
||||
ordering = ['-created_at']
|
||||
@@ -34,19 +42,19 @@ class SeedAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
|
||||
(scheduledcrawl.admin_change_url, scheduledcrawl)
|
||||
for scheduledcrawl in obj.scheduled_crawl_set.all().order_by('-created_at')[:20]
|
||||
)) or format_html('<i>No Scheduled Crawls yet...</i>')
|
||||
)) or mark_safe('<i>No Scheduled Crawls yet...</i>')
|
||||
|
||||
def crawls(self, obj):
|
||||
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
|
||||
(crawl.admin_change_url, crawl)
|
||||
for crawl in obj.crawl_set.all().order_by('-created_at')[:20]
|
||||
)) or format_html('<i>No Crawls yet...</i>')
|
||||
)) or mark_safe('<i>No Crawls yet...</i>')
|
||||
|
||||
def snapshots(self, obj):
|
||||
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
|
||||
(snapshot.admin_change_url, snapshot)
|
||||
for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
|
||||
)) or format_html('<i>No Snapshots yet...</i>')
|
||||
)) or mark_safe('<i>No Snapshots yet...</i>')
|
||||
|
||||
def contents(self, obj):
|
||||
if obj.uri.startswith('file:///data/'):
|
||||
@@ -69,14 +77,81 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
sort_fields = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at')
|
||||
search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'schedule_id', 'status', 'seed__uri')
|
||||
|
||||
readonly_fields = ('created_at', 'modified_at', 'snapshots', 'seed_contents', 'available_config_options')
|
||||
fields = ('label', 'notes', 'urls', 'config', 'available_config_options', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', *readonly_fields[:-1])
|
||||
readonly_fields = ('created_at', 'modified_at', 'snapshots', 'seed_urls_editor')
|
||||
fields = ('label', 'notes', 'seed_urls_editor', 'config', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', 'created_at', 'modified_at', 'snapshots')
|
||||
|
||||
list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at')
|
||||
ordering = ['-created_at', '-retry_at']
|
||||
list_per_page = 100
|
||||
actions = ["delete_selected"]
|
||||
|
||||
change_actions = ['recrawl']
|
||||
|
||||
@action(label='Recrawl', description='Create a new crawl with the same settings')
|
||||
def recrawl(self, request, obj):
|
||||
"""Duplicate this crawl as a new crawl with the same seed and settings."""
|
||||
from django.utils import timezone
|
||||
|
||||
new_crawl = Crawl.objects.create(
|
||||
seed=obj.seed,
|
||||
urls=obj.urls,
|
||||
max_depth=obj.max_depth,
|
||||
config=obj.config,
|
||||
schedule=obj.schedule,
|
||||
label=f"{obj.label} (recrawl)" if obj.label else "",
|
||||
notes=obj.notes,
|
||||
created_by=request.user,
|
||||
status=Crawl.StatusChoices.QUEUED,
|
||||
retry_at=timezone.now(),
|
||||
)
|
||||
|
||||
messages.success(
|
||||
request,
|
||||
f'Created new crawl {new_crawl.id} with the same settings. '
|
||||
f'It will start processing shortly.'
|
||||
)
|
||||
|
||||
# Redirect to the new crawl's change page
|
||||
from django.shortcuts import redirect
|
||||
return redirect('admin:crawls_crawl_change', new_crawl.id)
|
||||
|
||||
def get_urls(self):
|
||||
urls = super().get_urls()
|
||||
custom_urls = [
|
||||
path('<path:object_id>/save_seed_contents/',
|
||||
self.admin_site.admin_view(self.save_seed_contents_view),
|
||||
name='crawls_crawl_save_seed_contents'),
|
||||
]
|
||||
return custom_urls + urls
|
||||
|
||||
def save_seed_contents_view(self, request, object_id):
|
||||
"""Handle saving seed file contents via AJAX."""
|
||||
if request.method != 'POST':
|
||||
return JsonResponse({'success': False, 'error': 'POST required'}, status=405)
|
||||
|
||||
try:
|
||||
crawl = Crawl.objects.get(pk=object_id)
|
||||
except Crawl.DoesNotExist:
|
||||
return JsonResponse({'success': False, 'error': 'Crawl not found'}, status=404)
|
||||
|
||||
if not (crawl.seed and crawl.seed.uri and crawl.seed.uri.startswith('file:///data/')):
|
||||
return JsonResponse({'success': False, 'error': 'Seed is not a local file'}, status=400)
|
||||
|
||||
try:
|
||||
data = json.loads(request.body)
|
||||
contents = data.get('contents', '')
|
||||
except json.JSONDecodeError:
|
||||
return JsonResponse({'success': False, 'error': 'Invalid JSON'}, status=400)
|
||||
|
||||
source_file = DATA_DIR / crawl.seed.uri.replace('file:///data/', '', 1)
|
||||
|
||||
try:
|
||||
# Ensure parent directory exists
|
||||
source_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
source_file.write_text(contents)
|
||||
return JsonResponse({'success': True, 'message': f'Saved {len(contents)} bytes to {source_file.name}'})
|
||||
except Exception as e:
|
||||
return JsonResponse({'success': False, 'error': str(e)}, status=500)
|
||||
|
||||
def num_snapshots(self, obj):
|
||||
return obj.snapshot_set.count()
|
||||
|
||||
@@ -84,35 +159,175 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
return format_html_join('<br/>', '<a href="{}">{}</a>', (
|
||||
(snapshot.admin_change_url, snapshot)
|
||||
for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
|
||||
)) or format_html('<i>No Snapshots yet...</i>')
|
||||
|
||||
)) or mark_safe('<i>No Snapshots yet...</i>')
|
||||
|
||||
@admin.display(description='Schedule', ordering='schedule')
|
||||
def schedule_str(self, obj):
|
||||
if not obj.schedule:
|
||||
return format_html('<i>None</i>')
|
||||
return mark_safe('<i>None</i>')
|
||||
return format_html('<a href="{}">{}</a>', obj.schedule.admin_change_url, obj.schedule)
|
||||
|
||||
|
||||
@admin.display(description='Seed', ordering='seed')
|
||||
def seed_str(self, obj):
|
||||
if not obj.seed:
|
||||
return format_html('<i>None</i>')
|
||||
return mark_safe('<i>None</i>')
|
||||
return format_html('<a href="{}">{}</a>', obj.seed.admin_change_url, obj.seed)
|
||||
|
||||
def seed_contents(self, obj):
|
||||
if not (obj.seed and obj.seed.uri):
|
||||
return format_html('<i>None</i>')
|
||||
|
||||
if obj.seed.uri.startswith('file:///data/'):
|
||||
source_file = DATA_DIR / obj.seed.uri.replace('file:///data/', '', 1)
|
||||
contents = ""
|
||||
|
||||
@admin.display(description='URLs')
|
||||
def seed_urls_editor(self, obj):
|
||||
"""Combined editor showing seed URL and file contents."""
|
||||
widget_id = f'seed_urls_{obj.pk}'
|
||||
|
||||
# Get the seed URI (or use urls field if no seed)
|
||||
seed_uri = ''
|
||||
if obj.seed and obj.seed.uri:
|
||||
seed_uri = obj.seed.uri
|
||||
elif obj.urls:
|
||||
seed_uri = obj.urls
|
||||
|
||||
# Check if it's a local file we can edit
|
||||
is_file = seed_uri.startswith('file:///data/')
|
||||
contents = ""
|
||||
error = None
|
||||
source_file = None
|
||||
|
||||
if is_file:
|
||||
source_file = DATA_DIR / seed_uri.replace('file:///data/', '', 1)
|
||||
try:
|
||||
contents = source_file.read_text().strip()[:14_000]
|
||||
contents = source_file.read_text().strip()
|
||||
except Exception as e:
|
||||
contents = f'Error reading {source_file}: {e}'
|
||||
|
||||
return format_html('<b><code>{}</code>:</b><br/><pre>{}</pre>', source_file, contents)
|
||||
|
||||
return format_html('See URLs here: <a href="{}">{}</a>', obj.seed.uri, obj.seed.uri)
|
||||
error = f'Error reading {source_file}: {e}'
|
||||
|
||||
# Escape for safe HTML embedding
|
||||
escaped_uri = seed_uri.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"')
|
||||
escaped_contents = (contents or '').replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"')
|
||||
|
||||
# Count lines for auto-expand logic
|
||||
line_count = len(contents.split('\n')) if contents else 0
|
||||
uri_rows = min(max(1, seed_uri.count('\n') + 1), 3)
|
||||
|
||||
html = f'''
|
||||
<div id="{widget_id}_container" style="max-width: 900px;">
|
||||
<!-- Seed URL input (auto-expands) -->
|
||||
<div style="margin-bottom: 12px;">
|
||||
<label style="font-weight: bold; display: block; margin-bottom: 4px;">Seed URL:</label>
|
||||
<textarea id="{widget_id}_uri"
|
||||
style="width: 100%; font-family: monospace; font-size: 13px;
|
||||
padding: 8px; border: 1px solid #ccc; border-radius: 4px;
|
||||
resize: vertical; min-height: 32px; overflow: hidden;"
|
||||
rows="{uri_rows}"
|
||||
placeholder="file:///data/sources/... or https://..."
|
||||
{"readonly" if not obj.pk else ""}>{escaped_uri}</textarea>
|
||||
</div>
|
||||
|
||||
{"" if not is_file else f'''
|
||||
<!-- File contents editor -->
|
||||
<div style="margin-bottom: 8px;">
|
||||
<label style="font-weight: bold; display: block; margin-bottom: 4px;">
|
||||
File Contents: <code style="font-weight: normal; color: #666;">{source_file}</code>
|
||||
</label>
|
||||
{"<div style='color: #dc3545; margin-bottom: 8px;'>" + error + "</div>" if error else ""}
|
||||
<textarea id="{widget_id}_contents"
|
||||
style="width: 100%; height: {min(400, max(150, line_count * 18))}px; font-family: monospace; font-size: 12px;
|
||||
padding: 8px; border: 1px solid #ccc; border-radius: 4px; resize: vertical;"
|
||||
placeholder="Enter URLs, one per line...">{escaped_contents}</textarea>
|
||||
</div>
|
||||
|
||||
<div style="display: flex; gap: 12px; align-items: center; flex-wrap: wrap;">
|
||||
<button type="button" id="{widget_id}_save_btn"
|
||||
onclick="saveSeedUrls_{widget_id}()"
|
||||
style="padding: 8px 20px; background: #417690; color: white; border: none;
|
||||
border-radius: 4px; cursor: pointer; font-weight: bold;">
|
||||
Save URLs
|
||||
</button>
|
||||
<span id="{widget_id}_line_count" style="color: #666; font-size: 12px;"></span>
|
||||
<span id="{widget_id}_status" style="color: #666; font-size: 12px;"></span>
|
||||
</div>
|
||||
'''}
|
||||
|
||||
{"" if is_file else f'''
|
||||
<div style="margin-top: 8px; color: #666;">
|
||||
<a href="{seed_uri}" target="_blank">{seed_uri}</a>
|
||||
</div>
|
||||
'''}
|
||||
|
||||
<script>
|
||||
(function() {{
|
||||
var uriInput = document.getElementById('{widget_id}_uri');
|
||||
var contentsInput = document.getElementById('{widget_id}_contents');
|
||||
var status = document.getElementById('{widget_id}_status');
|
||||
var lineCount = document.getElementById('{widget_id}_line_count');
|
||||
var saveBtn = document.getElementById('{widget_id}_save_btn');
|
||||
|
||||
// Auto-resize URI input
|
||||
function autoResizeUri() {{
|
||||
uriInput.style.height = 'auto';
|
||||
uriInput.style.height = Math.min(100, uriInput.scrollHeight) + 'px';
|
||||
}}
|
||||
uriInput.addEventListener('input', autoResizeUri);
|
||||
autoResizeUri();
|
||||
|
||||
if (contentsInput) {{
|
||||
function updateLineCount() {{
|
||||
var lines = contentsInput.value.split('\\n').filter(function(l) {{ return l.trim(); }});
|
||||
lineCount.textContent = lines.length + ' URLs';
|
||||
}}
|
||||
|
||||
contentsInput.addEventListener('input', function() {{
|
||||
updateLineCount();
|
||||
if (status) {{
|
||||
status.textContent = '(unsaved changes)';
|
||||
status.style.color = '#c4820e';
|
||||
}}
|
||||
}});
|
||||
|
||||
updateLineCount();
|
||||
}}
|
||||
|
||||
window.saveSeedUrls_{widget_id} = function() {{
|
||||
if (!saveBtn) return;
|
||||
saveBtn.disabled = true;
|
||||
saveBtn.textContent = 'Saving...';
|
||||
if (status) status.textContent = '';
|
||||
|
||||
fetch(window.location.pathname + 'save_seed_contents/', {{
|
||||
method: 'POST',
|
||||
headers: {{
|
||||
'Content-Type': 'application/json',
|
||||
'X-CSRFToken': document.querySelector('[name=csrfmiddlewaretoken]').value
|
||||
}},
|
||||
body: JSON.stringify({{ contents: contentsInput ? contentsInput.value : '' }})
|
||||
}})
|
||||
.then(function(response) {{ return response.json(); }})
|
||||
.then(function(data) {{
|
||||
if (data.success) {{
|
||||
if (status) {{
|
||||
status.textContent = '✓ ' + data.message;
|
||||
status.style.color = '#28a745';
|
||||
}}
|
||||
}} else {{
|
||||
if (status) {{
|
||||
status.textContent = '✗ ' + data.error;
|
||||
status.style.color = '#dc3545';
|
||||
}}
|
||||
}}
|
||||
}})
|
||||
.catch(function(err) {{
|
||||
if (status) {{
|
||||
status.textContent = '✗ Error: ' + err;
|
||||
status.style.color = '#dc3545';
|
||||
}}
|
||||
}})
|
||||
.finally(function() {{
|
||||
saveBtn.disabled = false;
|
||||
saveBtn.textContent = 'Save URLs';
|
||||
}});
|
||||
}};
|
||||
}})();
|
||||
</script>
|
||||
</div>
|
||||
'''
|
||||
return mark_safe(html)
|
||||
|
||||
|
||||
|
||||
@@ -143,14 +358,14 @@ class CrawlScheduleAdmin(BaseModelAdmin):
|
||||
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
|
||||
(crawl.admin_change_url, crawl)
|
||||
for crawl in obj.crawl_set.all().order_by('-created_at')[:20]
|
||||
)) or format_html('<i>No Crawls yet...</i>')
|
||||
)) or mark_safe('<i>No Crawls yet...</i>')
|
||||
|
||||
def snapshots(self, obj):
|
||||
crawl_ids = obj.crawl_set.values_list('pk', flat=True)
|
||||
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
|
||||
(snapshot.admin_change_url, snapshot)
|
||||
for snapshot in Snapshot.objects.filter(crawl_id__in=crawl_ids).order_by('-created_at')[:20]
|
||||
)) or format_html('<i>No Snapshots yet...</i>')
|
||||
)) or mark_safe('<i>No Snapshots yet...</i>')
|
||||
|
||||
|
||||
def register_admin(admin_site):
|
||||
|
||||
@@ -865,3 +865,189 @@ def export_plugin_config_to_env(
|
||||
return env
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Plugin Template Discovery
|
||||
# =============================================================================
|
||||
#
|
||||
# Plugins can provide custom templates for rendering their output in the UI.
|
||||
# Templates are discovered by filename convention inside each plugin's templates/ dir:
|
||||
#
|
||||
# archivebox/plugins/<plugin_name>/
|
||||
# templates/
|
||||
# icon.html # Icon for admin table view (small inline HTML)
|
||||
# thumbnail.html # Preview thumbnail for snapshot cards
|
||||
# embed.html # Iframe embed content for main preview
|
||||
# fullscreen.html # Fullscreen view template
|
||||
#
|
||||
# Template context variables available:
|
||||
# {{ result }} - ArchiveResult object
|
||||
# {{ snapshot }} - Parent Snapshot object
|
||||
# {{ output_path }} - Path to output file/dir relative to snapshot dir
|
||||
# {{ extractor }} - Extractor name (e.g., 'screenshot', 'singlefile')
|
||||
#
|
||||
|
||||
# Default templates used when plugin doesn't provide one
|
||||
DEFAULT_TEMPLATES = {
|
||||
'icon': '''<span title="{{ extractor }}">{{ icon }}</span>''',
|
||||
'thumbnail': '''
|
||||
<img src="{{ output_path }}"
|
||||
alt="{{ extractor }} output"
|
||||
style="max-width: 100%; max-height: 100px; object-fit: cover;"
|
||||
onerror="this.style.display='none'">
|
||||
''',
|
||||
'embed': '''
|
||||
<iframe src="{{ output_path }}"
|
||||
style="width: 100%; height: 100%; border: none;"
|
||||
sandbox="allow-same-origin allow-scripts">
|
||||
</iframe>
|
||||
''',
|
||||
'fullscreen': '''
|
||||
<iframe src="{{ output_path }}"
|
||||
style="width: 100%; height: 100vh; border: none;"
|
||||
sandbox="allow-same-origin allow-scripts allow-forms">
|
||||
</iframe>
|
||||
''',
|
||||
}
|
||||
|
||||
# Default icons for known extractors (emoji or short HTML)
|
||||
DEFAULT_EXTRACTOR_ICONS = {
|
||||
'screenshot': '📷',
|
||||
'pdf': '📄',
|
||||
'singlefile': '📦',
|
||||
'dom': '🌐',
|
||||
'wget': '📥',
|
||||
'media': '🎬',
|
||||
'git': '📂',
|
||||
'readability': '📖',
|
||||
'mercury': '☿️',
|
||||
'favicon': '⭐',
|
||||
'title': '📝',
|
||||
'headers': '📋',
|
||||
'archive_org': '🏛️',
|
||||
'htmltotext': '📃',
|
||||
'warc': '🗄️',
|
||||
}
|
||||
|
||||
|
||||
def get_plugin_template(extractor: str, template_name: str) -> Optional[str]:
|
||||
"""
|
||||
Get a plugin template by extractor name and template type.
|
||||
|
||||
Args:
|
||||
extractor: Extractor name (e.g., 'screenshot', '15_singlefile')
|
||||
template_name: One of 'icon', 'thumbnail', 'embed', 'fullscreen'
|
||||
|
||||
Returns:
|
||||
Template content as string, or None if not found.
|
||||
"""
|
||||
base_name = get_extractor_name(extractor)
|
||||
|
||||
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
|
||||
if not base_dir.exists():
|
||||
continue
|
||||
|
||||
# Look for plugin directory matching extractor name
|
||||
for plugin_dir in base_dir.iterdir():
|
||||
if not plugin_dir.is_dir():
|
||||
continue
|
||||
|
||||
# Match by directory name (exact or partial)
|
||||
if plugin_dir.name == base_name or plugin_dir.name.endswith(f'_{base_name}'):
|
||||
template_path = plugin_dir / 'templates' / f'{template_name}.html'
|
||||
if template_path.exists():
|
||||
return template_path.read_text()
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_extractor_template(extractor: str, template_name: str) -> str:
|
||||
"""
|
||||
Get template for an extractor, falling back to defaults.
|
||||
|
||||
Args:
|
||||
extractor: Extractor name (e.g., 'screenshot', '15_singlefile')
|
||||
template_name: One of 'icon', 'thumbnail', 'embed', 'fullscreen'
|
||||
|
||||
Returns:
|
||||
Template content as string (plugin template or default).
|
||||
"""
|
||||
# Try plugin-provided template first
|
||||
template = get_plugin_template(extractor, template_name)
|
||||
if template:
|
||||
return template
|
||||
|
||||
# Fall back to default template
|
||||
return DEFAULT_TEMPLATES.get(template_name, '')
|
||||
|
||||
|
||||
def get_extractor_icon(extractor: str) -> str:
|
||||
"""
|
||||
Get the icon for an extractor.
|
||||
|
||||
First checks for plugin-provided icon.html template,
|
||||
then falls back to DEFAULT_EXTRACTOR_ICONS.
|
||||
|
||||
Args:
|
||||
extractor: Extractor name (e.g., 'screenshot', '15_singlefile')
|
||||
|
||||
Returns:
|
||||
Icon HTML/emoji string.
|
||||
"""
|
||||
base_name = get_extractor_name(extractor)
|
||||
|
||||
# Try plugin-provided icon template
|
||||
icon_template = get_plugin_template(extractor, 'icon')
|
||||
if icon_template:
|
||||
return icon_template.strip()
|
||||
|
||||
# Fall back to default icon
|
||||
return DEFAULT_EXTRACTOR_ICONS.get(base_name, '📁')
|
||||
|
||||
|
||||
def get_all_extractor_icons() -> Dict[str, str]:
|
||||
"""
|
||||
Get icons for all discovered extractors.
|
||||
|
||||
Returns:
|
||||
Dict mapping extractor base names to their icons.
|
||||
"""
|
||||
icons = {}
|
||||
for extractor in get_extractors():
|
||||
base_name = get_extractor_name(extractor)
|
||||
icons[base_name] = get_extractor_icon(extractor)
|
||||
return icons
|
||||
|
||||
|
||||
def discover_plugin_templates() -> Dict[str, Dict[str, str]]:
|
||||
"""
|
||||
Discover all plugin templates organized by extractor.
|
||||
|
||||
Returns:
|
||||
Dict mapping extractor names to dicts of template_name -> template_path.
|
||||
e.g., {'screenshot': {'icon': '/path/to/icon.html', 'thumbnail': '/path/to/thumbnail.html'}}
|
||||
"""
|
||||
templates: Dict[str, Dict[str, str]] = {}
|
||||
|
||||
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
|
||||
if not base_dir.exists():
|
||||
continue
|
||||
|
||||
for plugin_dir in base_dir.iterdir():
|
||||
if not plugin_dir.is_dir():
|
||||
continue
|
||||
|
||||
templates_dir = plugin_dir / 'templates'
|
||||
if not templates_dir.exists():
|
||||
continue
|
||||
|
||||
plugin_templates = {}
|
||||
for template_file in templates_dir.glob('*.html'):
|
||||
template_name = template_file.stem # icon, thumbnail, embed, fullscreen
|
||||
plugin_templates[template_name] = str(template_file)
|
||||
|
||||
if plugin_templates:
|
||||
templates[plugin_dir.name] = plugin_templates
|
||||
|
||||
return templates
|
||||
|
||||
|
||||
|
||||
@@ -3,16 +3,16 @@ __package__ = 'archivebox.machine'
|
||||
from django.contrib import admin
|
||||
from django.utils.html import format_html
|
||||
|
||||
from archivebox.base_models.admin import BaseModelAdmin
|
||||
from machine.models import Machine, NetworkInterface, InstalledBinary
|
||||
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
|
||||
from machine.models import Machine, NetworkInterface, InstalledBinary, Dependency
|
||||
|
||||
|
||||
class MachineAdmin(BaseModelAdmin):
|
||||
class MachineAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
list_display = ('id', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid', 'health')
|
||||
sort_fields = ('id', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid')
|
||||
|
||||
readonly_fields = ('guid', 'created_at', 'modified_at', 'ips')
|
||||
fields = (*readonly_fields, 'hostname', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'hw_uuid', 'os_arch', 'os_family', 'os_platform', 'os_kernel', 'os_release', 'stats', 'num_uses_succeeded', 'num_uses_failed')
|
||||
fields = (*readonly_fields, 'hostname', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'hw_uuid', 'os_arch', 'os_family', 'os_platform', 'os_kernel', 'os_release', 'stats', 'config', 'num_uses_succeeded', 'num_uses_failed')
|
||||
|
||||
list_filter = ('hw_in_docker', 'hw_in_vm', 'os_arch', 'os_family', 'os_platform')
|
||||
ordering = ['-created_at']
|
||||
@@ -48,15 +48,43 @@ class NetworkInterfaceAdmin(BaseModelAdmin):
|
||||
)
|
||||
|
||||
|
||||
class DependencyAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
list_display = ('id', 'created_at', 'bin_name', 'bin_providers', 'is_installed', 'installed_count')
|
||||
sort_fields = ('id', 'created_at', 'bin_name', 'bin_providers')
|
||||
search_fields = ('id', 'bin_name', 'bin_providers')
|
||||
|
||||
readonly_fields = ('id', 'created_at', 'modified_at', 'is_installed', 'installed_count')
|
||||
fields = ('bin_name', 'bin_providers', 'custom_cmds', 'config', *readonly_fields)
|
||||
|
||||
list_filter = ('bin_providers', 'created_at')
|
||||
ordering = ['-created_at']
|
||||
list_per_page = 100
|
||||
actions = ["delete_selected"]
|
||||
|
||||
@admin.display(description='Installed', boolean=True)
|
||||
def is_installed(self, dependency):
|
||||
return dependency.is_installed
|
||||
|
||||
@admin.display(description='# Binaries')
|
||||
def installed_count(self, dependency):
|
||||
count = dependency.installed_binaries.count()
|
||||
if count:
|
||||
return format_html(
|
||||
'<a href="/admin/machine/installedbinary/?dependency__id__exact={}">{}</a>',
|
||||
dependency.id, count,
|
||||
)
|
||||
return '0'
|
||||
|
||||
|
||||
class InstalledBinaryAdmin(BaseModelAdmin):
|
||||
list_display = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'health')
|
||||
list_display = ('id', 'created_at', 'machine_info', 'name', 'dependency_link', 'binprovider', 'version', 'abspath', 'sha256', 'health')
|
||||
sort_fields = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256')
|
||||
search_fields = ('id', 'machine__id', 'name', 'binprovider', 'version', 'abspath', 'sha256')
|
||||
search_fields = ('id', 'machine__id', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'dependency__bin_name')
|
||||
|
||||
readonly_fields = ('created_at', 'modified_at')
|
||||
fields = ('machine', 'name', 'binprovider', 'abspath', 'version', 'sha256', *readonly_fields, 'num_uses_succeeded', 'num_uses_failed')
|
||||
fields = ('machine', 'dependency', 'name', 'binprovider', 'abspath', 'version', 'sha256', *readonly_fields, 'num_uses_succeeded', 'num_uses_failed')
|
||||
|
||||
list_filter = ('name', 'binprovider', 'machine_id')
|
||||
list_filter = ('name', 'binprovider', 'machine_id', 'dependency')
|
||||
ordering = ['-created_at']
|
||||
list_per_page = 100
|
||||
actions = ["delete_selected"]
|
||||
@@ -68,8 +96,18 @@ class InstalledBinaryAdmin(BaseModelAdmin):
|
||||
installed_binary.machine.id, str(installed_binary.machine.id)[:8], installed_binary.machine.hostname,
|
||||
)
|
||||
|
||||
@admin.display(description='Dependency', ordering='dependency__bin_name')
|
||||
def dependency_link(self, installed_binary):
|
||||
if installed_binary.dependency:
|
||||
return format_html(
|
||||
'<a href="/admin/machine/dependency/{}/change">{}</a>',
|
||||
installed_binary.dependency.id, installed_binary.dependency.bin_name,
|
||||
)
|
||||
return '-'
|
||||
|
||||
|
||||
def register_admin(admin_site):
|
||||
admin_site.register(Machine, MachineAdmin)
|
||||
admin_site.register(NetworkInterface, NetworkInterfaceAdmin)
|
||||
admin_site.register(Dependency, DependencyAdmin)
|
||||
admin_site.register(InstalledBinary, InstalledBinaryAdmin)
|
||||
|
||||
@@ -37,15 +37,13 @@ def apply_migrations(out_dir: Path = DATA_DIR) -> List[str]:
|
||||
"""Apply pending Django migrations"""
|
||||
from django.core.management import call_command
|
||||
|
||||
out1, out2 = StringIO(), StringIO()
|
||||
out1 = StringIO()
|
||||
|
||||
call_command("migrate", interactive=False, database='default', stdout=out1)
|
||||
out1.seek(0)
|
||||
call_command("migrate", "huey_monitor", interactive=False, database='queue', stdout=out2)
|
||||
out2.seek(0)
|
||||
|
||||
return [
|
||||
line.strip() for line in out1.readlines() + out2.readlines() if line.strip()
|
||||
line.strip() for line in out1.readlines() if line.strip()
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -480,6 +480,138 @@ def printable_filesize(num_bytes: Union[int, float]) -> str:
|
||||
return '%3.1f %s' % (num_bytes, 'TB')
|
||||
|
||||
|
||||
@enforce_types
|
||||
def format_duration(seconds: float) -> str:
|
||||
"""Format duration in human-readable form."""
|
||||
if seconds < 1:
|
||||
return f'{seconds*1000:.0f}ms'
|
||||
elif seconds < 60:
|
||||
return f'{seconds:.1f}s'
|
||||
elif seconds < 3600:
|
||||
minutes = int(seconds // 60)
|
||||
secs = int(seconds % 60)
|
||||
return f'{minutes}min {secs}s' if secs else f'{minutes}min'
|
||||
else:
|
||||
hours = int(seconds // 3600)
|
||||
minutes = int((seconds % 3600) // 60)
|
||||
return f'{hours}hr {minutes}min' if minutes else f'{hours}hr'
|
||||
|
||||
|
||||
@enforce_types
|
||||
def truncate_url(url: str, max_length: int = 60) -> str:
|
||||
"""Truncate URL to max_length, keeping domain and adding ellipsis."""
|
||||
if len(url) <= max_length:
|
||||
return url
|
||||
# Try to keep the domain and beginning of path
|
||||
if '://' in url:
|
||||
protocol, rest = url.split('://', 1)
|
||||
if '/' in rest:
|
||||
domain, path = rest.split('/', 1)
|
||||
available = max_length - len(protocol) - len(domain) - 6 # for "://", "/", "..."
|
||||
if available > 10:
|
||||
return f'{protocol}://{domain}/{path[:available]}...'
|
||||
# Fallback: just truncate
|
||||
return url[:max_length-3] + '...'
|
||||
|
||||
|
||||
@enforce_types
|
||||
def log_worker_event(
|
||||
worker_type: str,
|
||||
event: str,
|
||||
indent_level: int = 0,
|
||||
pid: Optional[int] = None,
|
||||
worker_id: Optional[str] = None,
|
||||
url: Optional[str] = None,
|
||||
extractor: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
error: Optional[Exception] = None,
|
||||
) -> None:
|
||||
"""
|
||||
Log a worker event with structured metadata and indentation.
|
||||
|
||||
Args:
|
||||
worker_type: Type of worker (Orchestrator, CrawlWorker, SnapshotWorker, etc.)
|
||||
event: Event name (Starting, Completed, Failed, etc.)
|
||||
indent_level: Indentation level (0=Orchestrator, 1=CrawlWorker, 2=SnapshotWorker, 3=ArchiveResultWorker)
|
||||
pid: Process ID
|
||||
worker_id: Worker ID (UUID for CrawlWorker, url for SnapshotWorker, extractor for ArchiveResultWorker)
|
||||
url: URL being processed (for SnapshotWorker/ArchiveResultWorker)
|
||||
extractor: Extractor name (for ArchiveResultWorker)
|
||||
metadata: Dict of metadata to show in curly braces
|
||||
error: Exception if event is an error
|
||||
"""
|
||||
indent = ' ' * indent_level
|
||||
|
||||
# Build worker identifier
|
||||
worker_parts = [worker_type]
|
||||
if pid:
|
||||
worker_parts.append(f'pid={pid}')
|
||||
if worker_id and worker_type in ('CrawlWorker', 'Orchestrator'):
|
||||
worker_parts.append(f'id={worker_id}')
|
||||
if url and worker_type == 'SnapshotWorker':
|
||||
worker_parts.append(f'url={truncate_url(url)}')
|
||||
if extractor and worker_type == 'ArchiveResultWorker':
|
||||
worker_parts.append(f'extractor={extractor}')
|
||||
|
||||
worker_label = f'{worker_parts[0]}[{", ".join(worker_parts[1:])}]'
|
||||
|
||||
# Build metadata string
|
||||
metadata_str = ''
|
||||
if metadata:
|
||||
# Format metadata nicely
|
||||
meta_parts = []
|
||||
for k, v in metadata.items():
|
||||
if isinstance(v, float):
|
||||
# Format floats nicely (durations, sizes)
|
||||
if 'duration' in k.lower():
|
||||
meta_parts.append(f'{k}: {format_duration(v)}')
|
||||
elif 'size' in k.lower():
|
||||
meta_parts.append(f'{k}: {printable_filesize(int(v))}')
|
||||
else:
|
||||
meta_parts.append(f'{k}: {v:.2f}')
|
||||
elif isinstance(v, int):
|
||||
# Format integers - check if it's a size
|
||||
if 'size' in k.lower() or 'bytes' in k.lower():
|
||||
meta_parts.append(f'{k}: {printable_filesize(v)}')
|
||||
else:
|
||||
meta_parts.append(f'{k}: {v}')
|
||||
elif isinstance(v, (list, tuple)):
|
||||
meta_parts.append(f'{k}: {len(v)}')
|
||||
else:
|
||||
meta_parts.append(f'{k}: {v}')
|
||||
metadata_str = ' {' + ', '.join(meta_parts) + '}'
|
||||
|
||||
# Determine color based on event
|
||||
color = 'white'
|
||||
if event in ('Starting...', 'Started', 'STARTED', 'Started in background'):
|
||||
color = 'green'
|
||||
elif event in ('Processing...', 'PROCESSING'):
|
||||
color = 'blue'
|
||||
elif event in ('Completed', 'COMPLETED', 'All work complete'):
|
||||
color = 'blue'
|
||||
elif event in ('Failed', 'ERROR', 'Failed to spawn worker'):
|
||||
color = 'red'
|
||||
elif event in ('Shutting down', 'SHUTDOWN'):
|
||||
color = 'grey53'
|
||||
|
||||
# Build final message
|
||||
error_str = f' {type(error).__name__}: {error}' if error else ''
|
||||
# Build colored message - worker_label needs to be inside color tags
|
||||
# But first we need to format the color tags separately from the worker label
|
||||
from archivebox.misc.logging import CONSOLE
|
||||
from rich.text import Text
|
||||
|
||||
# Create a Rich Text object for proper formatting
|
||||
text = Text()
|
||||
text.append(indent) # Indentation
|
||||
# Append worker label and event with color
|
||||
text.append(f'{worker_label} {event}{error_str}', style=color)
|
||||
# Append metadata without color
|
||||
text.append(metadata_str)
|
||||
|
||||
CONSOLE.print(text)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def printable_folders(folders: Dict[str, Optional["Snapshot"]], with_headers: bool=False) -> str:
|
||||
return '\n'.join(
|
||||
|
||||
1
archivebox/plugins/archive_org/templates/icon.html
Normal file
1
archivebox/plugins/archive_org/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
🏛️
|
||||
@@ -7,7 +7,7 @@ new plugin-based output structure to the legacy canonical output paths that
|
||||
ArchiveBox has historically used. This maintains backward compatibility with
|
||||
existing tools and scripts that expect outputs at specific locations.
|
||||
|
||||
Canonical output paths (from Snapshot.canonical_outputs()):
|
||||
Canonical output paths:
|
||||
- favicon.ico → favicon/favicon.ico
|
||||
- singlefile.html → singlefile/singlefile.html
|
||||
- readability/content.html → readability/content.html
|
||||
@@ -27,27 +27,20 @@ New plugin outputs:
|
||||
- redirects.json → redirects/redirects.json
|
||||
- console.jsonl → consolelog/console.jsonl
|
||||
|
||||
Usage: on_Snapshot__91_canonical_outputs.py --url=<url> --snapshot-id=<uuid>
|
||||
Usage: on_Snapshot__92_canonical_outputs.py --url=<url> --snapshot-id=<uuid>
|
||||
|
||||
Environment variables:
|
||||
SAVE_CANONICAL_SYMLINKS: Enable canonical symlinks (default: true)
|
||||
DATA_DIR: ArchiveBox data directory
|
||||
ARCHIVE_DIR: Archive output directory
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.plugins.canonical_outputs'
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional
|
||||
|
||||
# Configure Django if running standalone
|
||||
if __name__ == '__main__':
|
||||
parent_dir = str(Path(__file__).resolve().parent.parent.parent)
|
||||
if parent_dir not in sys.path:
|
||||
sys.path.insert(0, parent_dir)
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
|
||||
import django
|
||||
django.setup()
|
||||
from datetime import datetime, timezone
|
||||
from typing import Dict
|
||||
|
||||
import rich_click as click
|
||||
|
||||
@@ -150,10 +143,7 @@ def create_canonical_symlinks(snapshot_dir: Path) -> Dict[str, bool]:
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Create symlinks from plugin outputs to canonical legacy locations."""
|
||||
from datetime import datetime
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
start_ts = datetime.now()
|
||||
start_ts = datetime.now(timezone.utc)
|
||||
status = 'failed'
|
||||
output = None
|
||||
error = ''
|
||||
@@ -161,31 +151,20 @@ def main(url: str, snapshot_id: str):
|
||||
|
||||
try:
|
||||
# Check if enabled
|
||||
from archivebox.config import CONSTANTS
|
||||
save_canonical = os.getenv('SAVE_CANONICAL_SYMLINKS', 'true').lower() in ('true', '1', 'yes', 'on')
|
||||
|
||||
if not save_canonical:
|
||||
click.echo('Skipping canonical symlinks (SAVE_CANONICAL_SYMLINKS=False)')
|
||||
status = 'skipped'
|
||||
end_ts = datetime.now()
|
||||
click.echo(f'START_TS={start_ts.isoformat()}')
|
||||
click.echo(f'END_TS={end_ts.isoformat()}')
|
||||
click.echo(f'STATUS={status}')
|
||||
click.echo(f'RESULT_JSON={{"extractor": "canonical_outputs", "status": "{status}", "url": "{url}", "snapshot_id": "{snapshot_id}"}}')
|
||||
click.echo(json.dumps({'status': status, 'output': 'SAVE_CANONICAL_SYMLINKS=false'}))
|
||||
sys.exit(0)
|
||||
|
||||
# Get snapshot
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
except Snapshot.DoesNotExist:
|
||||
error = f'Snapshot {snapshot_id} not found'
|
||||
raise ValueError(error)
|
||||
# Working directory is the extractor output dir (e.g., <snapshot>/canonical_outputs/)
|
||||
# Parent is the snapshot directory
|
||||
output_dir = Path.cwd()
|
||||
snapshot_dir = output_dir.parent
|
||||
|
||||
# Get snapshot directory
|
||||
snapshot_dir = Path(snapshot.output_dir)
|
||||
if not snapshot_dir.exists():
|
||||
error = f'Snapshot directory not found: {snapshot_dir}'
|
||||
raise FileNotFoundError(error)
|
||||
raise FileNotFoundError(f'Snapshot directory not found: {snapshot_dir}')
|
||||
|
||||
# Create canonical symlinks
|
||||
results = create_canonical_symlinks(snapshot_dir)
|
||||
@@ -203,37 +182,18 @@ def main(url: str, snapshot_id: str):
|
||||
status = 'failed'
|
||||
click.echo(f'Error: {error}', err=True)
|
||||
|
||||
end_ts = datetime.now()
|
||||
duration = (end_ts - start_ts).total_seconds()
|
||||
end_ts = datetime.now(timezone.utc)
|
||||
|
||||
# Print results
|
||||
click.echo(f'START_TS={start_ts.isoformat()}')
|
||||
click.echo(f'END_TS={end_ts.isoformat()}')
|
||||
click.echo(f'DURATION={duration:.2f}')
|
||||
if output:
|
||||
click.echo(f'OUTPUT={output}')
|
||||
click.echo(f'STATUS={status}')
|
||||
|
||||
if error:
|
||||
click.echo(f'ERROR={error}', err=True)
|
||||
|
||||
# Print JSON result
|
||||
import json
|
||||
result_json = {
|
||||
'extractor': 'canonical_outputs',
|
||||
'url': url,
|
||||
'snapshot_id': snapshot_id,
|
||||
# Print JSON result for hook runner
|
||||
result = {
|
||||
'status': status,
|
||||
'start_ts': start_ts.isoformat(),
|
||||
'end_ts': end_ts.isoformat(),
|
||||
'duration': round(duration, 2),
|
||||
'output': output,
|
||||
'symlinks_created': symlinks_created,
|
||||
'error': error or None,
|
||||
'symlinks_created': symlinks_created,
|
||||
}
|
||||
click.echo(f'RESULT_JSON={json.dumps(result_json)}')
|
||||
click.echo(json.dumps(result))
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
sys.exit(0 if status in ('succeeded', 'skipped') else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -1,149 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Install Chrome/Chromium if not already available.
|
||||
|
||||
Runs at crawl start to ensure Chrome is installed.
|
||||
Uses playwright to install chromium if no system Chrome found.
|
||||
Outputs JSONL for InstalledBinary.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def find_chrome():
|
||||
"""Try to find system Chrome/Chromium."""
|
||||
# Comprehensive list of Chrome/Chromium binary names and paths
|
||||
chromium_names_linux = [
|
||||
'chromium',
|
||||
'chromium-browser',
|
||||
'chromium-browser-beta',
|
||||
'chromium-browser-unstable',
|
||||
'chromium-browser-canary',
|
||||
'chromium-browser-dev',
|
||||
]
|
||||
|
||||
chrome_names_linux = [
|
||||
'google-chrome',
|
||||
'google-chrome-stable',
|
||||
'google-chrome-beta',
|
||||
'google-chrome-canary',
|
||||
'google-chrome-unstable',
|
||||
'google-chrome-dev',
|
||||
'chrome',
|
||||
]
|
||||
|
||||
chrome_paths_macos = [
|
||||
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
||||
'/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
|
||||
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
||||
]
|
||||
|
||||
chrome_paths_linux = [
|
||||
'/usr/bin/google-chrome',
|
||||
'/usr/bin/google-chrome-stable',
|
||||
'/usr/bin/chromium',
|
||||
'/usr/bin/chromium-browser',
|
||||
'/snap/bin/chromium',
|
||||
'/opt/google/chrome/chrome',
|
||||
]
|
||||
|
||||
all_chrome_names = chrome_names_linux + chromium_names_linux
|
||||
all_chrome_paths = chrome_paths_macos + chrome_paths_linux
|
||||
|
||||
# Check env var first
|
||||
env_path = os.environ.get('CHROME_BINARY', '')
|
||||
if env_path and Path(env_path).is_file():
|
||||
return env_path
|
||||
|
||||
# Try shutil.which for various names
|
||||
for name in all_chrome_names:
|
||||
abspath = shutil.which(name)
|
||||
if abspath:
|
||||
return abspath
|
||||
|
||||
# Check common paths
|
||||
for path in all_chrome_paths:
|
||||
if Path(path).is_file():
|
||||
return path
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
# First try to find system Chrome
|
||||
system_chrome = find_chrome()
|
||||
if system_chrome:
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': 'chrome',
|
||||
'abspath': str(system_chrome),
|
||||
'version': None,
|
||||
'sha256': None,
|
||||
'binprovider': 'env',
|
||||
}))
|
||||
sys.exit(0)
|
||||
|
||||
# If not found in system, try to install chromium via apt/brew
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
AptProvider.model_rebuild()
|
||||
BrewProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
# Try chromium-browser or chromium via system package managers
|
||||
for binary_name in ['chromium', 'chromium-browser', 'google-chrome']:
|
||||
try:
|
||||
chrome_binary = Binary(
|
||||
name=binary_name,
|
||||
binproviders=[AptProvider(), BrewProvider(), EnvProvider()]
|
||||
)
|
||||
|
||||
# Try to load, install if not found
|
||||
try:
|
||||
loaded = chrome_binary.load()
|
||||
if not loaded or not loaded.abspath:
|
||||
raise Exception("Not loaded")
|
||||
except Exception:
|
||||
# Install via system package manager
|
||||
loaded = chrome_binary.install()
|
||||
|
||||
if loaded and loaded.abspath:
|
||||
# Output InstalledBinary JSONL
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': 'chrome',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256,
|
||||
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
|
||||
}))
|
||||
sys.exit(0)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# If all attempts failed
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'chrome',
|
||||
'bin_providers': 'apt,brew,env',
|
||||
}))
|
||||
print("Failed to install Chrome/Chromium", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'chrome',
|
||||
'bin_providers': 'apt,brew,env',
|
||||
}))
|
||||
print(f"Error installing Chrome: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -2,7 +2,7 @@
|
||||
Integration tests for chrome_session plugin
|
||||
|
||||
Tests verify:
|
||||
1. Install hook finds system Chrome or installs chromium
|
||||
1. Validate hook checks for Chrome/Chromium binary
|
||||
2. Verify deps with abx-pkg
|
||||
3. Chrome session script exists
|
||||
"""
|
||||
@@ -14,7 +14,7 @@ from pathlib import Path
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
CHROME_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_chrome.py'
|
||||
CHROME_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_chrome.py'
|
||||
CHROME_SESSION_HOOK = PLUGIN_DIR / 'on_Snapshot__20_chrome_session.js'
|
||||
|
||||
|
||||
@@ -23,37 +23,50 @@ def test_hook_script_exists():
|
||||
assert CHROME_SESSION_HOOK.exists(), f"Hook not found: {CHROME_SESSION_HOOK}"
|
||||
|
||||
|
||||
def test_chrome_install_hook():
|
||||
"""Test chrome install hook to find or install Chrome/Chromium."""
|
||||
def test_chrome_validate_hook():
|
||||
"""Test chrome validate hook checks for Chrome/Chromium binary."""
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(CHROME_INSTALL_HOOK)],
|
||||
[sys.executable, str(CHROME_VALIDATE_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
|
||||
|
||||
# Verify InstalledBinary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == 'chrome'
|
||||
assert record['abspath']
|
||||
assert Path(record['abspath']).exists(), f"Chrome binary should exist at {record['abspath']}"
|
||||
found_binary = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert found_binary, "Should output InstalledBinary record"
|
||||
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
|
||||
if result.returncode == 0:
|
||||
# Binary found - verify InstalledBinary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == 'chrome'
|
||||
assert record['abspath']
|
||||
assert Path(record['abspath']).exists(), f"Chrome binary should exist at {record['abspath']}"
|
||||
found_binary = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
assert found_binary, "Should output InstalledBinary record when binary found"
|
||||
else:
|
||||
# Binary not found - verify Dependency JSONL output
|
||||
found_dependency = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Dependency':
|
||||
assert record['bin_name'] == 'chrome'
|
||||
found_dependency = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
assert found_dependency, "Should output Dependency record when binary not found"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify chrome is available via abx-pkg after hook installation."""
|
||||
"""Verify chrome is available via abx-pkg."""
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
AptProvider.model_rebuild()
|
||||
@@ -75,10 +88,10 @@ def test_verify_deps_with_abx_pkg():
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# If we get here, chrome should still be available from system
|
||||
# If we get here, chrome not available
|
||||
import shutil
|
||||
assert shutil.which('chromium') or shutil.which('chrome') or shutil.which('google-chrome'), \
|
||||
"Chrome should be available after install hook"
|
||||
if not (shutil.which('chromium') or shutil.which('chrome') or shutil.which('google-chrome')):
|
||||
pytest.skip("Chrome/Chromium not available - Dependency record should have been emitted")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
6
archivebox/plugins/dom/templates/embed.html
Normal file
6
archivebox/plugins/dom/templates/embed.html
Normal file
@@ -0,0 +1,6 @@
|
||||
<!-- DOM embed - full iframe of captured DOM HTML -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-embed dom-embed"
|
||||
style="width: 100%; height: 100%; min-height: 500px; border: none;"
|
||||
sandbox="allow-same-origin allow-scripts allow-forms">
|
||||
</iframe>
|
||||
6
archivebox/plugins/dom/templates/fullscreen.html
Normal file
6
archivebox/plugins/dom/templates/fullscreen.html
Normal file
@@ -0,0 +1,6 @@
|
||||
<!-- DOM fullscreen - full page iframe -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-fullscreen dom-fullscreen"
|
||||
style="width: 100%; height: 100vh; border: none;"
|
||||
sandbox="allow-same-origin allow-scripts allow-forms allow-top-navigation-by-user-activation">
|
||||
</iframe>
|
||||
1
archivebox/plugins/dom/templates/icon.html
Normal file
1
archivebox/plugins/dom/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
🌐
|
||||
8
archivebox/plugins/dom/templates/thumbnail.html
Normal file
8
archivebox/plugins/dom/templates/thumbnail.html
Normal file
@@ -0,0 +1,8 @@
|
||||
<!-- DOM thumbnail - scaled down iframe preview of captured DOM HTML -->
|
||||
<div class="extractor-thumbnail dom-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fff;">
|
||||
<iframe src="{{ output_path }}"
|
||||
style="width: 400%; height: 400px; transform: scale(0.25); transform-origin: top left; pointer-events: none; border: none;"
|
||||
loading="lazy"
|
||||
sandbox="allow-same-origin">
|
||||
</iframe>
|
||||
</div>
|
||||
1
archivebox/plugins/favicon/templates/icon.html
Normal file
1
archivebox/plugins/favicon/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
⭐
|
||||
@@ -1,68 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Install git if not already available.
|
||||
|
||||
Runs at crawl start to ensure git is installed.
|
||||
Outputs JSONL for InstalledBinary.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
AptProvider.model_rebuild()
|
||||
BrewProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
# git binary and package have same name
|
||||
git_binary = Binary(
|
||||
name='git',
|
||||
binproviders=[AptProvider(), BrewProvider(), EnvProvider()]
|
||||
)
|
||||
|
||||
# Try to load, install if not found
|
||||
try:
|
||||
loaded = git_binary.load()
|
||||
if not loaded or not loaded.abspath:
|
||||
raise Exception("Not loaded")
|
||||
except Exception:
|
||||
# Install via system package manager
|
||||
loaded = git_binary.install()
|
||||
|
||||
if loaded and loaded.abspath:
|
||||
# Output InstalledBinary JSONL
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': 'git',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256,
|
||||
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
|
||||
}))
|
||||
sys.exit(0)
|
||||
else:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'git',
|
||||
'bin_providers': 'apt,brew,env',
|
||||
}))
|
||||
print("Failed to install git", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'git',
|
||||
'bin_providers': 'apt,brew,env',
|
||||
}))
|
||||
print(f"Error installing git: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
6
archivebox/plugins/git/templates/embed.html
Normal file
6
archivebox/plugins/git/templates/embed.html
Normal file
@@ -0,0 +1,6 @@
|
||||
<!-- Git embed - directory listing of cloned repo -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-embed git-embed"
|
||||
style="width: 100%; height: 100%; min-height: 400px; border: none; background: #fff;"
|
||||
sandbox="allow-same-origin">
|
||||
</iframe>
|
||||
6
archivebox/plugins/git/templates/fullscreen.html
Normal file
6
archivebox/plugins/git/templates/fullscreen.html
Normal file
@@ -0,0 +1,6 @@
|
||||
<!-- Git fullscreen - full directory listing -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-fullscreen git-fullscreen"
|
||||
style="width: 100%; height: 100vh; border: none; background: #fff;"
|
||||
sandbox="allow-same-origin">
|
||||
</iframe>
|
||||
1
archivebox/plugins/git/templates/icon.html
Normal file
1
archivebox/plugins/git/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
📂
|
||||
5
archivebox/plugins/git/templates/thumbnail.html
Normal file
5
archivebox/plugins/git/templates/thumbnail.html
Normal file
@@ -0,0 +1,5 @@
|
||||
<!-- Git thumbnail - shows git repository icon and info -->
|
||||
<div class="extractor-thumbnail git-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #f6f8fa; display: flex; flex-direction: column; align-items: center; justify-content: center; padding: 10px;">
|
||||
<span style="font-size: 32px;">📂</span>
|
||||
<span style="font-size: 11px; color: #586069; margin-top: 4px;">Git Repository</span>
|
||||
</div>
|
||||
@@ -2,7 +2,7 @@
|
||||
Integration tests for git plugin
|
||||
|
||||
Tests verify:
|
||||
1. Install hook installs git via abx-pkg
|
||||
1. Validate hook checks for git binary
|
||||
2. Verify deps with abx-pkg
|
||||
3. Standalone git extractor execution
|
||||
"""
|
||||
@@ -17,50 +17,64 @@ import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
GIT_HOOK = PLUGIN_DIR / 'on_Snapshot__12_git.py'
|
||||
GIT_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_git.py'
|
||||
GIT_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_git.py'
|
||||
TEST_URL = 'https://github.com/example/repo.git'
|
||||
|
||||
def test_hook_script_exists():
|
||||
assert GIT_HOOK.exists()
|
||||
|
||||
def test_git_install_hook():
|
||||
"""Test git install hook to install git if needed."""
|
||||
def test_git_validate_hook():
|
||||
"""Test git validate hook checks for git binary."""
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(GIT_INSTALL_HOOK)],
|
||||
[sys.executable, str(GIT_VALIDATE_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
|
||||
|
||||
# Verify InstalledBinary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == 'git'
|
||||
assert record['abspath']
|
||||
found_binary = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert found_binary, "Should output InstalledBinary record"
|
||||
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
|
||||
if result.returncode == 0:
|
||||
# Binary found - verify InstalledBinary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == 'git'
|
||||
assert record['abspath']
|
||||
found_binary = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
assert found_binary, "Should output InstalledBinary record when binary found"
|
||||
else:
|
||||
# Binary not found - verify Dependency JSONL output
|
||||
found_dependency = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Dependency':
|
||||
assert record['bin_name'] == 'git'
|
||||
assert 'env' in record['bin_providers']
|
||||
found_dependency = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
assert found_dependency, "Should output Dependency record when binary not found"
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify git is available via abx-pkg after hook installation."""
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
|
||||
|
||||
AptProvider.model_rebuild()
|
||||
BrewProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
"""Verify git is available via abx-pkg."""
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
git_binary = Binary(name='git', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
|
||||
git_loaded = git_binary.load()
|
||||
assert git_loaded and git_loaded.abspath, "git should be available after install hook"
|
||||
|
||||
if git_loaded and git_loaded.abspath:
|
||||
assert True, "git is available"
|
||||
else:
|
||||
pytest.skip("git not available - Dependency record should have been emitted")
|
||||
|
||||
def test_reports_missing_git():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
|
||||
1
archivebox/plugins/headers/templates/icon.html
Normal file
1
archivebox/plugins/headers/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
📋
|
||||
1
archivebox/plugins/htmltotext/templates/icon.html
Normal file
1
archivebox/plugins/htmltotext/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
📃
|
||||
@@ -1,67 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Install yt-dlp if not already available.
|
||||
|
||||
Runs at crawl start to ensure yt-dlp is installed.
|
||||
Outputs JSONL for InstalledBinary.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
PipProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
# yt-dlp binary and package have same name
|
||||
ytdlp_binary = Binary(
|
||||
name='yt-dlp',
|
||||
binproviders=[PipProvider(), EnvProvider()]
|
||||
)
|
||||
|
||||
# Try to load, install if not found
|
||||
try:
|
||||
loaded = ytdlp_binary.load()
|
||||
if not loaded or not loaded.abspath:
|
||||
raise Exception("Not loaded")
|
||||
except Exception:
|
||||
# Install via pip
|
||||
loaded = ytdlp_binary.install()
|
||||
|
||||
if loaded and loaded.abspath:
|
||||
# Output InstalledBinary JSONL
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': 'yt-dlp',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256,
|
||||
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
|
||||
}))
|
||||
sys.exit(0)
|
||||
else:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'yt-dlp',
|
||||
'bin_providers': 'pip,brew,env',
|
||||
}))
|
||||
print("Failed to install yt-dlp", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'yt-dlp',
|
||||
'bin_providers': 'pip,brew,env',
|
||||
}))
|
||||
print(f"Error installing yt-dlp: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
278
archivebox/plugins/media/on_Crawl__00_validate_ytdlp.py
Executable file
278
archivebox/plugins/media/on_Crawl__00_validate_ytdlp.py
Executable file
@@ -0,0 +1,278 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validation hook for yt-dlp and its dependencies (node, ffmpeg).
|
||||
|
||||
Runs at crawl start to verify yt-dlp and required binaries are available.
|
||||
Outputs JSONL for InstalledBinary and Machine config updates.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import hashlib
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_binary_version(abspath: str, version_flag: str = '--version') -> str | None:
|
||||
"""Get version string from binary."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[abspath, version_flag],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
first_line = result.stdout.strip().split('\n')[0]
|
||||
return first_line[:64]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def get_binary_hash(abspath: str) -> str | None:
|
||||
"""Get SHA256 hash of binary."""
|
||||
try:
|
||||
with open(abspath, 'rb') as f:
|
||||
return hashlib.sha256(f.read()).hexdigest()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_ytdlp() -> dict | None:
|
||||
"""Find yt-dlp binary."""
|
||||
try:
|
||||
from abx_pkg import Binary, PipProvider, EnvProvider
|
||||
|
||||
class YtdlpBinary(Binary):
|
||||
name: str = 'yt-dlp'
|
||||
binproviders_supported = [PipProvider(), EnvProvider()]
|
||||
|
||||
binary = YtdlpBinary()
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
'name': 'yt-dlp',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to shutil.which
|
||||
abspath = shutil.which('yt-dlp') or os.environ.get('YTDLP_BINARY', '')
|
||||
if abspath and Path(abspath).is_file():
|
||||
return {
|
||||
'name': 'yt-dlp',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def find_node() -> dict | None:
|
||||
"""Find node binary."""
|
||||
try:
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
|
||||
|
||||
class NodeBinary(Binary):
|
||||
name: str = 'node'
|
||||
binproviders_supported = [AptProvider(), BrewProvider(), EnvProvider()]
|
||||
overrides: dict = {'apt': {'packages': ['nodejs']}}
|
||||
|
||||
binary = NodeBinary()
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
'name': 'node',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to shutil.which
|
||||
abspath = shutil.which('node') or os.environ.get('NODE_BINARY', '')
|
||||
if abspath and Path(abspath).is_file():
|
||||
return {
|
||||
'name': 'node',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def find_ffmpeg() -> dict | None:
|
||||
"""Find ffmpeg binary."""
|
||||
try:
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
|
||||
|
||||
class FfmpegBinary(Binary):
|
||||
name: str = 'ffmpeg'
|
||||
binproviders_supported = [AptProvider(), BrewProvider(), EnvProvider()]
|
||||
|
||||
binary = FfmpegBinary()
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
'name': 'ffmpeg',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to shutil.which
|
||||
abspath = shutil.which('ffmpeg') or os.environ.get('FFMPEG_BINARY', '')
|
||||
if abspath and Path(abspath).is_file():
|
||||
return {
|
||||
'name': 'ffmpeg',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
# Check for yt-dlp (required)
|
||||
ytdlp_result = find_ytdlp()
|
||||
|
||||
# Check for node (required for JS extraction)
|
||||
node_result = find_node()
|
||||
|
||||
# Check for ffmpeg (required for video conversion)
|
||||
ffmpeg_result = find_ffmpeg()
|
||||
|
||||
missing_deps = []
|
||||
|
||||
# Emit results for yt-dlp
|
||||
if ytdlp_result and ytdlp_result.get('abspath'):
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': ytdlp_result['name'],
|
||||
'abspath': ytdlp_result['abspath'],
|
||||
'version': ytdlp_result['version'],
|
||||
'sha256': ytdlp_result['sha256'],
|
||||
'binprovider': ytdlp_result['binprovider'],
|
||||
}))
|
||||
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/YTDLP_BINARY',
|
||||
'value': ytdlp_result['abspath'],
|
||||
}))
|
||||
|
||||
if ytdlp_result['version']:
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/YTDLP_VERSION',
|
||||
'value': ytdlp_result['version'],
|
||||
}))
|
||||
else:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'yt-dlp',
|
||||
'bin_providers': 'pip,env',
|
||||
}))
|
||||
missing_deps.append('yt-dlp')
|
||||
|
||||
# Emit results for node
|
||||
if node_result and node_result.get('abspath'):
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': node_result['name'],
|
||||
'abspath': node_result['abspath'],
|
||||
'version': node_result['version'],
|
||||
'sha256': node_result['sha256'],
|
||||
'binprovider': node_result['binprovider'],
|
||||
}))
|
||||
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/NODE_BINARY',
|
||||
'value': node_result['abspath'],
|
||||
}))
|
||||
|
||||
if node_result['version']:
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/NODE_VERSION',
|
||||
'value': node_result['version'],
|
||||
}))
|
||||
else:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'node',
|
||||
'bin_providers': 'apt,brew,env',
|
||||
}))
|
||||
missing_deps.append('node')
|
||||
|
||||
# Emit results for ffmpeg
|
||||
if ffmpeg_result and ffmpeg_result.get('abspath'):
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': ffmpeg_result['name'],
|
||||
'abspath': ffmpeg_result['abspath'],
|
||||
'version': ffmpeg_result['version'],
|
||||
'sha256': ffmpeg_result['sha256'],
|
||||
'binprovider': ffmpeg_result['binprovider'],
|
||||
}))
|
||||
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/FFMPEG_BINARY',
|
||||
'value': ffmpeg_result['abspath'],
|
||||
}))
|
||||
|
||||
if ffmpeg_result['version']:
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/FFMPEG_VERSION',
|
||||
'value': ffmpeg_result['version'],
|
||||
}))
|
||||
else:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'ffmpeg',
|
||||
'bin_providers': 'apt,brew,env',
|
||||
}))
|
||||
missing_deps.append('ffmpeg')
|
||||
|
||||
if missing_deps:
|
||||
print(f"Missing dependencies: {', '.join(missing_deps)}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
else:
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
9
archivebox/plugins/media/templates/embed.html
Normal file
9
archivebox/plugins/media/templates/embed.html
Normal file
@@ -0,0 +1,9 @@
|
||||
<!-- Media embed - video/audio player -->
|
||||
<div class="extractor-embed media-embed" style="width: 100%; height: 100%; min-height: 400px; background: #1a1a1a; display: flex; align-items: center; justify-content: center;">
|
||||
<video src="{{ output_path }}"
|
||||
style="max-width: 100%; max-height: 100%;"
|
||||
controls
|
||||
preload="metadata">
|
||||
Your browser does not support the video tag.
|
||||
</video>
|
||||
</div>
|
||||
10
archivebox/plugins/media/templates/fullscreen.html
Normal file
10
archivebox/plugins/media/templates/fullscreen.html
Normal file
@@ -0,0 +1,10 @@
|
||||
<!-- Media fullscreen - full video/audio player -->
|
||||
<div class="extractor-fullscreen media-fullscreen" style="width: 100%; height: 100vh; background: #000; display: flex; align-items: center; justify-content: center;">
|
||||
<video src="{{ output_path }}"
|
||||
style="max-width: 100%; max-height: 100%;"
|
||||
controls
|
||||
autoplay
|
||||
preload="auto">
|
||||
Your browser does not support the video tag.
|
||||
</video>
|
||||
</div>
|
||||
1
archivebox/plugins/media/templates/icon.html
Normal file
1
archivebox/plugins/media/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
🎬
|
||||
14
archivebox/plugins/media/templates/thumbnail.html
Normal file
14
archivebox/plugins/media/templates/thumbnail.html
Normal file
@@ -0,0 +1,14 @@
|
||||
<!-- Media thumbnail - shows video/audio player or placeholder -->
|
||||
<div class="extractor-thumbnail media-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #1a1a1a; display: flex; align-items: center; justify-content: center;">
|
||||
<video src="{{ output_path }}"
|
||||
style="width: 100%; height: 100px; object-fit: contain;"
|
||||
poster=""
|
||||
preload="metadata"
|
||||
muted
|
||||
onerror="this.style.display='none'; this.nextElementSibling.style.display='flex';">
|
||||
</video>
|
||||
<div style="display: none; flex-direction: column; align-items: center; color: #888; font-size: 12px;">
|
||||
<span style="font-size: 32px;">🎬</span>
|
||||
<span>Media</span>
|
||||
</div>
|
||||
</div>
|
||||
@@ -21,7 +21,7 @@ import pytest
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
MEDIA_HOOK = PLUGIN_DIR / 'on_Snapshot__51_media.py'
|
||||
MEDIA_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_ytdlp.py'
|
||||
MEDIA_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_ytdlp.py'
|
||||
TEST_URL = 'https://example.com/video.mp4'
|
||||
|
||||
def test_hook_script_exists():
|
||||
@@ -29,46 +29,72 @@ def test_hook_script_exists():
|
||||
assert MEDIA_HOOK.exists(), f"Hook not found: {MEDIA_HOOK}"
|
||||
|
||||
|
||||
def test_ytdlp_install_hook():
|
||||
"""Test yt-dlp install hook to install yt-dlp if needed."""
|
||||
# Run yt-dlp install hook
|
||||
def test_ytdlp_validate_hook():
|
||||
"""Test yt-dlp validate hook checks for yt-dlp and dependencies (node, ffmpeg)."""
|
||||
# Run yt-dlp validate hook
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(MEDIA_INSTALL_HOOK)],
|
||||
[sys.executable, str(MEDIA_VALIDATE_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
|
||||
# Hook exits 0 if all binaries found, 1 if any not found
|
||||
# Parse output for InstalledBinary and Dependency records
|
||||
found_binaries = {'node': False, 'ffmpeg': False, 'yt-dlp': False}
|
||||
found_dependencies = {'node': False, 'ffmpeg': False, 'yt-dlp': False}
|
||||
|
||||
# Verify InstalledBinary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == 'yt-dlp'
|
||||
assert record['abspath']
|
||||
found_binary = True
|
||||
break
|
||||
name = record['name']
|
||||
if name in found_binaries:
|
||||
assert record['abspath'], f"{name} should have abspath"
|
||||
found_binaries[name] = True
|
||||
elif record.get('type') == 'Dependency':
|
||||
name = record['bin_name']
|
||||
if name in found_dependencies:
|
||||
found_dependencies[name] = True
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert found_binary, "Should output InstalledBinary record"
|
||||
# Each binary should either be found (InstalledBinary) or missing (Dependency)
|
||||
for binary_name in ['yt-dlp', 'node', 'ffmpeg']:
|
||||
assert found_binaries[binary_name] or found_dependencies[binary_name], \
|
||||
f"{binary_name} should have either InstalledBinary or Dependency record"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify yt-dlp is available via abx-pkg after hook installation."""
|
||||
from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides
|
||||
"""Verify yt-dlp, node, and ffmpeg are available via abx-pkg."""
|
||||
from abx_pkg import Binary, PipProvider, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
PipProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
missing_binaries = []
|
||||
|
||||
# Verify yt-dlp is available
|
||||
ytdlp_binary = Binary(name='yt-dlp', binproviders=[PipProvider(), EnvProvider()])
|
||||
ytdlp_loaded = ytdlp_binary.load()
|
||||
assert ytdlp_loaded and ytdlp_loaded.abspath, "yt-dlp should be available after install hook"
|
||||
if not (ytdlp_loaded and ytdlp_loaded.abspath):
|
||||
missing_binaries.append('yt-dlp')
|
||||
|
||||
# Verify node is available (yt-dlp needs it for JS extraction)
|
||||
node_binary = Binary(
|
||||
name='node',
|
||||
binproviders=[AptProvider(), BrewProvider(), EnvProvider()]
|
||||
)
|
||||
node_loaded = node_binary.load()
|
||||
if not (node_loaded and node_loaded.abspath):
|
||||
missing_binaries.append('node')
|
||||
|
||||
# Verify ffmpeg is available (yt-dlp needs it for video conversion)
|
||||
ffmpeg_binary = Binary(name='ffmpeg', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
|
||||
ffmpeg_loaded = ffmpeg_binary.load()
|
||||
if not (ffmpeg_loaded and ffmpeg_loaded.abspath):
|
||||
missing_binaries.append('ffmpeg')
|
||||
|
||||
if missing_binaries:
|
||||
pytest.skip(f"Binaries not available: {', '.join(missing_binaries)} - Dependency records should have been emitted")
|
||||
|
||||
def test_handles_non_media_url():
|
||||
"""Test that media extractor handles non-media URLs gracefully via hook."""
|
||||
|
||||
@@ -1,68 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Install mercury-parser if not already available.
|
||||
|
||||
Runs at crawl start to ensure mercury-parser is installed.
|
||||
Outputs JSONL for InstalledBinary.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
NpmProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
# Note: npm package is @postlight/mercury-parser, binary is mercury-parser
|
||||
mercury_binary = Binary(
|
||||
name='mercury-parser',
|
||||
binproviders=[NpmProvider(), EnvProvider()],
|
||||
overrides={'npm': {'packages': ['@postlight/mercury-parser']}}
|
||||
)
|
||||
|
||||
# Try to load, install if not found
|
||||
try:
|
||||
loaded = mercury_binary.load()
|
||||
if not loaded or not loaded.abspath:
|
||||
raise Exception("Not loaded")
|
||||
except Exception:
|
||||
# Install via npm
|
||||
loaded = mercury_binary.install()
|
||||
|
||||
if loaded and loaded.abspath:
|
||||
# Output InstalledBinary JSONL
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': 'mercury-parser',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256,
|
||||
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
|
||||
}))
|
||||
sys.exit(0)
|
||||
else:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'mercury-parser',
|
||||
'bin_providers': 'npm,env',
|
||||
}))
|
||||
print("Failed to install mercury-parser", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'mercury-parser',
|
||||
'bin_providers': 'npm,env',
|
||||
}))
|
||||
print(f"Error installing mercury-parser: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
123
archivebox/plugins/mercury/on_Crawl__00_validate_mercury.py
Executable file
123
archivebox/plugins/mercury/on_Crawl__00_validate_mercury.py
Executable file
@@ -0,0 +1,123 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validation hook for postlight-parser binary.
|
||||
|
||||
Runs at crawl start to verify postlight-parser is available.
|
||||
Outputs JSONL for InstalledBinary and Machine config updates.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import hashlib
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_binary_version(abspath: str) -> str | None:
|
||||
"""Get version string from binary."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[abspath, '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
first_line = result.stdout.strip().split('\n')[0]
|
||||
return first_line[:64]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def get_binary_hash(abspath: str) -> str | None:
|
||||
"""Get SHA256 hash of binary."""
|
||||
try:
|
||||
with open(abspath, 'rb') as f:
|
||||
return hashlib.sha256(f.read()).hexdigest()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_mercury() -> dict | None:
|
||||
"""Find postlight-parser binary."""
|
||||
try:
|
||||
from abx_pkg import Binary, NpmProvider, EnvProvider
|
||||
|
||||
class MercuryBinary(Binary):
|
||||
name: str = 'postlight-parser'
|
||||
binproviders_supported = [NpmProvider(), EnvProvider()]
|
||||
overrides: dict = {'npm': {'packages': ['@postlight/parser']}}
|
||||
|
||||
binary = MercuryBinary()
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
'name': 'postlight-parser',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to shutil.which
|
||||
abspath = shutil.which('postlight-parser') or os.environ.get('MERCURY_BINARY', '')
|
||||
if abspath and Path(abspath).is_file():
|
||||
return {
|
||||
'name': 'postlight-parser',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
result = find_mercury()
|
||||
|
||||
if result and result.get('abspath'):
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': result['name'],
|
||||
'abspath': result['abspath'],
|
||||
'version': result['version'],
|
||||
'sha256': result['sha256'],
|
||||
'binprovider': result['binprovider'],
|
||||
}))
|
||||
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/MERCURY_BINARY',
|
||||
'value': result['abspath'],
|
||||
}))
|
||||
|
||||
if result['version']:
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/MERCURY_VERSION',
|
||||
'value': result['version'],
|
||||
}))
|
||||
|
||||
sys.exit(0)
|
||||
else:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'postlight-parser',
|
||||
'bin_providers': 'npm,env',
|
||||
}))
|
||||
print(f"postlight-parser binary not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -6,10 +6,10 @@ Usage: on_Snapshot__mercury.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Creates mercury/ directory with content.html, content.txt, article.json
|
||||
|
||||
Environment variables:
|
||||
MERCURY_BINARY: Path to mercury-parser binary
|
||||
MERCURY_BINARY: Path to postlight-parser binary
|
||||
TIMEOUT: Timeout in seconds (default: 60)
|
||||
|
||||
Note: Requires mercury-parser: npm install -g @postlight/mercury-parser
|
||||
Note: Requires postlight-parser: npm install -g @postlight/parser
|
||||
"""
|
||||
|
||||
import json
|
||||
@@ -25,7 +25,7 @@ import rich_click as click
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'mercury'
|
||||
BIN_NAME = 'mercury-parser'
|
||||
BIN_NAME = 'postlight-parser'
|
||||
BIN_PROVIDERS = 'npm,env'
|
||||
OUTPUT_DIR = 'mercury'
|
||||
|
||||
@@ -42,12 +42,12 @@ def get_env_int(name: str, default: int = 0) -> int:
|
||||
|
||||
|
||||
def find_mercury() -> str | None:
|
||||
"""Find mercury-parser binary."""
|
||||
"""Find postlight-parser binary."""
|
||||
mercury = get_env('MERCURY_BINARY')
|
||||
if mercury and os.path.isfile(mercury):
|
||||
return mercury
|
||||
|
||||
for name in ['mercury-parser', 'mercury']:
|
||||
for name in ['postlight-parser']:
|
||||
binary = shutil.which(name)
|
||||
if binary:
|
||||
return binary
|
||||
@@ -56,7 +56,7 @@ def find_mercury() -> str | None:
|
||||
|
||||
|
||||
def get_version(binary: str) -> str:
|
||||
"""Get mercury-parser version."""
|
||||
"""Get postlight-parser version."""
|
||||
try:
|
||||
result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
|
||||
return result.stdout.strip()[:64]
|
||||
@@ -83,12 +83,12 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
|
||||
if result_text.returncode != 0:
|
||||
stderr = result_text.stderr.decode('utf-8', errors='replace')
|
||||
return False, None, f'mercury-parser failed: {stderr[:200]}'
|
||||
return False, None, f'postlight-parser failed: {stderr[:200]}'
|
||||
|
||||
try:
|
||||
text_json = json.loads(result_text.stdout)
|
||||
except json.JSONDecodeError:
|
||||
return False, None, 'mercury-parser returned invalid JSON'
|
||||
return False, None, 'postlight-parser returned invalid JSON'
|
||||
|
||||
if text_json.get('failed'):
|
||||
return False, None, 'Mercury was not able to extract article'
|
||||
@@ -139,7 +139,7 @@ def main(url: str, snapshot_id: str):
|
||||
# Find binary
|
||||
binary = find_mercury()
|
||||
if not binary:
|
||||
print(f'ERROR: mercury-parser binary not found', file=sys.stderr)
|
||||
print(f'ERROR: postlight-parser binary not found', file=sys.stderr)
|
||||
print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
|
||||
print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
6
archivebox/plugins/mercury/templates/embed.html
Normal file
6
archivebox/plugins/mercury/templates/embed.html
Normal file
@@ -0,0 +1,6 @@
|
||||
<!-- Mercury embed - Mercury parser article view -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-embed mercury-embed"
|
||||
style="width: 100%; height: 100%; min-height: 500px; border: none; background: #fefefe;"
|
||||
sandbox="allow-same-origin">
|
||||
</iframe>
|
||||
6
archivebox/plugins/mercury/templates/fullscreen.html
Normal file
6
archivebox/plugins/mercury/templates/fullscreen.html
Normal file
@@ -0,0 +1,6 @@
|
||||
<!-- Mercury fullscreen - full Mercury parser article -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-fullscreen mercury-fullscreen"
|
||||
style="width: 100%; height: 100vh; border: none; background: #fefefe;"
|
||||
sandbox="allow-same-origin">
|
||||
</iframe>
|
||||
1
archivebox/plugins/mercury/templates/icon.html
Normal file
1
archivebox/plugins/mercury/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
☿️
|
||||
8
archivebox/plugins/mercury/templates/thumbnail.html
Normal file
8
archivebox/plugins/mercury/templates/thumbnail.html
Normal file
@@ -0,0 +1,8 @@
|
||||
<!-- Mercury thumbnail - shows Mercury parser extracted article content -->
|
||||
<div class="extractor-thumbnail mercury-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fefefe; padding: 8px; font-family: Georgia, serif; font-size: 11px; line-height: 1.4; color: #333;">
|
||||
<iframe src="{{ output_path }}"
|
||||
style="width: 100%; height: 300px; border: none; pointer-events: none;"
|
||||
loading="lazy"
|
||||
sandbox="allow-same-origin">
|
||||
</iframe>
|
||||
</div>
|
||||
@@ -21,7 +21,7 @@ import pytest
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
MERCURY_HOOK = PLUGIN_DIR / 'on_Snapshot__53_mercury.py'
|
||||
MERCURY_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_mercury.py'
|
||||
MERCURY_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_mercury.py'
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
def test_hook_script_exists():
|
||||
@@ -29,53 +29,70 @@ def test_hook_script_exists():
|
||||
assert MERCURY_HOOK.exists(), f"Hook not found: {MERCURY_HOOK}"
|
||||
|
||||
|
||||
def test_mercury_install_hook():
|
||||
"""Test mercury install hook to install mercury-parser if needed."""
|
||||
# Run mercury install hook
|
||||
def test_mercury_validate_hook():
|
||||
"""Test mercury validate hook checks for postlight-parser."""
|
||||
# Run mercury validate hook
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(MERCURY_INSTALL_HOOK)],
|
||||
[sys.executable, str(MERCURY_VALIDATE_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
|
||||
|
||||
# Verify InstalledBinary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == 'mercury-parser'
|
||||
assert record['abspath']
|
||||
found_binary = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert found_binary, "Should output InstalledBinary record"
|
||||
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
|
||||
if result.returncode == 0:
|
||||
# Binary found - verify InstalledBinary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == 'postlight-parser'
|
||||
assert record['abspath']
|
||||
found_binary = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
assert found_binary, "Should output InstalledBinary record when binary found"
|
||||
else:
|
||||
# Binary not found - verify Dependency JSONL output
|
||||
found_dependency = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Dependency':
|
||||
assert record['bin_name'] == 'postlight-parser'
|
||||
assert 'npm' in record['bin_providers']
|
||||
found_dependency = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
assert found_dependency, "Should output Dependency record when binary not found"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify mercury-parser is available via abx-pkg after hook installation."""
|
||||
"""Verify postlight-parser is available via abx-pkg."""
|
||||
from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
NpmProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
# Verify mercury-parser is available
|
||||
# Verify postlight-parser is available
|
||||
mercury_binary = Binary(
|
||||
name='mercury-parser',
|
||||
name='postlight-parser',
|
||||
binproviders=[NpmProvider(), EnvProvider()],
|
||||
overrides={'npm': {'packages': ['@postlight/mercury-parser']}}
|
||||
overrides={'npm': {'packages': ['@postlight/parser']}}
|
||||
)
|
||||
mercury_loaded = mercury_binary.load()
|
||||
assert mercury_loaded and mercury_loaded.abspath, "mercury-parser should be available after install hook"
|
||||
|
||||
# If validate hook found it (exit 0), this should succeed
|
||||
# If validate hook didn't find it (exit 1), this may fail unless binprovider installed it
|
||||
if mercury_loaded and mercury_loaded.abspath:
|
||||
assert True, "postlight-parser is available"
|
||||
else:
|
||||
pytest.skip("postlight-parser not available - Dependency record should have been emitted")
|
||||
|
||||
def test_extracts_with_mercury_parser():
|
||||
"""Test full workflow: extract with mercury-parser from real HTML via hook."""
|
||||
"""Test full workflow: extract with postlight-parser from real HTML via hook."""
|
||||
# Prerequisites checked by earlier test
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
|
||||
@@ -2,46 +2,28 @@
|
||||
"""
|
||||
Create a Merkle tree of all archived outputs.
|
||||
|
||||
This plugin runs after all extractors and post-processing complete (priority 92)
|
||||
and generates a cryptographic Merkle tree of all files in the snapshot directory.
|
||||
This provides:
|
||||
- Tamper detection: verify archive integrity
|
||||
- Efficient updates: only re-hash changed files
|
||||
- Compact proofs: prove file inclusion without sending all files
|
||||
- Deduplication: identify identical content across snapshots
|
||||
This plugin runs after all extractors complete (priority 93) and generates
|
||||
a cryptographic Merkle tree of all files in the snapshot directory.
|
||||
|
||||
Output: merkletree/merkletree.json containing:
|
||||
- root_hash: SHA256 hash of the Merkle root
|
||||
- tree: Full tree structure with internal nodes
|
||||
- files: List of all files with their hashes
|
||||
- metadata: Timestamp, file count, total size
|
||||
Output: merkletree.json containing root_hash, tree structure, file list, metadata
|
||||
|
||||
Usage: on_Snapshot__92_merkletree.py --url=<url> --snapshot-id=<uuid>
|
||||
Usage: on_Snapshot__93_merkletree.py --url=<url> --snapshot-id=<uuid>
|
||||
|
||||
Environment variables:
|
||||
SAVE_MERKLETREE: Enable merkle tree generation (default: true)
|
||||
DATA_DIR: ArchiveBox data directory
|
||||
ARCHIVE_DIR: Archive output directory
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.plugins.merkletree'
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from datetime import datetime, timezone
|
||||
from typing import Dict, List, Optional, Tuple, Any
|
||||
|
||||
# Configure Django if running standalone
|
||||
if __name__ == '__main__':
|
||||
parent_dir = str(Path(__file__).resolve().parent.parent.parent)
|
||||
if parent_dir not in sys.path:
|
||||
sys.path.insert(0, parent_dir)
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
|
||||
import django
|
||||
django.setup()
|
||||
|
||||
import rich_click as click
|
||||
import click
|
||||
|
||||
|
||||
def sha256_file(filepath: Path) -> str:
|
||||
@@ -49,12 +31,10 @@ def sha256_file(filepath: Path) -> str:
|
||||
h = hashlib.sha256()
|
||||
try:
|
||||
with open(filepath, 'rb') as f:
|
||||
# Read in 64kb chunks
|
||||
while chunk := f.read(65536):
|
||||
h.update(chunk)
|
||||
return h.hexdigest()
|
||||
except (OSError, PermissionError):
|
||||
# If we can't read the file, return a null hash
|
||||
return '0' * 64
|
||||
|
||||
|
||||
@@ -64,74 +44,45 @@ def sha256_data(data: bytes) -> str:
|
||||
|
||||
|
||||
def collect_files(snapshot_dir: Path, exclude_dirs: Optional[List[str]] = None) -> List[Tuple[Path, str, int]]:
|
||||
"""
|
||||
Recursively collect all files in snapshot directory.
|
||||
|
||||
Args:
|
||||
snapshot_dir: Root directory to scan
|
||||
exclude_dirs: Directory names to exclude (e.g., ['merkletree', '.git'])
|
||||
|
||||
Returns:
|
||||
List of (relative_path, sha256_hash, file_size) tuples
|
||||
"""
|
||||
"""Recursively collect all files in snapshot directory."""
|
||||
exclude_dirs = exclude_dirs or ['merkletree', '.git', '__pycache__']
|
||||
files = []
|
||||
|
||||
for root, dirs, filenames in os.walk(snapshot_dir):
|
||||
# Filter out excluded directories
|
||||
dirs[:] = [d for d in dirs if d not in exclude_dirs]
|
||||
|
||||
for filename in filenames:
|
||||
filepath = Path(root) / filename
|
||||
rel_path = filepath.relative_to(snapshot_dir)
|
||||
|
||||
# Skip symlinks (we hash the target, not the link)
|
||||
if filepath.is_symlink():
|
||||
continue
|
||||
|
||||
# Compute hash and size
|
||||
file_hash = sha256_file(filepath)
|
||||
file_size = filepath.stat().st_size if filepath.exists() else 0
|
||||
|
||||
files.append((rel_path, file_hash, file_size))
|
||||
|
||||
# Sort by path for deterministic tree
|
||||
files.sort(key=lambda x: str(x[0]))
|
||||
return files
|
||||
|
||||
|
||||
def build_merkle_tree(file_hashes: List[str]) -> Tuple[str, List[List[str]]]:
|
||||
"""
|
||||
Build a Merkle tree from a list of leaf hashes.
|
||||
|
||||
Args:
|
||||
file_hashes: List of SHA256 hashes (leaves)
|
||||
|
||||
Returns:
|
||||
(root_hash, tree_levels) where tree_levels is a list of hash lists per level
|
||||
"""
|
||||
"""Build a Merkle tree from a list of leaf hashes."""
|
||||
if not file_hashes:
|
||||
# Empty tree
|
||||
return sha256_data(b''), [[]]
|
||||
|
||||
# Initialize with leaf level
|
||||
tree_levels = [file_hashes.copy()]
|
||||
|
||||
# Build tree bottom-up
|
||||
while len(tree_levels[-1]) > 1:
|
||||
current_level = tree_levels[-1]
|
||||
next_level = []
|
||||
|
||||
# Process pairs
|
||||
for i in range(0, len(current_level), 2):
|
||||
left = current_level[i]
|
||||
|
||||
if i + 1 < len(current_level):
|
||||
# Combine left + right
|
||||
right = current_level[i + 1]
|
||||
combined = left + right
|
||||
else:
|
||||
# Odd number of nodes: duplicate the last one
|
||||
combined = left + left
|
||||
|
||||
parent_hash = sha256_data(combined.encode('utf-8'))
|
||||
@@ -139,67 +90,41 @@ def build_merkle_tree(file_hashes: List[str]) -> Tuple[str, List[List[str]]]:
|
||||
|
||||
tree_levels.append(next_level)
|
||||
|
||||
# Root is the single hash at the top level
|
||||
root_hash = tree_levels[-1][0]
|
||||
return root_hash, tree_levels
|
||||
|
||||
|
||||
def create_merkle_tree(snapshot_dir: Path) -> Dict[str, Any]:
|
||||
"""
|
||||
Create a complete Merkle tree of all files in snapshot directory.
|
||||
|
||||
Args:
|
||||
snapshot_dir: The snapshot directory to scan
|
||||
|
||||
Returns:
|
||||
Dict containing root_hash, tree structure, file list, and metadata
|
||||
"""
|
||||
# Collect all files
|
||||
"""Create a complete Merkle tree of all files in snapshot directory."""
|
||||
files = collect_files(snapshot_dir)
|
||||
|
||||
# Extract just the hashes for tree building
|
||||
file_hashes = [file_hash for _, file_hash, _ in files]
|
||||
|
||||
# Build Merkle tree
|
||||
root_hash, tree_levels = build_merkle_tree(file_hashes)
|
||||
|
||||
# Calculate total size
|
||||
total_size = sum(size for _, _, size in files)
|
||||
|
||||
# Prepare file list with metadata
|
||||
file_list = [
|
||||
{
|
||||
'path': str(path),
|
||||
'hash': file_hash,
|
||||
'size': size,
|
||||
}
|
||||
{'path': str(path), 'hash': file_hash, 'size': size}
|
||||
for path, file_hash, size in files
|
||||
]
|
||||
|
||||
# Prepare result
|
||||
result = {
|
||||
return {
|
||||
'root_hash': root_hash,
|
||||
'tree_levels': tree_levels,
|
||||
'files': file_list,
|
||||
'metadata': {
|
||||
'timestamp': datetime.now().isoformat(),
|
||||
'timestamp': datetime.now(timezone.utc).isoformat(),
|
||||
'file_count': len(files),
|
||||
'total_size': total_size,
|
||||
'tree_depth': len(tree_levels),
|
||||
},
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL being archived')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Generate Merkle tree of all archived outputs."""
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
start_ts = datetime.now()
|
||||
start_ts = datetime.now(timezone.utc)
|
||||
status = 'failed'
|
||||
output = None
|
||||
error = ''
|
||||
@@ -211,30 +136,19 @@ def main(url: str, snapshot_id: str):
|
||||
save_merkletree = os.getenv('SAVE_MERKLETREE', 'true').lower() in ('true', '1', 'yes', 'on')
|
||||
|
||||
if not save_merkletree:
|
||||
click.echo('Skipping merkle tree (SAVE_MERKLETREE=False)')
|
||||
status = 'skipped'
|
||||
end_ts = datetime.now()
|
||||
click.echo(f'START_TS={start_ts.isoformat()}')
|
||||
click.echo(f'END_TS={end_ts.isoformat()}')
|
||||
click.echo(f'STATUS={status}')
|
||||
click.echo(f'RESULT_JSON={{"extractor": "merkletree", "status": "{status}", "url": "{url}", "snapshot_id": "{snapshot_id}"}}')
|
||||
click.echo(json.dumps({'status': status, 'output': 'SAVE_MERKLETREE=false'}))
|
||||
sys.exit(0)
|
||||
|
||||
# Get snapshot
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
except Snapshot.DoesNotExist:
|
||||
error = f'Snapshot {snapshot_id} not found'
|
||||
raise ValueError(error)
|
||||
# Working directory is the extractor output dir (e.g., <snapshot>/merkletree/)
|
||||
# Parent is the snapshot directory
|
||||
output_dir = Path.cwd()
|
||||
snapshot_dir = output_dir.parent
|
||||
|
||||
# Get snapshot directory
|
||||
snapshot_dir = Path(snapshot.output_dir)
|
||||
if not snapshot_dir.exists():
|
||||
error = f'Snapshot directory not found: {snapshot_dir}'
|
||||
raise FileNotFoundError(error)
|
||||
raise FileNotFoundError(f'Snapshot directory not found: {snapshot_dir}')
|
||||
|
||||
# Create output directory
|
||||
output_dir = snapshot_dir / 'merkletree'
|
||||
# Ensure output directory exists
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
output_path = output_dir / 'merkletree.json'
|
||||
|
||||
@@ -246,49 +160,31 @@ def main(url: str, snapshot_id: str):
|
||||
json.dump(merkle_data, f, indent=2)
|
||||
|
||||
status = 'succeeded'
|
||||
output = str(output_path)
|
||||
output = 'merkletree.json'
|
||||
root_hash = merkle_data['root_hash']
|
||||
file_count = merkle_data['metadata']['file_count']
|
||||
total_size = merkle_data['metadata']['total_size']
|
||||
|
||||
click.echo(f'Merkle tree created: {file_count} files, root={root_hash[:16]}..., size={total_size:,} bytes')
|
||||
click.echo(f'Merkle tree: {file_count} files, root={root_hash[:16]}..., size={total_size:,} bytes')
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
click.echo(f'Error: {error}', err=True)
|
||||
|
||||
end_ts = datetime.now()
|
||||
duration = (end_ts - start_ts).total_seconds()
|
||||
end_ts = datetime.now(timezone.utc)
|
||||
|
||||
# Print results
|
||||
click.echo(f'START_TS={start_ts.isoformat()}')
|
||||
click.echo(f'END_TS={end_ts.isoformat()}')
|
||||
click.echo(f'DURATION={duration:.2f}')
|
||||
if output:
|
||||
click.echo(f'OUTPUT={output}')
|
||||
click.echo(f'STATUS={status}')
|
||||
|
||||
if error:
|
||||
click.echo(f'ERROR={error}', err=True)
|
||||
|
||||
# Print JSON result
|
||||
result_json = {
|
||||
'extractor': 'merkletree',
|
||||
'url': url,
|
||||
'snapshot_id': snapshot_id,
|
||||
# Print JSON result for hook runner
|
||||
result = {
|
||||
'status': status,
|
||||
'start_ts': start_ts.isoformat(),
|
||||
'end_ts': end_ts.isoformat(),
|
||||
'duration': round(duration, 2),
|
||||
'output': output,
|
||||
'error': error or None,
|
||||
'root_hash': root_hash,
|
||||
'file_count': file_count,
|
||||
'error': error or None,
|
||||
}
|
||||
click.echo(f'RESULT_JSON={json.dumps(result_json)}')
|
||||
click.echo(json.dumps(result))
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
sys.exit(0 if status in ('succeeded', 'skipped') else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
🔗
|
||||
@@ -133,7 +133,8 @@ def fetch_content(url: str) -> str:
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='HTML URL to parse')
|
||||
def main(url: str):
|
||||
@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
|
||||
def main(url: str, snapshot_id: str = None):
|
||||
"""Parse HTML and extract href URLs."""
|
||||
|
||||
# Skip only if parse_dom_outlinks already ran AND found URLs (it uses Chrome for better coverage)
|
||||
|
||||
1
archivebox/plugins/parse_html_urls/templates/icon.html
Normal file
1
archivebox/plugins/parse_html_urls/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
🔗
|
||||
@@ -127,7 +127,8 @@ def fetch_content(url: str) -> str:
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='JSONL file URL to parse')
|
||||
def main(url: str):
|
||||
@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
|
||||
def main(url: str, snapshot_id: str = None):
|
||||
"""Parse JSONL bookmark file and extract URLs."""
|
||||
|
||||
try:
|
||||
|
||||
1
archivebox/plugins/parse_jsonl_urls/templates/icon.html
Normal file
1
archivebox/plugins/parse_jsonl_urls/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
📋
|
||||
@@ -52,7 +52,8 @@ def fetch_content(url: str) -> str:
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='Netscape bookmark file URL to parse')
|
||||
def main(url: str):
|
||||
@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
|
||||
def main(url: str, snapshot_id: str = None):
|
||||
"""Parse Netscape bookmark HTML and extract URLs."""
|
||||
|
||||
try:
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
🔖
|
||||
@@ -51,7 +51,8 @@ def fetch_content(url: str) -> str:
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='RSS/Atom feed URL to parse')
|
||||
def main(url: str):
|
||||
@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
|
||||
def main(url: str, snapshot_id: str = None):
|
||||
"""Parse RSS/Atom feed and extract article URLs."""
|
||||
|
||||
if feedparser is None:
|
||||
|
||||
1
archivebox/plugins/parse_rss_urls/templates/icon.html
Normal file
1
archivebox/plugins/parse_rss_urls/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
📡
|
||||
@@ -100,7 +100,8 @@ def fetch_content(url: str) -> str:
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL to parse (file:// or https://)')
|
||||
def main(url: str):
|
||||
@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
|
||||
def main(url: str, snapshot_id: str = None):
|
||||
"""Parse plain text and extract URLs."""
|
||||
|
||||
try:
|
||||
|
||||
1
archivebox/plugins/parse_txt_urls/templates/icon.html
Normal file
1
archivebox/plugins/parse_txt_urls/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
📃
|
||||
5
archivebox/plugins/pdf/templates/embed.html
Normal file
5
archivebox/plugins/pdf/templates/embed.html
Normal file
@@ -0,0 +1,5 @@
|
||||
<!-- PDF embed - full PDF viewer -->
|
||||
<embed src="{{ output_path }}#toolbar=1&navpanes=1"
|
||||
type="application/pdf"
|
||||
class="extractor-embed pdf-embed"
|
||||
style="width: 100%; height: 100%; min-height: 500px;">
|
||||
5
archivebox/plugins/pdf/templates/fullscreen.html
Normal file
5
archivebox/plugins/pdf/templates/fullscreen.html
Normal file
@@ -0,0 +1,5 @@
|
||||
<!-- PDF fullscreen - full PDF viewer -->
|
||||
<embed src="{{ output_path }}#toolbar=1&navpanes=1&view=FitH"
|
||||
type="application/pdf"
|
||||
class="extractor-fullscreen pdf-fullscreen"
|
||||
style="width: 100%; height: 100vh;">
|
||||
1
archivebox/plugins/pdf/templates/icon.html
Normal file
1
archivebox/plugins/pdf/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
📄
|
||||
6
archivebox/plugins/pdf/templates/thumbnail.html
Normal file
6
archivebox/plugins/pdf/templates/thumbnail.html
Normal file
@@ -0,0 +1,6 @@
|
||||
<!-- PDF thumbnail - shows first page preview -->
|
||||
<div class="extractor-thumbnail pdf-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #f5f5f5;">
|
||||
<embed src="{{ output_path }}#toolbar=0&navpanes=0&scrollbar=0&page=1&view=FitH"
|
||||
type="application/pdf"
|
||||
style="width: 100%; height: 200px; margin-top: -20px; pointer-events: none;">
|
||||
</div>
|
||||
@@ -1,68 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Install readability-extractor if not already available.
|
||||
|
||||
Runs at crawl start to ensure readability-extractor is installed.
|
||||
Outputs JSONL for InstalledBinary.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
NpmProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
# Note: npm package is from github:ArchiveBox/readability-extractor
|
||||
readability_binary = Binary(
|
||||
name='readability-extractor',
|
||||
binproviders=[NpmProvider(), EnvProvider()],
|
||||
overrides={'npm': {'packages': ['github:ArchiveBox/readability-extractor']}}
|
||||
)
|
||||
|
||||
# Try to load, install if not found
|
||||
try:
|
||||
loaded = readability_binary.load()
|
||||
if not loaded or not loaded.abspath:
|
||||
raise Exception("Not loaded")
|
||||
except Exception:
|
||||
# Install via npm from GitHub repo
|
||||
loaded = readability_binary.install()
|
||||
|
||||
if loaded and loaded.abspath:
|
||||
# Output InstalledBinary JSONL
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': 'readability-extractor',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256,
|
||||
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
|
||||
}))
|
||||
sys.exit(0)
|
||||
else:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'readability-extractor',
|
||||
'bin_providers': 'npm,env',
|
||||
}))
|
||||
print("Failed to install readability-extractor", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'readability-extractor',
|
||||
'bin_providers': 'npm,env',
|
||||
}))
|
||||
print(f"Error installing readability-extractor: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
123
archivebox/plugins/readability/on_Crawl__00_validate_readability.py
Executable file
123
archivebox/plugins/readability/on_Crawl__00_validate_readability.py
Executable file
@@ -0,0 +1,123 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validation hook for readability-extractor binary.
|
||||
|
||||
Runs at crawl start to verify readability-extractor is available.
|
||||
Outputs JSONL for InstalledBinary and Machine config updates.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import hashlib
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_binary_version(abspath: str) -> str | None:
|
||||
"""Get version string from binary."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[abspath, '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
first_line = result.stdout.strip().split('\n')[0]
|
||||
return first_line[:64]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def get_binary_hash(abspath: str) -> str | None:
|
||||
"""Get SHA256 hash of binary."""
|
||||
try:
|
||||
with open(abspath, 'rb') as f:
|
||||
return hashlib.sha256(f.read()).hexdigest()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_readability() -> dict | None:
|
||||
"""Find readability-extractor binary."""
|
||||
try:
|
||||
from abx_pkg import Binary, NpmProvider, EnvProvider
|
||||
|
||||
class ReadabilityBinary(Binary):
|
||||
name: str = 'readability-extractor'
|
||||
binproviders_supported = [NpmProvider(), EnvProvider()]
|
||||
overrides: dict = {'npm': {'packages': ['github:ArchiveBox/readability-extractor']}}
|
||||
|
||||
binary = ReadabilityBinary()
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
'name': 'readability-extractor',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to shutil.which
|
||||
abspath = shutil.which('readability-extractor') or os.environ.get('READABILITY_BINARY', '')
|
||||
if abspath and Path(abspath).is_file():
|
||||
return {
|
||||
'name': 'readability-extractor',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
result = find_readability()
|
||||
|
||||
if result and result.get('abspath'):
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': result['name'],
|
||||
'abspath': result['abspath'],
|
||||
'version': result['version'],
|
||||
'sha256': result['sha256'],
|
||||
'binprovider': result['binprovider'],
|
||||
}))
|
||||
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/READABILITY_BINARY',
|
||||
'value': result['abspath'],
|
||||
}))
|
||||
|
||||
if result['version']:
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/READABILITY_VERSION',
|
||||
'value': result['version'],
|
||||
}))
|
||||
|
||||
sys.exit(0)
|
||||
else:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'readability-extractor',
|
||||
'bin_providers': 'npm,env',
|
||||
}))
|
||||
print(f"readability-extractor binary not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
6
archivebox/plugins/readability/templates/embed.html
Normal file
6
archivebox/plugins/readability/templates/embed.html
Normal file
@@ -0,0 +1,6 @@
|
||||
<!-- Readability embed - reader-mode article view -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-embed readability-embed"
|
||||
style="width: 100%; height: 100%; min-height: 500px; border: none; background: #fefefe;"
|
||||
sandbox="allow-same-origin">
|
||||
</iframe>
|
||||
6
archivebox/plugins/readability/templates/fullscreen.html
Normal file
6
archivebox/plugins/readability/templates/fullscreen.html
Normal file
@@ -0,0 +1,6 @@
|
||||
<!-- Readability fullscreen - full reader-mode article -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-fullscreen readability-fullscreen"
|
||||
style="width: 100%; height: 100vh; border: none; background: #fefefe;"
|
||||
sandbox="allow-same-origin">
|
||||
</iframe>
|
||||
1
archivebox/plugins/readability/templates/icon.html
Normal file
1
archivebox/plugins/readability/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
📖
|
||||
8
archivebox/plugins/readability/templates/thumbnail.html
Normal file
8
archivebox/plugins/readability/templates/thumbnail.html
Normal file
@@ -0,0 +1,8 @@
|
||||
<!-- Readability thumbnail - shows reader-mode extracted article content -->
|
||||
<div class="extractor-thumbnail readability-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fefefe; padding: 8px; font-family: Georgia, serif; font-size: 11px; line-height: 1.4; color: #333;">
|
||||
<iframe src="{{ output_path }}"
|
||||
style="width: 100%; height: 300px; border: none; pointer-events: none;"
|
||||
loading="lazy"
|
||||
sandbox="allow-same-origin">
|
||||
</iframe>
|
||||
</div>
|
||||
@@ -2,7 +2,7 @@
|
||||
Integration tests for readability plugin
|
||||
|
||||
Tests verify:
|
||||
1. Install hook installs readability-extractor via abx-pkg
|
||||
1. Validate hook checks for readability-extractor binary
|
||||
2. Verify deps with abx-pkg
|
||||
3. Plugin reports missing dependency correctly
|
||||
4. Extraction works against real example.com content
|
||||
@@ -21,7 +21,7 @@ import pytest
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
READABILITY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_readability.py'))
|
||||
READABILITY_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_readability.py'
|
||||
READABILITY_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_readability.py'
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
|
||||
@@ -101,48 +101,63 @@ def test_reports_missing_dependency_when_not_installed():
|
||||
assert 'readability-extractor' in combined or 'BIN_NAME' in combined, "Should mention readability-extractor"
|
||||
|
||||
|
||||
def test_readability_install_hook():
|
||||
"""Test readability install hook to install readability-extractor if needed."""
|
||||
def test_readability_validate_hook():
|
||||
"""Test readability validate hook checks for readability-extractor binary."""
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(READABILITY_INSTALL_HOOK)],
|
||||
[sys.executable, str(READABILITY_VALIDATE_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
|
||||
|
||||
# Verify InstalledBinary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == 'readability-extractor'
|
||||
assert record['abspath']
|
||||
found_binary = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert found_binary, "Should output InstalledBinary record"
|
||||
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
|
||||
if result.returncode == 0:
|
||||
# Binary found - verify InstalledBinary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == 'readability-extractor'
|
||||
assert record['abspath']
|
||||
found_binary = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
assert found_binary, "Should output InstalledBinary record when binary found"
|
||||
else:
|
||||
# Binary not found - verify Dependency JSONL output
|
||||
found_dependency = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Dependency':
|
||||
assert record['bin_name'] == 'readability-extractor'
|
||||
assert 'npm' in record['bin_providers']
|
||||
found_dependency = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
assert found_dependency, "Should output Dependency record when binary not found"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify readability-extractor is available via abx-pkg after hook installation."""
|
||||
"""Verify readability-extractor is available via abx-pkg."""
|
||||
from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
NpmProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
readability_binary = Binary(
|
||||
name='readability-extractor',
|
||||
binproviders=[NpmProvider(), EnvProvider()],
|
||||
overrides={'npm': {'packages': ['github:ArchiveBox/readability-extractor']}}
|
||||
)
|
||||
readability_loaded = readability_binary.load()
|
||||
assert readability_loaded and readability_loaded.abspath, "readability-extractor should be available after install hook"
|
||||
|
||||
if readability_loaded and readability_loaded.abspath:
|
||||
assert True, "readability-extractor is available"
|
||||
else:
|
||||
pytest.skip("readability-extractor not available - Dependency record should have been emitted")
|
||||
|
||||
|
||||
def test_extracts_article_after_installation():
|
||||
|
||||
5
archivebox/plugins/screenshot/templates/embed.html
Normal file
5
archivebox/plugins/screenshot/templates/embed.html
Normal file
@@ -0,0 +1,5 @@
|
||||
<!-- Screenshot embed - full image view -->
|
||||
<img src="{{ output_path }}"
|
||||
alt="Screenshot of page"
|
||||
class="extractor-embed screenshot-embed"
|
||||
style="max-width: 100%; height: auto;">
|
||||
8
archivebox/plugins/screenshot/templates/fullscreen.html
Normal file
8
archivebox/plugins/screenshot/templates/fullscreen.html
Normal file
@@ -0,0 +1,8 @@
|
||||
<!-- Screenshot fullscreen - zoomable image -->
|
||||
<div style="width: 100%; height: 100vh; overflow: auto; background: #222; display: flex; align-items: start; justify-content: center;">
|
||||
<img src="{{ output_path }}"
|
||||
alt="Screenshot of page"
|
||||
class="extractor-fullscreen screenshot-fullscreen"
|
||||
style="max-width: 100%; cursor: zoom-in;"
|
||||
onclick="this.style.maxWidth = this.style.maxWidth === 'none' ? '100%' : 'none'; this.style.cursor = this.style.maxWidth === 'none' ? 'zoom-out' : 'zoom-in';">
|
||||
</div>
|
||||
1
archivebox/plugins/screenshot/templates/icon.html
Normal file
1
archivebox/plugins/screenshot/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
📷
|
||||
8
archivebox/plugins/screenshot/templates/thumbnail.html
Normal file
8
archivebox/plugins/screenshot/templates/thumbnail.html
Normal file
@@ -0,0 +1,8 @@
|
||||
<!-- Screenshot thumbnail - shows the captured screenshot image -->
|
||||
<img src="{{ output_path }}"
|
||||
alt="Screenshot of page"
|
||||
class="extractor-thumbnail screenshot-thumbnail"
|
||||
style="width: 100%; height: 100px; object-fit: cover; object-position: top center; background: #333;"
|
||||
loading="lazy"
|
||||
onerror="this.style.display='none'; this.nextElementSibling.style.display='block';">
|
||||
<div style="display: none; text-align: center; padding: 20px; color: #999;">📷 Screenshot</div>
|
||||
6
archivebox/plugins/singlefile/templates/embed.html
Normal file
6
archivebox/plugins/singlefile/templates/embed.html
Normal file
@@ -0,0 +1,6 @@
|
||||
<!-- Singlefile embed - full iframe of archived HTML -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-embed singlefile-embed"
|
||||
style="width: 100%; height: 100%; min-height: 500px; border: none;"
|
||||
sandbox="allow-same-origin allow-scripts allow-forms">
|
||||
</iframe>
|
||||
6
archivebox/plugins/singlefile/templates/fullscreen.html
Normal file
6
archivebox/plugins/singlefile/templates/fullscreen.html
Normal file
@@ -0,0 +1,6 @@
|
||||
<!-- Singlefile fullscreen - full page iframe -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-fullscreen singlefile-fullscreen"
|
||||
style="width: 100%; height: 100vh; border: none;"
|
||||
sandbox="allow-same-origin allow-scripts allow-forms allow-top-navigation-by-user-activation">
|
||||
</iframe>
|
||||
1
archivebox/plugins/singlefile/templates/icon.html
Normal file
1
archivebox/plugins/singlefile/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
📦
|
||||
8
archivebox/plugins/singlefile/templates/thumbnail.html
Normal file
8
archivebox/plugins/singlefile/templates/thumbnail.html
Normal file
@@ -0,0 +1,8 @@
|
||||
<!-- Singlefile thumbnail - scaled down iframe preview of archived HTML -->
|
||||
<div class="extractor-thumbnail singlefile-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fff;">
|
||||
<iframe src="{{ output_path }}"
|
||||
style="width: 400%; height: 400px; transform: scale(0.25); transform-origin: top left; pointer-events: none; border: none;"
|
||||
loading="lazy"
|
||||
sandbox="allow-same-origin">
|
||||
</iframe>
|
||||
</div>
|
||||
1
archivebox/plugins/staticfile/templates/icon.html
Normal file
1
archivebox/plugins/staticfile/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
📁
|
||||
1
archivebox/plugins/title/templates/icon.html
Normal file
1
archivebox/plugins/title/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
📝
|
||||
@@ -1,68 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Install wget if not already available.
|
||||
|
||||
Runs at crawl start to ensure wget is installed.
|
||||
Outputs JSONL for InstalledBinary.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
AptProvider.model_rebuild()
|
||||
BrewProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
# wget binary and package have same name
|
||||
wget_binary = Binary(
|
||||
name='wget',
|
||||
binproviders=[AptProvider(), BrewProvider(), EnvProvider()]
|
||||
)
|
||||
|
||||
# Try to load, install if not found
|
||||
try:
|
||||
loaded = wget_binary.load()
|
||||
if not loaded or not loaded.abspath:
|
||||
raise Exception("Not loaded")
|
||||
except Exception:
|
||||
# Install via system package manager
|
||||
loaded = wget_binary.install()
|
||||
|
||||
if loaded and loaded.abspath:
|
||||
# Output InstalledBinary JSONL
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': 'wget',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256,
|
||||
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
|
||||
}))
|
||||
sys.exit(0)
|
||||
else:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'wget',
|
||||
'bin_providers': 'apt,brew,env',
|
||||
}))
|
||||
print("Failed to install wget", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'wget',
|
||||
'bin_providers': 'apt,brew,env',
|
||||
}))
|
||||
print(f"Error installing wget: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
6
archivebox/plugins/wget/templates/embed.html
Normal file
6
archivebox/plugins/wget/templates/embed.html
Normal file
@@ -0,0 +1,6 @@
|
||||
<!-- Wget embed - full iframe of mirrored site -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-embed wget-embed"
|
||||
style="width: 100%; height: 100%; min-height: 500px; border: none;"
|
||||
sandbox="allow-same-origin allow-scripts allow-forms">
|
||||
</iframe>
|
||||
6
archivebox/plugins/wget/templates/fullscreen.html
Normal file
6
archivebox/plugins/wget/templates/fullscreen.html
Normal file
@@ -0,0 +1,6 @@
|
||||
<!-- Wget fullscreen - full page iframe of mirrored site -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-fullscreen wget-fullscreen"
|
||||
style="width: 100%; height: 100vh; border: none;"
|
||||
sandbox="allow-same-origin allow-scripts allow-forms allow-top-navigation-by-user-activation">
|
||||
</iframe>
|
||||
1
archivebox/plugins/wget/templates/icon.html
Normal file
1
archivebox/plugins/wget/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
📥
|
||||
8
archivebox/plugins/wget/templates/thumbnail.html
Normal file
8
archivebox/plugins/wget/templates/thumbnail.html
Normal file
@@ -0,0 +1,8 @@
|
||||
<!-- Wget thumbnail - scaled down iframe preview of mirrored site -->
|
||||
<div class="extractor-thumbnail wget-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fff;">
|
||||
<iframe src="{{ output_path }}"
|
||||
style="width: 400%; height: 400px; transform: scale(0.25); transform-origin: top left; pointer-events: none; border: none;"
|
||||
loading="lazy"
|
||||
sandbox="allow-same-origin">
|
||||
</iframe>
|
||||
</div>
|
||||
@@ -2,8 +2,8 @@
|
||||
Integration tests for wget plugin
|
||||
|
||||
Tests verify:
|
||||
1. Plugin reports missing dependency correctly
|
||||
2. wget can be installed via brew/apt provider hooks
|
||||
1. Validate hook checks for wget binary
|
||||
2. Verify deps with abx-pkg
|
||||
3. Config options work (SAVE_WGET, SAVE_WARC, etc.)
|
||||
4. Extraction works against real example.com
|
||||
5. Output files contain actual page content
|
||||
@@ -26,7 +26,7 @@ import pytest
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
WGET_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_wget.py'))
|
||||
WGET_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_wget.py'
|
||||
WGET_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_wget.py'
|
||||
BREW_HOOK = PLUGINS_ROOT / 'brew' / 'on_Dependency__install_using_brew_provider.py'
|
||||
APT_HOOK = PLUGINS_ROOT / 'apt' / 'on_Dependency__install_using_apt_provider.py'
|
||||
TEST_URL = 'https://example.com'
|
||||
@@ -37,45 +37,59 @@ def test_hook_script_exists():
|
||||
assert WGET_HOOK.exists(), f"Hook script not found: {WGET_HOOK}"
|
||||
|
||||
|
||||
def test_wget_install_hook():
|
||||
"""Test wget install hook to install wget if needed."""
|
||||
def test_wget_validate_hook():
|
||||
"""Test wget validate hook checks for wget binary."""
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(WGET_INSTALL_HOOK)],
|
||||
[sys.executable, str(WGET_VALIDATE_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
|
||||
|
||||
# Verify InstalledBinary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == 'wget'
|
||||
assert record['abspath']
|
||||
found_binary = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert found_binary, "Should output InstalledBinary record"
|
||||
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
|
||||
if result.returncode == 0:
|
||||
# Binary found - verify InstalledBinary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == 'wget'
|
||||
assert record['abspath']
|
||||
found_binary = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
assert found_binary, "Should output InstalledBinary record when binary found"
|
||||
else:
|
||||
# Binary not found - verify Dependency JSONL output
|
||||
found_dependency = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Dependency':
|
||||
assert record['bin_name'] == 'wget'
|
||||
assert 'env' in record['bin_providers']
|
||||
found_dependency = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
assert found_dependency, "Should output Dependency record when binary not found"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify wget is available via abx-pkg after hook installation."""
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
|
||||
|
||||
AptProvider.model_rebuild()
|
||||
BrewProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
"""Verify wget is available via abx-pkg."""
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
wget_binary = Binary(name='wget', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
|
||||
wget_loaded = wget_binary.load()
|
||||
assert wget_loaded and wget_loaded.abspath, "wget should be available after install hook"
|
||||
|
||||
if wget_loaded and wget_loaded.abspath:
|
||||
assert True, "wget is available"
|
||||
else:
|
||||
pytest.skip("wget not available - Dependency record should have been emitted")
|
||||
|
||||
|
||||
def test_reports_missing_dependency_when_not_installed():
|
||||
|
||||
@@ -110,6 +110,10 @@
|
||||
{% block nav-global %}{% endblock %}
|
||||
</div>
|
||||
|
||||
{% if has_permission %}
|
||||
{% include 'admin/progress_monitor.html' %}
|
||||
{% endif %}
|
||||
|
||||
{% block breadcrumbs %}
|
||||
<div class="breadcrumbs">
|
||||
<a href="{% url 'admin:index' %}">{% trans 'Home' %}</a>
|
||||
|
||||
648
archivebox/templates/admin/progress_monitor.html
Normal file
648
archivebox/templates/admin/progress_monitor.html
Normal file
@@ -0,0 +1,648 @@
|
||||
<style>
|
||||
/* Progress Monitor Container */
|
||||
#progress-monitor {
|
||||
background: linear-gradient(135deg, #0d1117 0%, #161b22 100%);
|
||||
color: #c9d1d9;
|
||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Noto Sans', Helvetica, Arial, sans-serif;
|
||||
font-size: 12px;
|
||||
border-bottom: 1px solid #30363d;
|
||||
position: relative;
|
||||
z-index: 100;
|
||||
}
|
||||
#progress-monitor.hidden {
|
||||
display: none;
|
||||
}
|
||||
#progress-monitor .tree-container {
|
||||
max-height: 350px;
|
||||
overflow-y: auto;
|
||||
}
|
||||
|
||||
/* Header Bar */
|
||||
#progress-monitor .header-bar {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
padding: 8px 16px;
|
||||
background: rgba(0,0,0,0.2);
|
||||
border-bottom: 1px solid #30363d;
|
||||
position: sticky;
|
||||
top: 0;
|
||||
z-index: 10;
|
||||
}
|
||||
#progress-monitor .header-left {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 16px;
|
||||
}
|
||||
#progress-monitor .header-right {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 12px;
|
||||
}
|
||||
|
||||
/* Orchestrator Status */
|
||||
#progress-monitor .orchestrator-status {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 6px;
|
||||
}
|
||||
#progress-monitor .status-dot {
|
||||
width: 8px;
|
||||
height: 8px;
|
||||
border-radius: 50%;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
#progress-monitor .status-dot.running {
|
||||
background: #3fb950;
|
||||
box-shadow: 0 0 8px #3fb950;
|
||||
animation: pulse 2s infinite;
|
||||
}
|
||||
#progress-monitor .status-dot.stopped {
|
||||
background: #f85149;
|
||||
}
|
||||
@keyframes pulse {
|
||||
0%, 100% { opacity: 1; box-shadow: 0 0 8px #3fb950; }
|
||||
50% { opacity: 0.6; box-shadow: 0 0 4px #3fb950; }
|
||||
}
|
||||
|
||||
/* Stats */
|
||||
#progress-monitor .stats {
|
||||
display: flex;
|
||||
gap: 16px;
|
||||
}
|
||||
#progress-monitor .stat {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 4px;
|
||||
}
|
||||
#progress-monitor .stat-label {
|
||||
color: #8b949e;
|
||||
font-size: 10px;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.5px;
|
||||
}
|
||||
#progress-monitor .stat-value {
|
||||
font-weight: 600;
|
||||
font-variant-numeric: tabular-nums;
|
||||
}
|
||||
#progress-monitor .stat-value.success { color: #3fb950; }
|
||||
#progress-monitor .stat-value.error { color: #f85149; }
|
||||
#progress-monitor .stat-value.warning { color: #d29922; }
|
||||
#progress-monitor .stat-value.info { color: #58a6ff; }
|
||||
|
||||
/* Toggle Button */
|
||||
#progress-monitor .toggle-btn {
|
||||
background: transparent;
|
||||
border: 1px solid #30363d;
|
||||
color: #8b949e;
|
||||
cursor: pointer;
|
||||
padding: 4px 8px;
|
||||
border-radius: 6px;
|
||||
font-size: 11px;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
#progress-monitor .toggle-btn:hover {
|
||||
background: #21262d;
|
||||
color: #c9d1d9;
|
||||
border-color: #8b949e;
|
||||
}
|
||||
|
||||
/* Tree Container */
|
||||
#progress-monitor .tree-container {
|
||||
padding: 12px 16px;
|
||||
}
|
||||
#progress-monitor.collapsed .tree-container {
|
||||
display: none;
|
||||
}
|
||||
|
||||
/* Idle Message */
|
||||
#progress-monitor .idle-message {
|
||||
color: #8b949e;
|
||||
font-style: italic;
|
||||
padding: 8px 0;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
/* Crawl Item */
|
||||
#progress-monitor .crawl-item {
|
||||
background: #161b22;
|
||||
border: 1px solid #30363d;
|
||||
border-radius: 8px;
|
||||
margin-bottom: 12px;
|
||||
overflow: hidden;
|
||||
}
|
||||
#progress-monitor .crawl-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 12px;
|
||||
padding: 10px 14px;
|
||||
background: rgba(0,0,0,0.2);
|
||||
cursor: pointer;
|
||||
}
|
||||
#progress-monitor .crawl-header:hover {
|
||||
background: rgba(88, 166, 255, 0.1);
|
||||
}
|
||||
#progress-monitor .crawl-icon {
|
||||
font-size: 16px;
|
||||
width: 20px;
|
||||
text-align: center;
|
||||
}
|
||||
#progress-monitor .crawl-info {
|
||||
flex: 1;
|
||||
min-width: 0;
|
||||
}
|
||||
#progress-monitor .crawl-label {
|
||||
font-weight: 600;
|
||||
color: #58a6ff;
|
||||
white-space: nowrap;
|
||||
overflow: hidden;
|
||||
text-overflow: ellipsis;
|
||||
}
|
||||
#progress-monitor .crawl-meta {
|
||||
font-size: 11px;
|
||||
color: #8b949e;
|
||||
margin-top: 2px;
|
||||
}
|
||||
#progress-monitor .crawl-stats {
|
||||
display: flex;
|
||||
gap: 12px;
|
||||
font-size: 11px;
|
||||
}
|
||||
|
||||
/* Progress Bar */
|
||||
#progress-monitor .progress-bar-container {
|
||||
height: 4px;
|
||||
background: #21262d;
|
||||
border-radius: 2px;
|
||||
overflow: hidden;
|
||||
position: relative;
|
||||
}
|
||||
#progress-monitor .progress-bar {
|
||||
height: 100%;
|
||||
border-radius: 2px;
|
||||
transition: width 0.5s ease-out;
|
||||
position: relative;
|
||||
}
|
||||
#progress-monitor .progress-bar.crawl {
|
||||
background: linear-gradient(90deg, #238636 0%, #3fb950 100%);
|
||||
}
|
||||
#progress-monitor .progress-bar.snapshot {
|
||||
background: linear-gradient(90deg, #1f6feb 0%, #58a6ff 100%);
|
||||
}
|
||||
#progress-monitor .progress-bar.extractor {
|
||||
background: linear-gradient(90deg, #8957e5 0%, #a371f7 100%);
|
||||
}
|
||||
#progress-monitor .progress-bar.indeterminate {
|
||||
background: linear-gradient(90deg, transparent 0%, #58a6ff 50%, transparent 100%);
|
||||
animation: indeterminate 1.5s infinite linear;
|
||||
width: 30% !important;
|
||||
}
|
||||
@keyframes indeterminate {
|
||||
0% { transform: translateX(-100%); }
|
||||
100% { transform: translateX(400%); }
|
||||
}
|
||||
|
||||
/* Crawl Body */
|
||||
#progress-monitor .crawl-body {
|
||||
padding: 0 14px 14px;
|
||||
}
|
||||
#progress-monitor .crawl-progress {
|
||||
padding: 10px 14px;
|
||||
border-bottom: 1px solid #21262d;
|
||||
}
|
||||
|
||||
/* Snapshot List */
|
||||
#progress-monitor .snapshot-list {
|
||||
margin-top: 8px;
|
||||
}
|
||||
#progress-monitor .snapshot-item {
|
||||
background: #0d1117;
|
||||
border: 1px solid #21262d;
|
||||
border-radius: 6px;
|
||||
margin-bottom: 8px;
|
||||
overflow: hidden;
|
||||
}
|
||||
#progress-monitor .snapshot-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 10px;
|
||||
padding: 8px 12px;
|
||||
cursor: pointer;
|
||||
}
|
||||
#progress-monitor .snapshot-header:hover {
|
||||
background: rgba(88, 166, 255, 0.05);
|
||||
}
|
||||
#progress-monitor .snapshot-icon {
|
||||
font-size: 14px;
|
||||
width: 18px;
|
||||
text-align: center;
|
||||
color: #58a6ff;
|
||||
}
|
||||
#progress-monitor .snapshot-info {
|
||||
flex: 1;
|
||||
min-width: 0;
|
||||
}
|
||||
#progress-monitor .snapshot-url {
|
||||
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
|
||||
font-size: 11px;
|
||||
color: #c9d1d9;
|
||||
white-space: nowrap;
|
||||
overflow: hidden;
|
||||
text-overflow: ellipsis;
|
||||
}
|
||||
#progress-monitor .snapshot-meta {
|
||||
font-size: 10px;
|
||||
color: #8b949e;
|
||||
margin-top: 2px;
|
||||
}
|
||||
#progress-monitor .snapshot-progress {
|
||||
padding: 0 12px 8px;
|
||||
}
|
||||
|
||||
/* Extractor List */
|
||||
#progress-monitor .extractor-list {
|
||||
padding: 8px 12px;
|
||||
background: rgba(0,0,0,0.2);
|
||||
border-top: 1px solid #21262d;
|
||||
}
|
||||
#progress-monitor .extractor-item {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 8px;
|
||||
padding: 4px 0;
|
||||
}
|
||||
#progress-monitor .extractor-icon {
|
||||
font-size: 12px;
|
||||
width: 16px;
|
||||
text-align: center;
|
||||
}
|
||||
#progress-monitor .extractor-icon.running {
|
||||
color: #d29922;
|
||||
animation: spin 1s linear infinite;
|
||||
}
|
||||
#progress-monitor .extractor-icon.success {
|
||||
color: #3fb950;
|
||||
}
|
||||
#progress-monitor .extractor-icon.failed {
|
||||
color: #f85149;
|
||||
}
|
||||
#progress-monitor .extractor-icon.pending {
|
||||
color: #8b949e;
|
||||
}
|
||||
@keyframes spin {
|
||||
from { transform: rotate(0deg); }
|
||||
to { transform: rotate(360deg); }
|
||||
}
|
||||
#progress-monitor .extractor-name {
|
||||
flex: 1;
|
||||
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
|
||||
font-size: 11px;
|
||||
}
|
||||
#progress-monitor .extractor-progress {
|
||||
width: 60px;
|
||||
}
|
||||
|
||||
/* Status Badge */
|
||||
#progress-monitor .status-badge {
|
||||
font-size: 10px;
|
||||
padding: 2px 6px;
|
||||
border-radius: 10px;
|
||||
font-weight: 500;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.3px;
|
||||
}
|
||||
#progress-monitor .status-badge.queued {
|
||||
background: #21262d;
|
||||
color: #8b949e;
|
||||
}
|
||||
#progress-monitor .status-badge.started {
|
||||
background: rgba(210, 153, 34, 0.2);
|
||||
color: #d29922;
|
||||
}
|
||||
#progress-monitor .status-badge.sealed,
|
||||
#progress-monitor .status-badge.succeeded {
|
||||
background: rgba(63, 185, 80, 0.2);
|
||||
color: #3fb950;
|
||||
}
|
||||
#progress-monitor .status-badge.failed {
|
||||
background: rgba(248, 81, 73, 0.2);
|
||||
color: #f85149;
|
||||
}
|
||||
|
||||
/* Expand/Collapse Icons */
|
||||
#progress-monitor .expand-icon {
|
||||
color: #8b949e;
|
||||
font-size: 10px;
|
||||
transition: transform 0.2s;
|
||||
}
|
||||
#progress-monitor .expand-icon.expanded {
|
||||
transform: rotate(90deg);
|
||||
}
|
||||
</style>
|
||||
|
||||
<div id="progress-monitor">
|
||||
<div class="header-bar">
|
||||
<div class="header-left">
|
||||
<div class="orchestrator-status">
|
||||
<span class="status-dot stopped" id="orchestrator-dot"></span>
|
||||
<span id="orchestrator-text">Stopped</span>
|
||||
</div>
|
||||
<div class="stats">
|
||||
<div class="stat">
|
||||
<span class="stat-label">Workers</span>
|
||||
<span class="stat-value info" id="worker-count">0</span>
|
||||
</div>
|
||||
<div class="stat">
|
||||
<span class="stat-label">Queued</span>
|
||||
<span class="stat-value warning" id="total-queued">0</span>
|
||||
</div>
|
||||
<div class="stat">
|
||||
<span class="stat-label">Done</span>
|
||||
<span class="stat-value success" id="total-succeeded">0</span>
|
||||
</div>
|
||||
<div class="stat">
|
||||
<span class="stat-label">Failed</span>
|
||||
<span class="stat-value error" id="total-failed">0</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="header-right">
|
||||
<button class="toggle-btn" id="progress-collapse" title="Toggle details">Details</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="tree-container" id="tree-container">
|
||||
<div class="idle-message" id="idle-message">No active crawls</div>
|
||||
<div id="crawl-tree"></div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
(function() {
|
||||
const monitor = document.getElementById('progress-monitor');
|
||||
const collapseBtn = document.getElementById('progress-collapse');
|
||||
const treeContainer = document.getElementById('tree-container');
|
||||
const crawlTree = document.getElementById('crawl-tree');
|
||||
const idleMessage = document.getElementById('idle-message');
|
||||
|
||||
let pollInterval = null;
|
||||
let isCollapsed = localStorage.getItem('progress-monitor-collapsed') === 'true';
|
||||
let expandedCrawls = new Set(JSON.parse(localStorage.getItem('progress-monitor-expanded-crawls') || '[]'));
|
||||
let expandedSnapshots = new Set(JSON.parse(localStorage.getItem('progress-monitor-expanded-snapshots') || '[]'));
|
||||
|
||||
function formatUrl(url) {
|
||||
try {
|
||||
const u = new URL(url);
|
||||
return u.hostname + u.pathname.substring(0, 30) + (u.pathname.length > 30 ? '...' : '');
|
||||
} catch {
|
||||
return url.substring(0, 50) + (url.length > 50 ? '...' : '');
|
||||
}
|
||||
}
|
||||
|
||||
function renderExtractor(extractor) {
|
||||
const iconClass = extractor.status === 'started' ? 'running' :
|
||||
extractor.status === 'succeeded' ? 'success' :
|
||||
extractor.status === 'failed' ? 'failed' : 'pending';
|
||||
const icon = extractor.status === 'started' ? '↻' :
|
||||
extractor.status === 'succeeded' ? '✓' :
|
||||
extractor.status === 'failed' ? '✗' : '○';
|
||||
|
||||
return `
|
||||
<div class="extractor-item">
|
||||
<span class="extractor-icon ${iconClass}">${icon}</span>
|
||||
<span class="extractor-name">${extractor.extractor}</span>
|
||||
<div class="extractor-progress">
|
||||
<div class="progress-bar-container">
|
||||
<div class="progress-bar extractor ${extractor.status === 'started' ? 'indeterminate' : ''}"
|
||||
style="width: ${extractor.status === 'succeeded' ? '100' : extractor.status === 'failed' ? '100' : extractor.progress}%"></div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
|
||||
function renderSnapshot(snapshot, crawlId) {
|
||||
const snapshotKey = `${crawlId}-${snapshot.id}`;
|
||||
const isExpanded = expandedSnapshots.has(snapshotKey);
|
||||
const statusIcon = snapshot.status === 'started' ? '↻' : '📄';
|
||||
|
||||
let extractorHtml = '';
|
||||
if (snapshot.active_extractors && snapshot.active_extractors.length > 0) {
|
||||
extractorHtml = `
|
||||
<div class="extractor-list" style="${isExpanded ? '' : 'display:none'}">
|
||||
${snapshot.active_extractors.map(e => renderExtractor(e)).join('')}
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
|
||||
return `
|
||||
<div class="snapshot-item" data-snapshot-key="${snapshotKey}">
|
||||
<div class="snapshot-header" onclick="window.toggleSnapshot('${snapshotKey}')">
|
||||
<span class="expand-icon ${isExpanded ? 'expanded' : ''}">${snapshot.active_extractors?.length ? '▶' : ''}</span>
|
||||
<span class="snapshot-icon">${statusIcon}</span>
|
||||
<div class="snapshot-info">
|
||||
<div class="snapshot-url">${formatUrl(snapshot.url)}</div>
|
||||
<div class="snapshot-meta">
|
||||
${snapshot.completed_extractors}/${snapshot.total_extractors} extractors
|
||||
${snapshot.failed_extractors > 0 ? `<span style="color:#f85149">(${snapshot.failed_extractors} failed)</span>` : ''}
|
||||
</div>
|
||||
</div>
|
||||
<span class="status-badge ${snapshot.status}">${snapshot.status}</span>
|
||||
</div>
|
||||
<div class="snapshot-progress">
|
||||
<div class="progress-bar-container">
|
||||
<div class="progress-bar snapshot ${snapshot.status === 'started' && snapshot.progress === 0 ? 'indeterminate' : ''}"
|
||||
style="width: ${snapshot.progress}%"></div>
|
||||
</div>
|
||||
</div>
|
||||
${extractorHtml}
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
|
||||
function renderCrawl(crawl) {
|
||||
const isExpanded = expandedCrawls.has(crawl.id);
|
||||
const statusIcon = crawl.status === 'started' ? '↻' : '🔍';
|
||||
|
||||
let snapshotsHtml = '';
|
||||
if (crawl.active_snapshots && crawl.active_snapshots.length > 0) {
|
||||
snapshotsHtml = crawl.active_snapshots.map(s => renderSnapshot(s, crawl.id)).join('');
|
||||
}
|
||||
|
||||
return `
|
||||
<div class="crawl-item" data-crawl-id="${crawl.id}">
|
||||
<div class="crawl-header" onclick="window.toggleCrawl('${crawl.id}')">
|
||||
<span class="expand-icon ${isExpanded ? 'expanded' : ''}">${crawl.active_snapshots?.length ? '▶' : ''}</span>
|
||||
<span class="crawl-icon">${statusIcon}</span>
|
||||
<div class="crawl-info">
|
||||
<div class="crawl-label">${crawl.label}</div>
|
||||
<div class="crawl-meta">depth: ${crawl.max_depth} | ${crawl.total_snapshots} snapshots</div>
|
||||
</div>
|
||||
<div class="crawl-stats">
|
||||
<span style="color:#3fb950">${crawl.completed_snapshots} done</span>
|
||||
<span style="color:#8b949e">${crawl.pending_snapshots} pending</span>
|
||||
</div>
|
||||
<span class="status-badge ${crawl.status}">${crawl.status}</span>
|
||||
</div>
|
||||
<div class="crawl-progress">
|
||||
<div class="progress-bar-container">
|
||||
<div class="progress-bar crawl ${crawl.status === 'started' && crawl.progress === 0 ? 'indeterminate' : ''}"
|
||||
style="width: ${crawl.progress}%"></div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="crawl-body" style="${isExpanded ? '' : 'display:none'}">
|
||||
<div class="snapshot-list">
|
||||
${snapshotsHtml}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
`;
|
||||
}
|
||||
|
||||
window.toggleCrawl = function(crawlId) {
|
||||
const item = document.querySelector(`[data-crawl-id="${crawlId}"]`);
|
||||
const body = item.querySelector('.crawl-body');
|
||||
const icon = item.querySelector('.expand-icon');
|
||||
|
||||
if (expandedCrawls.has(crawlId)) {
|
||||
expandedCrawls.delete(crawlId);
|
||||
body.style.display = 'none';
|
||||
icon.classList.remove('expanded');
|
||||
} else {
|
||||
expandedCrawls.add(crawlId);
|
||||
body.style.display = '';
|
||||
icon.classList.add('expanded');
|
||||
}
|
||||
localStorage.setItem('progress-monitor-expanded-crawls', JSON.stringify([...expandedCrawls]));
|
||||
};
|
||||
|
||||
window.toggleSnapshot = function(snapshotKey) {
|
||||
const item = document.querySelector(`[data-snapshot-key="${snapshotKey}"]`);
|
||||
const extractorList = item.querySelector('.extractor-list');
|
||||
const icon = item.querySelector('.expand-icon');
|
||||
|
||||
if (!extractorList) return;
|
||||
|
||||
if (expandedSnapshots.has(snapshotKey)) {
|
||||
expandedSnapshots.delete(snapshotKey);
|
||||
extractorList.style.display = 'none';
|
||||
icon.classList.remove('expanded');
|
||||
} else {
|
||||
expandedSnapshots.add(snapshotKey);
|
||||
extractorList.style.display = '';
|
||||
icon.classList.add('expanded');
|
||||
}
|
||||
localStorage.setItem('progress-monitor-expanded-snapshots', JSON.stringify([...expandedSnapshots]));
|
||||
};
|
||||
|
||||
function updateProgress(data) {
|
||||
// Calculate if there's activity
|
||||
const hasActivity = data.active_crawls.length > 0 ||
|
||||
data.crawls_pending > 0 || data.crawls_started > 0 ||
|
||||
data.snapshots_pending > 0 || data.snapshots_started > 0 ||
|
||||
data.archiveresults_pending > 0 || data.archiveresults_started > 0;
|
||||
|
||||
// Update orchestrator status
|
||||
const dot = document.getElementById('orchestrator-dot');
|
||||
const text = document.getElementById('orchestrator-text');
|
||||
if (data.orchestrator_running) {
|
||||
dot.classList.remove('stopped');
|
||||
dot.classList.add('running');
|
||||
text.textContent = 'Running';
|
||||
} else {
|
||||
dot.classList.remove('running');
|
||||
dot.classList.add('stopped');
|
||||
text.textContent = 'Stopped';
|
||||
}
|
||||
|
||||
// Update stats
|
||||
document.getElementById('worker-count').textContent = data.total_workers;
|
||||
document.getElementById('total-queued').textContent =
|
||||
data.crawls_pending + data.snapshots_pending + data.archiveresults_pending;
|
||||
document.getElementById('total-succeeded').textContent = data.archiveresults_succeeded;
|
||||
document.getElementById('total-failed').textContent = data.archiveresults_failed;
|
||||
|
||||
// Render crawl tree
|
||||
if (data.active_crawls.length > 0) {
|
||||
idleMessage.style.display = 'none';
|
||||
crawlTree.innerHTML = data.active_crawls.map(c => renderCrawl(c)).join('');
|
||||
} else if (hasActivity) {
|
||||
idleMessage.style.display = 'none';
|
||||
crawlTree.innerHTML = `
|
||||
<div class="idle-message">
|
||||
${data.snapshots_started} snapshots processing, ${data.archiveresults_started} extractors running
|
||||
</div>
|
||||
`;
|
||||
} else {
|
||||
idleMessage.style.display = '';
|
||||
// Build the URL for recent crawls (last 24 hours)
|
||||
var yesterday = new Date(Date.now() - 24*60*60*1000).toISOString().split('T')[0];
|
||||
var recentUrl = '/admin/crawls/crawl/?created_at__gte=' + yesterday + '&o=-1';
|
||||
idleMessage.innerHTML = `No active crawls (${data.crawls_pending} pending, ${data.crawls_started} started, <a href="${recentUrl}" style="color: #58a6ff;">${data.crawls_recent} recent</a>)`;
|
||||
crawlTree.innerHTML = '';
|
||||
}
|
||||
}
|
||||
|
||||
function fetchProgress() {
|
||||
fetch('/admin/live-progress/')
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
if (data.error) {
|
||||
console.error('Progress API error:', data.error, data.traceback);
|
||||
idleMessage.textContent = 'API Error: ' + data.error;
|
||||
idleMessage.style.color = '#f85149';
|
||||
}
|
||||
updateProgress(data);
|
||||
})
|
||||
.catch(error => {
|
||||
console.error('Progress fetch error:', error);
|
||||
idleMessage.textContent = 'Fetch Error: ' + error.message;
|
||||
idleMessage.style.color = '#f85149';
|
||||
});
|
||||
}
|
||||
|
||||
function startPolling() {
|
||||
if (pollInterval) return;
|
||||
fetchProgress();
|
||||
pollInterval = setInterval(fetchProgress, 1000); // Poll every 1 second
|
||||
}
|
||||
|
||||
function stopPolling() {
|
||||
if (pollInterval) {
|
||||
clearInterval(pollInterval);
|
||||
pollInterval = null;
|
||||
}
|
||||
}
|
||||
|
||||
// Collapse toggle
|
||||
collapseBtn.addEventListener('click', function() {
|
||||
isCollapsed = !isCollapsed;
|
||||
localStorage.setItem('progress-monitor-collapsed', isCollapsed);
|
||||
if (isCollapsed) {
|
||||
monitor.classList.add('collapsed');
|
||||
collapseBtn.textContent = 'Expand';
|
||||
} else {
|
||||
monitor.classList.remove('collapsed');
|
||||
collapseBtn.textContent = 'Details';
|
||||
}
|
||||
});
|
||||
|
||||
// Apply initial state
|
||||
if (isCollapsed) {
|
||||
monitor.classList.add('collapsed');
|
||||
collapseBtn.textContent = 'Expand';
|
||||
}
|
||||
|
||||
// Start polling when page loads
|
||||
startPolling();
|
||||
|
||||
// Pause polling when tab is hidden
|
||||
document.addEventListener('visibilitychange', function() {
|
||||
if (document.hidden) {
|
||||
stopPolling();
|
||||
} else {
|
||||
startPolling();
|
||||
}
|
||||
});
|
||||
})();
|
||||
</script>
|
||||
@@ -192,6 +192,42 @@
|
||||
border: 0px;
|
||||
border-top: 3px solid #aa1e55;
|
||||
}
|
||||
#main-frame-wrapper {
|
||||
width: 100%;
|
||||
height: calc(100vh - 210px);
|
||||
border-top: 3px solid #aa1e55;
|
||||
overflow: hidden;
|
||||
}
|
||||
#main-frame-wrapper iframe {
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
border: none;
|
||||
}
|
||||
.full-page-wrapper {
|
||||
width: 100%;
|
||||
height: calc(100vh - 210px);
|
||||
}
|
||||
.thumbnail-wrapper {
|
||||
height: 100px;
|
||||
overflow: hidden;
|
||||
background-color: #333;
|
||||
pointer-events: none;
|
||||
}
|
||||
.thumbnail-wrapper iframe {
|
||||
width: 405%;
|
||||
height: 430px;
|
||||
margin-bottom: -330px;
|
||||
margin-left: -1%;
|
||||
transform: scale(0.25);
|
||||
transform-origin: 0 0;
|
||||
border: none;
|
||||
}
|
||||
.thumbnail-wrapper img {
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
object-fit: cover;
|
||||
object-position: top center;
|
||||
}
|
||||
.card.selected-card {
|
||||
border: 2px solid orange;
|
||||
box-shadow: 0px -6px 13px 1px rgba(0,0,0,0.05);
|
||||
@@ -403,12 +439,18 @@
|
||||
<div class="card {% if forloop.first %}selected-card{% endif %}">
|
||||
<div class="card-body">
|
||||
<a href="{{result.path|urlencode}}" target="preview" title="./{{result.path}} (downloaded {{result.ts}})">
|
||||
<h4>{{result.name|truncatechars:24}} <small>({{result.size|filesizeformat}})</small></h4>
|
||||
<!-- <p class="card-text" ><code>./{{result.path|truncatechars:30}}</code></p> -->
|
||||
<h4>{% extractor_icon result.name %} {{result.name|extractor_name|truncatechars:20}} <small>({{result.size|filesizeformat}})</small></h4>
|
||||
</a>
|
||||
<!--<a href="{{result.path}}" target="preview"><h4 class="card-title">{{result.name}}</h4></a>-->
|
||||
</div>
|
||||
<iframe class="card-img-top" src="{{result.path|urlencode}}?autoplay=0" allow="autoplay 'none'; fullscreen 'none'; navigation-override 'none'; " sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy"></iframe>
|
||||
{% if result.result %}
|
||||
{# Use plugin-specific thumbnail template when ArchiveResult is available #}
|
||||
<div class="card-img-top thumbnail-wrapper">
|
||||
{% extractor_thumbnail result.result %}
|
||||
</div>
|
||||
{% else %}
|
||||
{# Fall back to generic iframe for filesystem-discovered files #}
|
||||
<iframe class="card-img-top" src="{{result.path|urlencode}}?autoplay=0" allow="autoplay 'none'; fullscreen 'none'; navigation-override 'none'; " sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy"></iframe>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
{% endfor %}
|
||||
@@ -431,7 +473,15 @@
|
||||
|
||||
|
||||
|
||||
<iframe id="main-frame" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" class="full-page-iframe" src="{{best_result.path|urlencode}}" name="preview"></iframe>
|
||||
{% if best_result.result %}
|
||||
{# Use plugin-specific fullscreen template when ArchiveResult is available #}
|
||||
<div id="main-frame-wrapper" class="full-page-wrapper">
|
||||
{% extractor_fullscreen best_result.result %}
|
||||
</div>
|
||||
{% else %}
|
||||
{# Fall back to generic iframe #}
|
||||
<iframe id="main-frame" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" class="full-page-iframe" src="{{best_result.path|urlencode}}" name="preview"></iframe>
|
||||
{% endif %}
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,23 +1,13 @@
|
||||
"""
|
||||
Workers admin module.
|
||||
|
||||
The orchestrator/worker system doesn't need Django admin registration
|
||||
as workers are managed via CLI commands and the orchestrator.
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.workers'
|
||||
|
||||
from django.contrib.auth import get_permission_codename
|
||||
|
||||
from huey_monitor.apps import HueyMonitorConfig
|
||||
from huey_monitor.admin import TaskModel, TaskModelAdmin, SignalInfoModel, SignalInfoModelAdmin
|
||||
|
||||
|
||||
HueyMonitorConfig.verbose_name = 'Background Workers'
|
||||
|
||||
|
||||
class CustomTaskModelAdmin(TaskModelAdmin):
|
||||
actions = ["delete_selected"]
|
||||
|
||||
def has_delete_permission(self, request, obj=None):
|
||||
codename = get_permission_codename("delete", self.opts)
|
||||
return request.user.has_perm("%s.%s" % (self.opts.app_label, codename))
|
||||
|
||||
|
||||
|
||||
def register_admin(admin_site):
|
||||
admin_site.register(TaskModel, CustomTaskModelAdmin)
|
||||
admin_site.register(SignalInfoModel, SignalInfoModelAdmin)
|
||||
"""No models to register - workers are process-based, not Django models."""
|
||||
pass
|
||||
|
||||
0
archivebox/workers/management/__init__.py
Normal file
0
archivebox/workers/management/__init__.py
Normal file
0
archivebox/workers/management/commands/__init__.py
Normal file
0
archivebox/workers/management/commands/__init__.py
Normal file
15
archivebox/workers/management/commands/orchestrator.py
Normal file
15
archivebox/workers/management/commands/orchestrator.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from workers.orchestrator import Orchestrator
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = 'Run the archivebox orchestrator'
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('--daemon', '-d', action='store_true', help="Run forever (don't exit on idle)")
|
||||
|
||||
def handle(self, *args, **kwargs):
|
||||
daemon = kwargs.get('daemon', False)
|
||||
orchestrator = Orchestrator(exit_on_idle=not daemon)
|
||||
orchestrator.runloop()
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user