mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-03 14:27:55 +10:00
remove huey
This commit is contained in:
@@ -42,6 +42,7 @@ def register_urls(api: NinjaAPI) -> NinjaAPI:
|
|||||||
api.add_router('/crawls/', 'api.v1_crawls.router')
|
api.add_router('/crawls/', 'api.v1_crawls.router')
|
||||||
api.add_router('/cli/', 'api.v1_cli.router')
|
api.add_router('/cli/', 'api.v1_cli.router')
|
||||||
api.add_router('/workers/', 'api.v1_workers.router')
|
api.add_router('/workers/', 'api.v1_workers.router')
|
||||||
|
api.add_router('/machine/', 'api.v1_machine.router')
|
||||||
return api
|
return api
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -115,8 +115,9 @@ def cli_add(request, args: AddCommandSchema):
|
|||||||
update=args.update,
|
update=args.update,
|
||||||
index_only=args.index_only,
|
index_only=args.index_only,
|
||||||
overwrite=args.overwrite,
|
overwrite=args.overwrite,
|
||||||
extract=args.extract,
|
plugins=args.extract, # extract in API maps to plugins param
|
||||||
parser=args.parser,
|
parser=args.parser,
|
||||||
|
bg=True, # Always run in background for API calls
|
||||||
)
|
)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|||||||
206
archivebox/api/v1_machine.py
Normal file
206
archivebox/api/v1_machine.py
Normal file
@@ -0,0 +1,206 @@
|
|||||||
|
__package__ = 'archivebox.api'
|
||||||
|
|
||||||
|
from uuid import UUID
|
||||||
|
from typing import List, Optional
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from ninja import Router, Schema, FilterSchema, Field, Query
|
||||||
|
from ninja.pagination import paginate
|
||||||
|
|
||||||
|
from api.v1_core import CustomPagination
|
||||||
|
|
||||||
|
|
||||||
|
router = Router(tags=['Machine and Dependencies'])
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Machine Schemas
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
class MachineSchema(Schema):
|
||||||
|
"""Schema for Machine model."""
|
||||||
|
TYPE: str = 'machine.Machine'
|
||||||
|
id: UUID
|
||||||
|
created_at: datetime
|
||||||
|
modified_at: datetime
|
||||||
|
guid: str
|
||||||
|
hostname: str
|
||||||
|
hw_in_docker: bool
|
||||||
|
hw_in_vm: bool
|
||||||
|
hw_manufacturer: str
|
||||||
|
hw_product: str
|
||||||
|
hw_uuid: str
|
||||||
|
os_arch: str
|
||||||
|
os_family: str
|
||||||
|
os_platform: str
|
||||||
|
os_release: str
|
||||||
|
os_kernel: str
|
||||||
|
stats: dict
|
||||||
|
num_uses_succeeded: int
|
||||||
|
num_uses_failed: int
|
||||||
|
|
||||||
|
|
||||||
|
class MachineFilterSchema(FilterSchema):
|
||||||
|
id: Optional[str] = Field(None, q='id__startswith')
|
||||||
|
hostname: Optional[str] = Field(None, q='hostname__icontains')
|
||||||
|
os_platform: Optional[str] = Field(None, q='os_platform__icontains')
|
||||||
|
os_arch: Optional[str] = Field(None, q='os_arch')
|
||||||
|
hw_in_docker: Optional[bool] = Field(None, q='hw_in_docker')
|
||||||
|
hw_in_vm: Optional[bool] = Field(None, q='hw_in_vm')
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Dependency Schemas
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
class DependencySchema(Schema):
|
||||||
|
"""Schema for Dependency model."""
|
||||||
|
TYPE: str = 'machine.Dependency'
|
||||||
|
id: UUID
|
||||||
|
created_at: datetime
|
||||||
|
modified_at: datetime
|
||||||
|
bin_name: str
|
||||||
|
bin_providers: str
|
||||||
|
custom_cmds: dict
|
||||||
|
config: dict
|
||||||
|
is_installed: bool
|
||||||
|
installed_count: int
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_is_installed(obj) -> bool:
|
||||||
|
return obj.is_installed
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_installed_count(obj) -> int:
|
||||||
|
return obj.installed_binaries.count()
|
||||||
|
|
||||||
|
|
||||||
|
class DependencyFilterSchema(FilterSchema):
|
||||||
|
id: Optional[str] = Field(None, q='id__startswith')
|
||||||
|
bin_name: Optional[str] = Field(None, q='bin_name__icontains')
|
||||||
|
bin_providers: Optional[str] = Field(None, q='bin_providers__icontains')
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# InstalledBinary Schemas
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
class InstalledBinarySchema(Schema):
|
||||||
|
"""Schema for InstalledBinary model."""
|
||||||
|
TYPE: str = 'machine.InstalledBinary'
|
||||||
|
id: UUID
|
||||||
|
created_at: datetime
|
||||||
|
modified_at: datetime
|
||||||
|
machine_id: UUID
|
||||||
|
machine_hostname: str
|
||||||
|
dependency_id: Optional[UUID]
|
||||||
|
dependency_bin_name: Optional[str]
|
||||||
|
name: str
|
||||||
|
binprovider: str
|
||||||
|
abspath: str
|
||||||
|
version: str
|
||||||
|
sha256: str
|
||||||
|
is_valid: bool
|
||||||
|
num_uses_succeeded: int
|
||||||
|
num_uses_failed: int
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_machine_hostname(obj) -> str:
|
||||||
|
return obj.machine.hostname
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_dependency_id(obj) -> Optional[UUID]:
|
||||||
|
return obj.dependency_id
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_dependency_bin_name(obj) -> Optional[str]:
|
||||||
|
return obj.dependency.bin_name if obj.dependency else None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_is_valid(obj) -> bool:
|
||||||
|
return obj.is_valid
|
||||||
|
|
||||||
|
|
||||||
|
class InstalledBinaryFilterSchema(FilterSchema):
|
||||||
|
id: Optional[str] = Field(None, q='id__startswith')
|
||||||
|
name: Optional[str] = Field(None, q='name__icontains')
|
||||||
|
binprovider: Optional[str] = Field(None, q='binprovider')
|
||||||
|
machine_id: Optional[str] = Field(None, q='machine_id__startswith')
|
||||||
|
dependency_id: Optional[str] = Field(None, q='dependency_id__startswith')
|
||||||
|
version: Optional[str] = Field(None, q='version__icontains')
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Machine Endpoints
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
@router.get("/machines", response=List[MachineSchema], url_name="get_machines")
|
||||||
|
@paginate(CustomPagination)
|
||||||
|
def get_machines(request, filters: MachineFilterSchema = Query(...)):
|
||||||
|
"""List all machines."""
|
||||||
|
from machine.models import Machine
|
||||||
|
return filters.filter(Machine.objects.all()).distinct()
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/machine/{machine_id}", response=MachineSchema, url_name="get_machine")
|
||||||
|
def get_machine(request, machine_id: str):
|
||||||
|
"""Get a specific machine by ID."""
|
||||||
|
from machine.models import Machine
|
||||||
|
from django.db.models import Q
|
||||||
|
return Machine.objects.get(Q(id__startswith=machine_id) | Q(hostname__iexact=machine_id))
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/machine/current", response=MachineSchema, url_name="get_current_machine")
|
||||||
|
def get_current_machine(request):
|
||||||
|
"""Get the current machine."""
|
||||||
|
from machine.models import Machine
|
||||||
|
return Machine.current()
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Dependency Endpoints
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
@router.get("/dependencies", response=List[DependencySchema], url_name="get_dependencies")
|
||||||
|
@paginate(CustomPagination)
|
||||||
|
def get_dependencies(request, filters: DependencyFilterSchema = Query(...)):
|
||||||
|
"""List all dependencies."""
|
||||||
|
from machine.models import Dependency
|
||||||
|
return filters.filter(Dependency.objects.all()).distinct()
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/dependency/{dependency_id}", response=DependencySchema, url_name="get_dependency")
|
||||||
|
def get_dependency(request, dependency_id: str):
|
||||||
|
"""Get a specific dependency by ID or bin_name."""
|
||||||
|
from machine.models import Dependency
|
||||||
|
from django.db.models import Q
|
||||||
|
try:
|
||||||
|
return Dependency.objects.get(Q(id__startswith=dependency_id))
|
||||||
|
except Dependency.DoesNotExist:
|
||||||
|
return Dependency.objects.get(bin_name__iexact=dependency_id)
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# InstalledBinary Endpoints
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
@router.get("/binaries", response=List[InstalledBinarySchema], url_name="get_binaries")
|
||||||
|
@paginate(CustomPagination)
|
||||||
|
def get_binaries(request, filters: InstalledBinaryFilterSchema = Query(...)):
|
||||||
|
"""List all installed binaries."""
|
||||||
|
from machine.models import InstalledBinary
|
||||||
|
return filters.filter(InstalledBinary.objects.all().select_related('machine', 'dependency')).distinct()
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/binary/{binary_id}", response=InstalledBinarySchema, url_name="get_binary")
|
||||||
|
def get_binary(request, binary_id: str):
|
||||||
|
"""Get a specific installed binary by ID."""
|
||||||
|
from machine.models import InstalledBinary
|
||||||
|
return InstalledBinary.objects.select_related('machine', 'dependency').get(id__startswith=binary_id)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/binary/by-name/{name}", response=List[InstalledBinarySchema], url_name="get_binaries_by_name")
|
||||||
|
def get_binaries_by_name(request, name: str):
|
||||||
|
"""Get all installed binaries with the given name."""
|
||||||
|
from machine.models import InstalledBinary
|
||||||
|
return list(InstalledBinary.objects.filter(name__iexact=name).select_related('machine', 'dependency'))
|
||||||
@@ -4,125 +4,157 @@ from uuid import UUID
|
|||||||
from typing import List, Any
|
from typing import List, Any
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
from ninja import Router, Schema
|
from ninja import Router, Schema
|
||||||
|
|
||||||
|
|
||||||
router = Router(tags=['Workers and Tasks'])
|
router = Router(tags=['Workers and Tasks'])
|
||||||
|
|
||||||
|
|
||||||
class TaskSchema(Schema):
|
class QueueItemSchema(Schema):
|
||||||
|
"""Schema for a single item in a worker's queue."""
|
||||||
TYPE: str
|
TYPE: str
|
||||||
|
|
||||||
id: UUID
|
id: UUID
|
||||||
description: str
|
|
||||||
|
|
||||||
status: str
|
status: str
|
||||||
retry_at: datetime | None
|
retry_at: datetime | None
|
||||||
|
|
||||||
created_at: datetime
|
created_at: datetime
|
||||||
modified_at: datetime
|
modified_at: datetime
|
||||||
created_by_id: int
|
description: str
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def resolve_TYPE(obj) -> str:
|
||||||
|
return f'{obj._meta.app_label}.{obj._meta.model_name}'
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def resolve_description(obj) -> str:
|
def resolve_description(obj) -> str:
|
||||||
return str(obj)
|
return str(obj)
|
||||||
|
|
||||||
|
|
||||||
class ActorSchema(Schema):
|
class WorkerSchema(Schema):
|
||||||
# TYPE: str = 'workers.actor.ActorType'
|
"""Schema for a Worker type."""
|
||||||
|
name: str
|
||||||
# name: str
|
|
||||||
#pid: int | None
|
|
||||||
idle_count: int
|
|
||||||
launch_kwargs: dict[str, Any]
|
|
||||||
mode: str
|
|
||||||
|
|
||||||
model: str
|
model: str
|
||||||
statemachine: str
|
max_tick_time: int
|
||||||
ACTIVE_STATE: str
|
max_concurrent_tasks: int
|
||||||
EVENT_NAME: str
|
poll_interval: float
|
||||||
CLAIM_ORDER: list[str]
|
idle_timeout: int
|
||||||
CLAIM_FROM_TOP_N: int
|
running_count: int
|
||||||
CLAIM_ATOMIC: bool
|
running_workers: List[dict[str, Any]]
|
||||||
MAX_TICK_TIME: int
|
queue_count: int
|
||||||
MAX_CONCURRENT_ACTORS: int
|
queue: List[QueueItemSchema]
|
||||||
|
|
||||||
future: list[TaskSchema]
|
|
||||||
pending: list[TaskSchema]
|
|
||||||
stalled: list[TaskSchema]
|
|
||||||
active: list[TaskSchema]
|
|
||||||
past: list[TaskSchema]
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def resolve_model(obj) -> str:
|
def resolve_model(obj) -> str:
|
||||||
return obj.Model.__name__
|
Model = obj.get_model()
|
||||||
|
return f'{Model._meta.app_label}.{Model._meta.model_name}'
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def resolve_statemachine(obj) -> str:
|
def resolve_max_tick_time(obj) -> int:
|
||||||
return obj.StateMachineClass.__name__
|
return obj.MAX_TICK_TIME
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def resolve_name(obj) -> str:
|
def resolve_max_concurrent_tasks(obj) -> int:
|
||||||
return str(obj)
|
return obj.MAX_CONCURRENT_TASKS
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def resolve_ACTIVE_STATE(obj) -> str:
|
def resolve_poll_interval(obj) -> float:
|
||||||
return str(obj.ACTIVE_STATE)
|
return obj.POLL_INTERVAL
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def resolve_FINAL_STATES(obj) -> list[str]:
|
def resolve_idle_timeout(obj) -> int:
|
||||||
return [str(state) for state in obj.FINAL_STATES]
|
return obj.IDLE_TIMEOUT
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def resolve_future(obj) -> list[TaskSchema]:
|
def resolve_running_count(obj) -> int:
|
||||||
return [obj for obj in obj.qs.filter(obj.future_q).order_by('-retry_at')]
|
return len(obj.get_running_workers())
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def resolve_pending(obj) -> list[TaskSchema]:
|
def resolve_running_workers(obj) -> List[dict[str, Any]]:
|
||||||
return [obj for obj in obj.qs.filter(obj.pending_q).order_by('-retry_at')]
|
return obj.get_running_workers()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def resolve_stalled(obj) -> list[TaskSchema]:
|
def resolve_queue_count(obj) -> int:
|
||||||
return [obj for obj in obj.qs.filter(obj.stalled_q).order_by('-retry_at')]
|
return obj.get_queue().count()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def resolve_active(obj) -> list[TaskSchema]:
|
def resolve_queue(obj) -> List[QueueItemSchema]:
|
||||||
return [obj for obj in obj.qs.filter(obj.active_q).order_by('-retry_at')]
|
return list(obj.get_queue()[:50]) # Limit to 50 items
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def resolve_past(obj) -> list[TaskSchema]:
|
|
||||||
return [obj for obj in obj.qs.filter(obj.final_q).order_by('-modified_at')]
|
|
||||||
|
|
||||||
|
|
||||||
class OrchestratorSchema(Schema):
|
class OrchestratorSchema(Schema):
|
||||||
# TYPE: str = 'workers.orchestrator.Orchestrator'
|
"""Schema for the Orchestrator."""
|
||||||
|
is_running: bool
|
||||||
#pid: int | None
|
poll_interval: float
|
||||||
exit_on_idle: bool
|
idle_timeout: int
|
||||||
mode: str
|
max_workers_per_type: int
|
||||||
|
max_total_workers: int
|
||||||
actors: list[ActorSchema]
|
total_worker_count: int
|
||||||
|
workers: List[WorkerSchema]
|
||||||
@staticmethod
|
|
||||||
def resolve_actors(obj) -> list[ActorSchema]:
|
|
||||||
return [actor() for actor in obj.actor_types.values()]
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("/orchestrators", response=List[OrchestratorSchema], url_name="get_orchestrators")
|
@router.get("/orchestrator", response=OrchestratorSchema, url_name="get_orchestrator")
|
||||||
def get_orchestrators(request):
|
def get_orchestrator(request):
|
||||||
"""List all the task orchestrators (aka Orchestrators) that are currently running"""
|
"""Get the orchestrator status and all worker queues."""
|
||||||
|
|
||||||
from workers.orchestrator import Orchestrator
|
from workers.orchestrator import Orchestrator
|
||||||
|
from workers.worker import CrawlWorker, SnapshotWorker, ArchiveResultWorker
|
||||||
|
|
||||||
orchestrator = Orchestrator()
|
orchestrator = Orchestrator()
|
||||||
|
|
||||||
return [orchestrator]
|
# Create temporary worker instances to query their queues
|
||||||
|
workers = [
|
||||||
|
CrawlWorker(worker_id=-1),
|
||||||
|
SnapshotWorker(worker_id=-1),
|
||||||
|
ArchiveResultWorker(worker_id=-1),
|
||||||
|
]
|
||||||
|
|
||||||
|
return {
|
||||||
|
'is_running': orchestrator.is_running(),
|
||||||
|
'poll_interval': orchestrator.POLL_INTERVAL,
|
||||||
|
'idle_timeout': orchestrator.IDLE_TIMEOUT,
|
||||||
|
'max_workers_per_type': orchestrator.MAX_WORKERS_PER_TYPE,
|
||||||
|
'max_total_workers': orchestrator.MAX_TOTAL_WORKERS,
|
||||||
|
'total_worker_count': orchestrator.get_total_worker_count(),
|
||||||
|
'workers': workers,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@router.get("/actors", response=List[ActorSchema], url_name="get_actors")
|
@router.get("/workers", response=List[WorkerSchema], url_name="get_workers")
|
||||||
def get_actors(request):
|
def get_workers(request):
|
||||||
"""List all the task consumer workers (aka Actors) that are currently running"""
|
"""List all worker types and their current status."""
|
||||||
|
from workers.worker import CrawlWorker, SnapshotWorker, ArchiveResultWorker
|
||||||
|
|
||||||
from workers.orchestrator import Orchestrator
|
# Create temporary instances to query their queues
|
||||||
orchestrator = Orchestrator()
|
return [
|
||||||
return orchestrator.actor_types.values()
|
CrawlWorker(worker_id=-1),
|
||||||
|
SnapshotWorker(worker_id=-1),
|
||||||
|
ArchiveResultWorker(worker_id=-1),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/worker/{worker_name}", response=WorkerSchema, url_name="get_worker")
|
||||||
|
def get_worker(request, worker_name: str):
|
||||||
|
"""Get status and queue for a specific worker type."""
|
||||||
|
from workers.worker import WORKER_TYPES
|
||||||
|
|
||||||
|
if worker_name not in WORKER_TYPES:
|
||||||
|
from ninja.errors import HttpError
|
||||||
|
raise HttpError(404, f"Unknown worker type: {worker_name}. Valid types: {list(WORKER_TYPES.keys())}")
|
||||||
|
|
||||||
|
WorkerClass = WORKER_TYPES[worker_name]
|
||||||
|
return WorkerClass(worker_id=-1)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/worker/{worker_name}/queue", response=List[QueueItemSchema], url_name="get_worker_queue")
|
||||||
|
def get_worker_queue(request, worker_name: str, limit: int = 100):
|
||||||
|
"""Get the current queue for a specific worker type."""
|
||||||
|
from workers.worker import WORKER_TYPES
|
||||||
|
|
||||||
|
if worker_name not in WORKER_TYPES:
|
||||||
|
from ninja.errors import HttpError
|
||||||
|
raise HttpError(404, f"Unknown worker type: {worker_name}. Valid types: {list(WORKER_TYPES.keys())}")
|
||||||
|
|
||||||
|
WorkerClass = WORKER_TYPES[worker_name]
|
||||||
|
worker = WorkerClass(worker_id=-1)
|
||||||
|
return list(worker.get_queue()[:limit])
|
||||||
|
|
||||||
|
|
||||||
|
# Progress endpoint moved to core.views.live_progress_view for simplicity
|
||||||
|
|||||||
@@ -2,76 +2,226 @@
|
|||||||
|
|
||||||
__package__ = 'archivebox.base_models'
|
__package__ = 'archivebox.base_models'
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
from django import forms
|
||||||
from django.contrib import admin
|
from django.contrib import admin
|
||||||
from django.utils.html import format_html, mark_safe
|
from django.utils.html import format_html, mark_safe
|
||||||
from django_object_actions import DjangoObjectActions
|
from django_object_actions import DjangoObjectActions
|
||||||
|
|
||||||
|
|
||||||
|
class KeyValueWidget(forms.Widget):
|
||||||
|
"""
|
||||||
|
A widget that renders JSON dict as editable key-value input fields
|
||||||
|
with + and - buttons to add/remove rows.
|
||||||
|
Includes autocomplete for available config keys from the plugin system.
|
||||||
|
"""
|
||||||
|
template_name = None # We render manually
|
||||||
|
|
||||||
|
class Media:
|
||||||
|
css = {
|
||||||
|
'all': []
|
||||||
|
}
|
||||||
|
js = []
|
||||||
|
|
||||||
|
def _get_config_options(self):
|
||||||
|
"""Get available config options from plugins."""
|
||||||
|
try:
|
||||||
|
from archivebox.hooks import discover_plugin_configs
|
||||||
|
plugin_configs = discover_plugin_configs()
|
||||||
|
options = {}
|
||||||
|
for plugin_name, schema in plugin_configs.items():
|
||||||
|
for key, prop in schema.get('properties', {}).items():
|
||||||
|
options[key] = {
|
||||||
|
'plugin': plugin_name,
|
||||||
|
'type': prop.get('type', 'string'),
|
||||||
|
'default': prop.get('default', ''),
|
||||||
|
'description': prop.get('description', ''),
|
||||||
|
}
|
||||||
|
return options
|
||||||
|
except Exception:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def render(self, name, value, attrs=None, renderer=None):
|
||||||
|
# Parse JSON value to dict
|
||||||
|
if value is None:
|
||||||
|
data = {}
|
||||||
|
elif isinstance(value, str):
|
||||||
|
try:
|
||||||
|
data = json.loads(value) if value else {}
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
data = {}
|
||||||
|
elif isinstance(value, dict):
|
||||||
|
data = value
|
||||||
|
else:
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
widget_id = attrs.get('id', name) if attrs else name
|
||||||
|
config_options = self._get_config_options()
|
||||||
|
|
||||||
|
# Build datalist options
|
||||||
|
datalist_options = '\n'.join(
|
||||||
|
f'<option value="{self._escape(key)}">{self._escape(opt["description"][:60] or opt["type"])}</option>'
|
||||||
|
for key, opt in sorted(config_options.items())
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build config metadata as JSON for JS
|
||||||
|
config_meta_json = json.dumps(config_options)
|
||||||
|
|
||||||
|
html = f'''
|
||||||
|
<div id="{widget_id}_container" class="key-value-editor" style="max-width: 700px;">
|
||||||
|
<datalist id="{widget_id}_keys">
|
||||||
|
{datalist_options}
|
||||||
|
</datalist>
|
||||||
|
<div id="{widget_id}_rows" class="key-value-rows">
|
||||||
|
'''
|
||||||
|
|
||||||
|
# Render existing key-value pairs
|
||||||
|
row_idx = 0
|
||||||
|
for key, val in data.items():
|
||||||
|
val_str = json.dumps(val) if not isinstance(val, str) else val
|
||||||
|
html += self._render_row(widget_id, row_idx, key, val_str)
|
||||||
|
row_idx += 1
|
||||||
|
|
||||||
|
# Always add one empty row for new entries
|
||||||
|
html += self._render_row(widget_id, row_idx, '', '')
|
||||||
|
|
||||||
|
html += f'''
|
||||||
|
</div>
|
||||||
|
<div style="display: flex; gap: 8px; align-items: center; margin-top: 8px;">
|
||||||
|
<button type="button" onclick="addKeyValueRow_{widget_id}()"
|
||||||
|
style="padding: 4px 12px; cursor: pointer; background: #417690; color: white; border: none; border-radius: 4px;">
|
||||||
|
+ Add Row
|
||||||
|
</button>
|
||||||
|
<span id="{widget_id}_hint" style="font-size: 11px; color: #666; font-style: italic;"></span>
|
||||||
|
</div>
|
||||||
|
<input type="hidden" name="{name}" id="{widget_id}" value="">
|
||||||
|
<script>
|
||||||
|
(function() {{
|
||||||
|
var configMeta_{widget_id} = {config_meta_json};
|
||||||
|
|
||||||
|
function showKeyHint_{widget_id}(key) {{
|
||||||
|
var hint = document.getElementById('{widget_id}_hint');
|
||||||
|
var meta = configMeta_{widget_id}[key];
|
||||||
|
if (meta) {{
|
||||||
|
hint.innerHTML = '<b>' + key + '</b>: ' + (meta.description || meta.type) +
|
||||||
|
(meta.default !== '' ? ' <span style="color:#888">(default: ' + meta.default + ')</span>' : '');
|
||||||
|
}} else {{
|
||||||
|
hint.textContent = key ? 'Custom key: ' + key : '';
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
|
||||||
|
function updateHiddenField_{widget_id}() {{
|
||||||
|
var container = document.getElementById('{widget_id}_rows');
|
||||||
|
var rows = container.querySelectorAll('.key-value-row');
|
||||||
|
var result = {{}};
|
||||||
|
rows.forEach(function(row) {{
|
||||||
|
var keyInput = row.querySelector('.kv-key');
|
||||||
|
var valInput = row.querySelector('.kv-value');
|
||||||
|
if (keyInput && valInput && keyInput.value.trim()) {{
|
||||||
|
var key = keyInput.value.trim();
|
||||||
|
var val = valInput.value.trim();
|
||||||
|
// Try to parse as JSON (for booleans, numbers, etc)
|
||||||
|
try {{
|
||||||
|
if (val === 'true') result[key] = true;
|
||||||
|
else if (val === 'false') result[key] = false;
|
||||||
|
else if (val === 'null') result[key] = null;
|
||||||
|
else if (!isNaN(val) && val !== '') result[key] = Number(val);
|
||||||
|
else if ((val.startsWith('{{') && val.endsWith('}}')) ||
|
||||||
|
(val.startsWith('[') && val.endsWith(']')) ||
|
||||||
|
(val.startsWith('"') && val.endsWith('"')))
|
||||||
|
result[key] = JSON.parse(val);
|
||||||
|
else result[key] = val;
|
||||||
|
}} catch(e) {{
|
||||||
|
result[key] = val;
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
}});
|
||||||
|
document.getElementById('{widget_id}').value = JSON.stringify(result);
|
||||||
|
}}
|
||||||
|
|
||||||
|
window.addKeyValueRow_{widget_id} = function() {{
|
||||||
|
var container = document.getElementById('{widget_id}_rows');
|
||||||
|
var rows = container.querySelectorAll('.key-value-row');
|
||||||
|
var newIdx = rows.length;
|
||||||
|
var newRow = document.createElement('div');
|
||||||
|
newRow.className = 'key-value-row';
|
||||||
|
newRow.style.cssText = 'display: flex; gap: 8px; margin-bottom: 6px; align-items: center;';
|
||||||
|
newRow.innerHTML = '<input type="text" class="kv-key" placeholder="KEY" list="{widget_id}_keys" ' +
|
||||||
|
'style="flex: 1; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;" ' +
|
||||||
|
'onchange="updateHiddenField_{widget_id}()" oninput="updateHiddenField_{widget_id}(); showKeyHint_{widget_id}(this.value)" onfocus="showKeyHint_{widget_id}(this.value)">' +
|
||||||
|
'<input type="text" class="kv-value" placeholder="value" ' +
|
||||||
|
'style="flex: 2; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;" ' +
|
||||||
|
'onchange="updateHiddenField_{widget_id}()" oninput="updateHiddenField_{widget_id}()">' +
|
||||||
|
'<button type="button" onclick="removeKeyValueRow_{widget_id}(this)" ' +
|
||||||
|
'style="padding: 4px 10px; cursor: pointer; background: #ba2121; color: white; border: none; border-radius: 4px; font-weight: bold;">−</button>';
|
||||||
|
container.appendChild(newRow);
|
||||||
|
newRow.querySelector('.kv-key').focus();
|
||||||
|
}};
|
||||||
|
|
||||||
|
window.removeKeyValueRow_{widget_id} = function(btn) {{
|
||||||
|
var row = btn.parentElement;
|
||||||
|
row.remove();
|
||||||
|
updateHiddenField_{widget_id}();
|
||||||
|
}};
|
||||||
|
|
||||||
|
window.showKeyHint_{widget_id} = showKeyHint_{widget_id};
|
||||||
|
window.updateHiddenField_{widget_id} = updateHiddenField_{widget_id};
|
||||||
|
|
||||||
|
// Initialize on load
|
||||||
|
document.addEventListener('DOMContentLoaded', function() {{
|
||||||
|
updateHiddenField_{widget_id}();
|
||||||
|
}});
|
||||||
|
// Also run immediately in case DOM is already ready
|
||||||
|
if (document.readyState !== 'loading') {{
|
||||||
|
updateHiddenField_{widget_id}();
|
||||||
|
}}
|
||||||
|
|
||||||
|
// Update on any input change
|
||||||
|
document.getElementById('{widget_id}_rows').addEventListener('input', updateHiddenField_{widget_id});
|
||||||
|
}})();
|
||||||
|
</script>
|
||||||
|
</div>
|
||||||
|
'''
|
||||||
|
return mark_safe(html)
|
||||||
|
|
||||||
|
def _render_row(self, widget_id, idx, key, value):
|
||||||
|
return f'''
|
||||||
|
<div class="key-value-row" style="display: flex; gap: 8px; margin-bottom: 6px; align-items: center;">
|
||||||
|
<input type="text" class="kv-key" value="{self._escape(key)}" placeholder="KEY" list="{widget_id}_keys"
|
||||||
|
style="flex: 1; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;"
|
||||||
|
onchange="updateHiddenField_{widget_id}()" oninput="updateHiddenField_{widget_id}(); showKeyHint_{widget_id}(this.value)" onfocus="showKeyHint_{widget_id}(this.value)">
|
||||||
|
<input type="text" class="kv-value" value="{self._escape(value)}" placeholder="value"
|
||||||
|
style="flex: 2; padding: 6px 8px; border: 1px solid #ccc; border-radius: 4px; font-family: monospace; font-size: 12px;"
|
||||||
|
onchange="updateHiddenField_{widget_id}()" oninput="updateHiddenField_{widget_id}()">
|
||||||
|
<button type="button" onclick="removeKeyValueRow_{widget_id}(this)"
|
||||||
|
style="padding: 4px 10px; cursor: pointer; background: #ba2121; color: white; border: none; border-radius: 4px; font-weight: bold;">−</button>
|
||||||
|
</div>
|
||||||
|
'''
|
||||||
|
|
||||||
|
def _escape(self, s):
|
||||||
|
"""Escape HTML special chars in attribute values."""
|
||||||
|
if not s:
|
||||||
|
return ''
|
||||||
|
return str(s).replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"')
|
||||||
|
|
||||||
|
def value_from_datadict(self, data, files, name):
|
||||||
|
value = data.get(name, '{}')
|
||||||
|
return value
|
||||||
|
|
||||||
|
|
||||||
class ConfigEditorMixin:
|
class ConfigEditorMixin:
|
||||||
"""
|
"""
|
||||||
Mixin for admin classes with a config JSON field.
|
Mixin for admin classes with a config JSON field.
|
||||||
|
|
||||||
Provides a readonly field that shows available config options
|
Provides a key-value editor widget with autocomplete for available config keys.
|
||||||
from all discovered plugin schemas.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@admin.display(description='Available Config Options')
|
def formfield_for_dbfield(self, db_field, request, **kwargs):
|
||||||
def available_config_options(self, obj):
|
"""Use KeyValueWidget for the config JSON field."""
|
||||||
"""Show documentation for available config keys."""
|
if db_field.name == 'config':
|
||||||
try:
|
kwargs['widget'] = KeyValueWidget()
|
||||||
from archivebox.hooks import discover_plugin_configs
|
return super().formfield_for_dbfield(db_field, request, **kwargs)
|
||||||
plugin_configs = discover_plugin_configs()
|
|
||||||
except ImportError:
|
|
||||||
return format_html('<i>Plugin config system not available</i>')
|
|
||||||
|
|
||||||
html_parts = [
|
|
||||||
'<details>',
|
|
||||||
'<summary style="cursor: pointer; font-weight: bold; padding: 4px;">',
|
|
||||||
'Click to see available config keys ({})</summary>'.format(
|
|
||||||
sum(len(s.get('properties', {})) for s in plugin_configs.values())
|
|
||||||
),
|
|
||||||
'<div style="max-height: 400px; overflow-y: auto; padding: 8px; background: #f8f8f8; border-radius: 4px; font-family: monospace; font-size: 11px;">',
|
|
||||||
]
|
|
||||||
|
|
||||||
for plugin_name, schema in sorted(plugin_configs.items()):
|
|
||||||
properties = schema.get('properties', {})
|
|
||||||
if not properties:
|
|
||||||
continue
|
|
||||||
|
|
||||||
html_parts.append(f'<div style="margin: 8px 0;"><strong style="color: #333;">{plugin_name}</strong></div>')
|
|
||||||
html_parts.append('<table style="width: 100%; border-collapse: collapse; margin-bottom: 12px;">')
|
|
||||||
html_parts.append('<tr style="background: #eee;"><th style="text-align: left; padding: 4px;">Key</th><th style="text-align: left; padding: 4px;">Type</th><th style="text-align: left; padding: 4px;">Default</th><th style="text-align: left; padding: 4px;">Description</th></tr>')
|
|
||||||
|
|
||||||
for key, prop in sorted(properties.items()):
|
|
||||||
prop_type = prop.get('type', 'string')
|
|
||||||
default = prop.get('default', '')
|
|
||||||
description = prop.get('description', '')
|
|
||||||
|
|
||||||
# Truncate long defaults
|
|
||||||
default_str = str(default)
|
|
||||||
if len(default_str) > 30:
|
|
||||||
default_str = default_str[:27] + '...'
|
|
||||||
|
|
||||||
html_parts.append(
|
|
||||||
f'<tr style="border-bottom: 1px solid #ddd;">'
|
|
||||||
f'<td style="padding: 4px; font-weight: bold;">{key}</td>'
|
|
||||||
f'<td style="padding: 4px; color: #666;">{prop_type}</td>'
|
|
||||||
f'<td style="padding: 4px; color: #666;">{default_str}</td>'
|
|
||||||
f'<td style="padding: 4px;">{description}</td>'
|
|
||||||
f'</tr>'
|
|
||||||
)
|
|
||||||
|
|
||||||
html_parts.append('</table>')
|
|
||||||
|
|
||||||
html_parts.append('</div></details>')
|
|
||||||
html_parts.append(
|
|
||||||
'<p style="margin-top: 8px; color: #666; font-size: 11px;">'
|
|
||||||
'<strong>Usage:</strong> Add key-value pairs in JSON format, e.g., '
|
|
||||||
'<code>{"SAVE_WGET": false, "WGET_TIMEOUT": 120}</code>'
|
|
||||||
'</p>'
|
|
||||||
)
|
|
||||||
|
|
||||||
return mark_safe(''.join(html_parts))
|
|
||||||
|
|
||||||
|
|
||||||
class BaseModelAdmin(DjangoObjectActions, admin.ModelAdmin):
|
class BaseModelAdmin(DjangoObjectActions, admin.ModelAdmin):
|
||||||
|
|||||||
@@ -72,9 +72,10 @@ def add(urls: str | list[str],
|
|||||||
cli_args[0] = 'archivebox'
|
cli_args[0] = 'archivebox'
|
||||||
cmd_str = ' '.join(cli_args)
|
cmd_str = ' '.join(cli_args)
|
||||||
|
|
||||||
|
timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
|
||||||
seed = Seed.from_file(
|
seed = Seed.from_file(
|
||||||
sources_file,
|
sources_file,
|
||||||
label=f'{USER}@{HOSTNAME} $ {cmd_str}',
|
label=f'{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]',
|
||||||
parser=parser,
|
parser=parser,
|
||||||
tag=tag,
|
tag=tag,
|
||||||
created_by=created_by_id,
|
created_by=created_by_id,
|
||||||
|
|||||||
@@ -11,21 +11,53 @@ __package__ = "archivebox.config"
|
|||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Dict, Optional, List, Type, TYPE_CHECKING, cast
|
from typing import Any, Dict, Optional, List, Type, Tuple, TYPE_CHECKING, cast
|
||||||
from configparser import ConfigParser
|
from configparser import ConfigParser
|
||||||
|
|
||||||
from pydantic import Field
|
from pydantic import Field
|
||||||
from pydantic_settings import BaseSettings
|
from pydantic_settings import BaseSettings, PydanticBaseSettingsSource
|
||||||
|
|
||||||
|
|
||||||
|
class IniConfigSettingsSource(PydanticBaseSettingsSource):
|
||||||
|
"""
|
||||||
|
Custom settings source that reads from ArchiveBox.conf (INI format).
|
||||||
|
Flattens all sections into a single namespace.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def get_field_value(self, field: Any, field_name: str) -> Tuple[Any, str, bool]:
|
||||||
|
config_vals = self._load_config_file()
|
||||||
|
field_value = config_vals.get(field_name.upper())
|
||||||
|
return field_value, field_name, False
|
||||||
|
|
||||||
|
def __call__(self) -> Dict[str, Any]:
|
||||||
|
return self._load_config_file()
|
||||||
|
|
||||||
|
def _load_config_file(self) -> Dict[str, Any]:
|
||||||
|
try:
|
||||||
|
from archivebox.config.constants import CONSTANTS
|
||||||
|
config_path = CONSTANTS.CONFIG_FILE
|
||||||
|
except ImportError:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
if not config_path.exists():
|
||||||
|
return {}
|
||||||
|
|
||||||
|
parser = ConfigParser()
|
||||||
|
parser.optionxform = lambda x: x # preserve case
|
||||||
|
parser.read(config_path)
|
||||||
|
|
||||||
|
# Flatten all sections into single namespace (ignore section headers)
|
||||||
|
return {key.upper(): value for section in parser.sections() for key, value in parser.items(section)}
|
||||||
|
|
||||||
|
|
||||||
class BaseConfigSet(BaseSettings):
|
class BaseConfigSet(BaseSettings):
|
||||||
"""
|
"""
|
||||||
Base class for config sections.
|
Base class for config sections.
|
||||||
|
|
||||||
Automatically loads values from:
|
Automatically loads values from (highest to lowest priority):
|
||||||
1. Environment variables (highest priority)
|
1. Environment variables
|
||||||
2. ArchiveBox.conf file (if exists)
|
2. ArchiveBox.conf file (INI format, flattened)
|
||||||
3. Default values (lowest priority)
|
3. Default values
|
||||||
|
|
||||||
Subclasses define fields with defaults and types:
|
Subclasses define fields with defaults and types:
|
||||||
|
|
||||||
@@ -35,11 +67,30 @@ class BaseConfigSet(BaseSettings):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
# Use env vars with ARCHIVEBOX_ prefix or raw name
|
|
||||||
env_prefix = ""
|
env_prefix = ""
|
||||||
extra = "ignore"
|
extra = "ignore"
|
||||||
validate_default = True
|
validate_default = True
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def settings_customise_sources(
|
||||||
|
cls,
|
||||||
|
settings_cls: Type[BaseSettings],
|
||||||
|
init_settings: PydanticBaseSettingsSource,
|
||||||
|
env_settings: PydanticBaseSettingsSource,
|
||||||
|
dotenv_settings: PydanticBaseSettingsSource,
|
||||||
|
file_secret_settings: PydanticBaseSettingsSource,
|
||||||
|
) -> Tuple[PydanticBaseSettingsSource, ...]:
|
||||||
|
"""
|
||||||
|
Define the order of settings sources (first = highest priority).
|
||||||
|
"""
|
||||||
|
return (
|
||||||
|
init_settings, # 1. Passed to __init__
|
||||||
|
env_settings, # 2. Environment variables
|
||||||
|
IniConfigSettingsSource(settings_cls), # 3. ArchiveBox.conf file
|
||||||
|
# dotenv_settings, # Skip .env files
|
||||||
|
# file_secret_settings, # Skip secrets files
|
||||||
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def load_from_file(cls, config_path: Path) -> Dict[str, str]:
|
def load_from_file(cls, config_path: Path) -> Dict[str, str]:
|
||||||
"""Load config values from INI file."""
|
"""Load config values from INI file."""
|
||||||
@@ -47,7 +98,7 @@ class BaseConfigSet(BaseSettings):
|
|||||||
return {}
|
return {}
|
||||||
|
|
||||||
parser = ConfigParser()
|
parser = ConfigParser()
|
||||||
parser.optionxform = lambda x: x # type: ignore # preserve case
|
parser.optionxform = lambda x: x # preserve case
|
||||||
parser.read(config_path)
|
parser.read(config_path)
|
||||||
|
|
||||||
# Flatten all sections into single namespace
|
# Flatten all sections into single namespace
|
||||||
|
|||||||
@@ -256,7 +256,7 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
|||||||
# Show a helpful message when no plugins found
|
# Show a helpful message when no plugins found
|
||||||
rows['Name'].append('(no plugins found)')
|
rows['Name'].append('(no plugins found)')
|
||||||
rows['Source'].append('-')
|
rows['Source'].append('-')
|
||||||
rows['Path'].append(format_html('<code>archivebox/plugins/</code> or <code>data/plugins/</code>'))
|
rows['Path'].append(mark_safe('<code>archivebox/plugins/</code> or <code>data/plugins/</code>'))
|
||||||
rows['Hooks'].append('-')
|
rows['Hooks'].append('-')
|
||||||
|
|
||||||
return TableContext(
|
return TableContext(
|
||||||
|
|||||||
@@ -9,25 +9,17 @@ from django.core.exceptions import ValidationError
|
|||||||
from django.urls import reverse, resolve
|
from django.urls import reverse, resolve
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
|
|
||||||
from huey_monitor.admin import TaskModel
|
|
||||||
|
|
||||||
from archivebox.config import DATA_DIR
|
from archivebox.config import DATA_DIR
|
||||||
from archivebox.config.common import SERVER_CONFIG
|
from archivebox.config.common import SERVER_CONFIG
|
||||||
from archivebox.misc.paginators import AccelleratedPaginator
|
from archivebox.misc.paginators import AccelleratedPaginator
|
||||||
from archivebox.base_models.admin import BaseModelAdmin
|
from archivebox.base_models.admin import BaseModelAdmin
|
||||||
|
from archivebox.hooks import get_extractor_icon
|
||||||
|
|
||||||
|
|
||||||
from core.models import ArchiveResult, Snapshot
|
from core.models import ArchiveResult, Snapshot
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def result_url(result: TaskModel) -> str:
|
|
||||||
url = reverse("admin:huey_monitor_taskmodel_change", args=[str(result.id)])
|
|
||||||
return format_html('<a href="{url}" class="fade-in-progress-url">See progress...</a>'.format(url=url))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class ArchiveResultInline(admin.TabularInline):
|
class ArchiveResultInline(admin.TabularInline):
|
||||||
name = 'Archive Results Log'
|
name = 'Archive Results Log'
|
||||||
model = ArchiveResult
|
model = ArchiveResult
|
||||||
@@ -101,9 +93,9 @@ class ArchiveResultInline(admin.TabularInline):
|
|||||||
|
|
||||||
|
|
||||||
class ArchiveResultAdmin(BaseModelAdmin):
|
class ArchiveResultAdmin(BaseModelAdmin):
|
||||||
list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'extractor', 'cmd_str', 'output_str')
|
list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'extractor_with_icon', 'cmd_str', 'output_str')
|
||||||
sort_fields = ('id', 'created_by', 'created_at', 'extractor', 'status')
|
sort_fields = ('id', 'created_by', 'created_at', 'extractor', 'status')
|
||||||
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary')
|
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'extractor_with_icon')
|
||||||
search_fields = ('id', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
|
search_fields = ('id', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
|
||||||
fields = ('snapshot', 'extractor', 'status', 'retry_at', 'start_ts', 'end_ts', 'created_by', 'pwd', 'cmd_version', 'cmd', 'output', *readonly_fields)
|
fields = ('snapshot', 'extractor', 'status', 'retry_at', 'start_ts', 'end_ts', 'created_by', 'pwd', 'cmd_version', 'cmd', 'output', *readonly_fields)
|
||||||
autocomplete_fields = ['snapshot']
|
autocomplete_fields = ['snapshot']
|
||||||
@@ -144,6 +136,16 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
|||||||
def tags_str(self, result):
|
def tags_str(self, result):
|
||||||
return result.snapshot.tags_str()
|
return result.snapshot.tags_str()
|
||||||
|
|
||||||
|
@admin.display(description='Extractor', ordering='extractor')
|
||||||
|
def extractor_with_icon(self, result):
|
||||||
|
icon = get_extractor_icon(result.extractor)
|
||||||
|
return format_html(
|
||||||
|
'<span title="{}">{}</span> {}',
|
||||||
|
result.extractor,
|
||||||
|
icon,
|
||||||
|
result.extractor,
|
||||||
|
)
|
||||||
|
|
||||||
def cmd_str(self, result):
|
def cmd_str(self, result):
|
||||||
return format_html(
|
return format_html(
|
||||||
'<pre>{}</pre>',
|
'<pre>{}</pre>',
|
||||||
@@ -151,10 +153,12 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def output_str(self, result):
|
def output_str(self, result):
|
||||||
|
# Determine output link path - use output if file exists, otherwise link to index
|
||||||
|
output_path = result.output if (result.status == 'succeeded' and result.output) else 'index.html'
|
||||||
return format_html(
|
return format_html(
|
||||||
'<a href="/archive/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
|
'<a href="/archive/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
|
||||||
result.snapshot.timestamp,
|
result.snapshot.timestamp,
|
||||||
result.output if (result.status == 'succeeded') and result.extractor not in ('title', 'archive_org') else 'index.html',
|
output_path,
|
||||||
result.output,
|
result.output,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -185,7 +189,7 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
|||||||
is_hidden = filename.startswith('.')
|
is_hidden = filename.startswith('.')
|
||||||
output_str += format_html('<span style="opacity: {}.2">{}{}</span><br/>', int(not is_hidden), indentation_str, filename.strip())
|
output_str += format_html('<span style="opacity: {}.2">{}{}</span><br/>', int(not is_hidden), indentation_str, filename.strip())
|
||||||
|
|
||||||
return output_str + format_html('</code></pre>')
|
return output_str + mark_safe('</code></pre>')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -36,7 +36,18 @@ def register_admin_site():
|
|||||||
admin.site = archivebox_admin
|
admin.site = archivebox_admin
|
||||||
sites.site = archivebox_admin
|
sites.site = archivebox_admin
|
||||||
|
|
||||||
# Plugin admin registration is now handled by individual app admins
|
# Register admin views for each app
|
||||||
# No longer using archivebox.pm.hook.register_admin()
|
# (Previously handled by ABX plugin system, now called directly)
|
||||||
|
from core.admin import register_admin as register_core_admin
|
||||||
|
from crawls.admin import register_admin as register_crawls_admin
|
||||||
|
from api.admin import register_admin as register_api_admin
|
||||||
|
from machine.admin import register_admin as register_machine_admin
|
||||||
|
from workers.admin import register_admin as register_workers_admin
|
||||||
|
|
||||||
|
register_core_admin(archivebox_admin)
|
||||||
|
register_crawls_admin(archivebox_admin)
|
||||||
|
register_api_admin(archivebox_admin)
|
||||||
|
register_machine_admin(archivebox_admin)
|
||||||
|
register_workers_admin(archivebox_admin)
|
||||||
|
|
||||||
return archivebox_admin
|
return archivebox_admin
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ from archivebox.workers.tasks import bg_archive_snapshots, bg_add
|
|||||||
|
|
||||||
from core.models import Tag
|
from core.models import Tag
|
||||||
from core.admin_tags import TagInline
|
from core.admin_tags import TagInline
|
||||||
from core.admin_archiveresults import ArchiveResultInline, result_url
|
from core.admin_archiveresults import ArchiveResultInline
|
||||||
|
|
||||||
|
|
||||||
# GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
|
# GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
|
||||||
@@ -54,10 +54,10 @@ class SnapshotActionForm(ActionForm):
|
|||||||
class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||||
list_display = ('created_at', 'title_str', 'status', 'files', 'size', 'url_str')
|
list_display = ('created_at', 'title_str', 'status', 'files', 'size', 'url_str')
|
||||||
sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
|
sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
|
||||||
readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'link_dir', 'available_config_options')
|
readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir')
|
||||||
search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
|
search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
|
||||||
list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
|
list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
|
||||||
fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', 'config', 'available_config_options', *readonly_fields[:-1])
|
fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', 'config', *readonly_fields)
|
||||||
ordering = ['-created_at']
|
ordering = ['-created_at']
|
||||||
actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
|
actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
|
||||||
inlines = [TagInline, ArchiveResultInline]
|
inlines = [TagInline, ArchiveResultInline]
|
||||||
@@ -93,12 +93,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
|||||||
# self.request = request
|
# self.request = request
|
||||||
# return super().get_queryset(request).prefetch_related('archiveresult_set').distinct() # .annotate(archiveresult_count=Count('archiveresult'))
|
# return super().get_queryset(request).prefetch_related('archiveresult_set').distinct() # .annotate(archiveresult_count=Count('archiveresult'))
|
||||||
|
|
||||||
@admin.action(
|
@admin.display(description="Imported Timestamp")
|
||||||
description="Imported Timestamp"
|
|
||||||
)
|
|
||||||
def imported_timestamp(self, obj):
|
def imported_timestamp(self, obj):
|
||||||
context = RequestContext(self.request, {
|
context = RequestContext(self.request, {
|
||||||
'bookmarked_date': obj.bookmarked,
|
'bookmarked_date': obj.bookmarked_at,
|
||||||
'timestamp': obj.timestamp,
|
'timestamp': obj.timestamp,
|
||||||
})
|
})
|
||||||
|
|
||||||
@@ -145,22 +143,15 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
|||||||
|
|
||||||
def status_info(self, obj):
|
def status_info(self, obj):
|
||||||
return format_html(
|
return format_html(
|
||||||
# URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
|
|
||||||
'''
|
'''
|
||||||
Archived: {} ({} files {})
|
Archived: {} ({} files {})
|
||||||
Favicon: <img src="{}" style="height: 20px"/>
|
Favicon: <img src="{}" style="height: 20px"/>
|
||||||
Status code: {} <br/>
|
|
||||||
Server: {}
|
|
||||||
Content type: {}
|
|
||||||
Extension: {}
|
Extension: {}
|
||||||
''',
|
''',
|
||||||
'✅' if obj.is_archived else '❌',
|
'✅' if obj.is_archived else '❌',
|
||||||
obj.num_outputs,
|
obj.num_outputs,
|
||||||
self.size(obj) or '0kb',
|
self.size(obj) or '0kb',
|
||||||
f'/archive/{obj.timestamp}/favicon.ico',
|
f'/archive/{obj.timestamp}/favicon.ico',
|
||||||
obj.status_code or '-',
|
|
||||||
obj.headers and obj.headers.get('Server') or '-',
|
|
||||||
obj.headers and obj.headers.get('Content-Type') or '-',
|
|
||||||
obj.extension or '-',
|
obj.extension or '-',
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -184,8 +175,8 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
|||||||
obj.archive_path,
|
obj.archive_path,
|
||||||
obj.archive_path,
|
obj.archive_path,
|
||||||
obj.archive_path,
|
obj.archive_path,
|
||||||
'fetched' if obj.latest_title or obj.title else 'pending',
|
'fetched' if obj.title else 'pending',
|
||||||
urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
|
urldecode(htmldecode(obj.title or ''))[:128] or 'Pending...'
|
||||||
) + mark_safe(f' <span class="tags">{tags}</span>')
|
) + mark_safe(f' <span class="tags">{tags}</span>')
|
||||||
|
|
||||||
@admin.display(
|
@admin.display(
|
||||||
@@ -259,14 +250,13 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
|||||||
description="ℹ️ Get Title"
|
description="ℹ️ Get Title"
|
||||||
)
|
)
|
||||||
def update_titles(self, request, queryset):
|
def update_titles(self, request, queryset):
|
||||||
from core.models import Snapshot
|
|
||||||
count = queryset.count()
|
count = queryset.count()
|
||||||
|
|
||||||
# Queue snapshots for archiving via the state machine system
|
# Queue snapshots for archiving via the state machine system
|
||||||
result = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR})
|
queued = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR})
|
||||||
messages.success(
|
messages.success(
|
||||||
request,
|
request,
|
||||||
mark_safe(f"Title and favicon are updating in the background for {count} URLs. {result_url(result)}"),
|
f"Queued {queued} snapshots for title/favicon update. The orchestrator will process them in the background.",
|
||||||
)
|
)
|
||||||
|
|
||||||
@admin.action(
|
@admin.action(
|
||||||
@@ -275,11 +265,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
|||||||
def update_snapshots(self, request, queryset):
|
def update_snapshots(self, request, queryset):
|
||||||
count = queryset.count()
|
count = queryset.count()
|
||||||
|
|
||||||
result = bg_archive_snapshots(queryset, kwargs={"overwrite": False, "out_dir": DATA_DIR})
|
queued = bg_archive_snapshots(queryset, kwargs={"overwrite": False, "out_dir": DATA_DIR})
|
||||||
|
|
||||||
messages.success(
|
messages.success(
|
||||||
request,
|
request,
|
||||||
mark_safe(f"Re-trying any previously failed methods for {count} URLs in the background. {result_url(result)}"),
|
f"Queued {queued} snapshots for re-archiving. The orchestrator will process them in the background.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -291,11 +281,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
|||||||
timestamp = timezone.now().isoformat('T', 'seconds')
|
timestamp = timezone.now().isoformat('T', 'seconds')
|
||||||
new_url = snapshot.url.split('#')[0] + f'#{timestamp}'
|
new_url = snapshot.url.split('#')[0] + f'#{timestamp}'
|
||||||
|
|
||||||
result = bg_add({'urls': new_url, 'tag': snapshot.tags_str()})
|
bg_add({'urls': new_url, 'tag': snapshot.tags_str()})
|
||||||
|
|
||||||
messages.success(
|
messages.success(
|
||||||
request,
|
request,
|
||||||
mark_safe(f"Creating new fresh snapshots for {queryset.count()} URLs in the background. {result_url(result)}"),
|
f"Creating {queryset.count()} new fresh snapshots. The orchestrator will process them in the background.",
|
||||||
)
|
)
|
||||||
|
|
||||||
@admin.action(
|
@admin.action(
|
||||||
@@ -304,11 +294,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
|||||||
def overwrite_snapshots(self, request, queryset):
|
def overwrite_snapshots(self, request, queryset):
|
||||||
count = queryset.count()
|
count = queryset.count()
|
||||||
|
|
||||||
result = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "out_dir": DATA_DIR})
|
queued = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "out_dir": DATA_DIR})
|
||||||
|
|
||||||
messages.success(
|
messages.success(
|
||||||
request,
|
request,
|
||||||
mark_safe(f"Clearing all previous results and re-downloading {count} URLs in the background. {result_url(result)}"),
|
f"Queued {queued} snapshots for full re-archive (overwriting existing). The orchestrator will process them in the background.",
|
||||||
)
|
)
|
||||||
|
|
||||||
@admin.action(
|
@admin.action(
|
||||||
|
|||||||
@@ -1,5 +1,7 @@
|
|||||||
__package__ = 'archivebox.core'
|
__package__ = 'archivebox.core'
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
from django.apps import AppConfig
|
from django.apps import AppConfig
|
||||||
|
|
||||||
|
|
||||||
@@ -11,5 +13,40 @@ class CoreConfig(AppConfig):
|
|||||||
from core.admin_site import register_admin_site
|
from core.admin_site import register_admin_site
|
||||||
register_admin_site()
|
register_admin_site()
|
||||||
|
|
||||||
|
# Auto-start the orchestrator when running the web server
|
||||||
|
self._maybe_start_orchestrator()
|
||||||
|
|
||||||
|
def _maybe_start_orchestrator(self):
|
||||||
|
"""Start the orchestrator if we're running a web server."""
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Don't start orchestrator during migrations, shell, tests, etc.
|
||||||
|
# Only start when running: runserver, daphne, gunicorn, uwsgi
|
||||||
|
if not self._is_web_server():
|
||||||
|
return
|
||||||
|
|
||||||
|
# Don't start if RUN_ORCHESTRATOR env var is explicitly set to false
|
||||||
|
if os.environ.get('RUN_ORCHESTRATOR', '').lower() in ('false', '0', 'no'):
|
||||||
|
return
|
||||||
|
|
||||||
|
# Don't start in autoreload child process (avoid double-start)
|
||||||
|
if os.environ.get('RUN_MAIN') != 'true' and 'runserver' in sys.argv:
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
from workers.orchestrator import Orchestrator
|
||||||
|
|
||||||
|
if not Orchestrator.is_running():
|
||||||
|
# Start orchestrator as daemon (won't exit on idle when started by server)
|
||||||
|
orchestrator = Orchestrator(exit_on_idle=False)
|
||||||
|
orchestrator.start()
|
||||||
|
except Exception as e:
|
||||||
|
# Don't crash the server if orchestrator fails to start
|
||||||
|
import logging
|
||||||
|
logging.getLogger('archivebox').warning(f'Failed to auto-start orchestrator: {e}')
|
||||||
|
|
||||||
|
def _is_web_server(self) -> bool:
|
||||||
|
"""Check if we're running a web server command."""
|
||||||
|
# Check for common web server indicators
|
||||||
|
server_commands = ('runserver', 'daphne', 'gunicorn', 'uwsgi', 'server')
|
||||||
|
return any(cmd in ' '.join(sys.argv).lower() for cmd in server_commands)
|
||||||
|
|||||||
@@ -23,7 +23,11 @@ from archivebox.config import CONSTANTS
|
|||||||
from archivebox.misc.system import get_dir_size, atomic_write
|
from archivebox.misc.system import get_dir_size, atomic_write
|
||||||
from archivebox.misc.util import parse_date, base_url, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode
|
from archivebox.misc.util import parse_date, base_url, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode
|
||||||
from archivebox.misc.hashing import get_dir_info
|
from archivebox.misc.hashing import get_dir_info
|
||||||
from archivebox.hooks import ARCHIVE_METHODS_INDEXING_PRECEDENCE
|
from archivebox.hooks import (
|
||||||
|
ARCHIVE_METHODS_INDEXING_PRECEDENCE,
|
||||||
|
get_extractors, get_extractor_name, get_extractor_icon,
|
||||||
|
DEFAULT_EXTRACTOR_ICONS,
|
||||||
|
)
|
||||||
from archivebox.base_models.models import (
|
from archivebox.base_models.models import (
|
||||||
ModelWithUUID, ModelWithSerializers, ModelWithOutputDir,
|
ModelWithUUID, ModelWithSerializers, ModelWithOutputDir,
|
||||||
ModelWithConfig, ModelWithNotes, ModelWithHealthStats,
|
ModelWithConfig, ModelWithNotes, ModelWithHealthStats,
|
||||||
@@ -343,45 +347,37 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
|||||||
def icons(self) -> str:
|
def icons(self) -> str:
|
||||||
"""Generate HTML icons showing which extractors have succeeded for this snapshot"""
|
"""Generate HTML icons showing which extractors have succeeded for this snapshot"""
|
||||||
from django.utils.html import format_html, mark_safe
|
from django.utils.html import format_html, mark_safe
|
||||||
from collections import defaultdict
|
|
||||||
|
|
||||||
cache_key = f'result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}'
|
cache_key = f'result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}'
|
||||||
|
|
||||||
def calc_icons():
|
def calc_icons():
|
||||||
if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
|
if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
|
||||||
archive_results = [r for r in self.archiveresult_set.all() if r.status == "succeeded" and r.output]
|
archive_results = {r.extractor: r for r in self.archiveresult_set.all() if r.status == "succeeded" and r.output}
|
||||||
else:
|
else:
|
||||||
archive_results = self.archiveresult_set.filter(status="succeeded", output__isnull=False)
|
archive_results = {r.extractor: r for r in self.archiveresult_set.filter(status="succeeded", output__isnull=False)}
|
||||||
|
|
||||||
path = self.archive_path
|
path = self.archive_path
|
||||||
canon = self.canonical_outputs()
|
canon = self.canonical_outputs()
|
||||||
output = ""
|
output = ""
|
||||||
output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> '
|
output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> '
|
||||||
icons = {
|
|
||||||
"singlefile": "❶", "wget": "🆆", "dom": "🅷", "pdf": "📄",
|
|
||||||
"screenshot": "💻", "media": "📼", "git": "🅶", "archive_org": "🏛",
|
|
||||||
"readability": "🆁", "mercury": "🅼", "warc": "📦"
|
|
||||||
}
|
|
||||||
exclude = ["favicon", "title", "headers", "htmltotext", "archive_org"]
|
|
||||||
|
|
||||||
extractor_outputs = defaultdict(lambda: None)
|
# Get all extractors from hooks system (sorted by numeric prefix)
|
||||||
for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES:
|
all_extractors = [get_extractor_name(e) for e in get_extractors()]
|
||||||
for result in archive_results:
|
|
||||||
if result.extractor == extractor:
|
|
||||||
extractor_outputs[extractor] = result
|
|
||||||
|
|
||||||
for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES:
|
for extractor in all_extractors:
|
||||||
if extractor not in exclude:
|
result = archive_results.get(extractor)
|
||||||
existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
existing = result and result.status == 'succeeded' and result.output
|
||||||
output += format_html(output_template, path, canon.get(extractor, ''), str(bool(existing)), extractor, icons.get(extractor, "?"))
|
icon = get_extractor_icon(extractor)
|
||||||
if extractor == "wget":
|
output += format_html(
|
||||||
exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
output_template,
|
||||||
output += format_html(output_template, path, canon.get("warc", "warc/"), str(bool(exists)), "warc", icons.get("warc", "?"))
|
path,
|
||||||
if extractor == "archive_org":
|
canon.get(extractor, extractor + '/'),
|
||||||
exists = extractor in extractor_outputs and extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
str(bool(existing)),
|
||||||
output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon.get("archive_org", ""), str(exists), "archive_org", icons.get("archive_org", "?"))
|
extractor,
|
||||||
|
icon
|
||||||
|
)
|
||||||
|
|
||||||
return format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))
|
return format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}</span>', mark_safe(output))
|
||||||
|
|
||||||
cache_result = cache.get(cache_key)
|
cache_result = cache.get(cache_key)
|
||||||
if cache_result:
|
if cache_result:
|
||||||
@@ -767,12 +763,11 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
|||||||
FAILED = 'failed', 'Failed'
|
FAILED = 'failed', 'Failed'
|
||||||
SKIPPED = 'skipped', 'Skipped'
|
SKIPPED = 'skipped', 'Skipped'
|
||||||
|
|
||||||
EXTRACTOR_CHOICES = (
|
@classmethod
|
||||||
('htmltotext', 'htmltotext'), ('git', 'git'), ('singlefile', 'singlefile'), ('media', 'media'),
|
def get_extractor_choices(cls):
|
||||||
('archive_org', 'archive_org'), ('readability', 'readability'), ('mercury', 'mercury'),
|
"""Get extractor choices from discovered hooks (for forms/admin)."""
|
||||||
('favicon', 'favicon'), ('pdf', 'pdf'), ('headers', 'headers'), ('screenshot', 'screenshot'),
|
extractors = [get_extractor_name(e) for e in get_extractors()]
|
||||||
('dom', 'dom'), ('title', 'title'), ('wget', 'wget'),
|
return tuple((e, e) for e in extractors)
|
||||||
)
|
|
||||||
|
|
||||||
# Keep AutoField for backward compatibility with 0.7.x databases
|
# Keep AutoField for backward compatibility with 0.7.x databases
|
||||||
# UUID field is added separately by migration for new records
|
# UUID field is added separately by migration for new records
|
||||||
@@ -783,7 +778,8 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
|||||||
modified_at = models.DateTimeField(auto_now=True)
|
modified_at = models.DateTimeField(auto_now=True)
|
||||||
|
|
||||||
snapshot: Snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) # type: ignore
|
snapshot: Snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) # type: ignore
|
||||||
extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32, blank=False, null=False, db_index=True)
|
# No choices= constraint - extractor names come from plugin system and can be any string
|
||||||
|
extractor = models.CharField(max_length=32, blank=False, null=False, db_index=True)
|
||||||
pwd = models.CharField(max_length=256, default=None, null=True, blank=True)
|
pwd = models.CharField(max_length=256, default=None, null=True, blank=True)
|
||||||
cmd = models.JSONField(default=None, null=True, blank=True)
|
cmd = models.JSONField(default=None, null=True, blank=True)
|
||||||
cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
|
cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
|
||||||
@@ -835,6 +831,25 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
|||||||
def output_exists(self) -> bool:
|
def output_exists(self) -> bool:
|
||||||
return os.path.exists(Path(self.snapshot_dir) / self.extractor)
|
return os.path.exists(Path(self.snapshot_dir) / self.extractor)
|
||||||
|
|
||||||
|
def embed_path(self) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Get the relative path to the embeddable output file for this result.
|
||||||
|
|
||||||
|
Returns the output field if set and file exists, otherwise tries to
|
||||||
|
find a reasonable default based on the extractor type.
|
||||||
|
"""
|
||||||
|
if self.output:
|
||||||
|
return self.output
|
||||||
|
|
||||||
|
# Try to find output file based on extractor's canonical output path
|
||||||
|
canonical = self.snapshot.canonical_outputs()
|
||||||
|
extractor_key = f'{self.extractor}_path'
|
||||||
|
if extractor_key in canonical:
|
||||||
|
return canonical[extractor_key]
|
||||||
|
|
||||||
|
# Fallback to extractor directory
|
||||||
|
return f'{self.extractor}/'
|
||||||
|
|
||||||
def create_output_dir(self):
|
def create_output_dir(self):
|
||||||
output_dir = Path(self.snapshot_dir) / self.extractor
|
output_dir = Path(self.snapshot_dir) / self.extractor
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
@@ -891,6 +906,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
|||||||
output_dir=extractor_dir,
|
output_dir=extractor_dir,
|
||||||
config_objects=config_objects,
|
config_objects=config_objects,
|
||||||
url=self.snapshot.url,
|
url=self.snapshot.url,
|
||||||
|
snapshot_id=str(self.snapshot.id),
|
||||||
)
|
)
|
||||||
end_ts = timezone.now()
|
end_ts = timezone.now()
|
||||||
|
|
||||||
@@ -1000,6 +1016,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
|||||||
hook,
|
hook,
|
||||||
output_dir=self.output_dir,
|
output_dir=self.output_dir,
|
||||||
config_objects=config_objects,
|
config_objects=config_objects,
|
||||||
|
url=self.snapshot.url,
|
||||||
snapshot_id=str(self.snapshot.id),
|
snapshot_id=str(self.snapshot.id),
|
||||||
extractor=self.extractor,
|
extractor=self.extractor,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -68,9 +68,6 @@ INSTALLED_APPS = [
|
|||||||
# 3rd-party apps from PyPI that need to be loaded last
|
# 3rd-party apps from PyPI that need to be loaded last
|
||||||
"admin_data_views", # handles rendering some convenient automatic read-only views of data in Django admin
|
"admin_data_views", # handles rendering some convenient automatic read-only views of data in Django admin
|
||||||
"django_extensions", # provides Django Debug Toolbar (and other non-debug helpers)
|
"django_extensions", # provides Django Debug Toolbar (and other non-debug helpers)
|
||||||
"django_huey", # provides multi-queue support for django huey https://github.com/gaiacoop/django-huey
|
|
||||||
"bx_django_utils", # needed for huey_monitor https://github.com/boxine/bx_django_utils
|
|
||||||
"huey_monitor", # adds an admin UI for monitoring background huey tasks https://github.com/boxine/django-huey-monitor
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@@ -215,70 +212,6 @@ MIGRATION_MODULES = {"signal_webhooks": None}
|
|||||||
# as much as I'd love this to be a UUID or ULID field, it's not supported yet as of Django 5.0
|
# as much as I'd love this to be a UUID or ULID field, it's not supported yet as of Django 5.0
|
||||||
DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"
|
DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"
|
||||||
|
|
||||||
HUEY = {
|
|
||||||
"huey_class": "huey.SqliteHuey",
|
|
||||||
"filename": CONSTANTS.QUEUE_DATABASE_FILENAME,
|
|
||||||
"name": "commands",
|
|
||||||
"results": True,
|
|
||||||
"store_none": True,
|
|
||||||
"immediate": False,
|
|
||||||
"utc": True,
|
|
||||||
"consumer": {
|
|
||||||
"workers": 1,
|
|
||||||
"worker_type": "thread",
|
|
||||||
"initial_delay": 0.1, # Smallest polling interval, same as -d.
|
|
||||||
"backoff": 1.15, # Exponential backoff using this rate, -b.
|
|
||||||
"max_delay": 10.0, # Max possible polling interval, -m.
|
|
||||||
"scheduler_interval": 1, # Check schedule every second, -s.
|
|
||||||
"periodic": True, # Enable crontab feature.
|
|
||||||
"check_worker_health": True, # Enable worker health checks.
|
|
||||||
"health_check_interval": 1, # Check worker health every second.
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
# https://huey.readthedocs.io/en/latest/contrib.html#setting-things-up
|
|
||||||
# https://github.com/gaiacoop/django-huey
|
|
||||||
DJANGO_HUEY = {
|
|
||||||
"default": "commands",
|
|
||||||
"queues": {
|
|
||||||
HUEY["name"]: HUEY.copy(),
|
|
||||||
# more registered here at plugin import-time by BaseQueue.register()
|
|
||||||
# Additional huey queues configured via settings
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class HueyDBRouter:
|
|
||||||
"""
|
|
||||||
A router to store all the Huey result k:v / Huey Monitor models in the queue.sqlite3 database.
|
|
||||||
We keep the databases separate because the queue database receives many more reads/writes per second
|
|
||||||
and we want to avoid single-write lock contention with the main database. Also all the in-progress task
|
|
||||||
data is ephemeral/not-important-long-term. This makes it easier to for the user to clear non-critical
|
|
||||||
temp data by just deleting queue.sqlite3 and leaving index.sqlite3.
|
|
||||||
"""
|
|
||||||
|
|
||||||
route_app_labels = {"huey_monitor", "django_huey", "djhuey"}
|
|
||||||
db_name = "queue"
|
|
||||||
|
|
||||||
def db_for_read(self, model, **hints):
|
|
||||||
if model._meta.app_label in self.route_app_labels:
|
|
||||||
return self.db_name
|
|
||||||
return "default"
|
|
||||||
|
|
||||||
def db_for_write(self, model, **hints):
|
|
||||||
if model._meta.app_label in self.route_app_labels:
|
|
||||||
return self.db_name
|
|
||||||
return "default"
|
|
||||||
|
|
||||||
def allow_relation(self, obj1, obj2, **hints):
|
|
||||||
if obj1._meta.app_label in self.route_app_labels or obj2._meta.app_label in self.route_app_labels:
|
|
||||||
return obj1._meta.app_label == obj2._meta.app_label
|
|
||||||
return None
|
|
||||||
|
|
||||||
def allow_migrate(self, db, app_label, model_name=None, **hints):
|
|
||||||
if app_label in self.route_app_labels:
|
|
||||||
return db == self.db_name
|
|
||||||
return db == "default"
|
|
||||||
|
|
||||||
|
|
||||||
# class FilestoreDBRouter:
|
# class FilestoreDBRouter:
|
||||||
@@ -311,7 +244,7 @@ class HueyDBRouter:
|
|||||||
# return db == self.db_name
|
# return db == self.db_name
|
||||||
# return db == "default"
|
# return db == "default"
|
||||||
|
|
||||||
DATABASE_ROUTERS = ["core.settings.HueyDBRouter"]
|
DATABASE_ROUTERS = []
|
||||||
|
|
||||||
CACHES = {
|
CACHES = {
|
||||||
"default": {"BACKEND": "django.core.cache.backends.locmem.LocMemCache"},
|
"default": {"BACKEND": "django.core.cache.backends.locmem.LocMemCache"},
|
||||||
|
|||||||
@@ -1,9 +1,13 @@
|
|||||||
from django import template
|
from django import template
|
||||||
from django.contrib.admin.templatetags.base import InclusionAdminNode
|
from django.contrib.admin.templatetags.base import InclusionAdminNode
|
||||||
|
from django.utils.safestring import mark_safe
|
||||||
|
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
|
from archivebox.hooks import (
|
||||||
|
get_extractor_icon, get_extractor_template, get_extractor_name,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
register = template.Library()
|
register = template.Library()
|
||||||
|
|
||||||
@@ -44,3 +48,115 @@ def url_replace(context, **kwargs):
|
|||||||
dict_ = context['request'].GET.copy()
|
dict_ = context['request'].GET.copy()
|
||||||
dict_.update(**kwargs)
|
dict_.update(**kwargs)
|
||||||
return dict_.urlencode()
|
return dict_.urlencode()
|
||||||
|
|
||||||
|
|
||||||
|
@register.simple_tag
|
||||||
|
def extractor_icon(extractor: str) -> str:
|
||||||
|
"""
|
||||||
|
Render the icon for an extractor.
|
||||||
|
|
||||||
|
Usage: {% extractor_icon "screenshot" %}
|
||||||
|
"""
|
||||||
|
return mark_safe(get_extractor_icon(extractor))
|
||||||
|
|
||||||
|
|
||||||
|
@register.simple_tag(takes_context=True)
|
||||||
|
def extractor_thumbnail(context, result) -> str:
|
||||||
|
"""
|
||||||
|
Render the thumbnail template for an archive result.
|
||||||
|
|
||||||
|
Usage: {% extractor_thumbnail result %}
|
||||||
|
|
||||||
|
Context variables passed to template:
|
||||||
|
- result: ArchiveResult object
|
||||||
|
- snapshot: Parent Snapshot object
|
||||||
|
- output_path: Path to output relative to snapshot dir (from embed_path())
|
||||||
|
- extractor: Extractor base name
|
||||||
|
"""
|
||||||
|
extractor = get_extractor_name(result.extractor)
|
||||||
|
template_str = get_extractor_template(extractor, 'thumbnail')
|
||||||
|
|
||||||
|
if not template_str:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
# Use embed_path() for the display path (includes canonical paths)
|
||||||
|
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
|
||||||
|
|
||||||
|
# Create a mini template and render it with context
|
||||||
|
try:
|
||||||
|
tpl = template.Template(template_str)
|
||||||
|
ctx = template.Context({
|
||||||
|
'result': result,
|
||||||
|
'snapshot': result.snapshot,
|
||||||
|
'output_path': output_path,
|
||||||
|
'extractor': extractor,
|
||||||
|
})
|
||||||
|
return mark_safe(tpl.render(ctx))
|
||||||
|
except Exception:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
|
||||||
|
@register.simple_tag(takes_context=True)
|
||||||
|
def extractor_embed(context, result) -> str:
|
||||||
|
"""
|
||||||
|
Render the embed iframe template for an archive result.
|
||||||
|
|
||||||
|
Usage: {% extractor_embed result %}
|
||||||
|
"""
|
||||||
|
extractor = get_extractor_name(result.extractor)
|
||||||
|
template_str = get_extractor_template(extractor, 'embed')
|
||||||
|
|
||||||
|
if not template_str:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
|
||||||
|
|
||||||
|
try:
|
||||||
|
tpl = template.Template(template_str)
|
||||||
|
ctx = template.Context({
|
||||||
|
'result': result,
|
||||||
|
'snapshot': result.snapshot,
|
||||||
|
'output_path': output_path,
|
||||||
|
'extractor': extractor,
|
||||||
|
})
|
||||||
|
return mark_safe(tpl.render(ctx))
|
||||||
|
except Exception:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
|
||||||
|
@register.simple_tag(takes_context=True)
|
||||||
|
def extractor_fullscreen(context, result) -> str:
|
||||||
|
"""
|
||||||
|
Render the fullscreen template for an archive result.
|
||||||
|
|
||||||
|
Usage: {% extractor_fullscreen result %}
|
||||||
|
"""
|
||||||
|
extractor = get_extractor_name(result.extractor)
|
||||||
|
template_str = get_extractor_template(extractor, 'fullscreen')
|
||||||
|
|
||||||
|
if not template_str:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
|
||||||
|
|
||||||
|
try:
|
||||||
|
tpl = template.Template(template_str)
|
||||||
|
ctx = template.Context({
|
||||||
|
'result': result,
|
||||||
|
'snapshot': result.snapshot,
|
||||||
|
'output_path': output_path,
|
||||||
|
'extractor': extractor,
|
||||||
|
})
|
||||||
|
return mark_safe(tpl.render(ctx))
|
||||||
|
except Exception:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
|
||||||
|
@register.filter
|
||||||
|
def extractor_name(value: str) -> str:
|
||||||
|
"""
|
||||||
|
Get the base name of an extractor (strips numeric prefix).
|
||||||
|
|
||||||
|
Usage: {{ result.extractor|extractor_name }}
|
||||||
|
"""
|
||||||
|
return get_extractor_name(value)
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ from django.views.generic.base import RedirectView
|
|||||||
from archivebox.misc.serve_static import serve_static
|
from archivebox.misc.serve_static import serve_static
|
||||||
|
|
||||||
from core.admin_site import archivebox_admin
|
from core.admin_site import archivebox_admin
|
||||||
from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
|
from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView, live_progress_view
|
||||||
|
|
||||||
from workers.views import JobsDashboardView
|
from workers.views import JobsDashboardView
|
||||||
|
|
||||||
@@ -43,6 +43,8 @@ urlpatterns = [
|
|||||||
|
|
||||||
|
|
||||||
path('accounts/', include('django.contrib.auth.urls')),
|
path('accounts/', include('django.contrib.auth.urls')),
|
||||||
|
|
||||||
|
path('admin/live-progress/', live_progress_view, name='live_progress'),
|
||||||
path('admin/', archivebox_admin.urls),
|
path('admin/', archivebox_admin.urls),
|
||||||
|
|
||||||
path("api/", include('api.urls'), name='api'),
|
path("api/", include('api.urls'), name='api'),
|
||||||
|
|||||||
@@ -34,6 +34,7 @@ from archivebox.search import query_search_index
|
|||||||
from core.models import Snapshot
|
from core.models import Snapshot
|
||||||
from core.forms import AddLinkForm
|
from core.forms import AddLinkForm
|
||||||
from crawls.models import Seed, Crawl
|
from crawls.models import Seed, Crawl
|
||||||
|
from archivebox.hooks import get_extractors, get_extractor_name
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -54,8 +55,10 @@ class SnapshotView(View):
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def render_live_index(request, snapshot):
|
def render_live_index(request, snapshot):
|
||||||
TITLE_LOADING_MSG = 'Not yet archived...'
|
TITLE_LOADING_MSG = 'Not yet archived...'
|
||||||
HIDDEN_RESULTS = ('favicon', 'headers', 'title', 'htmltotext', 'warc', 'archive_org')
|
|
||||||
|
|
||||||
|
# Dict of extractor -> ArchiveResult object
|
||||||
|
archiveresult_objects = {}
|
||||||
|
# Dict of extractor -> result info dict (for template compatibility)
|
||||||
archiveresults = {}
|
archiveresults = {}
|
||||||
|
|
||||||
results = snapshot.archiveresult_set.all()
|
results = snapshot.archiveresult_set.all()
|
||||||
@@ -65,18 +68,21 @@ class SnapshotView(View):
|
|||||||
abs_path = result.snapshot_dir / (embed_path or 'None')
|
abs_path = result.snapshot_dir / (embed_path or 'None')
|
||||||
|
|
||||||
if (result.status == 'succeeded'
|
if (result.status == 'succeeded'
|
||||||
and (result.extractor not in HIDDEN_RESULTS)
|
|
||||||
and embed_path
|
and embed_path
|
||||||
and os.access(abs_path, os.R_OK)
|
and os.access(abs_path, os.R_OK)
|
||||||
and abs_path.exists()):
|
and abs_path.exists()):
|
||||||
if os.path.isdir(abs_path) and not any(abs_path.glob('*.*')):
|
if os.path.isdir(abs_path) and not any(abs_path.glob('*.*')):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Store the full ArchiveResult object for template tags
|
||||||
|
archiveresult_objects[result.extractor] = result
|
||||||
|
|
||||||
result_info = {
|
result_info = {
|
||||||
'name': result.extractor,
|
'name': result.extractor,
|
||||||
'path': embed_path,
|
'path': embed_path,
|
||||||
'ts': ts_to_date_str(result.end_ts),
|
'ts': ts_to_date_str(result.end_ts),
|
||||||
'size': abs_path.stat().st_size or '?',
|
'size': abs_path.stat().st_size or '?',
|
||||||
|
'result': result, # Include the full object for template tags
|
||||||
}
|
}
|
||||||
archiveresults[result.extractor] = result_info
|
archiveresults[result.extractor] = result_info
|
||||||
|
|
||||||
@@ -101,7 +107,7 @@ class SnapshotView(View):
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# iterate through all the files in the snapshot dir and add the biggest ones to1 the result list
|
# iterate through all the files in the snapshot dir and add the biggest ones to the result list
|
||||||
snap_dir = Path(snapshot.output_dir)
|
snap_dir = Path(snapshot.output_dir)
|
||||||
if not os.path.isdir(snap_dir) and os.access(snap_dir, os.R_OK):
|
if not os.path.isdir(snap_dir) and os.access(snap_dir, os.R_OK):
|
||||||
return {}
|
return {}
|
||||||
@@ -121,12 +127,16 @@ class SnapshotView(View):
|
|||||||
'path': result_file.relative_to(snap_dir),
|
'path': result_file.relative_to(snap_dir),
|
||||||
'ts': ts_to_date_str(result_file.stat().st_mtime or 0),
|
'ts': ts_to_date_str(result_file.stat().st_mtime or 0),
|
||||||
'size': file_size,
|
'size': file_size,
|
||||||
|
'result': None, # No ArchiveResult object for filesystem-discovered files
|
||||||
}
|
}
|
||||||
|
|
||||||
preferred_types = ('singlefile', 'screenshot', 'wget', 'dom', 'media', 'pdf', 'readability', 'mercury')
|
# Get available extractors from hooks (sorted by numeric prefix for ordering)
|
||||||
|
# Convert to base names for display ordering
|
||||||
|
all_extractors = [get_extractor_name(e) for e in get_extractors()]
|
||||||
|
preferred_types = tuple(all_extractors)
|
||||||
all_types = preferred_types + tuple(result_type for result_type in archiveresults.keys() if result_type not in preferred_types)
|
all_types = preferred_types + tuple(result_type for result_type in archiveresults.keys() if result_type not in preferred_types)
|
||||||
|
|
||||||
best_result = {'path': 'None'}
|
best_result = {'path': 'None', 'result': None}
|
||||||
for result_type in preferred_types:
|
for result_type in preferred_types:
|
||||||
if result_type in archiveresults:
|
if result_type in archiveresults:
|
||||||
best_result = archiveresults[result_type]
|
best_result = archiveresults[result_type]
|
||||||
@@ -157,6 +167,7 @@ class SnapshotView(View):
|
|||||||
'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
|
'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
|
||||||
'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']),
|
'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']),
|
||||||
'best_result': best_result,
|
'best_result': best_result,
|
||||||
|
'snapshot': snapshot, # Pass the snapshot object for template tags
|
||||||
}
|
}
|
||||||
return render(template_name='core/snapshot_live.html', request=request, context=context)
|
return render(template_name='core/snapshot_live.html', request=request, context=context)
|
||||||
|
|
||||||
@@ -436,7 +447,7 @@ class AddView(UserPassesTestMixin, FormView):
|
|||||||
def form_valid(self, form):
|
def form_valid(self, form):
|
||||||
urls = form.cleaned_data["url"]
|
urls = form.cleaned_data["url"]
|
||||||
print(f'[+] Adding URL: {urls}')
|
print(f'[+] Adding URL: {urls}')
|
||||||
parser = form.cleaned_data["parser"]
|
parser = form.cleaned_data.get("parser", "auto") # default to auto-detect parser
|
||||||
tag = form.cleaned_data["tag"]
|
tag = form.cleaned_data["tag"]
|
||||||
depth = 0 if form.cleaned_data["depth"] == "0" else 1
|
depth = 0 if form.cleaned_data["depth"] == "0" else 1
|
||||||
extractors = ','.join(form.cleaned_data["archive_methods"])
|
extractors = ','.join(form.cleaned_data["archive_methods"])
|
||||||
@@ -461,9 +472,10 @@ class AddView(UserPassesTestMixin, FormView):
|
|||||||
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
|
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
|
||||||
|
|
||||||
# 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
|
# 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
|
||||||
|
timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
|
||||||
seed = Seed.from_file(
|
seed = Seed.from_file(
|
||||||
sources_file,
|
sources_file,
|
||||||
label=f'{self.request.user.username}@{HOSTNAME}{self.request.path}',
|
label=f'{self.request.user.username}@{HOSTNAME}{self.request.path} {timestamp}',
|
||||||
parser=parser,
|
parser=parser,
|
||||||
tag=tag,
|
tag=tag,
|
||||||
created_by=self.request.user.pk,
|
created_by=self.request.user.pk,
|
||||||
@@ -472,7 +484,7 @@ class AddView(UserPassesTestMixin, FormView):
|
|||||||
# 'INDEX_ONLY': index_only,
|
# 'INDEX_ONLY': index_only,
|
||||||
# 'OVERWRITE': False,
|
# 'OVERWRITE': False,
|
||||||
'DEPTH': depth,
|
'DEPTH': depth,
|
||||||
'EXTRACTORS': parser,
|
'EXTRACTORS': extractors or '',
|
||||||
# 'DEFAULT_PERSONA': persona or 'Default',
|
# 'DEFAULT_PERSONA': persona or 'Default',
|
||||||
})
|
})
|
||||||
# 3. create a new Crawl pointing to the Seed
|
# 3. create a new Crawl pointing to the Seed
|
||||||
@@ -490,10 +502,15 @@ class AddView(UserPassesTestMixin, FormView):
|
|||||||
self.request,
|
self.request,
|
||||||
mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a minute start seeing results) {crawl.admin_change_url}"),
|
mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a minute start seeing results) {crawl.admin_change_url}"),
|
||||||
)
|
)
|
||||||
# if not bg:
|
|
||||||
# from workers.orchestrator import Orchestrator
|
# Start orchestrator in background to process the queued crawl
|
||||||
# orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4)
|
try:
|
||||||
# orchestrator.start()
|
from archivebox.workers.tasks import ensure_orchestrator_running
|
||||||
|
ensure_orchestrator_running()
|
||||||
|
except Exception as e:
|
||||||
|
# Orchestrator may already be running via supervisord, or fail to start
|
||||||
|
# This is not fatal - the crawl will be processed when orchestrator runs
|
||||||
|
print(f'[!] Failed to start orchestrator: {e}')
|
||||||
|
|
||||||
return redirect(crawl.admin_change_url)
|
return redirect(crawl.admin_change_url)
|
||||||
|
|
||||||
@@ -513,6 +530,141 @@ class HealthCheckView(View):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
import json
|
||||||
|
from django.http import JsonResponse
|
||||||
|
|
||||||
|
def live_progress_view(request):
|
||||||
|
"""Simple JSON endpoint for live progress status - used by admin progress monitor."""
|
||||||
|
try:
|
||||||
|
from workers.orchestrator import Orchestrator
|
||||||
|
from crawls.models import Crawl
|
||||||
|
from core.models import Snapshot, ArchiveResult
|
||||||
|
|
||||||
|
# Get orchestrator status
|
||||||
|
orchestrator_running = Orchestrator.is_running()
|
||||||
|
total_workers = Orchestrator().get_total_worker_count() if orchestrator_running else 0
|
||||||
|
|
||||||
|
# Get model counts by status
|
||||||
|
crawls_pending = Crawl.objects.filter(status=Crawl.StatusChoices.QUEUED).count()
|
||||||
|
crawls_started = Crawl.objects.filter(status=Crawl.StatusChoices.STARTED).count()
|
||||||
|
|
||||||
|
# Get recent crawls (last 24 hours)
|
||||||
|
from datetime import timedelta
|
||||||
|
one_day_ago = timezone.now() - timedelta(days=1)
|
||||||
|
crawls_recent = Crawl.objects.filter(created_at__gte=one_day_ago).count()
|
||||||
|
|
||||||
|
snapshots_pending = Snapshot.objects.filter(status=Snapshot.StatusChoices.QUEUED).count()
|
||||||
|
snapshots_started = Snapshot.objects.filter(status=Snapshot.StatusChoices.STARTED).count()
|
||||||
|
|
||||||
|
archiveresults_pending = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.QUEUED).count()
|
||||||
|
archiveresults_started = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.STARTED).count()
|
||||||
|
archiveresults_succeeded = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.SUCCEEDED).count()
|
||||||
|
archiveresults_failed = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.FAILED).count()
|
||||||
|
|
||||||
|
# Build hierarchical active crawls with nested snapshots and archive results
|
||||||
|
active_crawls = []
|
||||||
|
for crawl in Crawl.objects.filter(
|
||||||
|
status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]
|
||||||
|
).order_by('-modified_at')[:10]:
|
||||||
|
# Get snapshots for this crawl
|
||||||
|
crawl_snapshots = Snapshot.objects.filter(crawl=crawl)
|
||||||
|
total_snapshots = crawl_snapshots.count()
|
||||||
|
completed_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.SEALED).count()
|
||||||
|
pending_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.QUEUED).count()
|
||||||
|
|
||||||
|
# Calculate crawl progress
|
||||||
|
crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0
|
||||||
|
|
||||||
|
# Get active snapshots for this crawl
|
||||||
|
active_snapshots_for_crawl = []
|
||||||
|
for snapshot in crawl_snapshots.filter(
|
||||||
|
status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]
|
||||||
|
).order_by('-modified_at')[:5]:
|
||||||
|
# Get archive results for this snapshot
|
||||||
|
snapshot_results = ArchiveResult.objects.filter(snapshot=snapshot)
|
||||||
|
total_extractors = snapshot_results.count()
|
||||||
|
completed_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.SUCCEEDED).count()
|
||||||
|
failed_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.FAILED).count()
|
||||||
|
pending_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.QUEUED).count()
|
||||||
|
|
||||||
|
# Calculate snapshot progress
|
||||||
|
snapshot_progress = int(((completed_extractors + failed_extractors) / total_extractors) * 100) if total_extractors > 0 else 0
|
||||||
|
|
||||||
|
# Get active extractors for this snapshot
|
||||||
|
active_extractors = [
|
||||||
|
{
|
||||||
|
'id': str(ar.id),
|
||||||
|
'extractor': ar.extractor,
|
||||||
|
'status': ar.status,
|
||||||
|
'started': ar.start_ts.isoformat() if ar.start_ts else None,
|
||||||
|
'progress': 50,
|
||||||
|
}
|
||||||
|
for ar in snapshot_results.filter(status=ArchiveResult.StatusChoices.STARTED).order_by('-start_ts')[:5]
|
||||||
|
]
|
||||||
|
|
||||||
|
active_snapshots_for_crawl.append({
|
||||||
|
'id': str(snapshot.id),
|
||||||
|
'url': snapshot.url[:80],
|
||||||
|
'status': snapshot.status,
|
||||||
|
'started': snapshot.modified_at.isoformat() if snapshot.modified_at else None,
|
||||||
|
'progress': snapshot_progress,
|
||||||
|
'total_extractors': total_extractors,
|
||||||
|
'completed_extractors': completed_extractors,
|
||||||
|
'failed_extractors': failed_extractors,
|
||||||
|
'pending_extractors': pending_extractors,
|
||||||
|
'active_extractors': active_extractors,
|
||||||
|
})
|
||||||
|
|
||||||
|
active_crawls.append({
|
||||||
|
'id': str(crawl.id),
|
||||||
|
'label': str(crawl)[:60],
|
||||||
|
'status': crawl.status,
|
||||||
|
'started': crawl.modified_at.isoformat() if crawl.modified_at else None,
|
||||||
|
'progress': crawl_progress,
|
||||||
|
'max_depth': crawl.max_depth,
|
||||||
|
'total_snapshots': total_snapshots,
|
||||||
|
'completed_snapshots': completed_snapshots,
|
||||||
|
'failed_snapshots': 0,
|
||||||
|
'pending_snapshots': pending_snapshots,
|
||||||
|
'active_snapshots': active_snapshots_for_crawl,
|
||||||
|
})
|
||||||
|
|
||||||
|
return JsonResponse({
|
||||||
|
'orchestrator_running': orchestrator_running,
|
||||||
|
'total_workers': total_workers,
|
||||||
|
'crawls_pending': crawls_pending,
|
||||||
|
'crawls_started': crawls_started,
|
||||||
|
'crawls_recent': crawls_recent,
|
||||||
|
'snapshots_pending': snapshots_pending,
|
||||||
|
'snapshots_started': snapshots_started,
|
||||||
|
'archiveresults_pending': archiveresults_pending,
|
||||||
|
'archiveresults_started': archiveresults_started,
|
||||||
|
'archiveresults_succeeded': archiveresults_succeeded,
|
||||||
|
'archiveresults_failed': archiveresults_failed,
|
||||||
|
'active_crawls': active_crawls,
|
||||||
|
'server_time': timezone.now().isoformat(),
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
import traceback
|
||||||
|
return JsonResponse({
|
||||||
|
'error': str(e),
|
||||||
|
'traceback': traceback.format_exc(),
|
||||||
|
'orchestrator_running': False,
|
||||||
|
'total_workers': 0,
|
||||||
|
'crawls_pending': 0,
|
||||||
|
'crawls_started': 0,
|
||||||
|
'crawls_recent': 0,
|
||||||
|
'snapshots_pending': 0,
|
||||||
|
'snapshots_started': 0,
|
||||||
|
'archiveresults_pending': 0,
|
||||||
|
'archiveresults_started': 0,
|
||||||
|
'archiveresults_succeeded': 0,
|
||||||
|
'archiveresults_failed': 0,
|
||||||
|
'active_crawls': [],
|
||||||
|
'server_time': timezone.now().isoformat(),
|
||||||
|
}, status=500)
|
||||||
|
|
||||||
|
|
||||||
def find_config_section(key: str) -> str:
|
def find_config_section(key: str) -> str:
|
||||||
CONFIGS = get_all_configs()
|
CONFIGS = get_all_configs()
|
||||||
|
|
||||||
|
|||||||
@@ -1,10 +1,18 @@
|
|||||||
__package__ = 'archivebox.crawls'
|
__package__ = 'archivebox.crawls'
|
||||||
|
|
||||||
from django.utils.html import format_html, format_html_join
|
import json
|
||||||
from django.contrib import admin
|
from pathlib import Path
|
||||||
|
|
||||||
|
from django.utils.html import format_html, format_html_join, mark_safe
|
||||||
|
from django.contrib import admin, messages
|
||||||
|
from django.urls import path
|
||||||
|
from django.http import JsonResponse
|
||||||
|
from django.views.decorators.http import require_POST
|
||||||
|
|
||||||
from archivebox import DATA_DIR
|
from archivebox import DATA_DIR
|
||||||
|
|
||||||
|
from django_object_actions import action
|
||||||
|
|
||||||
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
|
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
|
||||||
|
|
||||||
from core.models import Snapshot
|
from core.models import Snapshot
|
||||||
@@ -16,8 +24,8 @@ class SeedAdmin(ConfigEditorMixin, BaseModelAdmin):
|
|||||||
sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str')
|
sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str')
|
||||||
search_fields = ('id', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str')
|
search_fields = ('id', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str')
|
||||||
|
|
||||||
readonly_fields = ('created_at', 'modified_at', 'scheduled_crawls', 'crawls', 'snapshots', 'contents', 'available_config_options')
|
readonly_fields = ('created_at', 'modified_at', 'scheduled_crawls', 'crawls', 'snapshots', 'contents')
|
||||||
fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'available_config_options', 'created_by', *readonly_fields[:-1])
|
fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'created_by', *readonly_fields)
|
||||||
|
|
||||||
list_filter = ('extractor', 'created_by')
|
list_filter = ('extractor', 'created_by')
|
||||||
ordering = ['-created_at']
|
ordering = ['-created_at']
|
||||||
@@ -34,19 +42,19 @@ class SeedAdmin(ConfigEditorMixin, BaseModelAdmin):
|
|||||||
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
|
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
|
||||||
(scheduledcrawl.admin_change_url, scheduledcrawl)
|
(scheduledcrawl.admin_change_url, scheduledcrawl)
|
||||||
for scheduledcrawl in obj.scheduled_crawl_set.all().order_by('-created_at')[:20]
|
for scheduledcrawl in obj.scheduled_crawl_set.all().order_by('-created_at')[:20]
|
||||||
)) or format_html('<i>No Scheduled Crawls yet...</i>')
|
)) or mark_safe('<i>No Scheduled Crawls yet...</i>')
|
||||||
|
|
||||||
def crawls(self, obj):
|
def crawls(self, obj):
|
||||||
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
|
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
|
||||||
(crawl.admin_change_url, crawl)
|
(crawl.admin_change_url, crawl)
|
||||||
for crawl in obj.crawl_set.all().order_by('-created_at')[:20]
|
for crawl in obj.crawl_set.all().order_by('-created_at')[:20]
|
||||||
)) or format_html('<i>No Crawls yet...</i>')
|
)) or mark_safe('<i>No Crawls yet...</i>')
|
||||||
|
|
||||||
def snapshots(self, obj):
|
def snapshots(self, obj):
|
||||||
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
|
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
|
||||||
(snapshot.admin_change_url, snapshot)
|
(snapshot.admin_change_url, snapshot)
|
||||||
for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
|
for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
|
||||||
)) or format_html('<i>No Snapshots yet...</i>')
|
)) or mark_safe('<i>No Snapshots yet...</i>')
|
||||||
|
|
||||||
def contents(self, obj):
|
def contents(self, obj):
|
||||||
if obj.uri.startswith('file:///data/'):
|
if obj.uri.startswith('file:///data/'):
|
||||||
@@ -69,13 +77,80 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
|||||||
sort_fields = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at')
|
sort_fields = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at')
|
||||||
search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'schedule_id', 'status', 'seed__uri')
|
search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'schedule_id', 'status', 'seed__uri')
|
||||||
|
|
||||||
readonly_fields = ('created_at', 'modified_at', 'snapshots', 'seed_contents', 'available_config_options')
|
readonly_fields = ('created_at', 'modified_at', 'snapshots', 'seed_urls_editor')
|
||||||
fields = ('label', 'notes', 'urls', 'config', 'available_config_options', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', *readonly_fields[:-1])
|
fields = ('label', 'notes', 'seed_urls_editor', 'config', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', 'created_at', 'modified_at', 'snapshots')
|
||||||
|
|
||||||
list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at')
|
list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at')
|
||||||
ordering = ['-created_at', '-retry_at']
|
ordering = ['-created_at', '-retry_at']
|
||||||
list_per_page = 100
|
list_per_page = 100
|
||||||
actions = ["delete_selected"]
|
actions = ["delete_selected"]
|
||||||
|
change_actions = ['recrawl']
|
||||||
|
|
||||||
|
@action(label='Recrawl', description='Create a new crawl with the same settings')
|
||||||
|
def recrawl(self, request, obj):
|
||||||
|
"""Duplicate this crawl as a new crawl with the same seed and settings."""
|
||||||
|
from django.utils import timezone
|
||||||
|
|
||||||
|
new_crawl = Crawl.objects.create(
|
||||||
|
seed=obj.seed,
|
||||||
|
urls=obj.urls,
|
||||||
|
max_depth=obj.max_depth,
|
||||||
|
config=obj.config,
|
||||||
|
schedule=obj.schedule,
|
||||||
|
label=f"{obj.label} (recrawl)" if obj.label else "",
|
||||||
|
notes=obj.notes,
|
||||||
|
created_by=request.user,
|
||||||
|
status=Crawl.StatusChoices.QUEUED,
|
||||||
|
retry_at=timezone.now(),
|
||||||
|
)
|
||||||
|
|
||||||
|
messages.success(
|
||||||
|
request,
|
||||||
|
f'Created new crawl {new_crawl.id} with the same settings. '
|
||||||
|
f'It will start processing shortly.'
|
||||||
|
)
|
||||||
|
|
||||||
|
# Redirect to the new crawl's change page
|
||||||
|
from django.shortcuts import redirect
|
||||||
|
return redirect('admin:crawls_crawl_change', new_crawl.id)
|
||||||
|
|
||||||
|
def get_urls(self):
|
||||||
|
urls = super().get_urls()
|
||||||
|
custom_urls = [
|
||||||
|
path('<path:object_id>/save_seed_contents/',
|
||||||
|
self.admin_site.admin_view(self.save_seed_contents_view),
|
||||||
|
name='crawls_crawl_save_seed_contents'),
|
||||||
|
]
|
||||||
|
return custom_urls + urls
|
||||||
|
|
||||||
|
def save_seed_contents_view(self, request, object_id):
|
||||||
|
"""Handle saving seed file contents via AJAX."""
|
||||||
|
if request.method != 'POST':
|
||||||
|
return JsonResponse({'success': False, 'error': 'POST required'}, status=405)
|
||||||
|
|
||||||
|
try:
|
||||||
|
crawl = Crawl.objects.get(pk=object_id)
|
||||||
|
except Crawl.DoesNotExist:
|
||||||
|
return JsonResponse({'success': False, 'error': 'Crawl not found'}, status=404)
|
||||||
|
|
||||||
|
if not (crawl.seed and crawl.seed.uri and crawl.seed.uri.startswith('file:///data/')):
|
||||||
|
return JsonResponse({'success': False, 'error': 'Seed is not a local file'}, status=400)
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = json.loads(request.body)
|
||||||
|
contents = data.get('contents', '')
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
return JsonResponse({'success': False, 'error': 'Invalid JSON'}, status=400)
|
||||||
|
|
||||||
|
source_file = DATA_DIR / crawl.seed.uri.replace('file:///data/', '', 1)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Ensure parent directory exists
|
||||||
|
source_file.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
source_file.write_text(contents)
|
||||||
|
return JsonResponse({'success': True, 'message': f'Saved {len(contents)} bytes to {source_file.name}'})
|
||||||
|
except Exception as e:
|
||||||
|
return JsonResponse({'success': False, 'error': str(e)}, status=500)
|
||||||
|
|
||||||
def num_snapshots(self, obj):
|
def num_snapshots(self, obj):
|
||||||
return obj.snapshot_set.count()
|
return obj.snapshot_set.count()
|
||||||
@@ -84,35 +159,175 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
|||||||
return format_html_join('<br/>', '<a href="{}">{}</a>', (
|
return format_html_join('<br/>', '<a href="{}">{}</a>', (
|
||||||
(snapshot.admin_change_url, snapshot)
|
(snapshot.admin_change_url, snapshot)
|
||||||
for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
|
for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
|
||||||
)) or format_html('<i>No Snapshots yet...</i>')
|
)) or mark_safe('<i>No Snapshots yet...</i>')
|
||||||
|
|
||||||
@admin.display(description='Schedule', ordering='schedule')
|
@admin.display(description='Schedule', ordering='schedule')
|
||||||
def schedule_str(self, obj):
|
def schedule_str(self, obj):
|
||||||
if not obj.schedule:
|
if not obj.schedule:
|
||||||
return format_html('<i>None</i>')
|
return mark_safe('<i>None</i>')
|
||||||
return format_html('<a href="{}">{}</a>', obj.schedule.admin_change_url, obj.schedule)
|
return format_html('<a href="{}">{}</a>', obj.schedule.admin_change_url, obj.schedule)
|
||||||
|
|
||||||
@admin.display(description='Seed', ordering='seed')
|
@admin.display(description='Seed', ordering='seed')
|
||||||
def seed_str(self, obj):
|
def seed_str(self, obj):
|
||||||
if not obj.seed:
|
if not obj.seed:
|
||||||
return format_html('<i>None</i>')
|
return mark_safe('<i>None</i>')
|
||||||
return format_html('<a href="{}">{}</a>', obj.seed.admin_change_url, obj.seed)
|
return format_html('<a href="{}">{}</a>', obj.seed.admin_change_url, obj.seed)
|
||||||
|
|
||||||
def seed_contents(self, obj):
|
@admin.display(description='URLs')
|
||||||
if not (obj.seed and obj.seed.uri):
|
def seed_urls_editor(self, obj):
|
||||||
return format_html('<i>None</i>')
|
"""Combined editor showing seed URL and file contents."""
|
||||||
|
widget_id = f'seed_urls_{obj.pk}'
|
||||||
|
|
||||||
if obj.seed.uri.startswith('file:///data/'):
|
# Get the seed URI (or use urls field if no seed)
|
||||||
source_file = DATA_DIR / obj.seed.uri.replace('file:///data/', '', 1)
|
seed_uri = ''
|
||||||
contents = ""
|
if obj.seed and obj.seed.uri:
|
||||||
|
seed_uri = obj.seed.uri
|
||||||
|
elif obj.urls:
|
||||||
|
seed_uri = obj.urls
|
||||||
|
|
||||||
|
# Check if it's a local file we can edit
|
||||||
|
is_file = seed_uri.startswith('file:///data/')
|
||||||
|
contents = ""
|
||||||
|
error = None
|
||||||
|
source_file = None
|
||||||
|
|
||||||
|
if is_file:
|
||||||
|
source_file = DATA_DIR / seed_uri.replace('file:///data/', '', 1)
|
||||||
try:
|
try:
|
||||||
contents = source_file.read_text().strip()[:14_000]
|
contents = source_file.read_text().strip()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
contents = f'Error reading {source_file}: {e}'
|
error = f'Error reading {source_file}: {e}'
|
||||||
|
|
||||||
return format_html('<b><code>{}</code>:</b><br/><pre>{}</pre>', source_file, contents)
|
# Escape for safe HTML embedding
|
||||||
|
escaped_uri = seed_uri.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"')
|
||||||
|
escaped_contents = (contents or '').replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"')
|
||||||
|
|
||||||
return format_html('See URLs here: <a href="{}">{}</a>', obj.seed.uri, obj.seed.uri)
|
# Count lines for auto-expand logic
|
||||||
|
line_count = len(contents.split('\n')) if contents else 0
|
||||||
|
uri_rows = min(max(1, seed_uri.count('\n') + 1), 3)
|
||||||
|
|
||||||
|
html = f'''
|
||||||
|
<div id="{widget_id}_container" style="max-width: 900px;">
|
||||||
|
<!-- Seed URL input (auto-expands) -->
|
||||||
|
<div style="margin-bottom: 12px;">
|
||||||
|
<label style="font-weight: bold; display: block; margin-bottom: 4px;">Seed URL:</label>
|
||||||
|
<textarea id="{widget_id}_uri"
|
||||||
|
style="width: 100%; font-family: monospace; font-size: 13px;
|
||||||
|
padding: 8px; border: 1px solid #ccc; border-radius: 4px;
|
||||||
|
resize: vertical; min-height: 32px; overflow: hidden;"
|
||||||
|
rows="{uri_rows}"
|
||||||
|
placeholder="file:///data/sources/... or https://..."
|
||||||
|
{"readonly" if not obj.pk else ""}>{escaped_uri}</textarea>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{"" if not is_file else f'''
|
||||||
|
<!-- File contents editor -->
|
||||||
|
<div style="margin-bottom: 8px;">
|
||||||
|
<label style="font-weight: bold; display: block; margin-bottom: 4px;">
|
||||||
|
File Contents: <code style="font-weight: normal; color: #666;">{source_file}</code>
|
||||||
|
</label>
|
||||||
|
{"<div style='color: #dc3545; margin-bottom: 8px;'>" + error + "</div>" if error else ""}
|
||||||
|
<textarea id="{widget_id}_contents"
|
||||||
|
style="width: 100%; height: {min(400, max(150, line_count * 18))}px; font-family: monospace; font-size: 12px;
|
||||||
|
padding: 8px; border: 1px solid #ccc; border-radius: 4px; resize: vertical;"
|
||||||
|
placeholder="Enter URLs, one per line...">{escaped_contents}</textarea>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div style="display: flex; gap: 12px; align-items: center; flex-wrap: wrap;">
|
||||||
|
<button type="button" id="{widget_id}_save_btn"
|
||||||
|
onclick="saveSeedUrls_{widget_id}()"
|
||||||
|
style="padding: 8px 20px; background: #417690; color: white; border: none;
|
||||||
|
border-radius: 4px; cursor: pointer; font-weight: bold;">
|
||||||
|
Save URLs
|
||||||
|
</button>
|
||||||
|
<span id="{widget_id}_line_count" style="color: #666; font-size: 12px;"></span>
|
||||||
|
<span id="{widget_id}_status" style="color: #666; font-size: 12px;"></span>
|
||||||
|
</div>
|
||||||
|
'''}
|
||||||
|
|
||||||
|
{"" if is_file else f'''
|
||||||
|
<div style="margin-top: 8px; color: #666;">
|
||||||
|
<a href="{seed_uri}" target="_blank">{seed_uri}</a>
|
||||||
|
</div>
|
||||||
|
'''}
|
||||||
|
|
||||||
|
<script>
|
||||||
|
(function() {{
|
||||||
|
var uriInput = document.getElementById('{widget_id}_uri');
|
||||||
|
var contentsInput = document.getElementById('{widget_id}_contents');
|
||||||
|
var status = document.getElementById('{widget_id}_status');
|
||||||
|
var lineCount = document.getElementById('{widget_id}_line_count');
|
||||||
|
var saveBtn = document.getElementById('{widget_id}_save_btn');
|
||||||
|
|
||||||
|
// Auto-resize URI input
|
||||||
|
function autoResizeUri() {{
|
||||||
|
uriInput.style.height = 'auto';
|
||||||
|
uriInput.style.height = Math.min(100, uriInput.scrollHeight) + 'px';
|
||||||
|
}}
|
||||||
|
uriInput.addEventListener('input', autoResizeUri);
|
||||||
|
autoResizeUri();
|
||||||
|
|
||||||
|
if (contentsInput) {{
|
||||||
|
function updateLineCount() {{
|
||||||
|
var lines = contentsInput.value.split('\\n').filter(function(l) {{ return l.trim(); }});
|
||||||
|
lineCount.textContent = lines.length + ' URLs';
|
||||||
|
}}
|
||||||
|
|
||||||
|
contentsInput.addEventListener('input', function() {{
|
||||||
|
updateLineCount();
|
||||||
|
if (status) {{
|
||||||
|
status.textContent = '(unsaved changes)';
|
||||||
|
status.style.color = '#c4820e';
|
||||||
|
}}
|
||||||
|
}});
|
||||||
|
|
||||||
|
updateLineCount();
|
||||||
|
}}
|
||||||
|
|
||||||
|
window.saveSeedUrls_{widget_id} = function() {{
|
||||||
|
if (!saveBtn) return;
|
||||||
|
saveBtn.disabled = true;
|
||||||
|
saveBtn.textContent = 'Saving...';
|
||||||
|
if (status) status.textContent = '';
|
||||||
|
|
||||||
|
fetch(window.location.pathname + 'save_seed_contents/', {{
|
||||||
|
method: 'POST',
|
||||||
|
headers: {{
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
'X-CSRFToken': document.querySelector('[name=csrfmiddlewaretoken]').value
|
||||||
|
}},
|
||||||
|
body: JSON.stringify({{ contents: contentsInput ? contentsInput.value : '' }})
|
||||||
|
}})
|
||||||
|
.then(function(response) {{ return response.json(); }})
|
||||||
|
.then(function(data) {{
|
||||||
|
if (data.success) {{
|
||||||
|
if (status) {{
|
||||||
|
status.textContent = '✓ ' + data.message;
|
||||||
|
status.style.color = '#28a745';
|
||||||
|
}}
|
||||||
|
}} else {{
|
||||||
|
if (status) {{
|
||||||
|
status.textContent = '✗ ' + data.error;
|
||||||
|
status.style.color = '#dc3545';
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
}})
|
||||||
|
.catch(function(err) {{
|
||||||
|
if (status) {{
|
||||||
|
status.textContent = '✗ Error: ' + err;
|
||||||
|
status.style.color = '#dc3545';
|
||||||
|
}}
|
||||||
|
}})
|
||||||
|
.finally(function() {{
|
||||||
|
saveBtn.disabled = false;
|
||||||
|
saveBtn.textContent = 'Save URLs';
|
||||||
|
}});
|
||||||
|
}};
|
||||||
|
}})();
|
||||||
|
</script>
|
||||||
|
</div>
|
||||||
|
'''
|
||||||
|
return mark_safe(html)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -143,14 +358,14 @@ class CrawlScheduleAdmin(BaseModelAdmin):
|
|||||||
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
|
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
|
||||||
(crawl.admin_change_url, crawl)
|
(crawl.admin_change_url, crawl)
|
||||||
for crawl in obj.crawl_set.all().order_by('-created_at')[:20]
|
for crawl in obj.crawl_set.all().order_by('-created_at')[:20]
|
||||||
)) or format_html('<i>No Crawls yet...</i>')
|
)) or mark_safe('<i>No Crawls yet...</i>')
|
||||||
|
|
||||||
def snapshots(self, obj):
|
def snapshots(self, obj):
|
||||||
crawl_ids = obj.crawl_set.values_list('pk', flat=True)
|
crawl_ids = obj.crawl_set.values_list('pk', flat=True)
|
||||||
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
|
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
|
||||||
(snapshot.admin_change_url, snapshot)
|
(snapshot.admin_change_url, snapshot)
|
||||||
for snapshot in Snapshot.objects.filter(crawl_id__in=crawl_ids).order_by('-created_at')[:20]
|
for snapshot in Snapshot.objects.filter(crawl_id__in=crawl_ids).order_by('-created_at')[:20]
|
||||||
)) or format_html('<i>No Snapshots yet...</i>')
|
)) or mark_safe('<i>No Snapshots yet...</i>')
|
||||||
|
|
||||||
|
|
||||||
def register_admin(admin_site):
|
def register_admin(admin_site):
|
||||||
|
|||||||
@@ -865,3 +865,189 @@ def export_plugin_config_to_env(
|
|||||||
return env
|
return env
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Plugin Template Discovery
|
||||||
|
# =============================================================================
|
||||||
|
#
|
||||||
|
# Plugins can provide custom templates for rendering their output in the UI.
|
||||||
|
# Templates are discovered by filename convention inside each plugin's templates/ dir:
|
||||||
|
#
|
||||||
|
# archivebox/plugins/<plugin_name>/
|
||||||
|
# templates/
|
||||||
|
# icon.html # Icon for admin table view (small inline HTML)
|
||||||
|
# thumbnail.html # Preview thumbnail for snapshot cards
|
||||||
|
# embed.html # Iframe embed content for main preview
|
||||||
|
# fullscreen.html # Fullscreen view template
|
||||||
|
#
|
||||||
|
# Template context variables available:
|
||||||
|
# {{ result }} - ArchiveResult object
|
||||||
|
# {{ snapshot }} - Parent Snapshot object
|
||||||
|
# {{ output_path }} - Path to output file/dir relative to snapshot dir
|
||||||
|
# {{ extractor }} - Extractor name (e.g., 'screenshot', 'singlefile')
|
||||||
|
#
|
||||||
|
|
||||||
|
# Default templates used when plugin doesn't provide one
|
||||||
|
DEFAULT_TEMPLATES = {
|
||||||
|
'icon': '''<span title="{{ extractor }}">{{ icon }}</span>''',
|
||||||
|
'thumbnail': '''
|
||||||
|
<img src="{{ output_path }}"
|
||||||
|
alt="{{ extractor }} output"
|
||||||
|
style="max-width: 100%; max-height: 100px; object-fit: cover;"
|
||||||
|
onerror="this.style.display='none'">
|
||||||
|
''',
|
||||||
|
'embed': '''
|
||||||
|
<iframe src="{{ output_path }}"
|
||||||
|
style="width: 100%; height: 100%; border: none;"
|
||||||
|
sandbox="allow-same-origin allow-scripts">
|
||||||
|
</iframe>
|
||||||
|
''',
|
||||||
|
'fullscreen': '''
|
||||||
|
<iframe src="{{ output_path }}"
|
||||||
|
style="width: 100%; height: 100vh; border: none;"
|
||||||
|
sandbox="allow-same-origin allow-scripts allow-forms">
|
||||||
|
</iframe>
|
||||||
|
''',
|
||||||
|
}
|
||||||
|
|
||||||
|
# Default icons for known extractors (emoji or short HTML)
|
||||||
|
DEFAULT_EXTRACTOR_ICONS = {
|
||||||
|
'screenshot': '📷',
|
||||||
|
'pdf': '📄',
|
||||||
|
'singlefile': '📦',
|
||||||
|
'dom': '🌐',
|
||||||
|
'wget': '📥',
|
||||||
|
'media': '🎬',
|
||||||
|
'git': '📂',
|
||||||
|
'readability': '📖',
|
||||||
|
'mercury': '☿️',
|
||||||
|
'favicon': '⭐',
|
||||||
|
'title': '📝',
|
||||||
|
'headers': '📋',
|
||||||
|
'archive_org': '🏛️',
|
||||||
|
'htmltotext': '📃',
|
||||||
|
'warc': '🗄️',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_plugin_template(extractor: str, template_name: str) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Get a plugin template by extractor name and template type.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
extractor: Extractor name (e.g., 'screenshot', '15_singlefile')
|
||||||
|
template_name: One of 'icon', 'thumbnail', 'embed', 'fullscreen'
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Template content as string, or None if not found.
|
||||||
|
"""
|
||||||
|
base_name = get_extractor_name(extractor)
|
||||||
|
|
||||||
|
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
|
||||||
|
if not base_dir.exists():
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Look for plugin directory matching extractor name
|
||||||
|
for plugin_dir in base_dir.iterdir():
|
||||||
|
if not plugin_dir.is_dir():
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Match by directory name (exact or partial)
|
||||||
|
if plugin_dir.name == base_name or plugin_dir.name.endswith(f'_{base_name}'):
|
||||||
|
template_path = plugin_dir / 'templates' / f'{template_name}.html'
|
||||||
|
if template_path.exists():
|
||||||
|
return template_path.read_text()
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_extractor_template(extractor: str, template_name: str) -> str:
|
||||||
|
"""
|
||||||
|
Get template for an extractor, falling back to defaults.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
extractor: Extractor name (e.g., 'screenshot', '15_singlefile')
|
||||||
|
template_name: One of 'icon', 'thumbnail', 'embed', 'fullscreen'
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Template content as string (plugin template or default).
|
||||||
|
"""
|
||||||
|
# Try plugin-provided template first
|
||||||
|
template = get_plugin_template(extractor, template_name)
|
||||||
|
if template:
|
||||||
|
return template
|
||||||
|
|
||||||
|
# Fall back to default template
|
||||||
|
return DEFAULT_TEMPLATES.get(template_name, '')
|
||||||
|
|
||||||
|
|
||||||
|
def get_extractor_icon(extractor: str) -> str:
|
||||||
|
"""
|
||||||
|
Get the icon for an extractor.
|
||||||
|
|
||||||
|
First checks for plugin-provided icon.html template,
|
||||||
|
then falls back to DEFAULT_EXTRACTOR_ICONS.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
extractor: Extractor name (e.g., 'screenshot', '15_singlefile')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Icon HTML/emoji string.
|
||||||
|
"""
|
||||||
|
base_name = get_extractor_name(extractor)
|
||||||
|
|
||||||
|
# Try plugin-provided icon template
|
||||||
|
icon_template = get_plugin_template(extractor, 'icon')
|
||||||
|
if icon_template:
|
||||||
|
return icon_template.strip()
|
||||||
|
|
||||||
|
# Fall back to default icon
|
||||||
|
return DEFAULT_EXTRACTOR_ICONS.get(base_name, '📁')
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_extractor_icons() -> Dict[str, str]:
|
||||||
|
"""
|
||||||
|
Get icons for all discovered extractors.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict mapping extractor base names to their icons.
|
||||||
|
"""
|
||||||
|
icons = {}
|
||||||
|
for extractor in get_extractors():
|
||||||
|
base_name = get_extractor_name(extractor)
|
||||||
|
icons[base_name] = get_extractor_icon(extractor)
|
||||||
|
return icons
|
||||||
|
|
||||||
|
|
||||||
|
def discover_plugin_templates() -> Dict[str, Dict[str, str]]:
|
||||||
|
"""
|
||||||
|
Discover all plugin templates organized by extractor.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict mapping extractor names to dicts of template_name -> template_path.
|
||||||
|
e.g., {'screenshot': {'icon': '/path/to/icon.html', 'thumbnail': '/path/to/thumbnail.html'}}
|
||||||
|
"""
|
||||||
|
templates: Dict[str, Dict[str, str]] = {}
|
||||||
|
|
||||||
|
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
|
||||||
|
if not base_dir.exists():
|
||||||
|
continue
|
||||||
|
|
||||||
|
for plugin_dir in base_dir.iterdir():
|
||||||
|
if not plugin_dir.is_dir():
|
||||||
|
continue
|
||||||
|
|
||||||
|
templates_dir = plugin_dir / 'templates'
|
||||||
|
if not templates_dir.exists():
|
||||||
|
continue
|
||||||
|
|
||||||
|
plugin_templates = {}
|
||||||
|
for template_file in templates_dir.glob('*.html'):
|
||||||
|
template_name = template_file.stem # icon, thumbnail, embed, fullscreen
|
||||||
|
plugin_templates[template_name] = str(template_file)
|
||||||
|
|
||||||
|
if plugin_templates:
|
||||||
|
templates[plugin_dir.name] = plugin_templates
|
||||||
|
|
||||||
|
return templates
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -3,16 +3,16 @@ __package__ = 'archivebox.machine'
|
|||||||
from django.contrib import admin
|
from django.contrib import admin
|
||||||
from django.utils.html import format_html
|
from django.utils.html import format_html
|
||||||
|
|
||||||
from archivebox.base_models.admin import BaseModelAdmin
|
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
|
||||||
from machine.models import Machine, NetworkInterface, InstalledBinary
|
from machine.models import Machine, NetworkInterface, InstalledBinary, Dependency
|
||||||
|
|
||||||
|
|
||||||
class MachineAdmin(BaseModelAdmin):
|
class MachineAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||||
list_display = ('id', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid', 'health')
|
list_display = ('id', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid', 'health')
|
||||||
sort_fields = ('id', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid')
|
sort_fields = ('id', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid')
|
||||||
|
|
||||||
readonly_fields = ('guid', 'created_at', 'modified_at', 'ips')
|
readonly_fields = ('guid', 'created_at', 'modified_at', 'ips')
|
||||||
fields = (*readonly_fields, 'hostname', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'hw_uuid', 'os_arch', 'os_family', 'os_platform', 'os_kernel', 'os_release', 'stats', 'num_uses_succeeded', 'num_uses_failed')
|
fields = (*readonly_fields, 'hostname', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'hw_uuid', 'os_arch', 'os_family', 'os_platform', 'os_kernel', 'os_release', 'stats', 'config', 'num_uses_succeeded', 'num_uses_failed')
|
||||||
|
|
||||||
list_filter = ('hw_in_docker', 'hw_in_vm', 'os_arch', 'os_family', 'os_platform')
|
list_filter = ('hw_in_docker', 'hw_in_vm', 'os_arch', 'os_family', 'os_platform')
|
||||||
ordering = ['-created_at']
|
ordering = ['-created_at']
|
||||||
@@ -48,15 +48,43 @@ class NetworkInterfaceAdmin(BaseModelAdmin):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class DependencyAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||||
|
list_display = ('id', 'created_at', 'bin_name', 'bin_providers', 'is_installed', 'installed_count')
|
||||||
|
sort_fields = ('id', 'created_at', 'bin_name', 'bin_providers')
|
||||||
|
search_fields = ('id', 'bin_name', 'bin_providers')
|
||||||
|
|
||||||
|
readonly_fields = ('id', 'created_at', 'modified_at', 'is_installed', 'installed_count')
|
||||||
|
fields = ('bin_name', 'bin_providers', 'custom_cmds', 'config', *readonly_fields)
|
||||||
|
|
||||||
|
list_filter = ('bin_providers', 'created_at')
|
||||||
|
ordering = ['-created_at']
|
||||||
|
list_per_page = 100
|
||||||
|
actions = ["delete_selected"]
|
||||||
|
|
||||||
|
@admin.display(description='Installed', boolean=True)
|
||||||
|
def is_installed(self, dependency):
|
||||||
|
return dependency.is_installed
|
||||||
|
|
||||||
|
@admin.display(description='# Binaries')
|
||||||
|
def installed_count(self, dependency):
|
||||||
|
count = dependency.installed_binaries.count()
|
||||||
|
if count:
|
||||||
|
return format_html(
|
||||||
|
'<a href="/admin/machine/installedbinary/?dependency__id__exact={}">{}</a>',
|
||||||
|
dependency.id, count,
|
||||||
|
)
|
||||||
|
return '0'
|
||||||
|
|
||||||
|
|
||||||
class InstalledBinaryAdmin(BaseModelAdmin):
|
class InstalledBinaryAdmin(BaseModelAdmin):
|
||||||
list_display = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'health')
|
list_display = ('id', 'created_at', 'machine_info', 'name', 'dependency_link', 'binprovider', 'version', 'abspath', 'sha256', 'health')
|
||||||
sort_fields = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256')
|
sort_fields = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256')
|
||||||
search_fields = ('id', 'machine__id', 'name', 'binprovider', 'version', 'abspath', 'sha256')
|
search_fields = ('id', 'machine__id', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'dependency__bin_name')
|
||||||
|
|
||||||
readonly_fields = ('created_at', 'modified_at')
|
readonly_fields = ('created_at', 'modified_at')
|
||||||
fields = ('machine', 'name', 'binprovider', 'abspath', 'version', 'sha256', *readonly_fields, 'num_uses_succeeded', 'num_uses_failed')
|
fields = ('machine', 'dependency', 'name', 'binprovider', 'abspath', 'version', 'sha256', *readonly_fields, 'num_uses_succeeded', 'num_uses_failed')
|
||||||
|
|
||||||
list_filter = ('name', 'binprovider', 'machine_id')
|
list_filter = ('name', 'binprovider', 'machine_id', 'dependency')
|
||||||
ordering = ['-created_at']
|
ordering = ['-created_at']
|
||||||
list_per_page = 100
|
list_per_page = 100
|
||||||
actions = ["delete_selected"]
|
actions = ["delete_selected"]
|
||||||
@@ -68,8 +96,18 @@ class InstalledBinaryAdmin(BaseModelAdmin):
|
|||||||
installed_binary.machine.id, str(installed_binary.machine.id)[:8], installed_binary.machine.hostname,
|
installed_binary.machine.id, str(installed_binary.machine.id)[:8], installed_binary.machine.hostname,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@admin.display(description='Dependency', ordering='dependency__bin_name')
|
||||||
|
def dependency_link(self, installed_binary):
|
||||||
|
if installed_binary.dependency:
|
||||||
|
return format_html(
|
||||||
|
'<a href="/admin/machine/dependency/{}/change">{}</a>',
|
||||||
|
installed_binary.dependency.id, installed_binary.dependency.bin_name,
|
||||||
|
)
|
||||||
|
return '-'
|
||||||
|
|
||||||
|
|
||||||
def register_admin(admin_site):
|
def register_admin(admin_site):
|
||||||
admin_site.register(Machine, MachineAdmin)
|
admin_site.register(Machine, MachineAdmin)
|
||||||
admin_site.register(NetworkInterface, NetworkInterfaceAdmin)
|
admin_site.register(NetworkInterface, NetworkInterfaceAdmin)
|
||||||
|
admin_site.register(Dependency, DependencyAdmin)
|
||||||
admin_site.register(InstalledBinary, InstalledBinaryAdmin)
|
admin_site.register(InstalledBinary, InstalledBinaryAdmin)
|
||||||
|
|||||||
@@ -37,15 +37,13 @@ def apply_migrations(out_dir: Path = DATA_DIR) -> List[str]:
|
|||||||
"""Apply pending Django migrations"""
|
"""Apply pending Django migrations"""
|
||||||
from django.core.management import call_command
|
from django.core.management import call_command
|
||||||
|
|
||||||
out1, out2 = StringIO(), StringIO()
|
out1 = StringIO()
|
||||||
|
|
||||||
call_command("migrate", interactive=False, database='default', stdout=out1)
|
call_command("migrate", interactive=False, database='default', stdout=out1)
|
||||||
out1.seek(0)
|
out1.seek(0)
|
||||||
call_command("migrate", "huey_monitor", interactive=False, database='queue', stdout=out2)
|
|
||||||
out2.seek(0)
|
|
||||||
|
|
||||||
return [
|
return [
|
||||||
line.strip() for line in out1.readlines() + out2.readlines() if line.strip()
|
line.strip() for line in out1.readlines() if line.strip()
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -480,6 +480,138 @@ def printable_filesize(num_bytes: Union[int, float]) -> str:
|
|||||||
return '%3.1f %s' % (num_bytes, 'TB')
|
return '%3.1f %s' % (num_bytes, 'TB')
|
||||||
|
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def format_duration(seconds: float) -> str:
|
||||||
|
"""Format duration in human-readable form."""
|
||||||
|
if seconds < 1:
|
||||||
|
return f'{seconds*1000:.0f}ms'
|
||||||
|
elif seconds < 60:
|
||||||
|
return f'{seconds:.1f}s'
|
||||||
|
elif seconds < 3600:
|
||||||
|
minutes = int(seconds // 60)
|
||||||
|
secs = int(seconds % 60)
|
||||||
|
return f'{minutes}min {secs}s' if secs else f'{minutes}min'
|
||||||
|
else:
|
||||||
|
hours = int(seconds // 3600)
|
||||||
|
minutes = int((seconds % 3600) // 60)
|
||||||
|
return f'{hours}hr {minutes}min' if minutes else f'{hours}hr'
|
||||||
|
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def truncate_url(url: str, max_length: int = 60) -> str:
|
||||||
|
"""Truncate URL to max_length, keeping domain and adding ellipsis."""
|
||||||
|
if len(url) <= max_length:
|
||||||
|
return url
|
||||||
|
# Try to keep the domain and beginning of path
|
||||||
|
if '://' in url:
|
||||||
|
protocol, rest = url.split('://', 1)
|
||||||
|
if '/' in rest:
|
||||||
|
domain, path = rest.split('/', 1)
|
||||||
|
available = max_length - len(protocol) - len(domain) - 6 # for "://", "/", "..."
|
||||||
|
if available > 10:
|
||||||
|
return f'{protocol}://{domain}/{path[:available]}...'
|
||||||
|
# Fallback: just truncate
|
||||||
|
return url[:max_length-3] + '...'
|
||||||
|
|
||||||
|
|
||||||
|
@enforce_types
|
||||||
|
def log_worker_event(
|
||||||
|
worker_type: str,
|
||||||
|
event: str,
|
||||||
|
indent_level: int = 0,
|
||||||
|
pid: Optional[int] = None,
|
||||||
|
worker_id: Optional[str] = None,
|
||||||
|
url: Optional[str] = None,
|
||||||
|
extractor: Optional[str] = None,
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
error: Optional[Exception] = None,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Log a worker event with structured metadata and indentation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
worker_type: Type of worker (Orchestrator, CrawlWorker, SnapshotWorker, etc.)
|
||||||
|
event: Event name (Starting, Completed, Failed, etc.)
|
||||||
|
indent_level: Indentation level (0=Orchestrator, 1=CrawlWorker, 2=SnapshotWorker, 3=ArchiveResultWorker)
|
||||||
|
pid: Process ID
|
||||||
|
worker_id: Worker ID (UUID for CrawlWorker, url for SnapshotWorker, extractor for ArchiveResultWorker)
|
||||||
|
url: URL being processed (for SnapshotWorker/ArchiveResultWorker)
|
||||||
|
extractor: Extractor name (for ArchiveResultWorker)
|
||||||
|
metadata: Dict of metadata to show in curly braces
|
||||||
|
error: Exception if event is an error
|
||||||
|
"""
|
||||||
|
indent = ' ' * indent_level
|
||||||
|
|
||||||
|
# Build worker identifier
|
||||||
|
worker_parts = [worker_type]
|
||||||
|
if pid:
|
||||||
|
worker_parts.append(f'pid={pid}')
|
||||||
|
if worker_id and worker_type in ('CrawlWorker', 'Orchestrator'):
|
||||||
|
worker_parts.append(f'id={worker_id}')
|
||||||
|
if url and worker_type == 'SnapshotWorker':
|
||||||
|
worker_parts.append(f'url={truncate_url(url)}')
|
||||||
|
if extractor and worker_type == 'ArchiveResultWorker':
|
||||||
|
worker_parts.append(f'extractor={extractor}')
|
||||||
|
|
||||||
|
worker_label = f'{worker_parts[0]}[{", ".join(worker_parts[1:])}]'
|
||||||
|
|
||||||
|
# Build metadata string
|
||||||
|
metadata_str = ''
|
||||||
|
if metadata:
|
||||||
|
# Format metadata nicely
|
||||||
|
meta_parts = []
|
||||||
|
for k, v in metadata.items():
|
||||||
|
if isinstance(v, float):
|
||||||
|
# Format floats nicely (durations, sizes)
|
||||||
|
if 'duration' in k.lower():
|
||||||
|
meta_parts.append(f'{k}: {format_duration(v)}')
|
||||||
|
elif 'size' in k.lower():
|
||||||
|
meta_parts.append(f'{k}: {printable_filesize(int(v))}')
|
||||||
|
else:
|
||||||
|
meta_parts.append(f'{k}: {v:.2f}')
|
||||||
|
elif isinstance(v, int):
|
||||||
|
# Format integers - check if it's a size
|
||||||
|
if 'size' in k.lower() or 'bytes' in k.lower():
|
||||||
|
meta_parts.append(f'{k}: {printable_filesize(v)}')
|
||||||
|
else:
|
||||||
|
meta_parts.append(f'{k}: {v}')
|
||||||
|
elif isinstance(v, (list, tuple)):
|
||||||
|
meta_parts.append(f'{k}: {len(v)}')
|
||||||
|
else:
|
||||||
|
meta_parts.append(f'{k}: {v}')
|
||||||
|
metadata_str = ' {' + ', '.join(meta_parts) + '}'
|
||||||
|
|
||||||
|
# Determine color based on event
|
||||||
|
color = 'white'
|
||||||
|
if event in ('Starting...', 'Started', 'STARTED', 'Started in background'):
|
||||||
|
color = 'green'
|
||||||
|
elif event in ('Processing...', 'PROCESSING'):
|
||||||
|
color = 'blue'
|
||||||
|
elif event in ('Completed', 'COMPLETED', 'All work complete'):
|
||||||
|
color = 'blue'
|
||||||
|
elif event in ('Failed', 'ERROR', 'Failed to spawn worker'):
|
||||||
|
color = 'red'
|
||||||
|
elif event in ('Shutting down', 'SHUTDOWN'):
|
||||||
|
color = 'grey53'
|
||||||
|
|
||||||
|
# Build final message
|
||||||
|
error_str = f' {type(error).__name__}: {error}' if error else ''
|
||||||
|
# Build colored message - worker_label needs to be inside color tags
|
||||||
|
# But first we need to format the color tags separately from the worker label
|
||||||
|
from archivebox.misc.logging import CONSOLE
|
||||||
|
from rich.text import Text
|
||||||
|
|
||||||
|
# Create a Rich Text object for proper formatting
|
||||||
|
text = Text()
|
||||||
|
text.append(indent) # Indentation
|
||||||
|
# Append worker label and event with color
|
||||||
|
text.append(f'{worker_label} {event}{error_str}', style=color)
|
||||||
|
# Append metadata without color
|
||||||
|
text.append(metadata_str)
|
||||||
|
|
||||||
|
CONSOLE.print(text)
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def printable_folders(folders: Dict[str, Optional["Snapshot"]], with_headers: bool=False) -> str:
|
def printable_folders(folders: Dict[str, Optional["Snapshot"]], with_headers: bool=False) -> str:
|
||||||
return '\n'.join(
|
return '\n'.join(
|
||||||
|
|||||||
1
archivebox/plugins/archive_org/templates/icon.html
Normal file
1
archivebox/plugins/archive_org/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
|||||||
|
🏛️
|
||||||
@@ -7,7 +7,7 @@ new plugin-based output structure to the legacy canonical output paths that
|
|||||||
ArchiveBox has historically used. This maintains backward compatibility with
|
ArchiveBox has historically used. This maintains backward compatibility with
|
||||||
existing tools and scripts that expect outputs at specific locations.
|
existing tools and scripts that expect outputs at specific locations.
|
||||||
|
|
||||||
Canonical output paths (from Snapshot.canonical_outputs()):
|
Canonical output paths:
|
||||||
- favicon.ico → favicon/favicon.ico
|
- favicon.ico → favicon/favicon.ico
|
||||||
- singlefile.html → singlefile/singlefile.html
|
- singlefile.html → singlefile/singlefile.html
|
||||||
- readability/content.html → readability/content.html
|
- readability/content.html → readability/content.html
|
||||||
@@ -27,27 +27,20 @@ New plugin outputs:
|
|||||||
- redirects.json → redirects/redirects.json
|
- redirects.json → redirects/redirects.json
|
||||||
- console.jsonl → consolelog/console.jsonl
|
- console.jsonl → consolelog/console.jsonl
|
||||||
|
|
||||||
Usage: on_Snapshot__91_canonical_outputs.py --url=<url> --snapshot-id=<uuid>
|
Usage: on_Snapshot__92_canonical_outputs.py --url=<url> --snapshot-id=<uuid>
|
||||||
|
|
||||||
Environment variables:
|
Environment variables:
|
||||||
SAVE_CANONICAL_SYMLINKS: Enable canonical symlinks (default: true)
|
SAVE_CANONICAL_SYMLINKS: Enable canonical symlinks (default: true)
|
||||||
|
DATA_DIR: ArchiveBox data directory
|
||||||
|
ARCHIVE_DIR: Archive output directory
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__package__ = 'archivebox.plugins.canonical_outputs'
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, Optional
|
from datetime import datetime, timezone
|
||||||
|
from typing import Dict
|
||||||
# Configure Django if running standalone
|
|
||||||
if __name__ == '__main__':
|
|
||||||
parent_dir = str(Path(__file__).resolve().parent.parent.parent)
|
|
||||||
if parent_dir not in sys.path:
|
|
||||||
sys.path.insert(0, parent_dir)
|
|
||||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
|
|
||||||
import django
|
|
||||||
django.setup()
|
|
||||||
|
|
||||||
import rich_click as click
|
import rich_click as click
|
||||||
|
|
||||||
@@ -150,10 +143,7 @@ def create_canonical_symlinks(snapshot_dir: Path) -> Dict[str, bool]:
|
|||||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||||
def main(url: str, snapshot_id: str):
|
def main(url: str, snapshot_id: str):
|
||||||
"""Create symlinks from plugin outputs to canonical legacy locations."""
|
"""Create symlinks from plugin outputs to canonical legacy locations."""
|
||||||
from datetime import datetime
|
start_ts = datetime.now(timezone.utc)
|
||||||
from archivebox.core.models import Snapshot
|
|
||||||
|
|
||||||
start_ts = datetime.now()
|
|
||||||
status = 'failed'
|
status = 'failed'
|
||||||
output = None
|
output = None
|
||||||
error = ''
|
error = ''
|
||||||
@@ -161,31 +151,20 @@ def main(url: str, snapshot_id: str):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
# Check if enabled
|
# Check if enabled
|
||||||
from archivebox.config import CONSTANTS
|
|
||||||
save_canonical = os.getenv('SAVE_CANONICAL_SYMLINKS', 'true').lower() in ('true', '1', 'yes', 'on')
|
save_canonical = os.getenv('SAVE_CANONICAL_SYMLINKS', 'true').lower() in ('true', '1', 'yes', 'on')
|
||||||
|
|
||||||
if not save_canonical:
|
if not save_canonical:
|
||||||
click.echo('Skipping canonical symlinks (SAVE_CANONICAL_SYMLINKS=False)')
|
|
||||||
status = 'skipped'
|
status = 'skipped'
|
||||||
end_ts = datetime.now()
|
click.echo(json.dumps({'status': status, 'output': 'SAVE_CANONICAL_SYMLINKS=false'}))
|
||||||
click.echo(f'START_TS={start_ts.isoformat()}')
|
|
||||||
click.echo(f'END_TS={end_ts.isoformat()}')
|
|
||||||
click.echo(f'STATUS={status}')
|
|
||||||
click.echo(f'RESULT_JSON={{"extractor": "canonical_outputs", "status": "{status}", "url": "{url}", "snapshot_id": "{snapshot_id}"}}')
|
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
|
||||||
# Get snapshot
|
# Working directory is the extractor output dir (e.g., <snapshot>/canonical_outputs/)
|
||||||
try:
|
# Parent is the snapshot directory
|
||||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
output_dir = Path.cwd()
|
||||||
except Snapshot.DoesNotExist:
|
snapshot_dir = output_dir.parent
|
||||||
error = f'Snapshot {snapshot_id} not found'
|
|
||||||
raise ValueError(error)
|
|
||||||
|
|
||||||
# Get snapshot directory
|
|
||||||
snapshot_dir = Path(snapshot.output_dir)
|
|
||||||
if not snapshot_dir.exists():
|
if not snapshot_dir.exists():
|
||||||
error = f'Snapshot directory not found: {snapshot_dir}'
|
raise FileNotFoundError(f'Snapshot directory not found: {snapshot_dir}')
|
||||||
raise FileNotFoundError(error)
|
|
||||||
|
|
||||||
# Create canonical symlinks
|
# Create canonical symlinks
|
||||||
results = create_canonical_symlinks(snapshot_dir)
|
results = create_canonical_symlinks(snapshot_dir)
|
||||||
@@ -203,37 +182,18 @@ def main(url: str, snapshot_id: str):
|
|||||||
status = 'failed'
|
status = 'failed'
|
||||||
click.echo(f'Error: {error}', err=True)
|
click.echo(f'Error: {error}', err=True)
|
||||||
|
|
||||||
end_ts = datetime.now()
|
end_ts = datetime.now(timezone.utc)
|
||||||
duration = (end_ts - start_ts).total_seconds()
|
|
||||||
|
|
||||||
# Print results
|
# Print JSON result for hook runner
|
||||||
click.echo(f'START_TS={start_ts.isoformat()}')
|
result = {
|
||||||
click.echo(f'END_TS={end_ts.isoformat()}')
|
|
||||||
click.echo(f'DURATION={duration:.2f}')
|
|
||||||
if output:
|
|
||||||
click.echo(f'OUTPUT={output}')
|
|
||||||
click.echo(f'STATUS={status}')
|
|
||||||
|
|
||||||
if error:
|
|
||||||
click.echo(f'ERROR={error}', err=True)
|
|
||||||
|
|
||||||
# Print JSON result
|
|
||||||
import json
|
|
||||||
result_json = {
|
|
||||||
'extractor': 'canonical_outputs',
|
|
||||||
'url': url,
|
|
||||||
'snapshot_id': snapshot_id,
|
|
||||||
'status': status,
|
'status': status,
|
||||||
'start_ts': start_ts.isoformat(),
|
|
||||||
'end_ts': end_ts.isoformat(),
|
|
||||||
'duration': round(duration, 2),
|
|
||||||
'output': output,
|
'output': output,
|
||||||
'symlinks_created': symlinks_created,
|
|
||||||
'error': error or None,
|
'error': error or None,
|
||||||
|
'symlinks_created': symlinks_created,
|
||||||
}
|
}
|
||||||
click.echo(f'RESULT_JSON={json.dumps(result_json)}')
|
click.echo(json.dumps(result))
|
||||||
|
|
||||||
sys.exit(0 if status == 'succeeded' else 1)
|
sys.exit(0 if status in ('succeeded', 'skipped') else 1)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|||||||
@@ -1,149 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Install Chrome/Chromium if not already available.
|
|
||||||
|
|
||||||
Runs at crawl start to ensure Chrome is installed.
|
|
||||||
Uses playwright to install chromium if no system Chrome found.
|
|
||||||
Outputs JSONL for InstalledBinary.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
import shutil
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
|
|
||||||
def find_chrome():
|
|
||||||
"""Try to find system Chrome/Chromium."""
|
|
||||||
# Comprehensive list of Chrome/Chromium binary names and paths
|
|
||||||
chromium_names_linux = [
|
|
||||||
'chromium',
|
|
||||||
'chromium-browser',
|
|
||||||
'chromium-browser-beta',
|
|
||||||
'chromium-browser-unstable',
|
|
||||||
'chromium-browser-canary',
|
|
||||||
'chromium-browser-dev',
|
|
||||||
]
|
|
||||||
|
|
||||||
chrome_names_linux = [
|
|
||||||
'google-chrome',
|
|
||||||
'google-chrome-stable',
|
|
||||||
'google-chrome-beta',
|
|
||||||
'google-chrome-canary',
|
|
||||||
'google-chrome-unstable',
|
|
||||||
'google-chrome-dev',
|
|
||||||
'chrome',
|
|
||||||
]
|
|
||||||
|
|
||||||
chrome_paths_macos = [
|
|
||||||
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
|
||||||
'/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
|
|
||||||
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
|
||||||
]
|
|
||||||
|
|
||||||
chrome_paths_linux = [
|
|
||||||
'/usr/bin/google-chrome',
|
|
||||||
'/usr/bin/google-chrome-stable',
|
|
||||||
'/usr/bin/chromium',
|
|
||||||
'/usr/bin/chromium-browser',
|
|
||||||
'/snap/bin/chromium',
|
|
||||||
'/opt/google/chrome/chrome',
|
|
||||||
]
|
|
||||||
|
|
||||||
all_chrome_names = chrome_names_linux + chromium_names_linux
|
|
||||||
all_chrome_paths = chrome_paths_macos + chrome_paths_linux
|
|
||||||
|
|
||||||
# Check env var first
|
|
||||||
env_path = os.environ.get('CHROME_BINARY', '')
|
|
||||||
if env_path and Path(env_path).is_file():
|
|
||||||
return env_path
|
|
||||||
|
|
||||||
# Try shutil.which for various names
|
|
||||||
for name in all_chrome_names:
|
|
||||||
abspath = shutil.which(name)
|
|
||||||
if abspath:
|
|
||||||
return abspath
|
|
||||||
|
|
||||||
# Check common paths
|
|
||||||
for path in all_chrome_paths:
|
|
||||||
if Path(path).is_file():
|
|
||||||
return path
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
try:
|
|
||||||
# First try to find system Chrome
|
|
||||||
system_chrome = find_chrome()
|
|
||||||
if system_chrome:
|
|
||||||
print(json.dumps({
|
|
||||||
'type': 'InstalledBinary',
|
|
||||||
'name': 'chrome',
|
|
||||||
'abspath': str(system_chrome),
|
|
||||||
'version': None,
|
|
||||||
'sha256': None,
|
|
||||||
'binprovider': 'env',
|
|
||||||
}))
|
|
||||||
sys.exit(0)
|
|
||||||
|
|
||||||
# If not found in system, try to install chromium via apt/brew
|
|
||||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
|
|
||||||
|
|
||||||
AptProvider.model_rebuild()
|
|
||||||
BrewProvider.model_rebuild()
|
|
||||||
EnvProvider.model_rebuild()
|
|
||||||
|
|
||||||
# Try chromium-browser or chromium via system package managers
|
|
||||||
for binary_name in ['chromium', 'chromium-browser', 'google-chrome']:
|
|
||||||
try:
|
|
||||||
chrome_binary = Binary(
|
|
||||||
name=binary_name,
|
|
||||||
binproviders=[AptProvider(), BrewProvider(), EnvProvider()]
|
|
||||||
)
|
|
||||||
|
|
||||||
# Try to load, install if not found
|
|
||||||
try:
|
|
||||||
loaded = chrome_binary.load()
|
|
||||||
if not loaded or not loaded.abspath:
|
|
||||||
raise Exception("Not loaded")
|
|
||||||
except Exception:
|
|
||||||
# Install via system package manager
|
|
||||||
loaded = chrome_binary.install()
|
|
||||||
|
|
||||||
if loaded and loaded.abspath:
|
|
||||||
# Output InstalledBinary JSONL
|
|
||||||
print(json.dumps({
|
|
||||||
'type': 'InstalledBinary',
|
|
||||||
'name': 'chrome',
|
|
||||||
'abspath': str(loaded.abspath),
|
|
||||||
'version': str(loaded.version) if loaded.version else None,
|
|
||||||
'sha256': loaded.sha256,
|
|
||||||
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
|
|
||||||
}))
|
|
||||||
sys.exit(0)
|
|
||||||
except Exception:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# If all attempts failed
|
|
||||||
print(json.dumps({
|
|
||||||
'type': 'Dependency',
|
|
||||||
'bin_name': 'chrome',
|
|
||||||
'bin_providers': 'apt,brew,env',
|
|
||||||
}))
|
|
||||||
print("Failed to install Chrome/Chromium", file=sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(json.dumps({
|
|
||||||
'type': 'Dependency',
|
|
||||||
'bin_name': 'chrome',
|
|
||||||
'bin_providers': 'apt,brew,env',
|
|
||||||
}))
|
|
||||||
print(f"Error installing Chrome: {e}", file=sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
||||||
@@ -2,7 +2,7 @@
|
|||||||
Integration tests for chrome_session plugin
|
Integration tests for chrome_session plugin
|
||||||
|
|
||||||
Tests verify:
|
Tests verify:
|
||||||
1. Install hook finds system Chrome or installs chromium
|
1. Validate hook checks for Chrome/Chromium binary
|
||||||
2. Verify deps with abx-pkg
|
2. Verify deps with abx-pkg
|
||||||
3. Chrome session script exists
|
3. Chrome session script exists
|
||||||
"""
|
"""
|
||||||
@@ -14,7 +14,7 @@ from pathlib import Path
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
PLUGIN_DIR = Path(__file__).parent.parent
|
PLUGIN_DIR = Path(__file__).parent.parent
|
||||||
CHROME_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_chrome.py'
|
CHROME_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_chrome.py'
|
||||||
CHROME_SESSION_HOOK = PLUGIN_DIR / 'on_Snapshot__20_chrome_session.js'
|
CHROME_SESSION_HOOK = PLUGIN_DIR / 'on_Snapshot__20_chrome_session.js'
|
||||||
|
|
||||||
|
|
||||||
@@ -23,37 +23,50 @@ def test_hook_script_exists():
|
|||||||
assert CHROME_SESSION_HOOK.exists(), f"Hook not found: {CHROME_SESSION_HOOK}"
|
assert CHROME_SESSION_HOOK.exists(), f"Hook not found: {CHROME_SESSION_HOOK}"
|
||||||
|
|
||||||
|
|
||||||
def test_chrome_install_hook():
|
def test_chrome_validate_hook():
|
||||||
"""Test chrome install hook to find or install Chrome/Chromium."""
|
"""Test chrome validate hook checks for Chrome/Chromium binary."""
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
[sys.executable, str(CHROME_INSTALL_HOOK)],
|
[sys.executable, str(CHROME_VALIDATE_HOOK)],
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
text=True,
|
text=True,
|
||||||
timeout=600
|
timeout=30
|
||||||
)
|
)
|
||||||
|
|
||||||
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
|
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
|
||||||
|
if result.returncode == 0:
|
||||||
# Verify InstalledBinary JSONL output
|
# Binary found - verify InstalledBinary JSONL output
|
||||||
found_binary = False
|
found_binary = False
|
||||||
for line in result.stdout.strip().split('\n'):
|
for line in result.stdout.strip().split('\n'):
|
||||||
if line.strip():
|
if line.strip():
|
||||||
try:
|
try:
|
||||||
record = json.loads(line)
|
record = json.loads(line)
|
||||||
if record.get('type') == 'InstalledBinary':
|
if record.get('type') == 'InstalledBinary':
|
||||||
assert record['name'] == 'chrome'
|
assert record['name'] == 'chrome'
|
||||||
assert record['abspath']
|
assert record['abspath']
|
||||||
assert Path(record['abspath']).exists(), f"Chrome binary should exist at {record['abspath']}"
|
assert Path(record['abspath']).exists(), f"Chrome binary should exist at {record['abspath']}"
|
||||||
found_binary = True
|
found_binary = True
|
||||||
break
|
break
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
pass
|
pass
|
||||||
|
assert found_binary, "Should output InstalledBinary record when binary found"
|
||||||
assert found_binary, "Should output InstalledBinary record"
|
else:
|
||||||
|
# Binary not found - verify Dependency JSONL output
|
||||||
|
found_dependency = False
|
||||||
|
for line in result.stdout.strip().split('\n'):
|
||||||
|
if line.strip():
|
||||||
|
try:
|
||||||
|
record = json.loads(line)
|
||||||
|
if record.get('type') == 'Dependency':
|
||||||
|
assert record['bin_name'] == 'chrome'
|
||||||
|
found_dependency = True
|
||||||
|
break
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
assert found_dependency, "Should output Dependency record when binary not found"
|
||||||
|
|
||||||
|
|
||||||
def test_verify_deps_with_abx_pkg():
|
def test_verify_deps_with_abx_pkg():
|
||||||
"""Verify chrome is available via abx-pkg after hook installation."""
|
"""Verify chrome is available via abx-pkg."""
|
||||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
|
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
|
||||||
|
|
||||||
AptProvider.model_rebuild()
|
AptProvider.model_rebuild()
|
||||||
@@ -75,10 +88,10 @@ def test_verify_deps_with_abx_pkg():
|
|||||||
except Exception:
|
except Exception:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# If we get here, chrome should still be available from system
|
# If we get here, chrome not available
|
||||||
import shutil
|
import shutil
|
||||||
assert shutil.which('chromium') or shutil.which('chrome') or shutil.which('google-chrome'), \
|
if not (shutil.which('chromium') or shutil.which('chrome') or shutil.which('google-chrome')):
|
||||||
"Chrome should be available after install hook"
|
pytest.skip("Chrome/Chromium not available - Dependency record should have been emitted")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|||||||
6
archivebox/plugins/dom/templates/embed.html
Normal file
6
archivebox/plugins/dom/templates/embed.html
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
<!-- DOM embed - full iframe of captured DOM HTML -->
|
||||||
|
<iframe src="{{ output_path }}"
|
||||||
|
class="extractor-embed dom-embed"
|
||||||
|
style="width: 100%; height: 100%; min-height: 500px; border: none;"
|
||||||
|
sandbox="allow-same-origin allow-scripts allow-forms">
|
||||||
|
</iframe>
|
||||||
6
archivebox/plugins/dom/templates/fullscreen.html
Normal file
6
archivebox/plugins/dom/templates/fullscreen.html
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
<!-- DOM fullscreen - full page iframe -->
|
||||||
|
<iframe src="{{ output_path }}"
|
||||||
|
class="extractor-fullscreen dom-fullscreen"
|
||||||
|
style="width: 100%; height: 100vh; border: none;"
|
||||||
|
sandbox="allow-same-origin allow-scripts allow-forms allow-top-navigation-by-user-activation">
|
||||||
|
</iframe>
|
||||||
1
archivebox/plugins/dom/templates/icon.html
Normal file
1
archivebox/plugins/dom/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
|||||||
|
🌐
|
||||||
8
archivebox/plugins/dom/templates/thumbnail.html
Normal file
8
archivebox/plugins/dom/templates/thumbnail.html
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
<!-- DOM thumbnail - scaled down iframe preview of captured DOM HTML -->
|
||||||
|
<div class="extractor-thumbnail dom-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fff;">
|
||||||
|
<iframe src="{{ output_path }}"
|
||||||
|
style="width: 400%; height: 400px; transform: scale(0.25); transform-origin: top left; pointer-events: none; border: none;"
|
||||||
|
loading="lazy"
|
||||||
|
sandbox="allow-same-origin">
|
||||||
|
</iframe>
|
||||||
|
</div>
|
||||||
1
archivebox/plugins/favicon/templates/icon.html
Normal file
1
archivebox/plugins/favicon/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
|||||||
|
⭐
|
||||||
@@ -1,68 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Install git if not already available.
|
|
||||||
|
|
||||||
Runs at crawl start to ensure git is installed.
|
|
||||||
Outputs JSONL for InstalledBinary.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
try:
|
|
||||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
|
|
||||||
|
|
||||||
AptProvider.model_rebuild()
|
|
||||||
BrewProvider.model_rebuild()
|
|
||||||
EnvProvider.model_rebuild()
|
|
||||||
|
|
||||||
# git binary and package have same name
|
|
||||||
git_binary = Binary(
|
|
||||||
name='git',
|
|
||||||
binproviders=[AptProvider(), BrewProvider(), EnvProvider()]
|
|
||||||
)
|
|
||||||
|
|
||||||
# Try to load, install if not found
|
|
||||||
try:
|
|
||||||
loaded = git_binary.load()
|
|
||||||
if not loaded or not loaded.abspath:
|
|
||||||
raise Exception("Not loaded")
|
|
||||||
except Exception:
|
|
||||||
# Install via system package manager
|
|
||||||
loaded = git_binary.install()
|
|
||||||
|
|
||||||
if loaded and loaded.abspath:
|
|
||||||
# Output InstalledBinary JSONL
|
|
||||||
print(json.dumps({
|
|
||||||
'type': 'InstalledBinary',
|
|
||||||
'name': 'git',
|
|
||||||
'abspath': str(loaded.abspath),
|
|
||||||
'version': str(loaded.version) if loaded.version else None,
|
|
||||||
'sha256': loaded.sha256,
|
|
||||||
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
|
|
||||||
}))
|
|
||||||
sys.exit(0)
|
|
||||||
else:
|
|
||||||
print(json.dumps({
|
|
||||||
'type': 'Dependency',
|
|
||||||
'bin_name': 'git',
|
|
||||||
'bin_providers': 'apt,brew,env',
|
|
||||||
}))
|
|
||||||
print("Failed to install git", file=sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(json.dumps({
|
|
||||||
'type': 'Dependency',
|
|
||||||
'bin_name': 'git',
|
|
||||||
'bin_providers': 'apt,brew,env',
|
|
||||||
}))
|
|
||||||
print(f"Error installing git: {e}", file=sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
||||||
6
archivebox/plugins/git/templates/embed.html
Normal file
6
archivebox/plugins/git/templates/embed.html
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
<!-- Git embed - directory listing of cloned repo -->
|
||||||
|
<iframe src="{{ output_path }}"
|
||||||
|
class="extractor-embed git-embed"
|
||||||
|
style="width: 100%; height: 100%; min-height: 400px; border: none; background: #fff;"
|
||||||
|
sandbox="allow-same-origin">
|
||||||
|
</iframe>
|
||||||
6
archivebox/plugins/git/templates/fullscreen.html
Normal file
6
archivebox/plugins/git/templates/fullscreen.html
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
<!-- Git fullscreen - full directory listing -->
|
||||||
|
<iframe src="{{ output_path }}"
|
||||||
|
class="extractor-fullscreen git-fullscreen"
|
||||||
|
style="width: 100%; height: 100vh; border: none; background: #fff;"
|
||||||
|
sandbox="allow-same-origin">
|
||||||
|
</iframe>
|
||||||
1
archivebox/plugins/git/templates/icon.html
Normal file
1
archivebox/plugins/git/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
|||||||
|
📂
|
||||||
5
archivebox/plugins/git/templates/thumbnail.html
Normal file
5
archivebox/plugins/git/templates/thumbnail.html
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
<!-- Git thumbnail - shows git repository icon and info -->
|
||||||
|
<div class="extractor-thumbnail git-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #f6f8fa; display: flex; flex-direction: column; align-items: center; justify-content: center; padding: 10px;">
|
||||||
|
<span style="font-size: 32px;">📂</span>
|
||||||
|
<span style="font-size: 11px; color: #586069; margin-top: 4px;">Git Repository</span>
|
||||||
|
</div>
|
||||||
@@ -2,7 +2,7 @@
|
|||||||
Integration tests for git plugin
|
Integration tests for git plugin
|
||||||
|
|
||||||
Tests verify:
|
Tests verify:
|
||||||
1. Install hook installs git via abx-pkg
|
1. Validate hook checks for git binary
|
||||||
2. Verify deps with abx-pkg
|
2. Verify deps with abx-pkg
|
||||||
3. Standalone git extractor execution
|
3. Standalone git extractor execution
|
||||||
"""
|
"""
|
||||||
@@ -17,50 +17,64 @@ import pytest
|
|||||||
|
|
||||||
PLUGIN_DIR = Path(__file__).parent.parent
|
PLUGIN_DIR = Path(__file__).parent.parent
|
||||||
GIT_HOOK = PLUGIN_DIR / 'on_Snapshot__12_git.py'
|
GIT_HOOK = PLUGIN_DIR / 'on_Snapshot__12_git.py'
|
||||||
GIT_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_git.py'
|
GIT_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_git.py'
|
||||||
TEST_URL = 'https://github.com/example/repo.git'
|
TEST_URL = 'https://github.com/example/repo.git'
|
||||||
|
|
||||||
def test_hook_script_exists():
|
def test_hook_script_exists():
|
||||||
assert GIT_HOOK.exists()
|
assert GIT_HOOK.exists()
|
||||||
|
|
||||||
def test_git_install_hook():
|
def test_git_validate_hook():
|
||||||
"""Test git install hook to install git if needed."""
|
"""Test git validate hook checks for git binary."""
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
[sys.executable, str(GIT_INSTALL_HOOK)],
|
[sys.executable, str(GIT_VALIDATE_HOOK)],
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
text=True,
|
text=True,
|
||||||
timeout=600
|
timeout=30
|
||||||
)
|
)
|
||||||
|
|
||||||
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
|
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
|
||||||
|
if result.returncode == 0:
|
||||||
# Verify InstalledBinary JSONL output
|
# Binary found - verify InstalledBinary JSONL output
|
||||||
found_binary = False
|
found_binary = False
|
||||||
for line in result.stdout.strip().split('\n'):
|
for line in result.stdout.strip().split('\n'):
|
||||||
if line.strip():
|
if line.strip():
|
||||||
try:
|
try:
|
||||||
record = json.loads(line)
|
record = json.loads(line)
|
||||||
if record.get('type') == 'InstalledBinary':
|
if record.get('type') == 'InstalledBinary':
|
||||||
assert record['name'] == 'git'
|
assert record['name'] == 'git'
|
||||||
assert record['abspath']
|
assert record['abspath']
|
||||||
found_binary = True
|
found_binary = True
|
||||||
break
|
break
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
pass
|
pass
|
||||||
|
assert found_binary, "Should output InstalledBinary record when binary found"
|
||||||
assert found_binary, "Should output InstalledBinary record"
|
else:
|
||||||
|
# Binary not found - verify Dependency JSONL output
|
||||||
|
found_dependency = False
|
||||||
|
for line in result.stdout.strip().split('\n'):
|
||||||
|
if line.strip():
|
||||||
|
try:
|
||||||
|
record = json.loads(line)
|
||||||
|
if record.get('type') == 'Dependency':
|
||||||
|
assert record['bin_name'] == 'git'
|
||||||
|
assert 'env' in record['bin_providers']
|
||||||
|
found_dependency = True
|
||||||
|
break
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
assert found_dependency, "Should output Dependency record when binary not found"
|
||||||
|
|
||||||
def test_verify_deps_with_abx_pkg():
|
def test_verify_deps_with_abx_pkg():
|
||||||
"""Verify git is available via abx-pkg after hook installation."""
|
"""Verify git is available via abx-pkg."""
|
||||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
|
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
|
||||||
|
|
||||||
AptProvider.model_rebuild()
|
|
||||||
BrewProvider.model_rebuild()
|
|
||||||
EnvProvider.model_rebuild()
|
|
||||||
|
|
||||||
git_binary = Binary(name='git', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
|
git_binary = Binary(name='git', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
|
||||||
git_loaded = git_binary.load()
|
git_loaded = git_binary.load()
|
||||||
assert git_loaded and git_loaded.abspath, "git should be available after install hook"
|
|
||||||
|
if git_loaded and git_loaded.abspath:
|
||||||
|
assert True, "git is available"
|
||||||
|
else:
|
||||||
|
pytest.skip("git not available - Dependency record should have been emitted")
|
||||||
|
|
||||||
def test_reports_missing_git():
|
def test_reports_missing_git():
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
|||||||
1
archivebox/plugins/headers/templates/icon.html
Normal file
1
archivebox/plugins/headers/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
|||||||
|
📋
|
||||||
1
archivebox/plugins/htmltotext/templates/icon.html
Normal file
1
archivebox/plugins/htmltotext/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
|||||||
|
📃
|
||||||
@@ -1,67 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Install yt-dlp if not already available.
|
|
||||||
|
|
||||||
Runs at crawl start to ensure yt-dlp is installed.
|
|
||||||
Outputs JSONL for InstalledBinary.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
try:
|
|
||||||
from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides
|
|
||||||
|
|
||||||
PipProvider.model_rebuild()
|
|
||||||
EnvProvider.model_rebuild()
|
|
||||||
|
|
||||||
# yt-dlp binary and package have same name
|
|
||||||
ytdlp_binary = Binary(
|
|
||||||
name='yt-dlp',
|
|
||||||
binproviders=[PipProvider(), EnvProvider()]
|
|
||||||
)
|
|
||||||
|
|
||||||
# Try to load, install if not found
|
|
||||||
try:
|
|
||||||
loaded = ytdlp_binary.load()
|
|
||||||
if not loaded or not loaded.abspath:
|
|
||||||
raise Exception("Not loaded")
|
|
||||||
except Exception:
|
|
||||||
# Install via pip
|
|
||||||
loaded = ytdlp_binary.install()
|
|
||||||
|
|
||||||
if loaded and loaded.abspath:
|
|
||||||
# Output InstalledBinary JSONL
|
|
||||||
print(json.dumps({
|
|
||||||
'type': 'InstalledBinary',
|
|
||||||
'name': 'yt-dlp',
|
|
||||||
'abspath': str(loaded.abspath),
|
|
||||||
'version': str(loaded.version) if loaded.version else None,
|
|
||||||
'sha256': loaded.sha256,
|
|
||||||
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
|
|
||||||
}))
|
|
||||||
sys.exit(0)
|
|
||||||
else:
|
|
||||||
print(json.dumps({
|
|
||||||
'type': 'Dependency',
|
|
||||||
'bin_name': 'yt-dlp',
|
|
||||||
'bin_providers': 'pip,brew,env',
|
|
||||||
}))
|
|
||||||
print("Failed to install yt-dlp", file=sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(json.dumps({
|
|
||||||
'type': 'Dependency',
|
|
||||||
'bin_name': 'yt-dlp',
|
|
||||||
'bin_providers': 'pip,brew,env',
|
|
||||||
}))
|
|
||||||
print(f"Error installing yt-dlp: {e}", file=sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
||||||
278
archivebox/plugins/media/on_Crawl__00_validate_ytdlp.py
Executable file
278
archivebox/plugins/media/on_Crawl__00_validate_ytdlp.py
Executable file
@@ -0,0 +1,278 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Validation hook for yt-dlp and its dependencies (node, ffmpeg).
|
||||||
|
|
||||||
|
Runs at crawl start to verify yt-dlp and required binaries are available.
|
||||||
|
Outputs JSONL for InstalledBinary and Machine config updates.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import shutil
|
||||||
|
import hashlib
|
||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def get_binary_version(abspath: str, version_flag: str = '--version') -> str | None:
|
||||||
|
"""Get version string from binary."""
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
[abspath, version_flag],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=5,
|
||||||
|
)
|
||||||
|
if result.returncode == 0 and result.stdout:
|
||||||
|
first_line = result.stdout.strip().split('\n')[0]
|
||||||
|
return first_line[:64]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_binary_hash(abspath: str) -> str | None:
|
||||||
|
"""Get SHA256 hash of binary."""
|
||||||
|
try:
|
||||||
|
with open(abspath, 'rb') as f:
|
||||||
|
return hashlib.sha256(f.read()).hexdigest()
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def find_ytdlp() -> dict | None:
|
||||||
|
"""Find yt-dlp binary."""
|
||||||
|
try:
|
||||||
|
from abx_pkg import Binary, PipProvider, EnvProvider
|
||||||
|
|
||||||
|
class YtdlpBinary(Binary):
|
||||||
|
name: str = 'yt-dlp'
|
||||||
|
binproviders_supported = [PipProvider(), EnvProvider()]
|
||||||
|
|
||||||
|
binary = YtdlpBinary()
|
||||||
|
loaded = binary.load()
|
||||||
|
if loaded and loaded.abspath:
|
||||||
|
return {
|
||||||
|
'name': 'yt-dlp',
|
||||||
|
'abspath': str(loaded.abspath),
|
||||||
|
'version': str(loaded.version) if loaded.version else None,
|
||||||
|
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||||
|
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||||
|
}
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Fallback to shutil.which
|
||||||
|
abspath = shutil.which('yt-dlp') or os.environ.get('YTDLP_BINARY', '')
|
||||||
|
if abspath and Path(abspath).is_file():
|
||||||
|
return {
|
||||||
|
'name': 'yt-dlp',
|
||||||
|
'abspath': abspath,
|
||||||
|
'version': get_binary_version(abspath),
|
||||||
|
'sha256': get_binary_hash(abspath),
|
||||||
|
'binprovider': 'env',
|
||||||
|
}
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def find_node() -> dict | None:
|
||||||
|
"""Find node binary."""
|
||||||
|
try:
|
||||||
|
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
|
||||||
|
|
||||||
|
class NodeBinary(Binary):
|
||||||
|
name: str = 'node'
|
||||||
|
binproviders_supported = [AptProvider(), BrewProvider(), EnvProvider()]
|
||||||
|
overrides: dict = {'apt': {'packages': ['nodejs']}}
|
||||||
|
|
||||||
|
binary = NodeBinary()
|
||||||
|
loaded = binary.load()
|
||||||
|
if loaded and loaded.abspath:
|
||||||
|
return {
|
||||||
|
'name': 'node',
|
||||||
|
'abspath': str(loaded.abspath),
|
||||||
|
'version': str(loaded.version) if loaded.version else None,
|
||||||
|
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||||
|
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||||
|
}
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Fallback to shutil.which
|
||||||
|
abspath = shutil.which('node') or os.environ.get('NODE_BINARY', '')
|
||||||
|
if abspath and Path(abspath).is_file():
|
||||||
|
return {
|
||||||
|
'name': 'node',
|
||||||
|
'abspath': abspath,
|
||||||
|
'version': get_binary_version(abspath),
|
||||||
|
'sha256': get_binary_hash(abspath),
|
||||||
|
'binprovider': 'env',
|
||||||
|
}
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def find_ffmpeg() -> dict | None:
|
||||||
|
"""Find ffmpeg binary."""
|
||||||
|
try:
|
||||||
|
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
|
||||||
|
|
||||||
|
class FfmpegBinary(Binary):
|
||||||
|
name: str = 'ffmpeg'
|
||||||
|
binproviders_supported = [AptProvider(), BrewProvider(), EnvProvider()]
|
||||||
|
|
||||||
|
binary = FfmpegBinary()
|
||||||
|
loaded = binary.load()
|
||||||
|
if loaded and loaded.abspath:
|
||||||
|
return {
|
||||||
|
'name': 'ffmpeg',
|
||||||
|
'abspath': str(loaded.abspath),
|
||||||
|
'version': str(loaded.version) if loaded.version else None,
|
||||||
|
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||||
|
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||||
|
}
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Fallback to shutil.which
|
||||||
|
abspath = shutil.which('ffmpeg') or os.environ.get('FFMPEG_BINARY', '')
|
||||||
|
if abspath and Path(abspath).is_file():
|
||||||
|
return {
|
||||||
|
'name': 'ffmpeg',
|
||||||
|
'abspath': abspath,
|
||||||
|
'version': get_binary_version(abspath),
|
||||||
|
'sha256': get_binary_hash(abspath),
|
||||||
|
'binprovider': 'env',
|
||||||
|
}
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Check for yt-dlp (required)
|
||||||
|
ytdlp_result = find_ytdlp()
|
||||||
|
|
||||||
|
# Check for node (required for JS extraction)
|
||||||
|
node_result = find_node()
|
||||||
|
|
||||||
|
# Check for ffmpeg (required for video conversion)
|
||||||
|
ffmpeg_result = find_ffmpeg()
|
||||||
|
|
||||||
|
missing_deps = []
|
||||||
|
|
||||||
|
# Emit results for yt-dlp
|
||||||
|
if ytdlp_result and ytdlp_result.get('abspath'):
|
||||||
|
print(json.dumps({
|
||||||
|
'type': 'InstalledBinary',
|
||||||
|
'name': ytdlp_result['name'],
|
||||||
|
'abspath': ytdlp_result['abspath'],
|
||||||
|
'version': ytdlp_result['version'],
|
||||||
|
'sha256': ytdlp_result['sha256'],
|
||||||
|
'binprovider': ytdlp_result['binprovider'],
|
||||||
|
}))
|
||||||
|
|
||||||
|
print(json.dumps({
|
||||||
|
'type': 'Machine',
|
||||||
|
'_method': 'update',
|
||||||
|
'key': 'config/YTDLP_BINARY',
|
||||||
|
'value': ytdlp_result['abspath'],
|
||||||
|
}))
|
||||||
|
|
||||||
|
if ytdlp_result['version']:
|
||||||
|
print(json.dumps({
|
||||||
|
'type': 'Machine',
|
||||||
|
'_method': 'update',
|
||||||
|
'key': 'config/YTDLP_VERSION',
|
||||||
|
'value': ytdlp_result['version'],
|
||||||
|
}))
|
||||||
|
else:
|
||||||
|
print(json.dumps({
|
||||||
|
'type': 'Dependency',
|
||||||
|
'bin_name': 'yt-dlp',
|
||||||
|
'bin_providers': 'pip,env',
|
||||||
|
}))
|
||||||
|
missing_deps.append('yt-dlp')
|
||||||
|
|
||||||
|
# Emit results for node
|
||||||
|
if node_result and node_result.get('abspath'):
|
||||||
|
print(json.dumps({
|
||||||
|
'type': 'InstalledBinary',
|
||||||
|
'name': node_result['name'],
|
||||||
|
'abspath': node_result['abspath'],
|
||||||
|
'version': node_result['version'],
|
||||||
|
'sha256': node_result['sha256'],
|
||||||
|
'binprovider': node_result['binprovider'],
|
||||||
|
}))
|
||||||
|
|
||||||
|
print(json.dumps({
|
||||||
|
'type': 'Machine',
|
||||||
|
'_method': 'update',
|
||||||
|
'key': 'config/NODE_BINARY',
|
||||||
|
'value': node_result['abspath'],
|
||||||
|
}))
|
||||||
|
|
||||||
|
if node_result['version']:
|
||||||
|
print(json.dumps({
|
||||||
|
'type': 'Machine',
|
||||||
|
'_method': 'update',
|
||||||
|
'key': 'config/NODE_VERSION',
|
||||||
|
'value': node_result['version'],
|
||||||
|
}))
|
||||||
|
else:
|
||||||
|
print(json.dumps({
|
||||||
|
'type': 'Dependency',
|
||||||
|
'bin_name': 'node',
|
||||||
|
'bin_providers': 'apt,brew,env',
|
||||||
|
}))
|
||||||
|
missing_deps.append('node')
|
||||||
|
|
||||||
|
# Emit results for ffmpeg
|
||||||
|
if ffmpeg_result and ffmpeg_result.get('abspath'):
|
||||||
|
print(json.dumps({
|
||||||
|
'type': 'InstalledBinary',
|
||||||
|
'name': ffmpeg_result['name'],
|
||||||
|
'abspath': ffmpeg_result['abspath'],
|
||||||
|
'version': ffmpeg_result['version'],
|
||||||
|
'sha256': ffmpeg_result['sha256'],
|
||||||
|
'binprovider': ffmpeg_result['binprovider'],
|
||||||
|
}))
|
||||||
|
|
||||||
|
print(json.dumps({
|
||||||
|
'type': 'Machine',
|
||||||
|
'_method': 'update',
|
||||||
|
'key': 'config/FFMPEG_BINARY',
|
||||||
|
'value': ffmpeg_result['abspath'],
|
||||||
|
}))
|
||||||
|
|
||||||
|
if ffmpeg_result['version']:
|
||||||
|
print(json.dumps({
|
||||||
|
'type': 'Machine',
|
||||||
|
'_method': 'update',
|
||||||
|
'key': 'config/FFMPEG_VERSION',
|
||||||
|
'value': ffmpeg_result['version'],
|
||||||
|
}))
|
||||||
|
else:
|
||||||
|
print(json.dumps({
|
||||||
|
'type': 'Dependency',
|
||||||
|
'bin_name': 'ffmpeg',
|
||||||
|
'bin_providers': 'apt,brew,env',
|
||||||
|
}))
|
||||||
|
missing_deps.append('ffmpeg')
|
||||||
|
|
||||||
|
if missing_deps:
|
||||||
|
print(f"Missing dependencies: {', '.join(missing_deps)}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
9
archivebox/plugins/media/templates/embed.html
Normal file
9
archivebox/plugins/media/templates/embed.html
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
<!-- Media embed - video/audio player -->
|
||||||
|
<div class="extractor-embed media-embed" style="width: 100%; height: 100%; min-height: 400px; background: #1a1a1a; display: flex; align-items: center; justify-content: center;">
|
||||||
|
<video src="{{ output_path }}"
|
||||||
|
style="max-width: 100%; max-height: 100%;"
|
||||||
|
controls
|
||||||
|
preload="metadata">
|
||||||
|
Your browser does not support the video tag.
|
||||||
|
</video>
|
||||||
|
</div>
|
||||||
10
archivebox/plugins/media/templates/fullscreen.html
Normal file
10
archivebox/plugins/media/templates/fullscreen.html
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
<!-- Media fullscreen - full video/audio player -->
|
||||||
|
<div class="extractor-fullscreen media-fullscreen" style="width: 100%; height: 100vh; background: #000; display: flex; align-items: center; justify-content: center;">
|
||||||
|
<video src="{{ output_path }}"
|
||||||
|
style="max-width: 100%; max-height: 100%;"
|
||||||
|
controls
|
||||||
|
autoplay
|
||||||
|
preload="auto">
|
||||||
|
Your browser does not support the video tag.
|
||||||
|
</video>
|
||||||
|
</div>
|
||||||
1
archivebox/plugins/media/templates/icon.html
Normal file
1
archivebox/plugins/media/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
|||||||
|
🎬
|
||||||
14
archivebox/plugins/media/templates/thumbnail.html
Normal file
14
archivebox/plugins/media/templates/thumbnail.html
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
<!-- Media thumbnail - shows video/audio player or placeholder -->
|
||||||
|
<div class="extractor-thumbnail media-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #1a1a1a; display: flex; align-items: center; justify-content: center;">
|
||||||
|
<video src="{{ output_path }}"
|
||||||
|
style="width: 100%; height: 100px; object-fit: contain;"
|
||||||
|
poster=""
|
||||||
|
preload="metadata"
|
||||||
|
muted
|
||||||
|
onerror="this.style.display='none'; this.nextElementSibling.style.display='flex';">
|
||||||
|
</video>
|
||||||
|
<div style="display: none; flex-direction: column; align-items: center; color: #888; font-size: 12px;">
|
||||||
|
<span style="font-size: 32px;">🎬</span>
|
||||||
|
<span>Media</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
@@ -21,7 +21,7 @@ import pytest
|
|||||||
PLUGIN_DIR = Path(__file__).parent.parent
|
PLUGIN_DIR = Path(__file__).parent.parent
|
||||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||||
MEDIA_HOOK = PLUGIN_DIR / 'on_Snapshot__51_media.py'
|
MEDIA_HOOK = PLUGIN_DIR / 'on_Snapshot__51_media.py'
|
||||||
MEDIA_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_ytdlp.py'
|
MEDIA_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_ytdlp.py'
|
||||||
TEST_URL = 'https://example.com/video.mp4'
|
TEST_URL = 'https://example.com/video.mp4'
|
||||||
|
|
||||||
def test_hook_script_exists():
|
def test_hook_script_exists():
|
||||||
@@ -29,46 +29,72 @@ def test_hook_script_exists():
|
|||||||
assert MEDIA_HOOK.exists(), f"Hook not found: {MEDIA_HOOK}"
|
assert MEDIA_HOOK.exists(), f"Hook not found: {MEDIA_HOOK}"
|
||||||
|
|
||||||
|
|
||||||
def test_ytdlp_install_hook():
|
def test_ytdlp_validate_hook():
|
||||||
"""Test yt-dlp install hook to install yt-dlp if needed."""
|
"""Test yt-dlp validate hook checks for yt-dlp and dependencies (node, ffmpeg)."""
|
||||||
# Run yt-dlp install hook
|
# Run yt-dlp validate hook
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
[sys.executable, str(MEDIA_INSTALL_HOOK)],
|
[sys.executable, str(MEDIA_VALIDATE_HOOK)],
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
text=True,
|
text=True,
|
||||||
timeout=600
|
timeout=30
|
||||||
)
|
)
|
||||||
|
|
||||||
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
|
# Hook exits 0 if all binaries found, 1 if any not found
|
||||||
|
# Parse output for InstalledBinary and Dependency records
|
||||||
|
found_binaries = {'node': False, 'ffmpeg': False, 'yt-dlp': False}
|
||||||
|
found_dependencies = {'node': False, 'ffmpeg': False, 'yt-dlp': False}
|
||||||
|
|
||||||
# Verify InstalledBinary JSONL output
|
|
||||||
found_binary = False
|
|
||||||
for line in result.stdout.strip().split('\n'):
|
for line in result.stdout.strip().split('\n'):
|
||||||
if line.strip():
|
if line.strip():
|
||||||
try:
|
try:
|
||||||
record = json.loads(line)
|
record = json.loads(line)
|
||||||
if record.get('type') == 'InstalledBinary':
|
if record.get('type') == 'InstalledBinary':
|
||||||
assert record['name'] == 'yt-dlp'
|
name = record['name']
|
||||||
assert record['abspath']
|
if name in found_binaries:
|
||||||
found_binary = True
|
assert record['abspath'], f"{name} should have abspath"
|
||||||
break
|
found_binaries[name] = True
|
||||||
|
elif record.get('type') == 'Dependency':
|
||||||
|
name = record['bin_name']
|
||||||
|
if name in found_dependencies:
|
||||||
|
found_dependencies[name] = True
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
assert found_binary, "Should output InstalledBinary record"
|
# Each binary should either be found (InstalledBinary) or missing (Dependency)
|
||||||
|
for binary_name in ['yt-dlp', 'node', 'ffmpeg']:
|
||||||
|
assert found_binaries[binary_name] or found_dependencies[binary_name], \
|
||||||
|
f"{binary_name} should have either InstalledBinary or Dependency record"
|
||||||
|
|
||||||
|
|
||||||
def test_verify_deps_with_abx_pkg():
|
def test_verify_deps_with_abx_pkg():
|
||||||
"""Verify yt-dlp is available via abx-pkg after hook installation."""
|
"""Verify yt-dlp, node, and ffmpeg are available via abx-pkg."""
|
||||||
from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides
|
from abx_pkg import Binary, PipProvider, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
|
||||||
|
|
||||||
PipProvider.model_rebuild()
|
missing_binaries = []
|
||||||
EnvProvider.model_rebuild()
|
|
||||||
|
|
||||||
# Verify yt-dlp is available
|
# Verify yt-dlp is available
|
||||||
ytdlp_binary = Binary(name='yt-dlp', binproviders=[PipProvider(), EnvProvider()])
|
ytdlp_binary = Binary(name='yt-dlp', binproviders=[PipProvider(), EnvProvider()])
|
||||||
ytdlp_loaded = ytdlp_binary.load()
|
ytdlp_loaded = ytdlp_binary.load()
|
||||||
assert ytdlp_loaded and ytdlp_loaded.abspath, "yt-dlp should be available after install hook"
|
if not (ytdlp_loaded and ytdlp_loaded.abspath):
|
||||||
|
missing_binaries.append('yt-dlp')
|
||||||
|
|
||||||
|
# Verify node is available (yt-dlp needs it for JS extraction)
|
||||||
|
node_binary = Binary(
|
||||||
|
name='node',
|
||||||
|
binproviders=[AptProvider(), BrewProvider(), EnvProvider()]
|
||||||
|
)
|
||||||
|
node_loaded = node_binary.load()
|
||||||
|
if not (node_loaded and node_loaded.abspath):
|
||||||
|
missing_binaries.append('node')
|
||||||
|
|
||||||
|
# Verify ffmpeg is available (yt-dlp needs it for video conversion)
|
||||||
|
ffmpeg_binary = Binary(name='ffmpeg', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
|
||||||
|
ffmpeg_loaded = ffmpeg_binary.load()
|
||||||
|
if not (ffmpeg_loaded and ffmpeg_loaded.abspath):
|
||||||
|
missing_binaries.append('ffmpeg')
|
||||||
|
|
||||||
|
if missing_binaries:
|
||||||
|
pytest.skip(f"Binaries not available: {', '.join(missing_binaries)} - Dependency records should have been emitted")
|
||||||
|
|
||||||
def test_handles_non_media_url():
|
def test_handles_non_media_url():
|
||||||
"""Test that media extractor handles non-media URLs gracefully via hook."""
|
"""Test that media extractor handles non-media URLs gracefully via hook."""
|
||||||
|
|||||||
@@ -1,68 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Install mercury-parser if not already available.
|
|
||||||
|
|
||||||
Runs at crawl start to ensure mercury-parser is installed.
|
|
||||||
Outputs JSONL for InstalledBinary.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
try:
|
|
||||||
from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
|
|
||||||
|
|
||||||
NpmProvider.model_rebuild()
|
|
||||||
EnvProvider.model_rebuild()
|
|
||||||
|
|
||||||
# Note: npm package is @postlight/mercury-parser, binary is mercury-parser
|
|
||||||
mercury_binary = Binary(
|
|
||||||
name='mercury-parser',
|
|
||||||
binproviders=[NpmProvider(), EnvProvider()],
|
|
||||||
overrides={'npm': {'packages': ['@postlight/mercury-parser']}}
|
|
||||||
)
|
|
||||||
|
|
||||||
# Try to load, install if not found
|
|
||||||
try:
|
|
||||||
loaded = mercury_binary.load()
|
|
||||||
if not loaded or not loaded.abspath:
|
|
||||||
raise Exception("Not loaded")
|
|
||||||
except Exception:
|
|
||||||
# Install via npm
|
|
||||||
loaded = mercury_binary.install()
|
|
||||||
|
|
||||||
if loaded and loaded.abspath:
|
|
||||||
# Output InstalledBinary JSONL
|
|
||||||
print(json.dumps({
|
|
||||||
'type': 'InstalledBinary',
|
|
||||||
'name': 'mercury-parser',
|
|
||||||
'abspath': str(loaded.abspath),
|
|
||||||
'version': str(loaded.version) if loaded.version else None,
|
|
||||||
'sha256': loaded.sha256,
|
|
||||||
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
|
|
||||||
}))
|
|
||||||
sys.exit(0)
|
|
||||||
else:
|
|
||||||
print(json.dumps({
|
|
||||||
'type': 'Dependency',
|
|
||||||
'bin_name': 'mercury-parser',
|
|
||||||
'bin_providers': 'npm,env',
|
|
||||||
}))
|
|
||||||
print("Failed to install mercury-parser", file=sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(json.dumps({
|
|
||||||
'type': 'Dependency',
|
|
||||||
'bin_name': 'mercury-parser',
|
|
||||||
'bin_providers': 'npm,env',
|
|
||||||
}))
|
|
||||||
print(f"Error installing mercury-parser: {e}", file=sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
||||||
123
archivebox/plugins/mercury/on_Crawl__00_validate_mercury.py
Executable file
123
archivebox/plugins/mercury/on_Crawl__00_validate_mercury.py
Executable file
@@ -0,0 +1,123 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Validation hook for postlight-parser binary.
|
||||||
|
|
||||||
|
Runs at crawl start to verify postlight-parser is available.
|
||||||
|
Outputs JSONL for InstalledBinary and Machine config updates.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import shutil
|
||||||
|
import hashlib
|
||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def get_binary_version(abspath: str) -> str | None:
|
||||||
|
"""Get version string from binary."""
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
[abspath, '--version'],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=5,
|
||||||
|
)
|
||||||
|
if result.returncode == 0 and result.stdout:
|
||||||
|
first_line = result.stdout.strip().split('\n')[0]
|
||||||
|
return first_line[:64]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_binary_hash(abspath: str) -> str | None:
|
||||||
|
"""Get SHA256 hash of binary."""
|
||||||
|
try:
|
||||||
|
with open(abspath, 'rb') as f:
|
||||||
|
return hashlib.sha256(f.read()).hexdigest()
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def find_mercury() -> dict | None:
|
||||||
|
"""Find postlight-parser binary."""
|
||||||
|
try:
|
||||||
|
from abx_pkg import Binary, NpmProvider, EnvProvider
|
||||||
|
|
||||||
|
class MercuryBinary(Binary):
|
||||||
|
name: str = 'postlight-parser'
|
||||||
|
binproviders_supported = [NpmProvider(), EnvProvider()]
|
||||||
|
overrides: dict = {'npm': {'packages': ['@postlight/parser']}}
|
||||||
|
|
||||||
|
binary = MercuryBinary()
|
||||||
|
loaded = binary.load()
|
||||||
|
if loaded and loaded.abspath:
|
||||||
|
return {
|
||||||
|
'name': 'postlight-parser',
|
||||||
|
'abspath': str(loaded.abspath),
|
||||||
|
'version': str(loaded.version) if loaded.version else None,
|
||||||
|
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||||
|
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||||
|
}
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Fallback to shutil.which
|
||||||
|
abspath = shutil.which('postlight-parser') or os.environ.get('MERCURY_BINARY', '')
|
||||||
|
if abspath and Path(abspath).is_file():
|
||||||
|
return {
|
||||||
|
'name': 'postlight-parser',
|
||||||
|
'abspath': abspath,
|
||||||
|
'version': get_binary_version(abspath),
|
||||||
|
'sha256': get_binary_hash(abspath),
|
||||||
|
'binprovider': 'env',
|
||||||
|
}
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
result = find_mercury()
|
||||||
|
|
||||||
|
if result and result.get('abspath'):
|
||||||
|
print(json.dumps({
|
||||||
|
'type': 'InstalledBinary',
|
||||||
|
'name': result['name'],
|
||||||
|
'abspath': result['abspath'],
|
||||||
|
'version': result['version'],
|
||||||
|
'sha256': result['sha256'],
|
||||||
|
'binprovider': result['binprovider'],
|
||||||
|
}))
|
||||||
|
|
||||||
|
print(json.dumps({
|
||||||
|
'type': 'Machine',
|
||||||
|
'_method': 'update',
|
||||||
|
'key': 'config/MERCURY_BINARY',
|
||||||
|
'value': result['abspath'],
|
||||||
|
}))
|
||||||
|
|
||||||
|
if result['version']:
|
||||||
|
print(json.dumps({
|
||||||
|
'type': 'Machine',
|
||||||
|
'_method': 'update',
|
||||||
|
'key': 'config/MERCURY_VERSION',
|
||||||
|
'value': result['version'],
|
||||||
|
}))
|
||||||
|
|
||||||
|
sys.exit(0)
|
||||||
|
else:
|
||||||
|
print(json.dumps({
|
||||||
|
'type': 'Dependency',
|
||||||
|
'bin_name': 'postlight-parser',
|
||||||
|
'bin_providers': 'npm,env',
|
||||||
|
}))
|
||||||
|
print(f"postlight-parser binary not found", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
@@ -6,10 +6,10 @@ Usage: on_Snapshot__mercury.py --url=<url> --snapshot-id=<uuid>
|
|||||||
Output: Creates mercury/ directory with content.html, content.txt, article.json
|
Output: Creates mercury/ directory with content.html, content.txt, article.json
|
||||||
|
|
||||||
Environment variables:
|
Environment variables:
|
||||||
MERCURY_BINARY: Path to mercury-parser binary
|
MERCURY_BINARY: Path to postlight-parser binary
|
||||||
TIMEOUT: Timeout in seconds (default: 60)
|
TIMEOUT: Timeout in seconds (default: 60)
|
||||||
|
|
||||||
Note: Requires mercury-parser: npm install -g @postlight/mercury-parser
|
Note: Requires postlight-parser: npm install -g @postlight/parser
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
@@ -25,7 +25,7 @@ import rich_click as click
|
|||||||
|
|
||||||
# Extractor metadata
|
# Extractor metadata
|
||||||
EXTRACTOR_NAME = 'mercury'
|
EXTRACTOR_NAME = 'mercury'
|
||||||
BIN_NAME = 'mercury-parser'
|
BIN_NAME = 'postlight-parser'
|
||||||
BIN_PROVIDERS = 'npm,env'
|
BIN_PROVIDERS = 'npm,env'
|
||||||
OUTPUT_DIR = 'mercury'
|
OUTPUT_DIR = 'mercury'
|
||||||
|
|
||||||
@@ -42,12 +42,12 @@ def get_env_int(name: str, default: int = 0) -> int:
|
|||||||
|
|
||||||
|
|
||||||
def find_mercury() -> str | None:
|
def find_mercury() -> str | None:
|
||||||
"""Find mercury-parser binary."""
|
"""Find postlight-parser binary."""
|
||||||
mercury = get_env('MERCURY_BINARY')
|
mercury = get_env('MERCURY_BINARY')
|
||||||
if mercury and os.path.isfile(mercury):
|
if mercury and os.path.isfile(mercury):
|
||||||
return mercury
|
return mercury
|
||||||
|
|
||||||
for name in ['mercury-parser', 'mercury']:
|
for name in ['postlight-parser']:
|
||||||
binary = shutil.which(name)
|
binary = shutil.which(name)
|
||||||
if binary:
|
if binary:
|
||||||
return binary
|
return binary
|
||||||
@@ -56,7 +56,7 @@ def find_mercury() -> str | None:
|
|||||||
|
|
||||||
|
|
||||||
def get_version(binary: str) -> str:
|
def get_version(binary: str) -> str:
|
||||||
"""Get mercury-parser version."""
|
"""Get postlight-parser version."""
|
||||||
try:
|
try:
|
||||||
result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
|
result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
|
||||||
return result.stdout.strip()[:64]
|
return result.stdout.strip()[:64]
|
||||||
@@ -83,12 +83,12 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
|
|||||||
|
|
||||||
if result_text.returncode != 0:
|
if result_text.returncode != 0:
|
||||||
stderr = result_text.stderr.decode('utf-8', errors='replace')
|
stderr = result_text.stderr.decode('utf-8', errors='replace')
|
||||||
return False, None, f'mercury-parser failed: {stderr[:200]}'
|
return False, None, f'postlight-parser failed: {stderr[:200]}'
|
||||||
|
|
||||||
try:
|
try:
|
||||||
text_json = json.loads(result_text.stdout)
|
text_json = json.loads(result_text.stdout)
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
return False, None, 'mercury-parser returned invalid JSON'
|
return False, None, 'postlight-parser returned invalid JSON'
|
||||||
|
|
||||||
if text_json.get('failed'):
|
if text_json.get('failed'):
|
||||||
return False, None, 'Mercury was not able to extract article'
|
return False, None, 'Mercury was not able to extract article'
|
||||||
@@ -139,7 +139,7 @@ def main(url: str, snapshot_id: str):
|
|||||||
# Find binary
|
# Find binary
|
||||||
binary = find_mercury()
|
binary = find_mercury()
|
||||||
if not binary:
|
if not binary:
|
||||||
print(f'ERROR: mercury-parser binary not found', file=sys.stderr)
|
print(f'ERROR: postlight-parser binary not found', file=sys.stderr)
|
||||||
print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
|
print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
|
||||||
print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
|
print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|||||||
6
archivebox/plugins/mercury/templates/embed.html
Normal file
6
archivebox/plugins/mercury/templates/embed.html
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
<!-- Mercury embed - Mercury parser article view -->
|
||||||
|
<iframe src="{{ output_path }}"
|
||||||
|
class="extractor-embed mercury-embed"
|
||||||
|
style="width: 100%; height: 100%; min-height: 500px; border: none; background: #fefefe;"
|
||||||
|
sandbox="allow-same-origin">
|
||||||
|
</iframe>
|
||||||
6
archivebox/plugins/mercury/templates/fullscreen.html
Normal file
6
archivebox/plugins/mercury/templates/fullscreen.html
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
<!-- Mercury fullscreen - full Mercury parser article -->
|
||||||
|
<iframe src="{{ output_path }}"
|
||||||
|
class="extractor-fullscreen mercury-fullscreen"
|
||||||
|
style="width: 100%; height: 100vh; border: none; background: #fefefe;"
|
||||||
|
sandbox="allow-same-origin">
|
||||||
|
</iframe>
|
||||||
1
archivebox/plugins/mercury/templates/icon.html
Normal file
1
archivebox/plugins/mercury/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
|||||||
|
☿️
|
||||||
8
archivebox/plugins/mercury/templates/thumbnail.html
Normal file
8
archivebox/plugins/mercury/templates/thumbnail.html
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
<!-- Mercury thumbnail - shows Mercury parser extracted article content -->
|
||||||
|
<div class="extractor-thumbnail mercury-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fefefe; padding: 8px; font-family: Georgia, serif; font-size: 11px; line-height: 1.4; color: #333;">
|
||||||
|
<iframe src="{{ output_path }}"
|
||||||
|
style="width: 100%; height: 300px; border: none; pointer-events: none;"
|
||||||
|
loading="lazy"
|
||||||
|
sandbox="allow-same-origin">
|
||||||
|
</iframe>
|
||||||
|
</div>
|
||||||
@@ -21,7 +21,7 @@ import pytest
|
|||||||
PLUGIN_DIR = Path(__file__).parent.parent
|
PLUGIN_DIR = Path(__file__).parent.parent
|
||||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||||
MERCURY_HOOK = PLUGIN_DIR / 'on_Snapshot__53_mercury.py'
|
MERCURY_HOOK = PLUGIN_DIR / 'on_Snapshot__53_mercury.py'
|
||||||
MERCURY_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_mercury.py'
|
MERCURY_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_mercury.py'
|
||||||
TEST_URL = 'https://example.com'
|
TEST_URL = 'https://example.com'
|
||||||
|
|
||||||
def test_hook_script_exists():
|
def test_hook_script_exists():
|
||||||
@@ -29,53 +29,70 @@ def test_hook_script_exists():
|
|||||||
assert MERCURY_HOOK.exists(), f"Hook not found: {MERCURY_HOOK}"
|
assert MERCURY_HOOK.exists(), f"Hook not found: {MERCURY_HOOK}"
|
||||||
|
|
||||||
|
|
||||||
def test_mercury_install_hook():
|
def test_mercury_validate_hook():
|
||||||
"""Test mercury install hook to install mercury-parser if needed."""
|
"""Test mercury validate hook checks for postlight-parser."""
|
||||||
# Run mercury install hook
|
# Run mercury validate hook
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
[sys.executable, str(MERCURY_INSTALL_HOOK)],
|
[sys.executable, str(MERCURY_VALIDATE_HOOK)],
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
text=True,
|
text=True,
|
||||||
timeout=600
|
timeout=30
|
||||||
)
|
)
|
||||||
|
|
||||||
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
|
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
|
||||||
|
if result.returncode == 0:
|
||||||
# Verify InstalledBinary JSONL output
|
# Binary found - verify InstalledBinary JSONL output
|
||||||
found_binary = False
|
found_binary = False
|
||||||
for line in result.stdout.strip().split('\n'):
|
for line in result.stdout.strip().split('\n'):
|
||||||
if line.strip():
|
if line.strip():
|
||||||
try:
|
try:
|
||||||
record = json.loads(line)
|
record = json.loads(line)
|
||||||
if record.get('type') == 'InstalledBinary':
|
if record.get('type') == 'InstalledBinary':
|
||||||
assert record['name'] == 'mercury-parser'
|
assert record['name'] == 'postlight-parser'
|
||||||
assert record['abspath']
|
assert record['abspath']
|
||||||
found_binary = True
|
found_binary = True
|
||||||
break
|
break
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
pass
|
pass
|
||||||
|
assert found_binary, "Should output InstalledBinary record when binary found"
|
||||||
assert found_binary, "Should output InstalledBinary record"
|
else:
|
||||||
|
# Binary not found - verify Dependency JSONL output
|
||||||
|
found_dependency = False
|
||||||
|
for line in result.stdout.strip().split('\n'):
|
||||||
|
if line.strip():
|
||||||
|
try:
|
||||||
|
record = json.loads(line)
|
||||||
|
if record.get('type') == 'Dependency':
|
||||||
|
assert record['bin_name'] == 'postlight-parser'
|
||||||
|
assert 'npm' in record['bin_providers']
|
||||||
|
found_dependency = True
|
||||||
|
break
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
assert found_dependency, "Should output Dependency record when binary not found"
|
||||||
|
|
||||||
|
|
||||||
def test_verify_deps_with_abx_pkg():
|
def test_verify_deps_with_abx_pkg():
|
||||||
"""Verify mercury-parser is available via abx-pkg after hook installation."""
|
"""Verify postlight-parser is available via abx-pkg."""
|
||||||
from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
|
from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
|
||||||
|
|
||||||
NpmProvider.model_rebuild()
|
# Verify postlight-parser is available
|
||||||
EnvProvider.model_rebuild()
|
|
||||||
|
|
||||||
# Verify mercury-parser is available
|
|
||||||
mercury_binary = Binary(
|
mercury_binary = Binary(
|
||||||
name='mercury-parser',
|
name='postlight-parser',
|
||||||
binproviders=[NpmProvider(), EnvProvider()],
|
binproviders=[NpmProvider(), EnvProvider()],
|
||||||
overrides={'npm': {'packages': ['@postlight/mercury-parser']}}
|
overrides={'npm': {'packages': ['@postlight/parser']}}
|
||||||
)
|
)
|
||||||
mercury_loaded = mercury_binary.load()
|
mercury_loaded = mercury_binary.load()
|
||||||
assert mercury_loaded and mercury_loaded.abspath, "mercury-parser should be available after install hook"
|
|
||||||
|
# If validate hook found it (exit 0), this should succeed
|
||||||
|
# If validate hook didn't find it (exit 1), this may fail unless binprovider installed it
|
||||||
|
if mercury_loaded and mercury_loaded.abspath:
|
||||||
|
assert True, "postlight-parser is available"
|
||||||
|
else:
|
||||||
|
pytest.skip("postlight-parser not available - Dependency record should have been emitted")
|
||||||
|
|
||||||
def test_extracts_with_mercury_parser():
|
def test_extracts_with_mercury_parser():
|
||||||
"""Test full workflow: extract with mercury-parser from real HTML via hook."""
|
"""Test full workflow: extract with postlight-parser from real HTML via hook."""
|
||||||
# Prerequisites checked by earlier test
|
# Prerequisites checked by earlier test
|
||||||
|
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
|||||||
@@ -2,46 +2,28 @@
|
|||||||
"""
|
"""
|
||||||
Create a Merkle tree of all archived outputs.
|
Create a Merkle tree of all archived outputs.
|
||||||
|
|
||||||
This plugin runs after all extractors and post-processing complete (priority 92)
|
This plugin runs after all extractors complete (priority 93) and generates
|
||||||
and generates a cryptographic Merkle tree of all files in the snapshot directory.
|
a cryptographic Merkle tree of all files in the snapshot directory.
|
||||||
This provides:
|
|
||||||
- Tamper detection: verify archive integrity
|
|
||||||
- Efficient updates: only re-hash changed files
|
|
||||||
- Compact proofs: prove file inclusion without sending all files
|
|
||||||
- Deduplication: identify identical content across snapshots
|
|
||||||
|
|
||||||
Output: merkletree/merkletree.json containing:
|
Output: merkletree.json containing root_hash, tree structure, file list, metadata
|
||||||
- root_hash: SHA256 hash of the Merkle root
|
|
||||||
- tree: Full tree structure with internal nodes
|
|
||||||
- files: List of all files with their hashes
|
|
||||||
- metadata: Timestamp, file count, total size
|
|
||||||
|
|
||||||
Usage: on_Snapshot__92_merkletree.py --url=<url> --snapshot-id=<uuid>
|
Usage: on_Snapshot__93_merkletree.py --url=<url> --snapshot-id=<uuid>
|
||||||
|
|
||||||
Environment variables:
|
Environment variables:
|
||||||
SAVE_MERKLETREE: Enable merkle tree generation (default: true)
|
SAVE_MERKLETREE: Enable merkle tree generation (default: true)
|
||||||
|
DATA_DIR: ArchiveBox data directory
|
||||||
|
ARCHIVE_DIR: Archive output directory
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__package__ = 'archivebox.plugins.merkletree'
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
import hashlib
|
import hashlib
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from datetime import datetime
|
from datetime import datetime, timezone
|
||||||
from typing import Dict, List, Optional, Tuple, Any
|
from typing import Dict, List, Optional, Tuple, Any
|
||||||
|
|
||||||
# Configure Django if running standalone
|
import click
|
||||||
if __name__ == '__main__':
|
|
||||||
parent_dir = str(Path(__file__).resolve().parent.parent.parent)
|
|
||||||
if parent_dir not in sys.path:
|
|
||||||
sys.path.insert(0, parent_dir)
|
|
||||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
|
|
||||||
import django
|
|
||||||
django.setup()
|
|
||||||
|
|
||||||
import rich_click as click
|
|
||||||
|
|
||||||
|
|
||||||
def sha256_file(filepath: Path) -> str:
|
def sha256_file(filepath: Path) -> str:
|
||||||
@@ -49,12 +31,10 @@ def sha256_file(filepath: Path) -> str:
|
|||||||
h = hashlib.sha256()
|
h = hashlib.sha256()
|
||||||
try:
|
try:
|
||||||
with open(filepath, 'rb') as f:
|
with open(filepath, 'rb') as f:
|
||||||
# Read in 64kb chunks
|
|
||||||
while chunk := f.read(65536):
|
while chunk := f.read(65536):
|
||||||
h.update(chunk)
|
h.update(chunk)
|
||||||
return h.hexdigest()
|
return h.hexdigest()
|
||||||
except (OSError, PermissionError):
|
except (OSError, PermissionError):
|
||||||
# If we can't read the file, return a null hash
|
|
||||||
return '0' * 64
|
return '0' * 64
|
||||||
|
|
||||||
|
|
||||||
@@ -64,74 +44,45 @@ def sha256_data(data: bytes) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def collect_files(snapshot_dir: Path, exclude_dirs: Optional[List[str]] = None) -> List[Tuple[Path, str, int]]:
|
def collect_files(snapshot_dir: Path, exclude_dirs: Optional[List[str]] = None) -> List[Tuple[Path, str, int]]:
|
||||||
"""
|
"""Recursively collect all files in snapshot directory."""
|
||||||
Recursively collect all files in snapshot directory.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
snapshot_dir: Root directory to scan
|
|
||||||
exclude_dirs: Directory names to exclude (e.g., ['merkletree', '.git'])
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of (relative_path, sha256_hash, file_size) tuples
|
|
||||||
"""
|
|
||||||
exclude_dirs = exclude_dirs or ['merkletree', '.git', '__pycache__']
|
exclude_dirs = exclude_dirs or ['merkletree', '.git', '__pycache__']
|
||||||
files = []
|
files = []
|
||||||
|
|
||||||
for root, dirs, filenames in os.walk(snapshot_dir):
|
for root, dirs, filenames in os.walk(snapshot_dir):
|
||||||
# Filter out excluded directories
|
|
||||||
dirs[:] = [d for d in dirs if d not in exclude_dirs]
|
dirs[:] = [d for d in dirs if d not in exclude_dirs]
|
||||||
|
|
||||||
for filename in filenames:
|
for filename in filenames:
|
||||||
filepath = Path(root) / filename
|
filepath = Path(root) / filename
|
||||||
rel_path = filepath.relative_to(snapshot_dir)
|
rel_path = filepath.relative_to(snapshot_dir)
|
||||||
|
|
||||||
# Skip symlinks (we hash the target, not the link)
|
|
||||||
if filepath.is_symlink():
|
if filepath.is_symlink():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Compute hash and size
|
|
||||||
file_hash = sha256_file(filepath)
|
file_hash = sha256_file(filepath)
|
||||||
file_size = filepath.stat().st_size if filepath.exists() else 0
|
file_size = filepath.stat().st_size if filepath.exists() else 0
|
||||||
|
|
||||||
files.append((rel_path, file_hash, file_size))
|
files.append((rel_path, file_hash, file_size))
|
||||||
|
|
||||||
# Sort by path for deterministic tree
|
|
||||||
files.sort(key=lambda x: str(x[0]))
|
files.sort(key=lambda x: str(x[0]))
|
||||||
return files
|
return files
|
||||||
|
|
||||||
|
|
||||||
def build_merkle_tree(file_hashes: List[str]) -> Tuple[str, List[List[str]]]:
|
def build_merkle_tree(file_hashes: List[str]) -> Tuple[str, List[List[str]]]:
|
||||||
"""
|
"""Build a Merkle tree from a list of leaf hashes."""
|
||||||
Build a Merkle tree from a list of leaf hashes.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
file_hashes: List of SHA256 hashes (leaves)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
(root_hash, tree_levels) where tree_levels is a list of hash lists per level
|
|
||||||
"""
|
|
||||||
if not file_hashes:
|
if not file_hashes:
|
||||||
# Empty tree
|
|
||||||
return sha256_data(b''), [[]]
|
return sha256_data(b''), [[]]
|
||||||
|
|
||||||
# Initialize with leaf level
|
|
||||||
tree_levels = [file_hashes.copy()]
|
tree_levels = [file_hashes.copy()]
|
||||||
|
|
||||||
# Build tree bottom-up
|
|
||||||
while len(tree_levels[-1]) > 1:
|
while len(tree_levels[-1]) > 1:
|
||||||
current_level = tree_levels[-1]
|
current_level = tree_levels[-1]
|
||||||
next_level = []
|
next_level = []
|
||||||
|
|
||||||
# Process pairs
|
|
||||||
for i in range(0, len(current_level), 2):
|
for i in range(0, len(current_level), 2):
|
||||||
left = current_level[i]
|
left = current_level[i]
|
||||||
|
|
||||||
if i + 1 < len(current_level):
|
if i + 1 < len(current_level):
|
||||||
# Combine left + right
|
|
||||||
right = current_level[i + 1]
|
right = current_level[i + 1]
|
||||||
combined = left + right
|
combined = left + right
|
||||||
else:
|
else:
|
||||||
# Odd number of nodes: duplicate the last one
|
|
||||||
combined = left + left
|
combined = left + left
|
||||||
|
|
||||||
parent_hash = sha256_data(combined.encode('utf-8'))
|
parent_hash = sha256_data(combined.encode('utf-8'))
|
||||||
@@ -139,67 +90,41 @@ def build_merkle_tree(file_hashes: List[str]) -> Tuple[str, List[List[str]]]:
|
|||||||
|
|
||||||
tree_levels.append(next_level)
|
tree_levels.append(next_level)
|
||||||
|
|
||||||
# Root is the single hash at the top level
|
|
||||||
root_hash = tree_levels[-1][0]
|
root_hash = tree_levels[-1][0]
|
||||||
return root_hash, tree_levels
|
return root_hash, tree_levels
|
||||||
|
|
||||||
|
|
||||||
def create_merkle_tree(snapshot_dir: Path) -> Dict[str, Any]:
|
def create_merkle_tree(snapshot_dir: Path) -> Dict[str, Any]:
|
||||||
"""
|
"""Create a complete Merkle tree of all files in snapshot directory."""
|
||||||
Create a complete Merkle tree of all files in snapshot directory.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
snapshot_dir: The snapshot directory to scan
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dict containing root_hash, tree structure, file list, and metadata
|
|
||||||
"""
|
|
||||||
# Collect all files
|
|
||||||
files = collect_files(snapshot_dir)
|
files = collect_files(snapshot_dir)
|
||||||
|
|
||||||
# Extract just the hashes for tree building
|
|
||||||
file_hashes = [file_hash for _, file_hash, _ in files]
|
file_hashes = [file_hash for _, file_hash, _ in files]
|
||||||
|
|
||||||
# Build Merkle tree
|
|
||||||
root_hash, tree_levels = build_merkle_tree(file_hashes)
|
root_hash, tree_levels = build_merkle_tree(file_hashes)
|
||||||
|
|
||||||
# Calculate total size
|
|
||||||
total_size = sum(size for _, _, size in files)
|
total_size = sum(size for _, _, size in files)
|
||||||
|
|
||||||
# Prepare file list with metadata
|
|
||||||
file_list = [
|
file_list = [
|
||||||
{
|
{'path': str(path), 'hash': file_hash, 'size': size}
|
||||||
'path': str(path),
|
|
||||||
'hash': file_hash,
|
|
||||||
'size': size,
|
|
||||||
}
|
|
||||||
for path, file_hash, size in files
|
for path, file_hash, size in files
|
||||||
]
|
]
|
||||||
|
|
||||||
# Prepare result
|
return {
|
||||||
result = {
|
|
||||||
'root_hash': root_hash,
|
'root_hash': root_hash,
|
||||||
'tree_levels': tree_levels,
|
'tree_levels': tree_levels,
|
||||||
'files': file_list,
|
'files': file_list,
|
||||||
'metadata': {
|
'metadata': {
|
||||||
'timestamp': datetime.now().isoformat(),
|
'timestamp': datetime.now(timezone.utc).isoformat(),
|
||||||
'file_count': len(files),
|
'file_count': len(files),
|
||||||
'total_size': total_size,
|
'total_size': total_size,
|
||||||
'tree_depth': len(tree_levels),
|
'tree_depth': len(tree_levels),
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
@click.command()
|
@click.command()
|
||||||
@click.option('--url', required=True, help='URL being archived')
|
@click.option('--url', required=True, help='URL being archived')
|
||||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||||
def main(url: str, snapshot_id: str):
|
def main(url: str, snapshot_id: str):
|
||||||
"""Generate Merkle tree of all archived outputs."""
|
"""Generate Merkle tree of all archived outputs."""
|
||||||
from archivebox.core.models import Snapshot
|
start_ts = datetime.now(timezone.utc)
|
||||||
|
|
||||||
start_ts = datetime.now()
|
|
||||||
status = 'failed'
|
status = 'failed'
|
||||||
output = None
|
output = None
|
||||||
error = ''
|
error = ''
|
||||||
@@ -211,30 +136,19 @@ def main(url: str, snapshot_id: str):
|
|||||||
save_merkletree = os.getenv('SAVE_MERKLETREE', 'true').lower() in ('true', '1', 'yes', 'on')
|
save_merkletree = os.getenv('SAVE_MERKLETREE', 'true').lower() in ('true', '1', 'yes', 'on')
|
||||||
|
|
||||||
if not save_merkletree:
|
if not save_merkletree:
|
||||||
click.echo('Skipping merkle tree (SAVE_MERKLETREE=False)')
|
|
||||||
status = 'skipped'
|
status = 'skipped'
|
||||||
end_ts = datetime.now()
|
click.echo(json.dumps({'status': status, 'output': 'SAVE_MERKLETREE=false'}))
|
||||||
click.echo(f'START_TS={start_ts.isoformat()}')
|
|
||||||
click.echo(f'END_TS={end_ts.isoformat()}')
|
|
||||||
click.echo(f'STATUS={status}')
|
|
||||||
click.echo(f'RESULT_JSON={{"extractor": "merkletree", "status": "{status}", "url": "{url}", "snapshot_id": "{snapshot_id}"}}')
|
|
||||||
sys.exit(0)
|
sys.exit(0)
|
||||||
|
|
||||||
# Get snapshot
|
# Working directory is the extractor output dir (e.g., <snapshot>/merkletree/)
|
||||||
try:
|
# Parent is the snapshot directory
|
||||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
output_dir = Path.cwd()
|
||||||
except Snapshot.DoesNotExist:
|
snapshot_dir = output_dir.parent
|
||||||
error = f'Snapshot {snapshot_id} not found'
|
|
||||||
raise ValueError(error)
|
|
||||||
|
|
||||||
# Get snapshot directory
|
|
||||||
snapshot_dir = Path(snapshot.output_dir)
|
|
||||||
if not snapshot_dir.exists():
|
if not snapshot_dir.exists():
|
||||||
error = f'Snapshot directory not found: {snapshot_dir}'
|
raise FileNotFoundError(f'Snapshot directory not found: {snapshot_dir}')
|
||||||
raise FileNotFoundError(error)
|
|
||||||
|
|
||||||
# Create output directory
|
# Ensure output directory exists
|
||||||
output_dir = snapshot_dir / 'merkletree'
|
|
||||||
output_dir.mkdir(exist_ok=True)
|
output_dir.mkdir(exist_ok=True)
|
||||||
output_path = output_dir / 'merkletree.json'
|
output_path = output_dir / 'merkletree.json'
|
||||||
|
|
||||||
@@ -246,49 +160,31 @@ def main(url: str, snapshot_id: str):
|
|||||||
json.dump(merkle_data, f, indent=2)
|
json.dump(merkle_data, f, indent=2)
|
||||||
|
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
output = str(output_path)
|
output = 'merkletree.json'
|
||||||
root_hash = merkle_data['root_hash']
|
root_hash = merkle_data['root_hash']
|
||||||
file_count = merkle_data['metadata']['file_count']
|
file_count = merkle_data['metadata']['file_count']
|
||||||
total_size = merkle_data['metadata']['total_size']
|
total_size = merkle_data['metadata']['total_size']
|
||||||
|
|
||||||
click.echo(f'Merkle tree created: {file_count} files, root={root_hash[:16]}..., size={total_size:,} bytes')
|
click.echo(f'Merkle tree: {file_count} files, root={root_hash[:16]}..., size={total_size:,} bytes')
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error = f'{type(e).__name__}: {e}'
|
error = f'{type(e).__name__}: {e}'
|
||||||
status = 'failed'
|
status = 'failed'
|
||||||
click.echo(f'Error: {error}', err=True)
|
click.echo(f'Error: {error}', err=True)
|
||||||
|
|
||||||
end_ts = datetime.now()
|
end_ts = datetime.now(timezone.utc)
|
||||||
duration = (end_ts - start_ts).total_seconds()
|
|
||||||
|
|
||||||
# Print results
|
# Print JSON result for hook runner
|
||||||
click.echo(f'START_TS={start_ts.isoformat()}')
|
result = {
|
||||||
click.echo(f'END_TS={end_ts.isoformat()}')
|
|
||||||
click.echo(f'DURATION={duration:.2f}')
|
|
||||||
if output:
|
|
||||||
click.echo(f'OUTPUT={output}')
|
|
||||||
click.echo(f'STATUS={status}')
|
|
||||||
|
|
||||||
if error:
|
|
||||||
click.echo(f'ERROR={error}', err=True)
|
|
||||||
|
|
||||||
# Print JSON result
|
|
||||||
result_json = {
|
|
||||||
'extractor': 'merkletree',
|
|
||||||
'url': url,
|
|
||||||
'snapshot_id': snapshot_id,
|
|
||||||
'status': status,
|
'status': status,
|
||||||
'start_ts': start_ts.isoformat(),
|
|
||||||
'end_ts': end_ts.isoformat(),
|
|
||||||
'duration': round(duration, 2),
|
|
||||||
'output': output,
|
'output': output,
|
||||||
|
'error': error or None,
|
||||||
'root_hash': root_hash,
|
'root_hash': root_hash,
|
||||||
'file_count': file_count,
|
'file_count': file_count,
|
||||||
'error': error or None,
|
|
||||||
}
|
}
|
||||||
click.echo(f'RESULT_JSON={json.dumps(result_json)}')
|
click.echo(json.dumps(result))
|
||||||
|
|
||||||
sys.exit(0 if status == 'succeeded' else 1)
|
sys.exit(0 if status in ('succeeded', 'skipped') else 1)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|||||||
@@ -0,0 +1 @@
|
|||||||
|
🔗
|
||||||
@@ -133,7 +133,8 @@ def fetch_content(url: str) -> str:
|
|||||||
|
|
||||||
@click.command()
|
@click.command()
|
||||||
@click.option('--url', required=True, help='HTML URL to parse')
|
@click.option('--url', required=True, help='HTML URL to parse')
|
||||||
def main(url: str):
|
@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
|
||||||
|
def main(url: str, snapshot_id: str = None):
|
||||||
"""Parse HTML and extract href URLs."""
|
"""Parse HTML and extract href URLs."""
|
||||||
|
|
||||||
# Skip only if parse_dom_outlinks already ran AND found URLs (it uses Chrome for better coverage)
|
# Skip only if parse_dom_outlinks already ran AND found URLs (it uses Chrome for better coverage)
|
||||||
|
|||||||
1
archivebox/plugins/parse_html_urls/templates/icon.html
Normal file
1
archivebox/plugins/parse_html_urls/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
|||||||
|
🔗
|
||||||
@@ -127,7 +127,8 @@ def fetch_content(url: str) -> str:
|
|||||||
|
|
||||||
@click.command()
|
@click.command()
|
||||||
@click.option('--url', required=True, help='JSONL file URL to parse')
|
@click.option('--url', required=True, help='JSONL file URL to parse')
|
||||||
def main(url: str):
|
@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
|
||||||
|
def main(url: str, snapshot_id: str = None):
|
||||||
"""Parse JSONL bookmark file and extract URLs."""
|
"""Parse JSONL bookmark file and extract URLs."""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|||||||
1
archivebox/plugins/parse_jsonl_urls/templates/icon.html
Normal file
1
archivebox/plugins/parse_jsonl_urls/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
|||||||
|
📋
|
||||||
@@ -52,7 +52,8 @@ def fetch_content(url: str) -> str:
|
|||||||
|
|
||||||
@click.command()
|
@click.command()
|
||||||
@click.option('--url', required=True, help='Netscape bookmark file URL to parse')
|
@click.option('--url', required=True, help='Netscape bookmark file URL to parse')
|
||||||
def main(url: str):
|
@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
|
||||||
|
def main(url: str, snapshot_id: str = None):
|
||||||
"""Parse Netscape bookmark HTML and extract URLs."""
|
"""Parse Netscape bookmark HTML and extract URLs."""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -0,0 +1 @@
|
|||||||
|
🔖
|
||||||
@@ -51,7 +51,8 @@ def fetch_content(url: str) -> str:
|
|||||||
|
|
||||||
@click.command()
|
@click.command()
|
||||||
@click.option('--url', required=True, help='RSS/Atom feed URL to parse')
|
@click.option('--url', required=True, help='RSS/Atom feed URL to parse')
|
||||||
def main(url: str):
|
@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
|
||||||
|
def main(url: str, snapshot_id: str = None):
|
||||||
"""Parse RSS/Atom feed and extract article URLs."""
|
"""Parse RSS/Atom feed and extract article URLs."""
|
||||||
|
|
||||||
if feedparser is None:
|
if feedparser is None:
|
||||||
|
|||||||
1
archivebox/plugins/parse_rss_urls/templates/icon.html
Normal file
1
archivebox/plugins/parse_rss_urls/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
|||||||
|
📡
|
||||||
@@ -100,7 +100,8 @@ def fetch_content(url: str) -> str:
|
|||||||
|
|
||||||
@click.command()
|
@click.command()
|
||||||
@click.option('--url', required=True, help='URL to parse (file:// or https://)')
|
@click.option('--url', required=True, help='URL to parse (file:// or https://)')
|
||||||
def main(url: str):
|
@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
|
||||||
|
def main(url: str, snapshot_id: str = None):
|
||||||
"""Parse plain text and extract URLs."""
|
"""Parse plain text and extract URLs."""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|||||||
1
archivebox/plugins/parse_txt_urls/templates/icon.html
Normal file
1
archivebox/plugins/parse_txt_urls/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
|||||||
|
📃
|
||||||
5
archivebox/plugins/pdf/templates/embed.html
Normal file
5
archivebox/plugins/pdf/templates/embed.html
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
<!-- PDF embed - full PDF viewer -->
|
||||||
|
<embed src="{{ output_path }}#toolbar=1&navpanes=1"
|
||||||
|
type="application/pdf"
|
||||||
|
class="extractor-embed pdf-embed"
|
||||||
|
style="width: 100%; height: 100%; min-height: 500px;">
|
||||||
5
archivebox/plugins/pdf/templates/fullscreen.html
Normal file
5
archivebox/plugins/pdf/templates/fullscreen.html
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
<!-- PDF fullscreen - full PDF viewer -->
|
||||||
|
<embed src="{{ output_path }}#toolbar=1&navpanes=1&view=FitH"
|
||||||
|
type="application/pdf"
|
||||||
|
class="extractor-fullscreen pdf-fullscreen"
|
||||||
|
style="width: 100%; height: 100vh;">
|
||||||
1
archivebox/plugins/pdf/templates/icon.html
Normal file
1
archivebox/plugins/pdf/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
|||||||
|
📄
|
||||||
6
archivebox/plugins/pdf/templates/thumbnail.html
Normal file
6
archivebox/plugins/pdf/templates/thumbnail.html
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
<!-- PDF thumbnail - shows first page preview -->
|
||||||
|
<div class="extractor-thumbnail pdf-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #f5f5f5;">
|
||||||
|
<embed src="{{ output_path }}#toolbar=0&navpanes=0&scrollbar=0&page=1&view=FitH"
|
||||||
|
type="application/pdf"
|
||||||
|
style="width: 100%; height: 200px; margin-top: -20px; pointer-events: none;">
|
||||||
|
</div>
|
||||||
@@ -1,68 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Install readability-extractor if not already available.
|
|
||||||
|
|
||||||
Runs at crawl start to ensure readability-extractor is installed.
|
|
||||||
Outputs JSONL for InstalledBinary.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
try:
|
|
||||||
from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
|
|
||||||
|
|
||||||
NpmProvider.model_rebuild()
|
|
||||||
EnvProvider.model_rebuild()
|
|
||||||
|
|
||||||
# Note: npm package is from github:ArchiveBox/readability-extractor
|
|
||||||
readability_binary = Binary(
|
|
||||||
name='readability-extractor',
|
|
||||||
binproviders=[NpmProvider(), EnvProvider()],
|
|
||||||
overrides={'npm': {'packages': ['github:ArchiveBox/readability-extractor']}}
|
|
||||||
)
|
|
||||||
|
|
||||||
# Try to load, install if not found
|
|
||||||
try:
|
|
||||||
loaded = readability_binary.load()
|
|
||||||
if not loaded or not loaded.abspath:
|
|
||||||
raise Exception("Not loaded")
|
|
||||||
except Exception:
|
|
||||||
# Install via npm from GitHub repo
|
|
||||||
loaded = readability_binary.install()
|
|
||||||
|
|
||||||
if loaded and loaded.abspath:
|
|
||||||
# Output InstalledBinary JSONL
|
|
||||||
print(json.dumps({
|
|
||||||
'type': 'InstalledBinary',
|
|
||||||
'name': 'readability-extractor',
|
|
||||||
'abspath': str(loaded.abspath),
|
|
||||||
'version': str(loaded.version) if loaded.version else None,
|
|
||||||
'sha256': loaded.sha256,
|
|
||||||
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
|
|
||||||
}))
|
|
||||||
sys.exit(0)
|
|
||||||
else:
|
|
||||||
print(json.dumps({
|
|
||||||
'type': 'Dependency',
|
|
||||||
'bin_name': 'readability-extractor',
|
|
||||||
'bin_providers': 'npm,env',
|
|
||||||
}))
|
|
||||||
print("Failed to install readability-extractor", file=sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(json.dumps({
|
|
||||||
'type': 'Dependency',
|
|
||||||
'bin_name': 'readability-extractor',
|
|
||||||
'bin_providers': 'npm,env',
|
|
||||||
}))
|
|
||||||
print(f"Error installing readability-extractor: {e}", file=sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
||||||
123
archivebox/plugins/readability/on_Crawl__00_validate_readability.py
Executable file
123
archivebox/plugins/readability/on_Crawl__00_validate_readability.py
Executable file
@@ -0,0 +1,123 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Validation hook for readability-extractor binary.
|
||||||
|
|
||||||
|
Runs at crawl start to verify readability-extractor is available.
|
||||||
|
Outputs JSONL for InstalledBinary and Machine config updates.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import shutil
|
||||||
|
import hashlib
|
||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def get_binary_version(abspath: str) -> str | None:
|
||||||
|
"""Get version string from binary."""
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
[abspath, '--version'],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=5,
|
||||||
|
)
|
||||||
|
if result.returncode == 0 and result.stdout:
|
||||||
|
first_line = result.stdout.strip().split('\n')[0]
|
||||||
|
return first_line[:64]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_binary_hash(abspath: str) -> str | None:
|
||||||
|
"""Get SHA256 hash of binary."""
|
||||||
|
try:
|
||||||
|
with open(abspath, 'rb') as f:
|
||||||
|
return hashlib.sha256(f.read()).hexdigest()
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def find_readability() -> dict | None:
|
||||||
|
"""Find readability-extractor binary."""
|
||||||
|
try:
|
||||||
|
from abx_pkg import Binary, NpmProvider, EnvProvider
|
||||||
|
|
||||||
|
class ReadabilityBinary(Binary):
|
||||||
|
name: str = 'readability-extractor'
|
||||||
|
binproviders_supported = [NpmProvider(), EnvProvider()]
|
||||||
|
overrides: dict = {'npm': {'packages': ['github:ArchiveBox/readability-extractor']}}
|
||||||
|
|
||||||
|
binary = ReadabilityBinary()
|
||||||
|
loaded = binary.load()
|
||||||
|
if loaded and loaded.abspath:
|
||||||
|
return {
|
||||||
|
'name': 'readability-extractor',
|
||||||
|
'abspath': str(loaded.abspath),
|
||||||
|
'version': str(loaded.version) if loaded.version else None,
|
||||||
|
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||||
|
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||||
|
}
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Fallback to shutil.which
|
||||||
|
abspath = shutil.which('readability-extractor') or os.environ.get('READABILITY_BINARY', '')
|
||||||
|
if abspath and Path(abspath).is_file():
|
||||||
|
return {
|
||||||
|
'name': 'readability-extractor',
|
||||||
|
'abspath': abspath,
|
||||||
|
'version': get_binary_version(abspath),
|
||||||
|
'sha256': get_binary_hash(abspath),
|
||||||
|
'binprovider': 'env',
|
||||||
|
}
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
result = find_readability()
|
||||||
|
|
||||||
|
if result and result.get('abspath'):
|
||||||
|
print(json.dumps({
|
||||||
|
'type': 'InstalledBinary',
|
||||||
|
'name': result['name'],
|
||||||
|
'abspath': result['abspath'],
|
||||||
|
'version': result['version'],
|
||||||
|
'sha256': result['sha256'],
|
||||||
|
'binprovider': result['binprovider'],
|
||||||
|
}))
|
||||||
|
|
||||||
|
print(json.dumps({
|
||||||
|
'type': 'Machine',
|
||||||
|
'_method': 'update',
|
||||||
|
'key': 'config/READABILITY_BINARY',
|
||||||
|
'value': result['abspath'],
|
||||||
|
}))
|
||||||
|
|
||||||
|
if result['version']:
|
||||||
|
print(json.dumps({
|
||||||
|
'type': 'Machine',
|
||||||
|
'_method': 'update',
|
||||||
|
'key': 'config/READABILITY_VERSION',
|
||||||
|
'value': result['version'],
|
||||||
|
}))
|
||||||
|
|
||||||
|
sys.exit(0)
|
||||||
|
else:
|
||||||
|
print(json.dumps({
|
||||||
|
'type': 'Dependency',
|
||||||
|
'bin_name': 'readability-extractor',
|
||||||
|
'bin_providers': 'npm,env',
|
||||||
|
}))
|
||||||
|
print(f"readability-extractor binary not found", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
6
archivebox/plugins/readability/templates/embed.html
Normal file
6
archivebox/plugins/readability/templates/embed.html
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
<!-- Readability embed - reader-mode article view -->
|
||||||
|
<iframe src="{{ output_path }}"
|
||||||
|
class="extractor-embed readability-embed"
|
||||||
|
style="width: 100%; height: 100%; min-height: 500px; border: none; background: #fefefe;"
|
||||||
|
sandbox="allow-same-origin">
|
||||||
|
</iframe>
|
||||||
6
archivebox/plugins/readability/templates/fullscreen.html
Normal file
6
archivebox/plugins/readability/templates/fullscreen.html
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
<!-- Readability fullscreen - full reader-mode article -->
|
||||||
|
<iframe src="{{ output_path }}"
|
||||||
|
class="extractor-fullscreen readability-fullscreen"
|
||||||
|
style="width: 100%; height: 100vh; border: none; background: #fefefe;"
|
||||||
|
sandbox="allow-same-origin">
|
||||||
|
</iframe>
|
||||||
1
archivebox/plugins/readability/templates/icon.html
Normal file
1
archivebox/plugins/readability/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
|||||||
|
📖
|
||||||
8
archivebox/plugins/readability/templates/thumbnail.html
Normal file
8
archivebox/plugins/readability/templates/thumbnail.html
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
<!-- Readability thumbnail - shows reader-mode extracted article content -->
|
||||||
|
<div class="extractor-thumbnail readability-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fefefe; padding: 8px; font-family: Georgia, serif; font-size: 11px; line-height: 1.4; color: #333;">
|
||||||
|
<iframe src="{{ output_path }}"
|
||||||
|
style="width: 100%; height: 300px; border: none; pointer-events: none;"
|
||||||
|
loading="lazy"
|
||||||
|
sandbox="allow-same-origin">
|
||||||
|
</iframe>
|
||||||
|
</div>
|
||||||
@@ -2,7 +2,7 @@
|
|||||||
Integration tests for readability plugin
|
Integration tests for readability plugin
|
||||||
|
|
||||||
Tests verify:
|
Tests verify:
|
||||||
1. Install hook installs readability-extractor via abx-pkg
|
1. Validate hook checks for readability-extractor binary
|
||||||
2. Verify deps with abx-pkg
|
2. Verify deps with abx-pkg
|
||||||
3. Plugin reports missing dependency correctly
|
3. Plugin reports missing dependency correctly
|
||||||
4. Extraction works against real example.com content
|
4. Extraction works against real example.com content
|
||||||
@@ -21,7 +21,7 @@ import pytest
|
|||||||
PLUGIN_DIR = Path(__file__).parent.parent
|
PLUGIN_DIR = Path(__file__).parent.parent
|
||||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||||
READABILITY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_readability.py'))
|
READABILITY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_readability.py'))
|
||||||
READABILITY_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_readability.py'
|
READABILITY_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_readability.py'
|
||||||
TEST_URL = 'https://example.com'
|
TEST_URL = 'https://example.com'
|
||||||
|
|
||||||
|
|
||||||
@@ -101,48 +101,63 @@ def test_reports_missing_dependency_when_not_installed():
|
|||||||
assert 'readability-extractor' in combined or 'BIN_NAME' in combined, "Should mention readability-extractor"
|
assert 'readability-extractor' in combined or 'BIN_NAME' in combined, "Should mention readability-extractor"
|
||||||
|
|
||||||
|
|
||||||
def test_readability_install_hook():
|
def test_readability_validate_hook():
|
||||||
"""Test readability install hook to install readability-extractor if needed."""
|
"""Test readability validate hook checks for readability-extractor binary."""
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
[sys.executable, str(READABILITY_INSTALL_HOOK)],
|
[sys.executable, str(READABILITY_VALIDATE_HOOK)],
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
text=True,
|
text=True,
|
||||||
timeout=600
|
timeout=30
|
||||||
)
|
)
|
||||||
|
|
||||||
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
|
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
|
||||||
|
if result.returncode == 0:
|
||||||
# Verify InstalledBinary JSONL output
|
# Binary found - verify InstalledBinary JSONL output
|
||||||
found_binary = False
|
found_binary = False
|
||||||
for line in result.stdout.strip().split('\n'):
|
for line in result.stdout.strip().split('\n'):
|
||||||
if line.strip():
|
if line.strip():
|
||||||
try:
|
try:
|
||||||
record = json.loads(line)
|
record = json.loads(line)
|
||||||
if record.get('type') == 'InstalledBinary':
|
if record.get('type') == 'InstalledBinary':
|
||||||
assert record['name'] == 'readability-extractor'
|
assert record['name'] == 'readability-extractor'
|
||||||
assert record['abspath']
|
assert record['abspath']
|
||||||
found_binary = True
|
found_binary = True
|
||||||
break
|
break
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
pass
|
pass
|
||||||
|
assert found_binary, "Should output InstalledBinary record when binary found"
|
||||||
assert found_binary, "Should output InstalledBinary record"
|
else:
|
||||||
|
# Binary not found - verify Dependency JSONL output
|
||||||
|
found_dependency = False
|
||||||
|
for line in result.stdout.strip().split('\n'):
|
||||||
|
if line.strip():
|
||||||
|
try:
|
||||||
|
record = json.loads(line)
|
||||||
|
if record.get('type') == 'Dependency':
|
||||||
|
assert record['bin_name'] == 'readability-extractor'
|
||||||
|
assert 'npm' in record['bin_providers']
|
||||||
|
found_dependency = True
|
||||||
|
break
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
assert found_dependency, "Should output Dependency record when binary not found"
|
||||||
|
|
||||||
|
|
||||||
def test_verify_deps_with_abx_pkg():
|
def test_verify_deps_with_abx_pkg():
|
||||||
"""Verify readability-extractor is available via abx-pkg after hook installation."""
|
"""Verify readability-extractor is available via abx-pkg."""
|
||||||
from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
|
from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
|
||||||
|
|
||||||
NpmProvider.model_rebuild()
|
|
||||||
EnvProvider.model_rebuild()
|
|
||||||
|
|
||||||
readability_binary = Binary(
|
readability_binary = Binary(
|
||||||
name='readability-extractor',
|
name='readability-extractor',
|
||||||
binproviders=[NpmProvider(), EnvProvider()],
|
binproviders=[NpmProvider(), EnvProvider()],
|
||||||
overrides={'npm': {'packages': ['github:ArchiveBox/readability-extractor']}}
|
overrides={'npm': {'packages': ['github:ArchiveBox/readability-extractor']}}
|
||||||
)
|
)
|
||||||
readability_loaded = readability_binary.load()
|
readability_loaded = readability_binary.load()
|
||||||
assert readability_loaded and readability_loaded.abspath, "readability-extractor should be available after install hook"
|
|
||||||
|
if readability_loaded and readability_loaded.abspath:
|
||||||
|
assert True, "readability-extractor is available"
|
||||||
|
else:
|
||||||
|
pytest.skip("readability-extractor not available - Dependency record should have been emitted")
|
||||||
|
|
||||||
|
|
||||||
def test_extracts_article_after_installation():
|
def test_extracts_article_after_installation():
|
||||||
|
|||||||
5
archivebox/plugins/screenshot/templates/embed.html
Normal file
5
archivebox/plugins/screenshot/templates/embed.html
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
<!-- Screenshot embed - full image view -->
|
||||||
|
<img src="{{ output_path }}"
|
||||||
|
alt="Screenshot of page"
|
||||||
|
class="extractor-embed screenshot-embed"
|
||||||
|
style="max-width: 100%; height: auto;">
|
||||||
8
archivebox/plugins/screenshot/templates/fullscreen.html
Normal file
8
archivebox/plugins/screenshot/templates/fullscreen.html
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
<!-- Screenshot fullscreen - zoomable image -->
|
||||||
|
<div style="width: 100%; height: 100vh; overflow: auto; background: #222; display: flex; align-items: start; justify-content: center;">
|
||||||
|
<img src="{{ output_path }}"
|
||||||
|
alt="Screenshot of page"
|
||||||
|
class="extractor-fullscreen screenshot-fullscreen"
|
||||||
|
style="max-width: 100%; cursor: zoom-in;"
|
||||||
|
onclick="this.style.maxWidth = this.style.maxWidth === 'none' ? '100%' : 'none'; this.style.cursor = this.style.maxWidth === 'none' ? 'zoom-out' : 'zoom-in';">
|
||||||
|
</div>
|
||||||
1
archivebox/plugins/screenshot/templates/icon.html
Normal file
1
archivebox/plugins/screenshot/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
|||||||
|
📷
|
||||||
8
archivebox/plugins/screenshot/templates/thumbnail.html
Normal file
8
archivebox/plugins/screenshot/templates/thumbnail.html
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
<!-- Screenshot thumbnail - shows the captured screenshot image -->
|
||||||
|
<img src="{{ output_path }}"
|
||||||
|
alt="Screenshot of page"
|
||||||
|
class="extractor-thumbnail screenshot-thumbnail"
|
||||||
|
style="width: 100%; height: 100px; object-fit: cover; object-position: top center; background: #333;"
|
||||||
|
loading="lazy"
|
||||||
|
onerror="this.style.display='none'; this.nextElementSibling.style.display='block';">
|
||||||
|
<div style="display: none; text-align: center; padding: 20px; color: #999;">📷 Screenshot</div>
|
||||||
6
archivebox/plugins/singlefile/templates/embed.html
Normal file
6
archivebox/plugins/singlefile/templates/embed.html
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
<!-- Singlefile embed - full iframe of archived HTML -->
|
||||||
|
<iframe src="{{ output_path }}"
|
||||||
|
class="extractor-embed singlefile-embed"
|
||||||
|
style="width: 100%; height: 100%; min-height: 500px; border: none;"
|
||||||
|
sandbox="allow-same-origin allow-scripts allow-forms">
|
||||||
|
</iframe>
|
||||||
6
archivebox/plugins/singlefile/templates/fullscreen.html
Normal file
6
archivebox/plugins/singlefile/templates/fullscreen.html
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
<!-- Singlefile fullscreen - full page iframe -->
|
||||||
|
<iframe src="{{ output_path }}"
|
||||||
|
class="extractor-fullscreen singlefile-fullscreen"
|
||||||
|
style="width: 100%; height: 100vh; border: none;"
|
||||||
|
sandbox="allow-same-origin allow-scripts allow-forms allow-top-navigation-by-user-activation">
|
||||||
|
</iframe>
|
||||||
1
archivebox/plugins/singlefile/templates/icon.html
Normal file
1
archivebox/plugins/singlefile/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
|||||||
|
📦
|
||||||
8
archivebox/plugins/singlefile/templates/thumbnail.html
Normal file
8
archivebox/plugins/singlefile/templates/thumbnail.html
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
<!-- Singlefile thumbnail - scaled down iframe preview of archived HTML -->
|
||||||
|
<div class="extractor-thumbnail singlefile-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fff;">
|
||||||
|
<iframe src="{{ output_path }}"
|
||||||
|
style="width: 400%; height: 400px; transform: scale(0.25); transform-origin: top left; pointer-events: none; border: none;"
|
||||||
|
loading="lazy"
|
||||||
|
sandbox="allow-same-origin">
|
||||||
|
</iframe>
|
||||||
|
</div>
|
||||||
1
archivebox/plugins/staticfile/templates/icon.html
Normal file
1
archivebox/plugins/staticfile/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
|||||||
|
📁
|
||||||
1
archivebox/plugins/title/templates/icon.html
Normal file
1
archivebox/plugins/title/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
|||||||
|
📝
|
||||||
@@ -1,68 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Install wget if not already available.
|
|
||||||
|
|
||||||
Runs at crawl start to ensure wget is installed.
|
|
||||||
Outputs JSONL for InstalledBinary.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import json
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
try:
|
|
||||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
|
|
||||||
|
|
||||||
AptProvider.model_rebuild()
|
|
||||||
BrewProvider.model_rebuild()
|
|
||||||
EnvProvider.model_rebuild()
|
|
||||||
|
|
||||||
# wget binary and package have same name
|
|
||||||
wget_binary = Binary(
|
|
||||||
name='wget',
|
|
||||||
binproviders=[AptProvider(), BrewProvider(), EnvProvider()]
|
|
||||||
)
|
|
||||||
|
|
||||||
# Try to load, install if not found
|
|
||||||
try:
|
|
||||||
loaded = wget_binary.load()
|
|
||||||
if not loaded or not loaded.abspath:
|
|
||||||
raise Exception("Not loaded")
|
|
||||||
except Exception:
|
|
||||||
# Install via system package manager
|
|
||||||
loaded = wget_binary.install()
|
|
||||||
|
|
||||||
if loaded and loaded.abspath:
|
|
||||||
# Output InstalledBinary JSONL
|
|
||||||
print(json.dumps({
|
|
||||||
'type': 'InstalledBinary',
|
|
||||||
'name': 'wget',
|
|
||||||
'abspath': str(loaded.abspath),
|
|
||||||
'version': str(loaded.version) if loaded.version else None,
|
|
||||||
'sha256': loaded.sha256,
|
|
||||||
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
|
|
||||||
}))
|
|
||||||
sys.exit(0)
|
|
||||||
else:
|
|
||||||
print(json.dumps({
|
|
||||||
'type': 'Dependency',
|
|
||||||
'bin_name': 'wget',
|
|
||||||
'bin_providers': 'apt,brew,env',
|
|
||||||
}))
|
|
||||||
print("Failed to install wget", file=sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(json.dumps({
|
|
||||||
'type': 'Dependency',
|
|
||||||
'bin_name': 'wget',
|
|
||||||
'bin_providers': 'apt,brew,env',
|
|
||||||
}))
|
|
||||||
print(f"Error installing wget: {e}", file=sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
||||||
6
archivebox/plugins/wget/templates/embed.html
Normal file
6
archivebox/plugins/wget/templates/embed.html
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
<!-- Wget embed - full iframe of mirrored site -->
|
||||||
|
<iframe src="{{ output_path }}"
|
||||||
|
class="extractor-embed wget-embed"
|
||||||
|
style="width: 100%; height: 100%; min-height: 500px; border: none;"
|
||||||
|
sandbox="allow-same-origin allow-scripts allow-forms">
|
||||||
|
</iframe>
|
||||||
6
archivebox/plugins/wget/templates/fullscreen.html
Normal file
6
archivebox/plugins/wget/templates/fullscreen.html
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
<!-- Wget fullscreen - full page iframe of mirrored site -->
|
||||||
|
<iframe src="{{ output_path }}"
|
||||||
|
class="extractor-fullscreen wget-fullscreen"
|
||||||
|
style="width: 100%; height: 100vh; border: none;"
|
||||||
|
sandbox="allow-same-origin allow-scripts allow-forms allow-top-navigation-by-user-activation">
|
||||||
|
</iframe>
|
||||||
1
archivebox/plugins/wget/templates/icon.html
Normal file
1
archivebox/plugins/wget/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
|||||||
|
📥
|
||||||
8
archivebox/plugins/wget/templates/thumbnail.html
Normal file
8
archivebox/plugins/wget/templates/thumbnail.html
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
<!-- Wget thumbnail - scaled down iframe preview of mirrored site -->
|
||||||
|
<div class="extractor-thumbnail wget-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fff;">
|
||||||
|
<iframe src="{{ output_path }}"
|
||||||
|
style="width: 400%; height: 400px; transform: scale(0.25); transform-origin: top left; pointer-events: none; border: none;"
|
||||||
|
loading="lazy"
|
||||||
|
sandbox="allow-same-origin">
|
||||||
|
</iframe>
|
||||||
|
</div>
|
||||||
@@ -2,8 +2,8 @@
|
|||||||
Integration tests for wget plugin
|
Integration tests for wget plugin
|
||||||
|
|
||||||
Tests verify:
|
Tests verify:
|
||||||
1. Plugin reports missing dependency correctly
|
1. Validate hook checks for wget binary
|
||||||
2. wget can be installed via brew/apt provider hooks
|
2. Verify deps with abx-pkg
|
||||||
3. Config options work (SAVE_WGET, SAVE_WARC, etc.)
|
3. Config options work (SAVE_WGET, SAVE_WARC, etc.)
|
||||||
4. Extraction works against real example.com
|
4. Extraction works against real example.com
|
||||||
5. Output files contain actual page content
|
5. Output files contain actual page content
|
||||||
@@ -26,7 +26,7 @@ import pytest
|
|||||||
PLUGIN_DIR = Path(__file__).parent.parent
|
PLUGIN_DIR = Path(__file__).parent.parent
|
||||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||||
WGET_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_wget.py'))
|
WGET_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_wget.py'))
|
||||||
WGET_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_wget.py'
|
WGET_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_wget.py'
|
||||||
BREW_HOOK = PLUGINS_ROOT / 'brew' / 'on_Dependency__install_using_brew_provider.py'
|
BREW_HOOK = PLUGINS_ROOT / 'brew' / 'on_Dependency__install_using_brew_provider.py'
|
||||||
APT_HOOK = PLUGINS_ROOT / 'apt' / 'on_Dependency__install_using_apt_provider.py'
|
APT_HOOK = PLUGINS_ROOT / 'apt' / 'on_Dependency__install_using_apt_provider.py'
|
||||||
TEST_URL = 'https://example.com'
|
TEST_URL = 'https://example.com'
|
||||||
@@ -37,45 +37,59 @@ def test_hook_script_exists():
|
|||||||
assert WGET_HOOK.exists(), f"Hook script not found: {WGET_HOOK}"
|
assert WGET_HOOK.exists(), f"Hook script not found: {WGET_HOOK}"
|
||||||
|
|
||||||
|
|
||||||
def test_wget_install_hook():
|
def test_wget_validate_hook():
|
||||||
"""Test wget install hook to install wget if needed."""
|
"""Test wget validate hook checks for wget binary."""
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
[sys.executable, str(WGET_INSTALL_HOOK)],
|
[sys.executable, str(WGET_VALIDATE_HOOK)],
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
text=True,
|
text=True,
|
||||||
timeout=600
|
timeout=30
|
||||||
)
|
)
|
||||||
|
|
||||||
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
|
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
|
||||||
|
if result.returncode == 0:
|
||||||
# Verify InstalledBinary JSONL output
|
# Binary found - verify InstalledBinary JSONL output
|
||||||
found_binary = False
|
found_binary = False
|
||||||
for line in result.stdout.strip().split('\n'):
|
for line in result.stdout.strip().split('\n'):
|
||||||
if line.strip():
|
if line.strip():
|
||||||
try:
|
try:
|
||||||
record = json.loads(line)
|
record = json.loads(line)
|
||||||
if record.get('type') == 'InstalledBinary':
|
if record.get('type') == 'InstalledBinary':
|
||||||
assert record['name'] == 'wget'
|
assert record['name'] == 'wget'
|
||||||
assert record['abspath']
|
assert record['abspath']
|
||||||
found_binary = True
|
found_binary = True
|
||||||
break
|
break
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
pass
|
pass
|
||||||
|
assert found_binary, "Should output InstalledBinary record when binary found"
|
||||||
assert found_binary, "Should output InstalledBinary record"
|
else:
|
||||||
|
# Binary not found - verify Dependency JSONL output
|
||||||
|
found_dependency = False
|
||||||
|
for line in result.stdout.strip().split('\n'):
|
||||||
|
if line.strip():
|
||||||
|
try:
|
||||||
|
record = json.loads(line)
|
||||||
|
if record.get('type') == 'Dependency':
|
||||||
|
assert record['bin_name'] == 'wget'
|
||||||
|
assert 'env' in record['bin_providers']
|
||||||
|
found_dependency = True
|
||||||
|
break
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
assert found_dependency, "Should output Dependency record when binary not found"
|
||||||
|
|
||||||
|
|
||||||
def test_verify_deps_with_abx_pkg():
|
def test_verify_deps_with_abx_pkg():
|
||||||
"""Verify wget is available via abx-pkg after hook installation."""
|
"""Verify wget is available via abx-pkg."""
|
||||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
|
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
|
||||||
|
|
||||||
AptProvider.model_rebuild()
|
|
||||||
BrewProvider.model_rebuild()
|
|
||||||
EnvProvider.model_rebuild()
|
|
||||||
|
|
||||||
wget_binary = Binary(name='wget', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
|
wget_binary = Binary(name='wget', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
|
||||||
wget_loaded = wget_binary.load()
|
wget_loaded = wget_binary.load()
|
||||||
assert wget_loaded and wget_loaded.abspath, "wget should be available after install hook"
|
|
||||||
|
if wget_loaded and wget_loaded.abspath:
|
||||||
|
assert True, "wget is available"
|
||||||
|
else:
|
||||||
|
pytest.skip("wget not available - Dependency record should have been emitted")
|
||||||
|
|
||||||
|
|
||||||
def test_reports_missing_dependency_when_not_installed():
|
def test_reports_missing_dependency_when_not_installed():
|
||||||
|
|||||||
@@ -110,6 +110,10 @@
|
|||||||
{% block nav-global %}{% endblock %}
|
{% block nav-global %}{% endblock %}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
{% if has_permission %}
|
||||||
|
{% include 'admin/progress_monitor.html' %}
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
{% block breadcrumbs %}
|
{% block breadcrumbs %}
|
||||||
<div class="breadcrumbs">
|
<div class="breadcrumbs">
|
||||||
<a href="{% url 'admin:index' %}">{% trans 'Home' %}</a>
|
<a href="{% url 'admin:index' %}">{% trans 'Home' %}</a>
|
||||||
|
|||||||
648
archivebox/templates/admin/progress_monitor.html
Normal file
648
archivebox/templates/admin/progress_monitor.html
Normal file
@@ -0,0 +1,648 @@
|
|||||||
|
<style>
|
||||||
|
/* Progress Monitor Container */
|
||||||
|
#progress-monitor {
|
||||||
|
background: linear-gradient(135deg, #0d1117 0%, #161b22 100%);
|
||||||
|
color: #c9d1d9;
|
||||||
|
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Noto Sans', Helvetica, Arial, sans-serif;
|
||||||
|
font-size: 12px;
|
||||||
|
border-bottom: 1px solid #30363d;
|
||||||
|
position: relative;
|
||||||
|
z-index: 100;
|
||||||
|
}
|
||||||
|
#progress-monitor.hidden {
|
||||||
|
display: none;
|
||||||
|
}
|
||||||
|
#progress-monitor .tree-container {
|
||||||
|
max-height: 350px;
|
||||||
|
overflow-y: auto;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Header Bar */
|
||||||
|
#progress-monitor .header-bar {
|
||||||
|
display: flex;
|
||||||
|
justify-content: space-between;
|
||||||
|
align-items: center;
|
||||||
|
padding: 8px 16px;
|
||||||
|
background: rgba(0,0,0,0.2);
|
||||||
|
border-bottom: 1px solid #30363d;
|
||||||
|
position: sticky;
|
||||||
|
top: 0;
|
||||||
|
z-index: 10;
|
||||||
|
}
|
||||||
|
#progress-monitor .header-left {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 16px;
|
||||||
|
}
|
||||||
|
#progress-monitor .header-right {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 12px;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Orchestrator Status */
|
||||||
|
#progress-monitor .orchestrator-status {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 6px;
|
||||||
|
}
|
||||||
|
#progress-monitor .status-dot {
|
||||||
|
width: 8px;
|
||||||
|
height: 8px;
|
||||||
|
border-radius: 50%;
|
||||||
|
flex-shrink: 0;
|
||||||
|
}
|
||||||
|
#progress-monitor .status-dot.running {
|
||||||
|
background: #3fb950;
|
||||||
|
box-shadow: 0 0 8px #3fb950;
|
||||||
|
animation: pulse 2s infinite;
|
||||||
|
}
|
||||||
|
#progress-monitor .status-dot.stopped {
|
||||||
|
background: #f85149;
|
||||||
|
}
|
||||||
|
@keyframes pulse {
|
||||||
|
0%, 100% { opacity: 1; box-shadow: 0 0 8px #3fb950; }
|
||||||
|
50% { opacity: 0.6; box-shadow: 0 0 4px #3fb950; }
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Stats */
|
||||||
|
#progress-monitor .stats {
|
||||||
|
display: flex;
|
||||||
|
gap: 16px;
|
||||||
|
}
|
||||||
|
#progress-monitor .stat {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 4px;
|
||||||
|
}
|
||||||
|
#progress-monitor .stat-label {
|
||||||
|
color: #8b949e;
|
||||||
|
font-size: 10px;
|
||||||
|
text-transform: uppercase;
|
||||||
|
letter-spacing: 0.5px;
|
||||||
|
}
|
||||||
|
#progress-monitor .stat-value {
|
||||||
|
font-weight: 600;
|
||||||
|
font-variant-numeric: tabular-nums;
|
||||||
|
}
|
||||||
|
#progress-monitor .stat-value.success { color: #3fb950; }
|
||||||
|
#progress-monitor .stat-value.error { color: #f85149; }
|
||||||
|
#progress-monitor .stat-value.warning { color: #d29922; }
|
||||||
|
#progress-monitor .stat-value.info { color: #58a6ff; }
|
||||||
|
|
||||||
|
/* Toggle Button */
|
||||||
|
#progress-monitor .toggle-btn {
|
||||||
|
background: transparent;
|
||||||
|
border: 1px solid #30363d;
|
||||||
|
color: #8b949e;
|
||||||
|
cursor: pointer;
|
||||||
|
padding: 4px 8px;
|
||||||
|
border-radius: 6px;
|
||||||
|
font-size: 11px;
|
||||||
|
transition: all 0.2s;
|
||||||
|
}
|
||||||
|
#progress-monitor .toggle-btn:hover {
|
||||||
|
background: #21262d;
|
||||||
|
color: #c9d1d9;
|
||||||
|
border-color: #8b949e;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Tree Container */
|
||||||
|
#progress-monitor .tree-container {
|
||||||
|
padding: 12px 16px;
|
||||||
|
}
|
||||||
|
#progress-monitor.collapsed .tree-container {
|
||||||
|
display: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Idle Message */
|
||||||
|
#progress-monitor .idle-message {
|
||||||
|
color: #8b949e;
|
||||||
|
font-style: italic;
|
||||||
|
padding: 8px 0;
|
||||||
|
text-align: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Crawl Item */
|
||||||
|
#progress-monitor .crawl-item {
|
||||||
|
background: #161b22;
|
||||||
|
border: 1px solid #30363d;
|
||||||
|
border-radius: 8px;
|
||||||
|
margin-bottom: 12px;
|
||||||
|
overflow: hidden;
|
||||||
|
}
|
||||||
|
#progress-monitor .crawl-header {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 12px;
|
||||||
|
padding: 10px 14px;
|
||||||
|
background: rgba(0,0,0,0.2);
|
||||||
|
cursor: pointer;
|
||||||
|
}
|
||||||
|
#progress-monitor .crawl-header:hover {
|
||||||
|
background: rgba(88, 166, 255, 0.1);
|
||||||
|
}
|
||||||
|
#progress-monitor .crawl-icon {
|
||||||
|
font-size: 16px;
|
||||||
|
width: 20px;
|
||||||
|
text-align: center;
|
||||||
|
}
|
||||||
|
#progress-monitor .crawl-info {
|
||||||
|
flex: 1;
|
||||||
|
min-width: 0;
|
||||||
|
}
|
||||||
|
#progress-monitor .crawl-label {
|
||||||
|
font-weight: 600;
|
||||||
|
color: #58a6ff;
|
||||||
|
white-space: nowrap;
|
||||||
|
overflow: hidden;
|
||||||
|
text-overflow: ellipsis;
|
||||||
|
}
|
||||||
|
#progress-monitor .crawl-meta {
|
||||||
|
font-size: 11px;
|
||||||
|
color: #8b949e;
|
||||||
|
margin-top: 2px;
|
||||||
|
}
|
||||||
|
#progress-monitor .crawl-stats {
|
||||||
|
display: flex;
|
||||||
|
gap: 12px;
|
||||||
|
font-size: 11px;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Progress Bar */
|
||||||
|
#progress-monitor .progress-bar-container {
|
||||||
|
height: 4px;
|
||||||
|
background: #21262d;
|
||||||
|
border-radius: 2px;
|
||||||
|
overflow: hidden;
|
||||||
|
position: relative;
|
||||||
|
}
|
||||||
|
#progress-monitor .progress-bar {
|
||||||
|
height: 100%;
|
||||||
|
border-radius: 2px;
|
||||||
|
transition: width 0.5s ease-out;
|
||||||
|
position: relative;
|
||||||
|
}
|
||||||
|
#progress-monitor .progress-bar.crawl {
|
||||||
|
background: linear-gradient(90deg, #238636 0%, #3fb950 100%);
|
||||||
|
}
|
||||||
|
#progress-monitor .progress-bar.snapshot {
|
||||||
|
background: linear-gradient(90deg, #1f6feb 0%, #58a6ff 100%);
|
||||||
|
}
|
||||||
|
#progress-monitor .progress-bar.extractor {
|
||||||
|
background: linear-gradient(90deg, #8957e5 0%, #a371f7 100%);
|
||||||
|
}
|
||||||
|
#progress-monitor .progress-bar.indeterminate {
|
||||||
|
background: linear-gradient(90deg, transparent 0%, #58a6ff 50%, transparent 100%);
|
||||||
|
animation: indeterminate 1.5s infinite linear;
|
||||||
|
width: 30% !important;
|
||||||
|
}
|
||||||
|
@keyframes indeterminate {
|
||||||
|
0% { transform: translateX(-100%); }
|
||||||
|
100% { transform: translateX(400%); }
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Crawl Body */
|
||||||
|
#progress-monitor .crawl-body {
|
||||||
|
padding: 0 14px 14px;
|
||||||
|
}
|
||||||
|
#progress-monitor .crawl-progress {
|
||||||
|
padding: 10px 14px;
|
||||||
|
border-bottom: 1px solid #21262d;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Snapshot List */
|
||||||
|
#progress-monitor .snapshot-list {
|
||||||
|
margin-top: 8px;
|
||||||
|
}
|
||||||
|
#progress-monitor .snapshot-item {
|
||||||
|
background: #0d1117;
|
||||||
|
border: 1px solid #21262d;
|
||||||
|
border-radius: 6px;
|
||||||
|
margin-bottom: 8px;
|
||||||
|
overflow: hidden;
|
||||||
|
}
|
||||||
|
#progress-monitor .snapshot-header {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 10px;
|
||||||
|
padding: 8px 12px;
|
||||||
|
cursor: pointer;
|
||||||
|
}
|
||||||
|
#progress-monitor .snapshot-header:hover {
|
||||||
|
background: rgba(88, 166, 255, 0.05);
|
||||||
|
}
|
||||||
|
#progress-monitor .snapshot-icon {
|
||||||
|
font-size: 14px;
|
||||||
|
width: 18px;
|
||||||
|
text-align: center;
|
||||||
|
color: #58a6ff;
|
||||||
|
}
|
||||||
|
#progress-monitor .snapshot-info {
|
||||||
|
flex: 1;
|
||||||
|
min-width: 0;
|
||||||
|
}
|
||||||
|
#progress-monitor .snapshot-url {
|
||||||
|
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
|
||||||
|
font-size: 11px;
|
||||||
|
color: #c9d1d9;
|
||||||
|
white-space: nowrap;
|
||||||
|
overflow: hidden;
|
||||||
|
text-overflow: ellipsis;
|
||||||
|
}
|
||||||
|
#progress-monitor .snapshot-meta {
|
||||||
|
font-size: 10px;
|
||||||
|
color: #8b949e;
|
||||||
|
margin-top: 2px;
|
||||||
|
}
|
||||||
|
#progress-monitor .snapshot-progress {
|
||||||
|
padding: 0 12px 8px;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Extractor List */
|
||||||
|
#progress-monitor .extractor-list {
|
||||||
|
padding: 8px 12px;
|
||||||
|
background: rgba(0,0,0,0.2);
|
||||||
|
border-top: 1px solid #21262d;
|
||||||
|
}
|
||||||
|
#progress-monitor .extractor-item {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 8px;
|
||||||
|
padding: 4px 0;
|
||||||
|
}
|
||||||
|
#progress-monitor .extractor-icon {
|
||||||
|
font-size: 12px;
|
||||||
|
width: 16px;
|
||||||
|
text-align: center;
|
||||||
|
}
|
||||||
|
#progress-monitor .extractor-icon.running {
|
||||||
|
color: #d29922;
|
||||||
|
animation: spin 1s linear infinite;
|
||||||
|
}
|
||||||
|
#progress-monitor .extractor-icon.success {
|
||||||
|
color: #3fb950;
|
||||||
|
}
|
||||||
|
#progress-monitor .extractor-icon.failed {
|
||||||
|
color: #f85149;
|
||||||
|
}
|
||||||
|
#progress-monitor .extractor-icon.pending {
|
||||||
|
color: #8b949e;
|
||||||
|
}
|
||||||
|
@keyframes spin {
|
||||||
|
from { transform: rotate(0deg); }
|
||||||
|
to { transform: rotate(360deg); }
|
||||||
|
}
|
||||||
|
#progress-monitor .extractor-name {
|
||||||
|
flex: 1;
|
||||||
|
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
|
||||||
|
font-size: 11px;
|
||||||
|
}
|
||||||
|
#progress-monitor .extractor-progress {
|
||||||
|
width: 60px;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Status Badge */
|
||||||
|
#progress-monitor .status-badge {
|
||||||
|
font-size: 10px;
|
||||||
|
padding: 2px 6px;
|
||||||
|
border-radius: 10px;
|
||||||
|
font-weight: 500;
|
||||||
|
text-transform: uppercase;
|
||||||
|
letter-spacing: 0.3px;
|
||||||
|
}
|
||||||
|
#progress-monitor .status-badge.queued {
|
||||||
|
background: #21262d;
|
||||||
|
color: #8b949e;
|
||||||
|
}
|
||||||
|
#progress-monitor .status-badge.started {
|
||||||
|
background: rgba(210, 153, 34, 0.2);
|
||||||
|
color: #d29922;
|
||||||
|
}
|
||||||
|
#progress-monitor .status-badge.sealed,
|
||||||
|
#progress-monitor .status-badge.succeeded {
|
||||||
|
background: rgba(63, 185, 80, 0.2);
|
||||||
|
color: #3fb950;
|
||||||
|
}
|
||||||
|
#progress-monitor .status-badge.failed {
|
||||||
|
background: rgba(248, 81, 73, 0.2);
|
||||||
|
color: #f85149;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Expand/Collapse Icons */
|
||||||
|
#progress-monitor .expand-icon {
|
||||||
|
color: #8b949e;
|
||||||
|
font-size: 10px;
|
||||||
|
transition: transform 0.2s;
|
||||||
|
}
|
||||||
|
#progress-monitor .expand-icon.expanded {
|
||||||
|
transform: rotate(90deg);
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
|
||||||
|
<div id="progress-monitor">
|
||||||
|
<div class="header-bar">
|
||||||
|
<div class="header-left">
|
||||||
|
<div class="orchestrator-status">
|
||||||
|
<span class="status-dot stopped" id="orchestrator-dot"></span>
|
||||||
|
<span id="orchestrator-text">Stopped</span>
|
||||||
|
</div>
|
||||||
|
<div class="stats">
|
||||||
|
<div class="stat">
|
||||||
|
<span class="stat-label">Workers</span>
|
||||||
|
<span class="stat-value info" id="worker-count">0</span>
|
||||||
|
</div>
|
||||||
|
<div class="stat">
|
||||||
|
<span class="stat-label">Queued</span>
|
||||||
|
<span class="stat-value warning" id="total-queued">0</span>
|
||||||
|
</div>
|
||||||
|
<div class="stat">
|
||||||
|
<span class="stat-label">Done</span>
|
||||||
|
<span class="stat-value success" id="total-succeeded">0</span>
|
||||||
|
</div>
|
||||||
|
<div class="stat">
|
||||||
|
<span class="stat-label">Failed</span>
|
||||||
|
<span class="stat-value error" id="total-failed">0</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="header-right">
|
||||||
|
<button class="toggle-btn" id="progress-collapse" title="Toggle details">Details</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="tree-container" id="tree-container">
|
||||||
|
<div class="idle-message" id="idle-message">No active crawls</div>
|
||||||
|
<div id="crawl-tree"></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
(function() {
|
||||||
|
const monitor = document.getElementById('progress-monitor');
|
||||||
|
const collapseBtn = document.getElementById('progress-collapse');
|
||||||
|
const treeContainer = document.getElementById('tree-container');
|
||||||
|
const crawlTree = document.getElementById('crawl-tree');
|
||||||
|
const idleMessage = document.getElementById('idle-message');
|
||||||
|
|
||||||
|
let pollInterval = null;
|
||||||
|
let isCollapsed = localStorage.getItem('progress-monitor-collapsed') === 'true';
|
||||||
|
let expandedCrawls = new Set(JSON.parse(localStorage.getItem('progress-monitor-expanded-crawls') || '[]'));
|
||||||
|
let expandedSnapshots = new Set(JSON.parse(localStorage.getItem('progress-monitor-expanded-snapshots') || '[]'));
|
||||||
|
|
||||||
|
function formatUrl(url) {
|
||||||
|
try {
|
||||||
|
const u = new URL(url);
|
||||||
|
return u.hostname + u.pathname.substring(0, 30) + (u.pathname.length > 30 ? '...' : '');
|
||||||
|
} catch {
|
||||||
|
return url.substring(0, 50) + (url.length > 50 ? '...' : '');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function renderExtractor(extractor) {
|
||||||
|
const iconClass = extractor.status === 'started' ? 'running' :
|
||||||
|
extractor.status === 'succeeded' ? 'success' :
|
||||||
|
extractor.status === 'failed' ? 'failed' : 'pending';
|
||||||
|
const icon = extractor.status === 'started' ? '↻' :
|
||||||
|
extractor.status === 'succeeded' ? '✓' :
|
||||||
|
extractor.status === 'failed' ? '✗' : '○';
|
||||||
|
|
||||||
|
return `
|
||||||
|
<div class="extractor-item">
|
||||||
|
<span class="extractor-icon ${iconClass}">${icon}</span>
|
||||||
|
<span class="extractor-name">${extractor.extractor}</span>
|
||||||
|
<div class="extractor-progress">
|
||||||
|
<div class="progress-bar-container">
|
||||||
|
<div class="progress-bar extractor ${extractor.status === 'started' ? 'indeterminate' : ''}"
|
||||||
|
style="width: ${extractor.status === 'succeeded' ? '100' : extractor.status === 'failed' ? '100' : extractor.progress}%"></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function renderSnapshot(snapshot, crawlId) {
|
||||||
|
const snapshotKey = `${crawlId}-${snapshot.id}`;
|
||||||
|
const isExpanded = expandedSnapshots.has(snapshotKey);
|
||||||
|
const statusIcon = snapshot.status === 'started' ? '↻' : '📄';
|
||||||
|
|
||||||
|
let extractorHtml = '';
|
||||||
|
if (snapshot.active_extractors && snapshot.active_extractors.length > 0) {
|
||||||
|
extractorHtml = `
|
||||||
|
<div class="extractor-list" style="${isExpanded ? '' : 'display:none'}">
|
||||||
|
${snapshot.active_extractors.map(e => renderExtractor(e)).join('')}
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
}
|
||||||
|
|
||||||
|
return `
|
||||||
|
<div class="snapshot-item" data-snapshot-key="${snapshotKey}">
|
||||||
|
<div class="snapshot-header" onclick="window.toggleSnapshot('${snapshotKey}')">
|
||||||
|
<span class="expand-icon ${isExpanded ? 'expanded' : ''}">${snapshot.active_extractors?.length ? '▶' : ''}</span>
|
||||||
|
<span class="snapshot-icon">${statusIcon}</span>
|
||||||
|
<div class="snapshot-info">
|
||||||
|
<div class="snapshot-url">${formatUrl(snapshot.url)}</div>
|
||||||
|
<div class="snapshot-meta">
|
||||||
|
${snapshot.completed_extractors}/${snapshot.total_extractors} extractors
|
||||||
|
${snapshot.failed_extractors > 0 ? `<span style="color:#f85149">(${snapshot.failed_extractors} failed)</span>` : ''}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<span class="status-badge ${snapshot.status}">${snapshot.status}</span>
|
||||||
|
</div>
|
||||||
|
<div class="snapshot-progress">
|
||||||
|
<div class="progress-bar-container">
|
||||||
|
<div class="progress-bar snapshot ${snapshot.status === 'started' && snapshot.progress === 0 ? 'indeterminate' : ''}"
|
||||||
|
style="width: ${snapshot.progress}%"></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
${extractorHtml}
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function renderCrawl(crawl) {
|
||||||
|
const isExpanded = expandedCrawls.has(crawl.id);
|
||||||
|
const statusIcon = crawl.status === 'started' ? '↻' : '🔍';
|
||||||
|
|
||||||
|
let snapshotsHtml = '';
|
||||||
|
if (crawl.active_snapshots && crawl.active_snapshots.length > 0) {
|
||||||
|
snapshotsHtml = crawl.active_snapshots.map(s => renderSnapshot(s, crawl.id)).join('');
|
||||||
|
}
|
||||||
|
|
||||||
|
return `
|
||||||
|
<div class="crawl-item" data-crawl-id="${crawl.id}">
|
||||||
|
<div class="crawl-header" onclick="window.toggleCrawl('${crawl.id}')">
|
||||||
|
<span class="expand-icon ${isExpanded ? 'expanded' : ''}">${crawl.active_snapshots?.length ? '▶' : ''}</span>
|
||||||
|
<span class="crawl-icon">${statusIcon}</span>
|
||||||
|
<div class="crawl-info">
|
||||||
|
<div class="crawl-label">${crawl.label}</div>
|
||||||
|
<div class="crawl-meta">depth: ${crawl.max_depth} | ${crawl.total_snapshots} snapshots</div>
|
||||||
|
</div>
|
||||||
|
<div class="crawl-stats">
|
||||||
|
<span style="color:#3fb950">${crawl.completed_snapshots} done</span>
|
||||||
|
<span style="color:#8b949e">${crawl.pending_snapshots} pending</span>
|
||||||
|
</div>
|
||||||
|
<span class="status-badge ${crawl.status}">${crawl.status}</span>
|
||||||
|
</div>
|
||||||
|
<div class="crawl-progress">
|
||||||
|
<div class="progress-bar-container">
|
||||||
|
<div class="progress-bar crawl ${crawl.status === 'started' && crawl.progress === 0 ? 'indeterminate' : ''}"
|
||||||
|
style="width: ${crawl.progress}%"></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="crawl-body" style="${isExpanded ? '' : 'display:none'}">
|
||||||
|
<div class="snapshot-list">
|
||||||
|
${snapshotsHtml}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
}
|
||||||
|
|
||||||
|
window.toggleCrawl = function(crawlId) {
|
||||||
|
const item = document.querySelector(`[data-crawl-id="${crawlId}"]`);
|
||||||
|
const body = item.querySelector('.crawl-body');
|
||||||
|
const icon = item.querySelector('.expand-icon');
|
||||||
|
|
||||||
|
if (expandedCrawls.has(crawlId)) {
|
||||||
|
expandedCrawls.delete(crawlId);
|
||||||
|
body.style.display = 'none';
|
||||||
|
icon.classList.remove('expanded');
|
||||||
|
} else {
|
||||||
|
expandedCrawls.add(crawlId);
|
||||||
|
body.style.display = '';
|
||||||
|
icon.classList.add('expanded');
|
||||||
|
}
|
||||||
|
localStorage.setItem('progress-monitor-expanded-crawls', JSON.stringify([...expandedCrawls]));
|
||||||
|
};
|
||||||
|
|
||||||
|
window.toggleSnapshot = function(snapshotKey) {
|
||||||
|
const item = document.querySelector(`[data-snapshot-key="${snapshotKey}"]`);
|
||||||
|
const extractorList = item.querySelector('.extractor-list');
|
||||||
|
const icon = item.querySelector('.expand-icon');
|
||||||
|
|
||||||
|
if (!extractorList) return;
|
||||||
|
|
||||||
|
if (expandedSnapshots.has(snapshotKey)) {
|
||||||
|
expandedSnapshots.delete(snapshotKey);
|
||||||
|
extractorList.style.display = 'none';
|
||||||
|
icon.classList.remove('expanded');
|
||||||
|
} else {
|
||||||
|
expandedSnapshots.add(snapshotKey);
|
||||||
|
extractorList.style.display = '';
|
||||||
|
icon.classList.add('expanded');
|
||||||
|
}
|
||||||
|
localStorage.setItem('progress-monitor-expanded-snapshots', JSON.stringify([...expandedSnapshots]));
|
||||||
|
};
|
||||||
|
|
||||||
|
function updateProgress(data) {
|
||||||
|
// Calculate if there's activity
|
||||||
|
const hasActivity = data.active_crawls.length > 0 ||
|
||||||
|
data.crawls_pending > 0 || data.crawls_started > 0 ||
|
||||||
|
data.snapshots_pending > 0 || data.snapshots_started > 0 ||
|
||||||
|
data.archiveresults_pending > 0 || data.archiveresults_started > 0;
|
||||||
|
|
||||||
|
// Update orchestrator status
|
||||||
|
const dot = document.getElementById('orchestrator-dot');
|
||||||
|
const text = document.getElementById('orchestrator-text');
|
||||||
|
if (data.orchestrator_running) {
|
||||||
|
dot.classList.remove('stopped');
|
||||||
|
dot.classList.add('running');
|
||||||
|
text.textContent = 'Running';
|
||||||
|
} else {
|
||||||
|
dot.classList.remove('running');
|
||||||
|
dot.classList.add('stopped');
|
||||||
|
text.textContent = 'Stopped';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update stats
|
||||||
|
document.getElementById('worker-count').textContent = data.total_workers;
|
||||||
|
document.getElementById('total-queued').textContent =
|
||||||
|
data.crawls_pending + data.snapshots_pending + data.archiveresults_pending;
|
||||||
|
document.getElementById('total-succeeded').textContent = data.archiveresults_succeeded;
|
||||||
|
document.getElementById('total-failed').textContent = data.archiveresults_failed;
|
||||||
|
|
||||||
|
// Render crawl tree
|
||||||
|
if (data.active_crawls.length > 0) {
|
||||||
|
idleMessage.style.display = 'none';
|
||||||
|
crawlTree.innerHTML = data.active_crawls.map(c => renderCrawl(c)).join('');
|
||||||
|
} else if (hasActivity) {
|
||||||
|
idleMessage.style.display = 'none';
|
||||||
|
crawlTree.innerHTML = `
|
||||||
|
<div class="idle-message">
|
||||||
|
${data.snapshots_started} snapshots processing, ${data.archiveresults_started} extractors running
|
||||||
|
</div>
|
||||||
|
`;
|
||||||
|
} else {
|
||||||
|
idleMessage.style.display = '';
|
||||||
|
// Build the URL for recent crawls (last 24 hours)
|
||||||
|
var yesterday = new Date(Date.now() - 24*60*60*1000).toISOString().split('T')[0];
|
||||||
|
var recentUrl = '/admin/crawls/crawl/?created_at__gte=' + yesterday + '&o=-1';
|
||||||
|
idleMessage.innerHTML = `No active crawls (${data.crawls_pending} pending, ${data.crawls_started} started, <a href="${recentUrl}" style="color: #58a6ff;">${data.crawls_recent} recent</a>)`;
|
||||||
|
crawlTree.innerHTML = '';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function fetchProgress() {
|
||||||
|
fetch('/admin/live-progress/')
|
||||||
|
.then(response => response.json())
|
||||||
|
.then(data => {
|
||||||
|
if (data.error) {
|
||||||
|
console.error('Progress API error:', data.error, data.traceback);
|
||||||
|
idleMessage.textContent = 'API Error: ' + data.error;
|
||||||
|
idleMessage.style.color = '#f85149';
|
||||||
|
}
|
||||||
|
updateProgress(data);
|
||||||
|
})
|
||||||
|
.catch(error => {
|
||||||
|
console.error('Progress fetch error:', error);
|
||||||
|
idleMessage.textContent = 'Fetch Error: ' + error.message;
|
||||||
|
idleMessage.style.color = '#f85149';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function startPolling() {
|
||||||
|
if (pollInterval) return;
|
||||||
|
fetchProgress();
|
||||||
|
pollInterval = setInterval(fetchProgress, 1000); // Poll every 1 second
|
||||||
|
}
|
||||||
|
|
||||||
|
function stopPolling() {
|
||||||
|
if (pollInterval) {
|
||||||
|
clearInterval(pollInterval);
|
||||||
|
pollInterval = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collapse toggle
|
||||||
|
collapseBtn.addEventListener('click', function() {
|
||||||
|
isCollapsed = !isCollapsed;
|
||||||
|
localStorage.setItem('progress-monitor-collapsed', isCollapsed);
|
||||||
|
if (isCollapsed) {
|
||||||
|
monitor.classList.add('collapsed');
|
||||||
|
collapseBtn.textContent = 'Expand';
|
||||||
|
} else {
|
||||||
|
monitor.classList.remove('collapsed');
|
||||||
|
collapseBtn.textContent = 'Details';
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Apply initial state
|
||||||
|
if (isCollapsed) {
|
||||||
|
monitor.classList.add('collapsed');
|
||||||
|
collapseBtn.textContent = 'Expand';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start polling when page loads
|
||||||
|
startPolling();
|
||||||
|
|
||||||
|
// Pause polling when tab is hidden
|
||||||
|
document.addEventListener('visibilitychange', function() {
|
||||||
|
if (document.hidden) {
|
||||||
|
stopPolling();
|
||||||
|
} else {
|
||||||
|
startPolling();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
})();
|
||||||
|
</script>
|
||||||
@@ -192,6 +192,42 @@
|
|||||||
border: 0px;
|
border: 0px;
|
||||||
border-top: 3px solid #aa1e55;
|
border-top: 3px solid #aa1e55;
|
||||||
}
|
}
|
||||||
|
#main-frame-wrapper {
|
||||||
|
width: 100%;
|
||||||
|
height: calc(100vh - 210px);
|
||||||
|
border-top: 3px solid #aa1e55;
|
||||||
|
overflow: hidden;
|
||||||
|
}
|
||||||
|
#main-frame-wrapper iframe {
|
||||||
|
width: 100%;
|
||||||
|
height: 100%;
|
||||||
|
border: none;
|
||||||
|
}
|
||||||
|
.full-page-wrapper {
|
||||||
|
width: 100%;
|
||||||
|
height: calc(100vh - 210px);
|
||||||
|
}
|
||||||
|
.thumbnail-wrapper {
|
||||||
|
height: 100px;
|
||||||
|
overflow: hidden;
|
||||||
|
background-color: #333;
|
||||||
|
pointer-events: none;
|
||||||
|
}
|
||||||
|
.thumbnail-wrapper iframe {
|
||||||
|
width: 405%;
|
||||||
|
height: 430px;
|
||||||
|
margin-bottom: -330px;
|
||||||
|
margin-left: -1%;
|
||||||
|
transform: scale(0.25);
|
||||||
|
transform-origin: 0 0;
|
||||||
|
border: none;
|
||||||
|
}
|
||||||
|
.thumbnail-wrapper img {
|
||||||
|
width: 100%;
|
||||||
|
height: 100%;
|
||||||
|
object-fit: cover;
|
||||||
|
object-position: top center;
|
||||||
|
}
|
||||||
.card.selected-card {
|
.card.selected-card {
|
||||||
border: 2px solid orange;
|
border: 2px solid orange;
|
||||||
box-shadow: 0px -6px 13px 1px rgba(0,0,0,0.05);
|
box-shadow: 0px -6px 13px 1px rgba(0,0,0,0.05);
|
||||||
@@ -403,12 +439,18 @@
|
|||||||
<div class="card {% if forloop.first %}selected-card{% endif %}">
|
<div class="card {% if forloop.first %}selected-card{% endif %}">
|
||||||
<div class="card-body">
|
<div class="card-body">
|
||||||
<a href="{{result.path|urlencode}}" target="preview" title="./{{result.path}} (downloaded {{result.ts}})">
|
<a href="{{result.path|urlencode}}" target="preview" title="./{{result.path}} (downloaded {{result.ts}})">
|
||||||
<h4>{{result.name|truncatechars:24}} <small>({{result.size|filesizeformat}})</small></h4>
|
<h4>{% extractor_icon result.name %} {{result.name|extractor_name|truncatechars:20}} <small>({{result.size|filesizeformat}})</small></h4>
|
||||||
<!-- <p class="card-text" ><code>./{{result.path|truncatechars:30}}</code></p> -->
|
|
||||||
</a>
|
</a>
|
||||||
<!--<a href="{{result.path}}" target="preview"><h4 class="card-title">{{result.name}}</h4></a>-->
|
|
||||||
</div>
|
</div>
|
||||||
<iframe class="card-img-top" src="{{result.path|urlencode}}?autoplay=0" allow="autoplay 'none'; fullscreen 'none'; navigation-override 'none'; " sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy"></iframe>
|
{% if result.result %}
|
||||||
|
{# Use plugin-specific thumbnail template when ArchiveResult is available #}
|
||||||
|
<div class="card-img-top thumbnail-wrapper">
|
||||||
|
{% extractor_thumbnail result.result %}
|
||||||
|
</div>
|
||||||
|
{% else %}
|
||||||
|
{# Fall back to generic iframe for filesystem-discovered files #}
|
||||||
|
<iframe class="card-img-top" src="{{result.path|urlencode}}?autoplay=0" allow="autoplay 'none'; fullscreen 'none'; navigation-override 'none'; " sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy"></iframe>
|
||||||
|
{% endif %}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
@@ -431,7 +473,15 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
<iframe id="main-frame" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" class="full-page-iframe" src="{{best_result.path|urlencode}}" name="preview"></iframe>
|
{% if best_result.result %}
|
||||||
|
{# Use plugin-specific fullscreen template when ArchiveResult is available #}
|
||||||
|
<div id="main-frame-wrapper" class="full-page-wrapper">
|
||||||
|
{% extractor_fullscreen best_result.result %}
|
||||||
|
</div>
|
||||||
|
{% else %}
|
||||||
|
{# Fall back to generic iframe #}
|
||||||
|
<iframe id="main-frame" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" class="full-page-iframe" src="{{best_result.path|urlencode}}" name="preview"></iframe>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,23 +1,13 @@
|
|||||||
|
"""
|
||||||
|
Workers admin module.
|
||||||
|
|
||||||
|
The orchestrator/worker system doesn't need Django admin registration
|
||||||
|
as workers are managed via CLI commands and the orchestrator.
|
||||||
|
"""
|
||||||
|
|
||||||
__package__ = 'archivebox.workers'
|
__package__ = 'archivebox.workers'
|
||||||
|
|
||||||
from django.contrib.auth import get_permission_codename
|
|
||||||
|
|
||||||
from huey_monitor.apps import HueyMonitorConfig
|
|
||||||
from huey_monitor.admin import TaskModel, TaskModelAdmin, SignalInfoModel, SignalInfoModelAdmin
|
|
||||||
|
|
||||||
|
|
||||||
HueyMonitorConfig.verbose_name = 'Background Workers'
|
|
||||||
|
|
||||||
|
|
||||||
class CustomTaskModelAdmin(TaskModelAdmin):
|
|
||||||
actions = ["delete_selected"]
|
|
||||||
|
|
||||||
def has_delete_permission(self, request, obj=None):
|
|
||||||
codename = get_permission_codename("delete", self.opts)
|
|
||||||
return request.user.has_perm("%s.%s" % (self.opts.app_label, codename))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def register_admin(admin_site):
|
def register_admin(admin_site):
|
||||||
admin_site.register(TaskModel, CustomTaskModelAdmin)
|
"""No models to register - workers are process-based, not Django models."""
|
||||||
admin_site.register(SignalInfoModel, SignalInfoModelAdmin)
|
pass
|
||||||
|
|||||||
0
archivebox/workers/management/__init__.py
Normal file
0
archivebox/workers/management/__init__.py
Normal file
0
archivebox/workers/management/commands/__init__.py
Normal file
0
archivebox/workers/management/commands/__init__.py
Normal file
15
archivebox/workers/management/commands/orchestrator.py
Normal file
15
archivebox/workers/management/commands/orchestrator.py
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
from django.core.management.base import BaseCommand
|
||||||
|
|
||||||
|
from workers.orchestrator import Orchestrator
|
||||||
|
|
||||||
|
|
||||||
|
class Command(BaseCommand):
|
||||||
|
help = 'Run the archivebox orchestrator'
|
||||||
|
|
||||||
|
def add_arguments(self, parser):
|
||||||
|
parser.add_argument('--daemon', '-d', action='store_true', help="Run forever (don't exit on idle)")
|
||||||
|
|
||||||
|
def handle(self, *args, **kwargs):
|
||||||
|
daemon = kwargs.get('daemon', False)
|
||||||
|
orchestrator = Orchestrator(exit_on_idle=not daemon)
|
||||||
|
orchestrator.runloop()
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user