From d95f0dc1867d1d3705abc563ed2a57c528a6cea9 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 24 Dec 2025 23:40:18 -0800 Subject: [PATCH] remove huey --- archivebox/api/v1_api.py | 1 + archivebox/api/v1_cli.py | 5 +- archivebox/api/v1_machine.py | 206 ++++++ archivebox/api/v1_workers.py | 206 +++--- archivebox/base_models/admin.py | 268 ++++++-- archivebox/cli/archivebox_add.py | 3 +- archivebox/config/configset.py | 67 +- archivebox/config/views.py | 2 +- archivebox/core/admin_archiveresults.py | 32 +- archivebox/core/admin_site.py | 19 +- archivebox/core/admin_snapshots.py | 40 +- archivebox/core/apps.py | 39 +- archivebox/core/models.py | 83 ++- archivebox/core/settings.py | 69 +- archivebox/core/templatetags/core_tags.py | 118 +++- archivebox/core/urls.py | 6 +- archivebox/core/views.py | 186 ++++- archivebox/crawls/admin.py | 277 +++++++- archivebox/hooks.py | 186 +++++ archivebox/machine/admin.py | 54 +- archivebox/misc/db.py | 6 +- archivebox/misc/logging_util.py | 132 ++++ .../plugins/archive_org/templates/icon.html | 1 + .../on_Snapshot__92_canonical_outputs.py | 80 +-- .../on_Crawl__00_install_chrome.py | 149 ---- .../tests/test_chrome_session.py | 69 +- archivebox/plugins/dom/templates/embed.html | 6 + .../plugins/dom/templates/fullscreen.html | 6 + archivebox/plugins/dom/templates/icon.html | 1 + .../plugins/dom/templates/thumbnail.html | 8 + .../plugins/favicon/templates/icon.html | 1 + .../plugins/git/on_Crawl__00_install_git.py | 68 -- archivebox/plugins/git/templates/embed.html | 6 + .../plugins/git/templates/fullscreen.html | 6 + archivebox/plugins/git/templates/icon.html | 1 + .../plugins/git/templates/thumbnail.html | 5 + archivebox/plugins/git/tests/test_git.py | 74 +- .../plugins/headers/templates/icon.html | 1 + .../plugins/htmltotext/templates/icon.html | 1 + .../media/on_Crawl__00_install_ytdlp.py | 67 -- .../media/on_Crawl__00_validate_ytdlp.py | 278 ++++++++ archivebox/plugins/media/templates/embed.html | 9 + .../plugins/media/templates/fullscreen.html | 10 + archivebox/plugins/media/templates/icon.html | 1 + .../plugins/media/templates/thumbnail.html | 14 + archivebox/plugins/media/tests/test_media.py | 64 +- .../mercury/on_Crawl__00_install_mercury.py | 68 -- .../mercury/on_Crawl__00_validate_mercury.py | 123 ++++ .../mercury/on_Snapshot__53_mercury.py | 18 +- .../plugins/mercury/templates/embed.html | 6 + .../plugins/mercury/templates/fullscreen.html | 6 + .../plugins/mercury/templates/icon.html | 1 + .../plugins/mercury/templates/thumbnail.html | 8 + .../plugins/mercury/tests/test_mercury.py | 81 ++- .../merkletree/on_Snapshot__93_merkletree.py | 164 +---- .../parse_dom_outlinks/templates/icon.html | 1 + .../on_Snapshot__60_parse_html_urls.py | 3 +- .../parse_html_urls/templates/icon.html | 1 + .../on_Snapshot__64_parse_jsonl_urls.py | 3 +- .../parse_jsonl_urls/templates/icon.html | 1 + .../on_Snapshot__63_parse_netscape_urls.py | 3 +- .../parse_netscape_urls/templates/icon.html | 1 + .../on_Snapshot__61_parse_rss_urls.py | 3 +- .../parse_rss_urls/templates/icon.html | 1 + .../on_Snapshot__62_parse_txt_urls.py | 3 +- .../parse_txt_urls/templates/icon.html | 1 + archivebox/plugins/pdf/templates/embed.html | 5 + .../plugins/pdf/templates/fullscreen.html | 5 + archivebox/plugins/pdf/templates/icon.html | 1 + .../plugins/pdf/templates/thumbnail.html | 6 + .../on_Crawl__00_install_readability.py | 68 -- .../on_Crawl__00_validate_readability.py | 123 ++++ .../plugins/readability/templates/embed.html | 6 + .../readability/templates/fullscreen.html | 6 + .../plugins/readability/templates/icon.html | 1 + .../readability/templates/thumbnail.html | 8 + .../readability/tests/test_readability.py | 71 +- .../plugins/screenshot/templates/embed.html | 5 + .../screenshot/templates/fullscreen.html | 8 + .../plugins/screenshot/templates/icon.html | 1 + .../screenshot/templates/thumbnail.html | 8 + .../plugins/singlefile/templates/embed.html | 6 + .../singlefile/templates/fullscreen.html | 6 + .../plugins/singlefile/templates/icon.html | 1 + .../singlefile/templates/thumbnail.html | 8 + .../plugins/staticfile/templates/icon.html | 1 + archivebox/plugins/title/templates/icon.html | 1 + .../plugins/wget/on_Crawl__00_install_wget.py | 68 -- archivebox/plugins/wget/templates/embed.html | 6 + .../plugins/wget/templates/fullscreen.html | 6 + archivebox/plugins/wget/templates/icon.html | 1 + .../plugins/wget/templates/thumbnail.html | 8 + archivebox/plugins/wget/tests/test_wget.py | 76 +- archivebox/templates/admin/base.html | 4 + .../templates/admin/progress_monitor.html | 648 ++++++++++++++++++ archivebox/templates/core/snapshot_live.html | 60 +- archivebox/workers/admin.py | 28 +- archivebox/workers/management/__init__.py | 0 .../workers/management/commands/__init__.py | 0 .../management/commands/orchestrator.py | 15 + archivebox/workers/orchestrator.py | 82 ++- archivebox/workers/supervisord_util.py | 28 +- archivebox/workers/tasks.py | 120 ++-- archivebox/workers/worker.py | 113 ++- pyproject.toml | 2 - 105 files changed, 3635 insertions(+), 1402 deletions(-) create mode 100644 archivebox/api/v1_machine.py create mode 100644 archivebox/plugins/archive_org/templates/icon.html delete mode 100755 archivebox/plugins/chrome_session/on_Crawl__00_install_chrome.py create mode 100644 archivebox/plugins/dom/templates/embed.html create mode 100644 archivebox/plugins/dom/templates/fullscreen.html create mode 100644 archivebox/plugins/dom/templates/icon.html create mode 100644 archivebox/plugins/dom/templates/thumbnail.html create mode 100644 archivebox/plugins/favicon/templates/icon.html delete mode 100755 archivebox/plugins/git/on_Crawl__00_install_git.py create mode 100644 archivebox/plugins/git/templates/embed.html create mode 100644 archivebox/plugins/git/templates/fullscreen.html create mode 100644 archivebox/plugins/git/templates/icon.html create mode 100644 archivebox/plugins/git/templates/thumbnail.html create mode 100644 archivebox/plugins/headers/templates/icon.html create mode 100644 archivebox/plugins/htmltotext/templates/icon.html delete mode 100755 archivebox/plugins/media/on_Crawl__00_install_ytdlp.py create mode 100755 archivebox/plugins/media/on_Crawl__00_validate_ytdlp.py create mode 100644 archivebox/plugins/media/templates/embed.html create mode 100644 archivebox/plugins/media/templates/fullscreen.html create mode 100644 archivebox/plugins/media/templates/icon.html create mode 100644 archivebox/plugins/media/templates/thumbnail.html delete mode 100755 archivebox/plugins/mercury/on_Crawl__00_install_mercury.py create mode 100755 archivebox/plugins/mercury/on_Crawl__00_validate_mercury.py create mode 100644 archivebox/plugins/mercury/templates/embed.html create mode 100644 archivebox/plugins/mercury/templates/fullscreen.html create mode 100644 archivebox/plugins/mercury/templates/icon.html create mode 100644 archivebox/plugins/mercury/templates/thumbnail.html create mode 100644 archivebox/plugins/parse_dom_outlinks/templates/icon.html create mode 100644 archivebox/plugins/parse_html_urls/templates/icon.html create mode 100644 archivebox/plugins/parse_jsonl_urls/templates/icon.html create mode 100644 archivebox/plugins/parse_netscape_urls/templates/icon.html create mode 100644 archivebox/plugins/parse_rss_urls/templates/icon.html create mode 100644 archivebox/plugins/parse_txt_urls/templates/icon.html create mode 100644 archivebox/plugins/pdf/templates/embed.html create mode 100644 archivebox/plugins/pdf/templates/fullscreen.html create mode 100644 archivebox/plugins/pdf/templates/icon.html create mode 100644 archivebox/plugins/pdf/templates/thumbnail.html delete mode 100755 archivebox/plugins/readability/on_Crawl__00_install_readability.py create mode 100755 archivebox/plugins/readability/on_Crawl__00_validate_readability.py create mode 100644 archivebox/plugins/readability/templates/embed.html create mode 100644 archivebox/plugins/readability/templates/fullscreen.html create mode 100644 archivebox/plugins/readability/templates/icon.html create mode 100644 archivebox/plugins/readability/templates/thumbnail.html create mode 100644 archivebox/plugins/screenshot/templates/embed.html create mode 100644 archivebox/plugins/screenshot/templates/fullscreen.html create mode 100644 archivebox/plugins/screenshot/templates/icon.html create mode 100644 archivebox/plugins/screenshot/templates/thumbnail.html create mode 100644 archivebox/plugins/singlefile/templates/embed.html create mode 100644 archivebox/plugins/singlefile/templates/fullscreen.html create mode 100644 archivebox/plugins/singlefile/templates/icon.html create mode 100644 archivebox/plugins/singlefile/templates/thumbnail.html create mode 100644 archivebox/plugins/staticfile/templates/icon.html create mode 100644 archivebox/plugins/title/templates/icon.html delete mode 100755 archivebox/plugins/wget/on_Crawl__00_install_wget.py create mode 100644 archivebox/plugins/wget/templates/embed.html create mode 100644 archivebox/plugins/wget/templates/fullscreen.html create mode 100644 archivebox/plugins/wget/templates/icon.html create mode 100644 archivebox/plugins/wget/templates/thumbnail.html create mode 100644 archivebox/templates/admin/progress_monitor.html create mode 100644 archivebox/workers/management/__init__.py create mode 100644 archivebox/workers/management/commands/__init__.py create mode 100644 archivebox/workers/management/commands/orchestrator.py diff --git a/archivebox/api/v1_api.py b/archivebox/api/v1_api.py index b1b7ff2c..524b5da5 100644 --- a/archivebox/api/v1_api.py +++ b/archivebox/api/v1_api.py @@ -42,6 +42,7 @@ def register_urls(api: NinjaAPI) -> NinjaAPI: api.add_router('/crawls/', 'api.v1_crawls.router') api.add_router('/cli/', 'api.v1_cli.router') api.add_router('/workers/', 'api.v1_workers.router') + api.add_router('/machine/', 'api.v1_machine.router') return api diff --git a/archivebox/api/v1_cli.py b/archivebox/api/v1_cli.py index 15e8a984..9282acce 100644 --- a/archivebox/api/v1_cli.py +++ b/archivebox/api/v1_cli.py @@ -107,7 +107,7 @@ class RemoveCommandSchema(Schema): @router.post("/add", response=CLICommandResponseSchema, summary='archivebox add [args] [urls]') def cli_add(request, args: AddCommandSchema): from archivebox.cli.archivebox_add import add - + result = add( urls=args.urls, tag=args.tag, @@ -115,8 +115,9 @@ def cli_add(request, args: AddCommandSchema): update=args.update, index_only=args.index_only, overwrite=args.overwrite, - extract=args.extract, + plugins=args.extract, # extract in API maps to plugins param parser=args.parser, + bg=True, # Always run in background for API calls ) return { diff --git a/archivebox/api/v1_machine.py b/archivebox/api/v1_machine.py new file mode 100644 index 00000000..fa8a6ad8 --- /dev/null +++ b/archivebox/api/v1_machine.py @@ -0,0 +1,206 @@ +__package__ = 'archivebox.api' + +from uuid import UUID +from typing import List, Optional +from datetime import datetime + +from ninja import Router, Schema, FilterSchema, Field, Query +from ninja.pagination import paginate + +from api.v1_core import CustomPagination + + +router = Router(tags=['Machine and Dependencies']) + + +# ============================================================================ +# Machine Schemas +# ============================================================================ + +class MachineSchema(Schema): + """Schema for Machine model.""" + TYPE: str = 'machine.Machine' + id: UUID + created_at: datetime + modified_at: datetime + guid: str + hostname: str + hw_in_docker: bool + hw_in_vm: bool + hw_manufacturer: str + hw_product: str + hw_uuid: str + os_arch: str + os_family: str + os_platform: str + os_release: str + os_kernel: str + stats: dict + num_uses_succeeded: int + num_uses_failed: int + + +class MachineFilterSchema(FilterSchema): + id: Optional[str] = Field(None, q='id__startswith') + hostname: Optional[str] = Field(None, q='hostname__icontains') + os_platform: Optional[str] = Field(None, q='os_platform__icontains') + os_arch: Optional[str] = Field(None, q='os_arch') + hw_in_docker: Optional[bool] = Field(None, q='hw_in_docker') + hw_in_vm: Optional[bool] = Field(None, q='hw_in_vm') + + +# ============================================================================ +# Dependency Schemas +# ============================================================================ + +class DependencySchema(Schema): + """Schema for Dependency model.""" + TYPE: str = 'machine.Dependency' + id: UUID + created_at: datetime + modified_at: datetime + bin_name: str + bin_providers: str + custom_cmds: dict + config: dict + is_installed: bool + installed_count: int + + @staticmethod + def resolve_is_installed(obj) -> bool: + return obj.is_installed + + @staticmethod + def resolve_installed_count(obj) -> int: + return obj.installed_binaries.count() + + +class DependencyFilterSchema(FilterSchema): + id: Optional[str] = Field(None, q='id__startswith') + bin_name: Optional[str] = Field(None, q='bin_name__icontains') + bin_providers: Optional[str] = Field(None, q='bin_providers__icontains') + + +# ============================================================================ +# InstalledBinary Schemas +# ============================================================================ + +class InstalledBinarySchema(Schema): + """Schema for InstalledBinary model.""" + TYPE: str = 'machine.InstalledBinary' + id: UUID + created_at: datetime + modified_at: datetime + machine_id: UUID + machine_hostname: str + dependency_id: Optional[UUID] + dependency_bin_name: Optional[str] + name: str + binprovider: str + abspath: str + version: str + sha256: str + is_valid: bool + num_uses_succeeded: int + num_uses_failed: int + + @staticmethod + def resolve_machine_hostname(obj) -> str: + return obj.machine.hostname + + @staticmethod + def resolve_dependency_id(obj) -> Optional[UUID]: + return obj.dependency_id + + @staticmethod + def resolve_dependency_bin_name(obj) -> Optional[str]: + return obj.dependency.bin_name if obj.dependency else None + + @staticmethod + def resolve_is_valid(obj) -> bool: + return obj.is_valid + + +class InstalledBinaryFilterSchema(FilterSchema): + id: Optional[str] = Field(None, q='id__startswith') + name: Optional[str] = Field(None, q='name__icontains') + binprovider: Optional[str] = Field(None, q='binprovider') + machine_id: Optional[str] = Field(None, q='machine_id__startswith') + dependency_id: Optional[str] = Field(None, q='dependency_id__startswith') + version: Optional[str] = Field(None, q='version__icontains') + + +# ============================================================================ +# Machine Endpoints +# ============================================================================ + +@router.get("/machines", response=List[MachineSchema], url_name="get_machines") +@paginate(CustomPagination) +def get_machines(request, filters: MachineFilterSchema = Query(...)): + """List all machines.""" + from machine.models import Machine + return filters.filter(Machine.objects.all()).distinct() + + +@router.get("/machine/{machine_id}", response=MachineSchema, url_name="get_machine") +def get_machine(request, machine_id: str): + """Get a specific machine by ID.""" + from machine.models import Machine + from django.db.models import Q + return Machine.objects.get(Q(id__startswith=machine_id) | Q(hostname__iexact=machine_id)) + + +@router.get("/machine/current", response=MachineSchema, url_name="get_current_machine") +def get_current_machine(request): + """Get the current machine.""" + from machine.models import Machine + return Machine.current() + + +# ============================================================================ +# Dependency Endpoints +# ============================================================================ + +@router.get("/dependencies", response=List[DependencySchema], url_name="get_dependencies") +@paginate(CustomPagination) +def get_dependencies(request, filters: DependencyFilterSchema = Query(...)): + """List all dependencies.""" + from machine.models import Dependency + return filters.filter(Dependency.objects.all()).distinct() + + +@router.get("/dependency/{dependency_id}", response=DependencySchema, url_name="get_dependency") +def get_dependency(request, dependency_id: str): + """Get a specific dependency by ID or bin_name.""" + from machine.models import Dependency + from django.db.models import Q + try: + return Dependency.objects.get(Q(id__startswith=dependency_id)) + except Dependency.DoesNotExist: + return Dependency.objects.get(bin_name__iexact=dependency_id) + + +# ============================================================================ +# InstalledBinary Endpoints +# ============================================================================ + +@router.get("/binaries", response=List[InstalledBinarySchema], url_name="get_binaries") +@paginate(CustomPagination) +def get_binaries(request, filters: InstalledBinaryFilterSchema = Query(...)): + """List all installed binaries.""" + from machine.models import InstalledBinary + return filters.filter(InstalledBinary.objects.all().select_related('machine', 'dependency')).distinct() + + +@router.get("/binary/{binary_id}", response=InstalledBinarySchema, url_name="get_binary") +def get_binary(request, binary_id: str): + """Get a specific installed binary by ID.""" + from machine.models import InstalledBinary + return InstalledBinary.objects.select_related('machine', 'dependency').get(id__startswith=binary_id) + + +@router.get("/binary/by-name/{name}", response=List[InstalledBinarySchema], url_name="get_binaries_by_name") +def get_binaries_by_name(request, name: str): + """Get all installed binaries with the given name.""" + from machine.models import InstalledBinary + return list(InstalledBinary.objects.filter(name__iexact=name).select_related('machine', 'dependency')) diff --git a/archivebox/api/v1_workers.py b/archivebox/api/v1_workers.py index 11b258cb..d95c6ff6 100644 --- a/archivebox/api/v1_workers.py +++ b/archivebox/api/v1_workers.py @@ -4,125 +4,157 @@ from uuid import UUID from typing import List, Any from datetime import datetime - from ninja import Router, Schema router = Router(tags=['Workers and Tasks']) -class TaskSchema(Schema): +class QueueItemSchema(Schema): + """Schema for a single item in a worker's queue.""" TYPE: str - id: UUID - description: str - status: str retry_at: datetime | None - created_at: datetime modified_at: datetime - created_by_id: int - + description: str + + @staticmethod + def resolve_TYPE(obj) -> str: + return f'{obj._meta.app_label}.{obj._meta.model_name}' + @staticmethod def resolve_description(obj) -> str: return str(obj) -class ActorSchema(Schema): - # TYPE: str = 'workers.actor.ActorType' - - # name: str - #pid: int | None - idle_count: int - launch_kwargs: dict[str, Any] - mode: str - +class WorkerSchema(Schema): + """Schema for a Worker type.""" + name: str model: str - statemachine: str - ACTIVE_STATE: str - EVENT_NAME: str - CLAIM_ORDER: list[str] - CLAIM_FROM_TOP_N: int - CLAIM_ATOMIC: bool - MAX_TICK_TIME: int - MAX_CONCURRENT_ACTORS: int - - future: list[TaskSchema] - pending: list[TaskSchema] - stalled: list[TaskSchema] - active: list[TaskSchema] - past: list[TaskSchema] - + max_tick_time: int + max_concurrent_tasks: int + poll_interval: float + idle_timeout: int + running_count: int + running_workers: List[dict[str, Any]] + queue_count: int + queue: List[QueueItemSchema] + @staticmethod def resolve_model(obj) -> str: - return obj.Model.__name__ - - @staticmethod - def resolve_statemachine(obj) -> str: - return obj.StateMachineClass.__name__ - - @staticmethod - def resolve_name(obj) -> str: - return str(obj) + Model = obj.get_model() + return f'{Model._meta.app_label}.{Model._meta.model_name}' @staticmethod - def resolve_ACTIVE_STATE(obj) -> str: - return str(obj.ACTIVE_STATE) - - @staticmethod - def resolve_FINAL_STATES(obj) -> list[str]: - return [str(state) for state in obj.FINAL_STATES] - - @staticmethod - def resolve_future(obj) -> list[TaskSchema]: - return [obj for obj in obj.qs.filter(obj.future_q).order_by('-retry_at')] - - @staticmethod - def resolve_pending(obj) -> list[TaskSchema]: - return [obj for obj in obj.qs.filter(obj.pending_q).order_by('-retry_at')] - - @staticmethod - def resolve_stalled(obj) -> list[TaskSchema]: - return [obj for obj in obj.qs.filter(obj.stalled_q).order_by('-retry_at')] - - @staticmethod - def resolve_active(obj) -> list[TaskSchema]: - return [obj for obj in obj.qs.filter(obj.active_q).order_by('-retry_at')] + def resolve_max_tick_time(obj) -> int: + return obj.MAX_TICK_TIME @staticmethod - def resolve_past(obj) -> list[TaskSchema]: - return [obj for obj in obj.qs.filter(obj.final_q).order_by('-modified_at')] + def resolve_max_concurrent_tasks(obj) -> int: + return obj.MAX_CONCURRENT_TASKS + + @staticmethod + def resolve_poll_interval(obj) -> float: + return obj.POLL_INTERVAL + + @staticmethod + def resolve_idle_timeout(obj) -> int: + return obj.IDLE_TIMEOUT + + @staticmethod + def resolve_running_count(obj) -> int: + return len(obj.get_running_workers()) + + @staticmethod + def resolve_running_workers(obj) -> List[dict[str, Any]]: + return obj.get_running_workers() + + @staticmethod + def resolve_queue_count(obj) -> int: + return obj.get_queue().count() + + @staticmethod + def resolve_queue(obj) -> List[QueueItemSchema]: + return list(obj.get_queue()[:50]) # Limit to 50 items class OrchestratorSchema(Schema): - # TYPE: str = 'workers.orchestrator.Orchestrator' - - #pid: int | None - exit_on_idle: bool - mode: str - - actors: list[ActorSchema] - - @staticmethod - def resolve_actors(obj) -> list[ActorSchema]: - return [actor() for actor in obj.actor_types.values()] + """Schema for the Orchestrator.""" + is_running: bool + poll_interval: float + idle_timeout: int + max_workers_per_type: int + max_total_workers: int + total_worker_count: int + workers: List[WorkerSchema] -@router.get("/orchestrators", response=List[OrchestratorSchema], url_name="get_orchestrators") -def get_orchestrators(request): - """List all the task orchestrators (aka Orchestrators) that are currently running""" - +@router.get("/orchestrator", response=OrchestratorSchema, url_name="get_orchestrator") +def get_orchestrator(request): + """Get the orchestrator status and all worker queues.""" from workers.orchestrator import Orchestrator + from workers.worker import CrawlWorker, SnapshotWorker, ArchiveResultWorker + orchestrator = Orchestrator() - return [orchestrator] + # Create temporary worker instances to query their queues + workers = [ + CrawlWorker(worker_id=-1), + SnapshotWorker(worker_id=-1), + ArchiveResultWorker(worker_id=-1), + ] + + return { + 'is_running': orchestrator.is_running(), + 'poll_interval': orchestrator.POLL_INTERVAL, + 'idle_timeout': orchestrator.IDLE_TIMEOUT, + 'max_workers_per_type': orchestrator.MAX_WORKERS_PER_TYPE, + 'max_total_workers': orchestrator.MAX_TOTAL_WORKERS, + 'total_worker_count': orchestrator.get_total_worker_count(), + 'workers': workers, + } -@router.get("/actors", response=List[ActorSchema], url_name="get_actors") -def get_actors(request): - """List all the task consumer workers (aka Actors) that are currently running""" +@router.get("/workers", response=List[WorkerSchema], url_name="get_workers") +def get_workers(request): + """List all worker types and their current status.""" + from workers.worker import CrawlWorker, SnapshotWorker, ArchiveResultWorker - from workers.orchestrator import Orchestrator - orchestrator = Orchestrator() - return orchestrator.actor_types.values() + # Create temporary instances to query their queues + return [ + CrawlWorker(worker_id=-1), + SnapshotWorker(worker_id=-1), + ArchiveResultWorker(worker_id=-1), + ] + + +@router.get("/worker/{worker_name}", response=WorkerSchema, url_name="get_worker") +def get_worker(request, worker_name: str): + """Get status and queue for a specific worker type.""" + from workers.worker import WORKER_TYPES + + if worker_name not in WORKER_TYPES: + from ninja.errors import HttpError + raise HttpError(404, f"Unknown worker type: {worker_name}. Valid types: {list(WORKER_TYPES.keys())}") + + WorkerClass = WORKER_TYPES[worker_name] + return WorkerClass(worker_id=-1) + + +@router.get("/worker/{worker_name}/queue", response=List[QueueItemSchema], url_name="get_worker_queue") +def get_worker_queue(request, worker_name: str, limit: int = 100): + """Get the current queue for a specific worker type.""" + from workers.worker import WORKER_TYPES + + if worker_name not in WORKER_TYPES: + from ninja.errors import HttpError + raise HttpError(404, f"Unknown worker type: {worker_name}. Valid types: {list(WORKER_TYPES.keys())}") + + WorkerClass = WORKER_TYPES[worker_name] + worker = WorkerClass(worker_id=-1) + return list(worker.get_queue()[:limit]) + + +# Progress endpoint moved to core.views.live_progress_view for simplicity diff --git a/archivebox/base_models/admin.py b/archivebox/base_models/admin.py index e2028db9..3c4fa643 100644 --- a/archivebox/base_models/admin.py +++ b/archivebox/base_models/admin.py @@ -2,76 +2,226 @@ __package__ = 'archivebox.base_models' +import json + +from django import forms from django.contrib import admin from django.utils.html import format_html, mark_safe from django_object_actions import DjangoObjectActions +class KeyValueWidget(forms.Widget): + """ + A widget that renders JSON dict as editable key-value input fields + with + and - buttons to add/remove rows. + Includes autocomplete for available config keys from the plugin system. + """ + template_name = None # We render manually + + class Media: + css = { + 'all': [] + } + js = [] + + def _get_config_options(self): + """Get available config options from plugins.""" + try: + from archivebox.hooks import discover_plugin_configs + plugin_configs = discover_plugin_configs() + options = {} + for plugin_name, schema in plugin_configs.items(): + for key, prop in schema.get('properties', {}).items(): + options[key] = { + 'plugin': plugin_name, + 'type': prop.get('type', 'string'), + 'default': prop.get('default', ''), + 'description': prop.get('description', ''), + } + return options + except Exception: + return {} + + def render(self, name, value, attrs=None, renderer=None): + # Parse JSON value to dict + if value is None: + data = {} + elif isinstance(value, str): + try: + data = json.loads(value) if value else {} + except json.JSONDecodeError: + data = {} + elif isinstance(value, dict): + data = value + else: + data = {} + + widget_id = attrs.get('id', name) if attrs else name + config_options = self._get_config_options() + + # Build datalist options + datalist_options = '\n'.join( + f'' + for key, opt in sorted(config_options.items()) + ) + + # Build config metadata as JSON for JS + config_meta_json = json.dumps(config_options) + + html = f''' +
+ + {datalist_options} + +
+ ''' + + # Render existing key-value pairs + row_idx = 0 + for key, val in data.items(): + val_str = json.dumps(val) if not isinstance(val, str) else val + html += self._render_row(widget_id, row_idx, key, val_str) + row_idx += 1 + + # Always add one empty row for new entries + html += self._render_row(widget_id, row_idx, '', '') + + html += f''' +
+
+ + +
+ + +
+ ''' + return mark_safe(html) + + def _render_row(self, widget_id, idx, key, value): + return f''' +
+ + + +
+ ''' + + def _escape(self, s): + """Escape HTML special chars in attribute values.""" + if not s: + return '' + return str(s).replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"') + + def value_from_datadict(self, data, files, name): + value = data.get(name, '{}') + return value + + class ConfigEditorMixin: """ Mixin for admin classes with a config JSON field. - Provides a readonly field that shows available config options - from all discovered plugin schemas. + Provides a key-value editor widget with autocomplete for available config keys. """ - @admin.display(description='Available Config Options') - def available_config_options(self, obj): - """Show documentation for available config keys.""" - try: - from archivebox.hooks import discover_plugin_configs - plugin_configs = discover_plugin_configs() - except ImportError: - return format_html('Plugin config system not available') - - html_parts = [ - '
', - '', - 'Click to see available config keys ({})'.format( - sum(len(s.get('properties', {})) for s in plugin_configs.values()) - ), - '
', - ] - - for plugin_name, schema in sorted(plugin_configs.items()): - properties = schema.get('properties', {}) - if not properties: - continue - - html_parts.append(f'
{plugin_name}
') - html_parts.append('') - html_parts.append('') - - for key, prop in sorted(properties.items()): - prop_type = prop.get('type', 'string') - default = prop.get('default', '') - description = prop.get('description', '') - - # Truncate long defaults - default_str = str(default) - if len(default_str) > 30: - default_str = default_str[:27] + '...' - - html_parts.append( - f'' - f'' - f'' - f'' - f'' - f'' - ) - - html_parts.append('
KeyTypeDefaultDescription
{key}{prop_type}{default_str}{description}
') - - html_parts.append('
') - html_parts.append( - '

' - 'Usage: Add key-value pairs in JSON format, e.g., ' - '{"SAVE_WGET": false, "WGET_TIMEOUT": 120}' - '

' - ) - - return mark_safe(''.join(html_parts)) + def formfield_for_dbfield(self, db_field, request, **kwargs): + """Use KeyValueWidget for the config JSON field.""" + if db_field.name == 'config': + kwargs['widget'] = KeyValueWidget() + return super().formfield_for_dbfield(db_field, request, **kwargs) class BaseModelAdmin(DjangoObjectActions, admin.ModelAdmin): diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index 051f9f72..e9bcc53e 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -72,9 +72,10 @@ def add(urls: str | list[str], cli_args[0] = 'archivebox' cmd_str = ' '.join(cli_args) + timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S") seed = Seed.from_file( sources_file, - label=f'{USER}@{HOSTNAME} $ {cmd_str}', + label=f'{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]', parser=parser, tag=tag, created_by=created_by_id, diff --git a/archivebox/config/configset.py b/archivebox/config/configset.py index f483d991..aeadbbca 100644 --- a/archivebox/config/configset.py +++ b/archivebox/config/configset.py @@ -11,21 +11,53 @@ __package__ = "archivebox.config" import os import json from pathlib import Path -from typing import Any, Dict, Optional, List, Type, TYPE_CHECKING, cast +from typing import Any, Dict, Optional, List, Type, Tuple, TYPE_CHECKING, cast from configparser import ConfigParser from pydantic import Field -from pydantic_settings import BaseSettings +from pydantic_settings import BaseSettings, PydanticBaseSettingsSource + + +class IniConfigSettingsSource(PydanticBaseSettingsSource): + """ + Custom settings source that reads from ArchiveBox.conf (INI format). + Flattens all sections into a single namespace. + """ + + def get_field_value(self, field: Any, field_name: str) -> Tuple[Any, str, bool]: + config_vals = self._load_config_file() + field_value = config_vals.get(field_name.upper()) + return field_value, field_name, False + + def __call__(self) -> Dict[str, Any]: + return self._load_config_file() + + def _load_config_file(self) -> Dict[str, Any]: + try: + from archivebox.config.constants import CONSTANTS + config_path = CONSTANTS.CONFIG_FILE + except ImportError: + return {} + + if not config_path.exists(): + return {} + + parser = ConfigParser() + parser.optionxform = lambda x: x # preserve case + parser.read(config_path) + + # Flatten all sections into single namespace (ignore section headers) + return {key.upper(): value for section in parser.sections() for key, value in parser.items(section)} class BaseConfigSet(BaseSettings): """ Base class for config sections. - Automatically loads values from: - 1. Environment variables (highest priority) - 2. ArchiveBox.conf file (if exists) - 3. Default values (lowest priority) + Automatically loads values from (highest to lowest priority): + 1. Environment variables + 2. ArchiveBox.conf file (INI format, flattened) + 3. Default values Subclasses define fields with defaults and types: @@ -35,11 +67,30 @@ class BaseConfigSet(BaseSettings): """ class Config: - # Use env vars with ARCHIVEBOX_ prefix or raw name env_prefix = "" extra = "ignore" validate_default = True + @classmethod + def settings_customise_sources( + cls, + settings_cls: Type[BaseSettings], + init_settings: PydanticBaseSettingsSource, + env_settings: PydanticBaseSettingsSource, + dotenv_settings: PydanticBaseSettingsSource, + file_secret_settings: PydanticBaseSettingsSource, + ) -> Tuple[PydanticBaseSettingsSource, ...]: + """ + Define the order of settings sources (first = highest priority). + """ + return ( + init_settings, # 1. Passed to __init__ + env_settings, # 2. Environment variables + IniConfigSettingsSource(settings_cls), # 3. ArchiveBox.conf file + # dotenv_settings, # Skip .env files + # file_secret_settings, # Skip secrets files + ) + @classmethod def load_from_file(cls, config_path: Path) -> Dict[str, str]: """Load config values from INI file.""" @@ -47,7 +98,7 @@ class BaseConfigSet(BaseSettings): return {} parser = ConfigParser() - parser.optionxform = lambda x: x # type: ignore # preserve case + parser.optionxform = lambda x: x # preserve case parser.read(config_path) # Flatten all sections into single namespace diff --git a/archivebox/config/views.py b/archivebox/config/views.py index 5cfb0190..0f1c33b6 100644 --- a/archivebox/config/views.py +++ b/archivebox/config/views.py @@ -256,7 +256,7 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext: # Show a helpful message when no plugins found rows['Name'].append('(no plugins found)') rows['Source'].append('-') - rows['Path'].append(format_html('archivebox/plugins/ or data/plugins/')) + rows['Path'].append(mark_safe('archivebox/plugins/ or data/plugins/')) rows['Hooks'].append('-') return TableContext( diff --git a/archivebox/core/admin_archiveresults.py b/archivebox/core/admin_archiveresults.py index 1e3b9be4..5497d2a6 100644 --- a/archivebox/core/admin_archiveresults.py +++ b/archivebox/core/admin_archiveresults.py @@ -9,25 +9,17 @@ from django.core.exceptions import ValidationError from django.urls import reverse, resolve from django.utils import timezone -from huey_monitor.admin import TaskModel - from archivebox.config import DATA_DIR from archivebox.config.common import SERVER_CONFIG from archivebox.misc.paginators import AccelleratedPaginator from archivebox.base_models.admin import BaseModelAdmin +from archivebox.hooks import get_extractor_icon from core.models import ArchiveResult, Snapshot - -def result_url(result: TaskModel) -> str: - url = reverse("admin:huey_monitor_taskmodel_change", args=[str(result.id)]) - return format_html('See progress...'.format(url=url)) - - - class ArchiveResultInline(admin.TabularInline): name = 'Archive Results Log' model = ArchiveResult @@ -101,9 +93,9 @@ class ArchiveResultInline(admin.TabularInline): class ArchiveResultAdmin(BaseModelAdmin): - list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'extractor', 'cmd_str', 'output_str') + list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'extractor_with_icon', 'cmd_str', 'output_str') sort_fields = ('id', 'created_by', 'created_at', 'extractor', 'status') - readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary') + readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'extractor_with_icon') search_fields = ('id', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp') fields = ('snapshot', 'extractor', 'status', 'retry_at', 'start_ts', 'end_ts', 'created_by', 'pwd', 'cmd_version', 'cmd', 'output', *readonly_fields) autocomplete_fields = ['snapshot'] @@ -144,17 +136,29 @@ class ArchiveResultAdmin(BaseModelAdmin): def tags_str(self, result): return result.snapshot.tags_str() + @admin.display(description='Extractor', ordering='extractor') + def extractor_with_icon(self, result): + icon = get_extractor_icon(result.extractor) + return format_html( + '{} {}', + result.extractor, + icon, + result.extractor, + ) + def cmd_str(self, result): return format_html( '
{}
', ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd), ) - + def output_str(self, result): + # Determine output link path - use output if file exists, otherwise link to index + output_path = result.output if (result.status == 'succeeded' and result.output) else 'index.html' return format_html( '↗️
{}
', result.snapshot.timestamp, - result.output if (result.status == 'succeeded') and result.extractor not in ('title', 'archive_org') else 'index.html', + output_path, result.output, ) @@ -185,7 +189,7 @@ class ArchiveResultAdmin(BaseModelAdmin): is_hidden = filename.startswith('.') output_str += format_html('{}{}
', int(not is_hidden), indentation_str, filename.strip()) - return output_str + format_html('') + return output_str + mark_safe('') diff --git a/archivebox/core/admin_site.py b/archivebox/core/admin_site.py index 0159b9bb..67e074ac 100644 --- a/archivebox/core/admin_site.py +++ b/archivebox/core/admin_site.py @@ -35,8 +35,19 @@ def register_admin_site(): admin.site = archivebox_admin sites.site = archivebox_admin - - # Plugin admin registration is now handled by individual app admins - # No longer using archivebox.pm.hook.register_admin() - + + # Register admin views for each app + # (Previously handled by ABX plugin system, now called directly) + from core.admin import register_admin as register_core_admin + from crawls.admin import register_admin as register_crawls_admin + from api.admin import register_admin as register_api_admin + from machine.admin import register_admin as register_machine_admin + from workers.admin import register_admin as register_workers_admin + + register_core_admin(archivebox_admin) + register_crawls_admin(archivebox_admin) + register_api_admin(archivebox_admin) + register_machine_admin(archivebox_admin) + register_workers_admin(archivebox_admin) + return archivebox_admin diff --git a/archivebox/core/admin_snapshots.py b/archivebox/core/admin_snapshots.py index a50d7b03..d1917e52 100644 --- a/archivebox/core/admin_snapshots.py +++ b/archivebox/core/admin_snapshots.py @@ -25,7 +25,7 @@ from archivebox.workers.tasks import bg_archive_snapshots, bg_add from core.models import Tag from core.admin_tags import TagInline -from core.admin_archiveresults import ArchiveResultInline, result_url +from core.admin_archiveresults import ArchiveResultInline # GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False} @@ -54,10 +54,10 @@ class SnapshotActionForm(ActionForm): class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): list_display = ('created_at', 'title_str', 'status', 'files', 'size', 'url_str') sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl') - readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'link_dir', 'available_config_options') + readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir') search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name') list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name') - fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', 'config', 'available_config_options', *readonly_fields[:-1]) + fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', 'config', *readonly_fields) ordering = ['-created_at'] actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots'] inlines = [TagInline, ArchiveResultInline] @@ -93,12 +93,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): # self.request = request # return super().get_queryset(request).prefetch_related('archiveresult_set').distinct() # .annotate(archiveresult_count=Count('archiveresult')) - @admin.action( - description="Imported Timestamp" - ) + @admin.display(description="Imported Timestamp") def imported_timestamp(self, obj): context = RequestContext(self.request, { - 'bookmarked_date': obj.bookmarked, + 'bookmarked_date': obj.bookmarked_at, 'timestamp': obj.timestamp, }) @@ -145,22 +143,15 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): def status_info(self, obj): return format_html( - # URL Hash: {}
''' Archived: {} ({} files {})     Favicon:     - Status code: {}    
- Server: {}     - Content type: {}     Extension: {}     ''', '✅' if obj.is_archived else '❌', obj.num_outputs, self.size(obj) or '0kb', f'/archive/{obj.timestamp}/favicon.ico', - obj.status_code or '-', - obj.headers and obj.headers.get('Server') or '-', - obj.headers and obj.headers.get('Content-Type') or '-', obj.extension or '-', ) @@ -184,8 +175,8 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): obj.archive_path, obj.archive_path, obj.archive_path, - 'fetched' if obj.latest_title or obj.title else 'pending', - urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...' + 'fetched' if obj.title else 'pending', + urldecode(htmldecode(obj.title or ''))[:128] or 'Pending...' ) + mark_safe(f' {tags}') @admin.display( @@ -259,14 +250,13 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): description="ℹ️ Get Title" ) def update_titles(self, request, queryset): - from core.models import Snapshot count = queryset.count() # Queue snapshots for archiving via the state machine system - result = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR}) + queued = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR}) messages.success( request, - mark_safe(f"Title and favicon are updating in the background for {count} URLs. {result_url(result)}"), + f"Queued {queued} snapshots for title/favicon update. The orchestrator will process them in the background.", ) @admin.action( @@ -275,11 +265,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): def update_snapshots(self, request, queryset): count = queryset.count() - result = bg_archive_snapshots(queryset, kwargs={"overwrite": False, "out_dir": DATA_DIR}) + queued = bg_archive_snapshots(queryset, kwargs={"overwrite": False, "out_dir": DATA_DIR}) messages.success( request, - mark_safe(f"Re-trying any previously failed methods for {count} URLs in the background. {result_url(result)}"), + f"Queued {queued} snapshots for re-archiving. The orchestrator will process them in the background.", ) @@ -291,11 +281,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): timestamp = timezone.now().isoformat('T', 'seconds') new_url = snapshot.url.split('#')[0] + f'#{timestamp}' - result = bg_add({'urls': new_url, 'tag': snapshot.tags_str()}) + bg_add({'urls': new_url, 'tag': snapshot.tags_str()}) messages.success( request, - mark_safe(f"Creating new fresh snapshots for {queryset.count()} URLs in the background. {result_url(result)}"), + f"Creating {queryset.count()} new fresh snapshots. The orchestrator will process them in the background.", ) @admin.action( @@ -304,11 +294,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): def overwrite_snapshots(self, request, queryset): count = queryset.count() - result = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "out_dir": DATA_DIR}) + queued = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "out_dir": DATA_DIR}) messages.success( request, - mark_safe(f"Clearing all previous results and re-downloading {count} URLs in the background. {result_url(result)}"), + f"Queued {queued} snapshots for full re-archive (overwriting existing). The orchestrator will process them in the background.", ) @admin.action( diff --git a/archivebox/core/apps.py b/archivebox/core/apps.py index 981edc52..5193166d 100644 --- a/archivebox/core/apps.py +++ b/archivebox/core/apps.py @@ -1,5 +1,7 @@ __package__ = 'archivebox.core' +import sys + from django.apps import AppConfig @@ -10,6 +12,41 @@ class CoreConfig(AppConfig): """Register the archivebox.core.admin_site as the main django admin site""" from core.admin_site import register_admin_site register_admin_site() - + # Auto-start the orchestrator when running the web server + self._maybe_start_orchestrator() + def _maybe_start_orchestrator(self): + """Start the orchestrator if we're running a web server.""" + import os + + # Don't start orchestrator during migrations, shell, tests, etc. + # Only start when running: runserver, daphne, gunicorn, uwsgi + if not self._is_web_server(): + return + + # Don't start if RUN_ORCHESTRATOR env var is explicitly set to false + if os.environ.get('RUN_ORCHESTRATOR', '').lower() in ('false', '0', 'no'): + return + + # Don't start in autoreload child process (avoid double-start) + if os.environ.get('RUN_MAIN') != 'true' and 'runserver' in sys.argv: + return + + try: + from workers.orchestrator import Orchestrator + + if not Orchestrator.is_running(): + # Start orchestrator as daemon (won't exit on idle when started by server) + orchestrator = Orchestrator(exit_on_idle=False) + orchestrator.start() + except Exception as e: + # Don't crash the server if orchestrator fails to start + import logging + logging.getLogger('archivebox').warning(f'Failed to auto-start orchestrator: {e}') + + def _is_web_server(self) -> bool: + """Check if we're running a web server command.""" + # Check for common web server indicators + server_commands = ('runserver', 'daphne', 'gunicorn', 'uwsgi', 'server') + return any(cmd in ' '.join(sys.argv).lower() for cmd in server_commands) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index e746c221..543435aa 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -23,7 +23,11 @@ from archivebox.config import CONSTANTS from archivebox.misc.system import get_dir_size, atomic_write from archivebox.misc.util import parse_date, base_url, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode from archivebox.misc.hashing import get_dir_info -from archivebox.hooks import ARCHIVE_METHODS_INDEXING_PRECEDENCE +from archivebox.hooks import ( + ARCHIVE_METHODS_INDEXING_PRECEDENCE, + get_extractors, get_extractor_name, get_extractor_icon, + DEFAULT_EXTRACTOR_ICONS, +) from archivebox.base_models.models import ( ModelWithUUID, ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, @@ -343,45 +347,37 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea def icons(self) -> str: """Generate HTML icons showing which extractors have succeeded for this snapshot""" from django.utils.html import format_html, mark_safe - from collections import defaultdict cache_key = f'result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}' def calc_icons(): if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache: - archive_results = [r for r in self.archiveresult_set.all() if r.status == "succeeded" and r.output] + archive_results = {r.extractor: r for r in self.archiveresult_set.all() if r.status == "succeeded" and r.output} else: - archive_results = self.archiveresult_set.filter(status="succeeded", output__isnull=False) + archive_results = {r.extractor: r for r in self.archiveresult_set.filter(status="succeeded", output__isnull=False)} path = self.archive_path canon = self.canonical_outputs() output = "" output_template = '{}  ' - icons = { - "singlefile": "❶", "wget": "🆆", "dom": "🅷", "pdf": "📄", - "screenshot": "💻", "media": "📼", "git": "🅶", "archive_org": "🏛", - "readability": "🆁", "mercury": "🅼", "warc": "📦" - } - exclude = ["favicon", "title", "headers", "htmltotext", "archive_org"] - extractor_outputs = defaultdict(lambda: None) - for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES: - for result in archive_results: - if result.extractor == extractor: - extractor_outputs[extractor] = result + # Get all extractors from hooks system (sorted by numeric prefix) + all_extractors = [get_extractor_name(e) for e in get_extractors()] - for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES: - if extractor not in exclude: - existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output - output += format_html(output_template, path, canon.get(extractor, ''), str(bool(existing)), extractor, icons.get(extractor, "?")) - if extractor == "wget": - exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output - output += format_html(output_template, path, canon.get("warc", "warc/"), str(bool(exists)), "warc", icons.get("warc", "?")) - if extractor == "archive_org": - exists = extractor in extractor_outputs and extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output - output += '{} '.format(canon.get("archive_org", ""), str(exists), "archive_org", icons.get("archive_org", "?")) + for extractor in all_extractors: + result = archive_results.get(extractor) + existing = result and result.status == 'succeeded' and result.output + icon = get_extractor_icon(extractor) + output += format_html( + output_template, + path, + canon.get(extractor, extractor + '/'), + str(bool(existing)), + extractor, + icon + ) - return format_html('{}', mark_safe(output)) + return format_html('{}', mark_safe(output)) cache_result = cache.get(cache_key) if cache_result: @@ -767,12 +763,11 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi FAILED = 'failed', 'Failed' SKIPPED = 'skipped', 'Skipped' - EXTRACTOR_CHOICES = ( - ('htmltotext', 'htmltotext'), ('git', 'git'), ('singlefile', 'singlefile'), ('media', 'media'), - ('archive_org', 'archive_org'), ('readability', 'readability'), ('mercury', 'mercury'), - ('favicon', 'favicon'), ('pdf', 'pdf'), ('headers', 'headers'), ('screenshot', 'screenshot'), - ('dom', 'dom'), ('title', 'title'), ('wget', 'wget'), - ) + @classmethod + def get_extractor_choices(cls): + """Get extractor choices from discovered hooks (for forms/admin).""" + extractors = [get_extractor_name(e) for e in get_extractors()] + return tuple((e, e) for e in extractors) # Keep AutoField for backward compatibility with 0.7.x databases # UUID field is added separately by migration for new records @@ -783,7 +778,8 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi modified_at = models.DateTimeField(auto_now=True) snapshot: Snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) # type: ignore - extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32, blank=False, null=False, db_index=True) + # No choices= constraint - extractor names come from plugin system and can be any string + extractor = models.CharField(max_length=32, blank=False, null=False, db_index=True) pwd = models.CharField(max_length=256, default=None, null=True, blank=True) cmd = models.JSONField(default=None, null=True, blank=True) cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True) @@ -835,6 +831,25 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi def output_exists(self) -> bool: return os.path.exists(Path(self.snapshot_dir) / self.extractor) + def embed_path(self) -> Optional[str]: + """ + Get the relative path to the embeddable output file for this result. + + Returns the output field if set and file exists, otherwise tries to + find a reasonable default based on the extractor type. + """ + if self.output: + return self.output + + # Try to find output file based on extractor's canonical output path + canonical = self.snapshot.canonical_outputs() + extractor_key = f'{self.extractor}_path' + if extractor_key in canonical: + return canonical[extractor_key] + + # Fallback to extractor directory + return f'{self.extractor}/' + def create_output_dir(self): output_dir = Path(self.snapshot_dir) / self.extractor output_dir.mkdir(parents=True, exist_ok=True) @@ -891,6 +906,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi output_dir=extractor_dir, config_objects=config_objects, url=self.snapshot.url, + snapshot_id=str(self.snapshot.id), ) end_ts = timezone.now() @@ -1000,6 +1016,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi hook, output_dir=self.output_dir, config_objects=config_objects, + url=self.snapshot.url, snapshot_id=str(self.snapshot.id), extractor=self.extractor, ) diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index c3a67d09..d051229d 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -68,9 +68,6 @@ INSTALLED_APPS = [ # 3rd-party apps from PyPI that need to be loaded last "admin_data_views", # handles rendering some convenient automatic read-only views of data in Django admin "django_extensions", # provides Django Debug Toolbar (and other non-debug helpers) - "django_huey", # provides multi-queue support for django huey https://github.com/gaiacoop/django-huey - "bx_django_utils", # needed for huey_monitor https://github.com/boxine/bx_django_utils - "huey_monitor", # adds an admin UI for monitoring background huey tasks https://github.com/boxine/django-huey-monitor ] @@ -215,70 +212,6 @@ MIGRATION_MODULES = {"signal_webhooks": None} # as much as I'd love this to be a UUID or ULID field, it's not supported yet as of Django 5.0 DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField" -HUEY = { - "huey_class": "huey.SqliteHuey", - "filename": CONSTANTS.QUEUE_DATABASE_FILENAME, - "name": "commands", - "results": True, - "store_none": True, - "immediate": False, - "utc": True, - "consumer": { - "workers": 1, - "worker_type": "thread", - "initial_delay": 0.1, # Smallest polling interval, same as -d. - "backoff": 1.15, # Exponential backoff using this rate, -b. - "max_delay": 10.0, # Max possible polling interval, -m. - "scheduler_interval": 1, # Check schedule every second, -s. - "periodic": True, # Enable crontab feature. - "check_worker_health": True, # Enable worker health checks. - "health_check_interval": 1, # Check worker health every second. - }, -} - -# https://huey.readthedocs.io/en/latest/contrib.html#setting-things-up -# https://github.com/gaiacoop/django-huey -DJANGO_HUEY = { - "default": "commands", - "queues": { - HUEY["name"]: HUEY.copy(), - # more registered here at plugin import-time by BaseQueue.register() - # Additional huey queues configured via settings - }, -} - - -class HueyDBRouter: - """ - A router to store all the Huey result k:v / Huey Monitor models in the queue.sqlite3 database. - We keep the databases separate because the queue database receives many more reads/writes per second - and we want to avoid single-write lock contention with the main database. Also all the in-progress task - data is ephemeral/not-important-long-term. This makes it easier to for the user to clear non-critical - temp data by just deleting queue.sqlite3 and leaving index.sqlite3. - """ - - route_app_labels = {"huey_monitor", "django_huey", "djhuey"} - db_name = "queue" - - def db_for_read(self, model, **hints): - if model._meta.app_label in self.route_app_labels: - return self.db_name - return "default" - - def db_for_write(self, model, **hints): - if model._meta.app_label in self.route_app_labels: - return self.db_name - return "default" - - def allow_relation(self, obj1, obj2, **hints): - if obj1._meta.app_label in self.route_app_labels or obj2._meta.app_label in self.route_app_labels: - return obj1._meta.app_label == obj2._meta.app_label - return None - - def allow_migrate(self, db, app_label, model_name=None, **hints): - if app_label in self.route_app_labels: - return db == self.db_name - return db == "default" # class FilestoreDBRouter: @@ -311,7 +244,7 @@ class HueyDBRouter: # return db == self.db_name # return db == "default" -DATABASE_ROUTERS = ["core.settings.HueyDBRouter"] +DATABASE_ROUTERS = [] CACHES = { "default": {"BACKEND": "django.core.cache.backends.locmem.LocMemCache"}, diff --git a/archivebox/core/templatetags/core_tags.py b/archivebox/core/templatetags/core_tags.py index 2de610bc..b2c126cd 100644 --- a/archivebox/core/templatetags/core_tags.py +++ b/archivebox/core/templatetags/core_tags.py @@ -1,9 +1,13 @@ from django import template from django.contrib.admin.templatetags.base import InclusionAdminNode - +from django.utils.safestring import mark_safe from typing import Union +from archivebox.hooks import ( + get_extractor_icon, get_extractor_template, get_extractor_name, +) + register = template.Library() @@ -44,3 +48,115 @@ def url_replace(context, **kwargs): dict_ = context['request'].GET.copy() dict_.update(**kwargs) return dict_.urlencode() + + +@register.simple_tag +def extractor_icon(extractor: str) -> str: + """ + Render the icon for an extractor. + + Usage: {% extractor_icon "screenshot" %} + """ + return mark_safe(get_extractor_icon(extractor)) + + +@register.simple_tag(takes_context=True) +def extractor_thumbnail(context, result) -> str: + """ + Render the thumbnail template for an archive result. + + Usage: {% extractor_thumbnail result %} + + Context variables passed to template: + - result: ArchiveResult object + - snapshot: Parent Snapshot object + - output_path: Path to output relative to snapshot dir (from embed_path()) + - extractor: Extractor base name + """ + extractor = get_extractor_name(result.extractor) + template_str = get_extractor_template(extractor, 'thumbnail') + + if not template_str: + return '' + + # Use embed_path() for the display path (includes canonical paths) + output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '') + + # Create a mini template and render it with context + try: + tpl = template.Template(template_str) + ctx = template.Context({ + 'result': result, + 'snapshot': result.snapshot, + 'output_path': output_path, + 'extractor': extractor, + }) + return mark_safe(tpl.render(ctx)) + except Exception: + return '' + + +@register.simple_tag(takes_context=True) +def extractor_embed(context, result) -> str: + """ + Render the embed iframe template for an archive result. + + Usage: {% extractor_embed result %} + """ + extractor = get_extractor_name(result.extractor) + template_str = get_extractor_template(extractor, 'embed') + + if not template_str: + return '' + + output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '') + + try: + tpl = template.Template(template_str) + ctx = template.Context({ + 'result': result, + 'snapshot': result.snapshot, + 'output_path': output_path, + 'extractor': extractor, + }) + return mark_safe(tpl.render(ctx)) + except Exception: + return '' + + +@register.simple_tag(takes_context=True) +def extractor_fullscreen(context, result) -> str: + """ + Render the fullscreen template for an archive result. + + Usage: {% extractor_fullscreen result %} + """ + extractor = get_extractor_name(result.extractor) + template_str = get_extractor_template(extractor, 'fullscreen') + + if not template_str: + return '' + + output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '') + + try: + tpl = template.Template(template_str) + ctx = template.Context({ + 'result': result, + 'snapshot': result.snapshot, + 'output_path': output_path, + 'extractor': extractor, + }) + return mark_safe(tpl.render(ctx)) + except Exception: + return '' + + +@register.filter +def extractor_name(value: str) -> str: + """ + Get the base name of an extractor (strips numeric prefix). + + Usage: {{ result.extractor|extractor_name }} + """ + return get_extractor_name(value) diff --git a/archivebox/core/urls.py b/archivebox/core/urls.py index c8b3bed9..910d59ee 100644 --- a/archivebox/core/urls.py +++ b/archivebox/core/urls.py @@ -8,7 +8,7 @@ from django.views.generic.base import RedirectView from archivebox.misc.serve_static import serve_static from core.admin_site import archivebox_admin -from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView +from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView, live_progress_view from workers.views import JobsDashboardView @@ -43,8 +43,10 @@ urlpatterns = [ path('accounts/', include('django.contrib.auth.urls')), + + path('admin/live-progress/', live_progress_view, name='live_progress'), path('admin/', archivebox_admin.urls), - + path("api/", include('api.urls'), name='api'), path('health/', HealthCheckView.as_view(), name='healthcheck'), diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 6a662d04..43110364 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -34,6 +34,7 @@ from archivebox.search import query_search_index from core.models import Snapshot from core.forms import AddLinkForm from crawls.models import Seed, Crawl +from archivebox.hooks import get_extractors, get_extractor_name @@ -54,8 +55,10 @@ class SnapshotView(View): @staticmethod def render_live_index(request, snapshot): TITLE_LOADING_MSG = 'Not yet archived...' - HIDDEN_RESULTS = ('favicon', 'headers', 'title', 'htmltotext', 'warc', 'archive_org') + # Dict of extractor -> ArchiveResult object + archiveresult_objects = {} + # Dict of extractor -> result info dict (for template compatibility) archiveresults = {} results = snapshot.archiveresult_set.all() @@ -65,18 +68,21 @@ class SnapshotView(View): abs_path = result.snapshot_dir / (embed_path or 'None') if (result.status == 'succeeded' - and (result.extractor not in HIDDEN_RESULTS) and embed_path and os.access(abs_path, os.R_OK) and abs_path.exists()): if os.path.isdir(abs_path) and not any(abs_path.glob('*.*')): continue + # Store the full ArchiveResult object for template tags + archiveresult_objects[result.extractor] = result + result_info = { 'name': result.extractor, 'path': embed_path, 'ts': ts_to_date_str(result.end_ts), 'size': abs_path.stat().st_size or '?', + 'result': result, # Include the full object for template tags } archiveresults[result.extractor] = result_info @@ -101,11 +107,11 @@ class SnapshotView(View): } - # iterate through all the files in the snapshot dir and add the biggest ones to1 the result list + # iterate through all the files in the snapshot dir and add the biggest ones to the result list snap_dir = Path(snapshot.output_dir) if not os.path.isdir(snap_dir) and os.access(snap_dir, os.R_OK): return {} - + for result_file in (*snap_dir.glob('*'), *snap_dir.glob('*/*')): extension = result_file.suffix.lstrip('.').lower() if result_file.is_dir() or result_file.name.startswith('.') or extension not in allowed_extensions: @@ -121,12 +127,16 @@ class SnapshotView(View): 'path': result_file.relative_to(snap_dir), 'ts': ts_to_date_str(result_file.stat().st_mtime or 0), 'size': file_size, + 'result': None, # No ArchiveResult object for filesystem-discovered files } - preferred_types = ('singlefile', 'screenshot', 'wget', 'dom', 'media', 'pdf', 'readability', 'mercury') + # Get available extractors from hooks (sorted by numeric prefix for ordering) + # Convert to base names for display ordering + all_extractors = [get_extractor_name(e) for e in get_extractors()] + preferred_types = tuple(all_extractors) all_types = preferred_types + tuple(result_type for result_type in archiveresults.keys() if result_type not in preferred_types) - best_result = {'path': 'None'} + best_result = {'path': 'None', 'result': None} for result_type in preferred_types: if result_type in archiveresults: best_result = archiveresults[result_type] @@ -157,6 +167,7 @@ class SnapshotView(View): 'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS, 'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']), 'best_result': best_result, + 'snapshot': snapshot, # Pass the snapshot object for template tags } return render(template_name='core/snapshot_live.html', request=request, context=context) @@ -436,7 +447,7 @@ class AddView(UserPassesTestMixin, FormView): def form_valid(self, form): urls = form.cleaned_data["url"] print(f'[+] Adding URL: {urls}') - parser = form.cleaned_data["parser"] + parser = form.cleaned_data.get("parser", "auto") # default to auto-detect parser tag = form.cleaned_data["tag"] depth = 0 if form.cleaned_data["depth"] == "0" else 1 extractors = ','.join(form.cleaned_data["archive_methods"]) @@ -452,18 +463,19 @@ class AddView(UserPassesTestMixin, FormView): if extractors: input_kwargs.update({"extractors": extractors}) - + from archivebox.config.permissions import HOSTNAME - - + + # 1. save the provided urls to sources/2024-11-05__23-59-59__web_ui_add_by_user_.txt sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{self.request.user.pk}.txt' sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls)) - + # 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__web_ui_add_by_user_.txt + timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S") seed = Seed.from_file( sources_file, - label=f'{self.request.user.username}@{HOSTNAME}{self.request.path}', + label=f'{self.request.user.username}@{HOSTNAME}{self.request.path} {timestamp}', parser=parser, tag=tag, created_by=self.request.user.pk, @@ -472,7 +484,7 @@ class AddView(UserPassesTestMixin, FormView): # 'INDEX_ONLY': index_only, # 'OVERWRITE': False, 'DEPTH': depth, - 'EXTRACTORS': parser, + 'EXTRACTORS': extractors or '', # 'DEFAULT_PERSONA': persona or 'Default', }) # 3. create a new Crawl pointing to the Seed @@ -490,10 +502,15 @@ class AddView(UserPassesTestMixin, FormView): self.request, mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a minute start seeing results) {crawl.admin_change_url}"), ) - # if not bg: - # from workers.orchestrator import Orchestrator - # orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4) - # orchestrator.start() + + # Start orchestrator in background to process the queued crawl + try: + from archivebox.workers.tasks import ensure_orchestrator_running + ensure_orchestrator_running() + except Exception as e: + # Orchestrator may already be running via supervisord, or fail to start + # This is not fatal - the crawl will be processed when orchestrator runs + print(f'[!] Failed to start orchestrator: {e}') return redirect(crawl.admin_change_url) @@ -513,6 +530,141 @@ class HealthCheckView(View): ) +import json +from django.http import JsonResponse + +def live_progress_view(request): + """Simple JSON endpoint for live progress status - used by admin progress monitor.""" + try: + from workers.orchestrator import Orchestrator + from crawls.models import Crawl + from core.models import Snapshot, ArchiveResult + + # Get orchestrator status + orchestrator_running = Orchestrator.is_running() + total_workers = Orchestrator().get_total_worker_count() if orchestrator_running else 0 + + # Get model counts by status + crawls_pending = Crawl.objects.filter(status=Crawl.StatusChoices.QUEUED).count() + crawls_started = Crawl.objects.filter(status=Crawl.StatusChoices.STARTED).count() + + # Get recent crawls (last 24 hours) + from datetime import timedelta + one_day_ago = timezone.now() - timedelta(days=1) + crawls_recent = Crawl.objects.filter(created_at__gte=one_day_ago).count() + + snapshots_pending = Snapshot.objects.filter(status=Snapshot.StatusChoices.QUEUED).count() + snapshots_started = Snapshot.objects.filter(status=Snapshot.StatusChoices.STARTED).count() + + archiveresults_pending = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.QUEUED).count() + archiveresults_started = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.STARTED).count() + archiveresults_succeeded = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.SUCCEEDED).count() + archiveresults_failed = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.FAILED).count() + + # Build hierarchical active crawls with nested snapshots and archive results + active_crawls = [] + for crawl in Crawl.objects.filter( + status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED] + ).order_by('-modified_at')[:10]: + # Get snapshots for this crawl + crawl_snapshots = Snapshot.objects.filter(crawl=crawl) + total_snapshots = crawl_snapshots.count() + completed_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.SEALED).count() + pending_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.QUEUED).count() + + # Calculate crawl progress + crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0 + + # Get active snapshots for this crawl + active_snapshots_for_crawl = [] + for snapshot in crawl_snapshots.filter( + status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED] + ).order_by('-modified_at')[:5]: + # Get archive results for this snapshot + snapshot_results = ArchiveResult.objects.filter(snapshot=snapshot) + total_extractors = snapshot_results.count() + completed_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.SUCCEEDED).count() + failed_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.FAILED).count() + pending_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.QUEUED).count() + + # Calculate snapshot progress + snapshot_progress = int(((completed_extractors + failed_extractors) / total_extractors) * 100) if total_extractors > 0 else 0 + + # Get active extractors for this snapshot + active_extractors = [ + { + 'id': str(ar.id), + 'extractor': ar.extractor, + 'status': ar.status, + 'started': ar.start_ts.isoformat() if ar.start_ts else None, + 'progress': 50, + } + for ar in snapshot_results.filter(status=ArchiveResult.StatusChoices.STARTED).order_by('-start_ts')[:5] + ] + + active_snapshots_for_crawl.append({ + 'id': str(snapshot.id), + 'url': snapshot.url[:80], + 'status': snapshot.status, + 'started': snapshot.modified_at.isoformat() if snapshot.modified_at else None, + 'progress': snapshot_progress, + 'total_extractors': total_extractors, + 'completed_extractors': completed_extractors, + 'failed_extractors': failed_extractors, + 'pending_extractors': pending_extractors, + 'active_extractors': active_extractors, + }) + + active_crawls.append({ + 'id': str(crawl.id), + 'label': str(crawl)[:60], + 'status': crawl.status, + 'started': crawl.modified_at.isoformat() if crawl.modified_at else None, + 'progress': crawl_progress, + 'max_depth': crawl.max_depth, + 'total_snapshots': total_snapshots, + 'completed_snapshots': completed_snapshots, + 'failed_snapshots': 0, + 'pending_snapshots': pending_snapshots, + 'active_snapshots': active_snapshots_for_crawl, + }) + + return JsonResponse({ + 'orchestrator_running': orchestrator_running, + 'total_workers': total_workers, + 'crawls_pending': crawls_pending, + 'crawls_started': crawls_started, + 'crawls_recent': crawls_recent, + 'snapshots_pending': snapshots_pending, + 'snapshots_started': snapshots_started, + 'archiveresults_pending': archiveresults_pending, + 'archiveresults_started': archiveresults_started, + 'archiveresults_succeeded': archiveresults_succeeded, + 'archiveresults_failed': archiveresults_failed, + 'active_crawls': active_crawls, + 'server_time': timezone.now().isoformat(), + }) + except Exception as e: + import traceback + return JsonResponse({ + 'error': str(e), + 'traceback': traceback.format_exc(), + 'orchestrator_running': False, + 'total_workers': 0, + 'crawls_pending': 0, + 'crawls_started': 0, + 'crawls_recent': 0, + 'snapshots_pending': 0, + 'snapshots_started': 0, + 'archiveresults_pending': 0, + 'archiveresults_started': 0, + 'archiveresults_succeeded': 0, + 'archiveresults_failed': 0, + 'active_crawls': [], + 'server_time': timezone.now().isoformat(), + }, status=500) + + def find_config_section(key: str) -> str: CONFIGS = get_all_configs() diff --git a/archivebox/crawls/admin.py b/archivebox/crawls/admin.py index 3b6453c7..611a80bc 100644 --- a/archivebox/crawls/admin.py +++ b/archivebox/crawls/admin.py @@ -1,10 +1,18 @@ __package__ = 'archivebox.crawls' -from django.utils.html import format_html, format_html_join -from django.contrib import admin +import json +from pathlib import Path + +from django.utils.html import format_html, format_html_join, mark_safe +from django.contrib import admin, messages +from django.urls import path +from django.http import JsonResponse +from django.views.decorators.http import require_POST from archivebox import DATA_DIR +from django_object_actions import action + from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin from core.models import Snapshot @@ -16,8 +24,8 @@ class SeedAdmin(ConfigEditorMixin, BaseModelAdmin): sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str') search_fields = ('id', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str') - readonly_fields = ('created_at', 'modified_at', 'scheduled_crawls', 'crawls', 'snapshots', 'contents', 'available_config_options') - fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'available_config_options', 'created_by', *readonly_fields[:-1]) + readonly_fields = ('created_at', 'modified_at', 'scheduled_crawls', 'crawls', 'snapshots', 'contents') + fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'created_by', *readonly_fields) list_filter = ('extractor', 'created_by') ordering = ['-created_at'] @@ -34,19 +42,19 @@ class SeedAdmin(ConfigEditorMixin, BaseModelAdmin): return format_html_join('
', ' - {}', ( (scheduledcrawl.admin_change_url, scheduledcrawl) for scheduledcrawl in obj.scheduled_crawl_set.all().order_by('-created_at')[:20] - )) or format_html('No Scheduled Crawls yet...') + )) or mark_safe('No Scheduled Crawls yet...') def crawls(self, obj): return format_html_join('
', ' - {}', ( (crawl.admin_change_url, crawl) for crawl in obj.crawl_set.all().order_by('-created_at')[:20] - )) or format_html('No Crawls yet...') + )) or mark_safe('No Crawls yet...') def snapshots(self, obj): return format_html_join('
', ' - {}', ( (snapshot.admin_change_url, snapshot) for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20] - )) or format_html('No Snapshots yet...') + )) or mark_safe('No Snapshots yet...') def contents(self, obj): if obj.uri.startswith('file:///data/'): @@ -69,14 +77,81 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): sort_fields = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at') search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'schedule_id', 'status', 'seed__uri') - readonly_fields = ('created_at', 'modified_at', 'snapshots', 'seed_contents', 'available_config_options') - fields = ('label', 'notes', 'urls', 'config', 'available_config_options', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', *readonly_fields[:-1]) + readonly_fields = ('created_at', 'modified_at', 'snapshots', 'seed_urls_editor') + fields = ('label', 'notes', 'seed_urls_editor', 'config', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', 'created_at', 'modified_at', 'snapshots') list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at') ordering = ['-created_at', '-retry_at'] list_per_page = 100 actions = ["delete_selected"] - + change_actions = ['recrawl'] + + @action(label='Recrawl', description='Create a new crawl with the same settings') + def recrawl(self, request, obj): + """Duplicate this crawl as a new crawl with the same seed and settings.""" + from django.utils import timezone + + new_crawl = Crawl.objects.create( + seed=obj.seed, + urls=obj.urls, + max_depth=obj.max_depth, + config=obj.config, + schedule=obj.schedule, + label=f"{obj.label} (recrawl)" if obj.label else "", + notes=obj.notes, + created_by=request.user, + status=Crawl.StatusChoices.QUEUED, + retry_at=timezone.now(), + ) + + messages.success( + request, + f'Created new crawl {new_crawl.id} with the same settings. ' + f'It will start processing shortly.' + ) + + # Redirect to the new crawl's change page + from django.shortcuts import redirect + return redirect('admin:crawls_crawl_change', new_crawl.id) + + def get_urls(self): + urls = super().get_urls() + custom_urls = [ + path('/save_seed_contents/', + self.admin_site.admin_view(self.save_seed_contents_view), + name='crawls_crawl_save_seed_contents'), + ] + return custom_urls + urls + + def save_seed_contents_view(self, request, object_id): + """Handle saving seed file contents via AJAX.""" + if request.method != 'POST': + return JsonResponse({'success': False, 'error': 'POST required'}, status=405) + + try: + crawl = Crawl.objects.get(pk=object_id) + except Crawl.DoesNotExist: + return JsonResponse({'success': False, 'error': 'Crawl not found'}, status=404) + + if not (crawl.seed and crawl.seed.uri and crawl.seed.uri.startswith('file:///data/')): + return JsonResponse({'success': False, 'error': 'Seed is not a local file'}, status=400) + + try: + data = json.loads(request.body) + contents = data.get('contents', '') + except json.JSONDecodeError: + return JsonResponse({'success': False, 'error': 'Invalid JSON'}, status=400) + + source_file = DATA_DIR / crawl.seed.uri.replace('file:///data/', '', 1) + + try: + # Ensure parent directory exists + source_file.parent.mkdir(parents=True, exist_ok=True) + source_file.write_text(contents) + return JsonResponse({'success': True, 'message': f'Saved {len(contents)} bytes to {source_file.name}'}) + except Exception as e: + return JsonResponse({'success': False, 'error': str(e)}, status=500) + def num_snapshots(self, obj): return obj.snapshot_set.count() @@ -84,35 +159,175 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): return format_html_join('
', '{}', ( (snapshot.admin_change_url, snapshot) for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20] - )) or format_html('No Snapshots yet...') - + )) or mark_safe('No Snapshots yet...') + @admin.display(description='Schedule', ordering='schedule') def schedule_str(self, obj): if not obj.schedule: - return format_html('None') + return mark_safe('None') return format_html('{}', obj.schedule.admin_change_url, obj.schedule) - + @admin.display(description='Seed', ordering='seed') def seed_str(self, obj): if not obj.seed: - return format_html('None') + return mark_safe('None') return format_html('{}', obj.seed.admin_change_url, obj.seed) - - def seed_contents(self, obj): - if not (obj.seed and obj.seed.uri): - return format_html('None') - - if obj.seed.uri.startswith('file:///data/'): - source_file = DATA_DIR / obj.seed.uri.replace('file:///data/', '', 1) - contents = "" + + @admin.display(description='URLs') + def seed_urls_editor(self, obj): + """Combined editor showing seed URL and file contents.""" + widget_id = f'seed_urls_{obj.pk}' + + # Get the seed URI (or use urls field if no seed) + seed_uri = '' + if obj.seed and obj.seed.uri: + seed_uri = obj.seed.uri + elif obj.urls: + seed_uri = obj.urls + + # Check if it's a local file we can edit + is_file = seed_uri.startswith('file:///data/') + contents = "" + error = None + source_file = None + + if is_file: + source_file = DATA_DIR / seed_uri.replace('file:///data/', '', 1) try: - contents = source_file.read_text().strip()[:14_000] + contents = source_file.read_text().strip() except Exception as e: - contents = f'Error reading {source_file}: {e}' - - return format_html('{}:
{}
', source_file, contents) - - return format_html('See URLs here: {}', obj.seed.uri, obj.seed.uri) + error = f'Error reading {source_file}: {e}' + + # Escape for safe HTML embedding + escaped_uri = seed_uri.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"') + escaped_contents = (contents or '').replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"') + + # Count lines for auto-expand logic + line_count = len(contents.split('\n')) if contents else 0 + uri_rows = min(max(1, seed_uri.count('\n') + 1), 3) + + html = f''' +
+ +
+ + +
+ + {"" if not is_file else f''' + +
+ + {"
" + error + "
" if error else ""} + +
+ +
+ + + +
+ '''} + + {"" if is_file else f''' + + '''} + + +
+ ''' + return mark_safe(html) @@ -143,14 +358,14 @@ class CrawlScheduleAdmin(BaseModelAdmin): return format_html_join('
', ' - {}', ( (crawl.admin_change_url, crawl) for crawl in obj.crawl_set.all().order_by('-created_at')[:20] - )) or format_html('No Crawls yet...') + )) or mark_safe('No Crawls yet...') def snapshots(self, obj): crawl_ids = obj.crawl_set.values_list('pk', flat=True) return format_html_join('
', ' - {}', ( (snapshot.admin_change_url, snapshot) for snapshot in Snapshot.objects.filter(crawl_id__in=crawl_ids).order_by('-created_at')[:20] - )) or format_html('No Snapshots yet...') + )) or mark_safe('No Snapshots yet...') def register_admin(admin_site): diff --git a/archivebox/hooks.py b/archivebox/hooks.py index fac8bca2..4c2bdd09 100644 --- a/archivebox/hooks.py +++ b/archivebox/hooks.py @@ -865,3 +865,189 @@ def export_plugin_config_to_env( return env +# ============================================================================= +# Plugin Template Discovery +# ============================================================================= +# +# Plugins can provide custom templates for rendering their output in the UI. +# Templates are discovered by filename convention inside each plugin's templates/ dir: +# +# archivebox/plugins// +# templates/ +# icon.html # Icon for admin table view (small inline HTML) +# thumbnail.html # Preview thumbnail for snapshot cards +# embed.html # Iframe embed content for main preview +# fullscreen.html # Fullscreen view template +# +# Template context variables available: +# {{ result }} - ArchiveResult object +# {{ snapshot }} - Parent Snapshot object +# {{ output_path }} - Path to output file/dir relative to snapshot dir +# {{ extractor }} - Extractor name (e.g., 'screenshot', 'singlefile') +# + +# Default templates used when plugin doesn't provide one +DEFAULT_TEMPLATES = { + 'icon': '''{{ icon }}''', + 'thumbnail': ''' + {{ extractor }} output + ''', + 'embed': ''' + + ''', + 'fullscreen': ''' + + ''', +} + +# Default icons for known extractors (emoji or short HTML) +DEFAULT_EXTRACTOR_ICONS = { + 'screenshot': '📷', + 'pdf': '📄', + 'singlefile': '📦', + 'dom': '🌐', + 'wget': '📥', + 'media': '🎬', + 'git': '📂', + 'readability': '📖', + 'mercury': '☿️', + 'favicon': '⭐', + 'title': '📝', + 'headers': '📋', + 'archive_org': '🏛️', + 'htmltotext': '📃', + 'warc': '🗄️', +} + + +def get_plugin_template(extractor: str, template_name: str) -> Optional[str]: + """ + Get a plugin template by extractor name and template type. + + Args: + extractor: Extractor name (e.g., 'screenshot', '15_singlefile') + template_name: One of 'icon', 'thumbnail', 'embed', 'fullscreen' + + Returns: + Template content as string, or None if not found. + """ + base_name = get_extractor_name(extractor) + + for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR): + if not base_dir.exists(): + continue + + # Look for plugin directory matching extractor name + for plugin_dir in base_dir.iterdir(): + if not plugin_dir.is_dir(): + continue + + # Match by directory name (exact or partial) + if plugin_dir.name == base_name or plugin_dir.name.endswith(f'_{base_name}'): + template_path = plugin_dir / 'templates' / f'{template_name}.html' + if template_path.exists(): + return template_path.read_text() + + return None + + +def get_extractor_template(extractor: str, template_name: str) -> str: + """ + Get template for an extractor, falling back to defaults. + + Args: + extractor: Extractor name (e.g., 'screenshot', '15_singlefile') + template_name: One of 'icon', 'thumbnail', 'embed', 'fullscreen' + + Returns: + Template content as string (plugin template or default). + """ + # Try plugin-provided template first + template = get_plugin_template(extractor, template_name) + if template: + return template + + # Fall back to default template + return DEFAULT_TEMPLATES.get(template_name, '') + + +def get_extractor_icon(extractor: str) -> str: + """ + Get the icon for an extractor. + + First checks for plugin-provided icon.html template, + then falls back to DEFAULT_EXTRACTOR_ICONS. + + Args: + extractor: Extractor name (e.g., 'screenshot', '15_singlefile') + + Returns: + Icon HTML/emoji string. + """ + base_name = get_extractor_name(extractor) + + # Try plugin-provided icon template + icon_template = get_plugin_template(extractor, 'icon') + if icon_template: + return icon_template.strip() + + # Fall back to default icon + return DEFAULT_EXTRACTOR_ICONS.get(base_name, '📁') + + +def get_all_extractor_icons() -> Dict[str, str]: + """ + Get icons for all discovered extractors. + + Returns: + Dict mapping extractor base names to their icons. + """ + icons = {} + for extractor in get_extractors(): + base_name = get_extractor_name(extractor) + icons[base_name] = get_extractor_icon(extractor) + return icons + + +def discover_plugin_templates() -> Dict[str, Dict[str, str]]: + """ + Discover all plugin templates organized by extractor. + + Returns: + Dict mapping extractor names to dicts of template_name -> template_path. + e.g., {'screenshot': {'icon': '/path/to/icon.html', 'thumbnail': '/path/to/thumbnail.html'}} + """ + templates: Dict[str, Dict[str, str]] = {} + + for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR): + if not base_dir.exists(): + continue + + for plugin_dir in base_dir.iterdir(): + if not plugin_dir.is_dir(): + continue + + templates_dir = plugin_dir / 'templates' + if not templates_dir.exists(): + continue + + plugin_templates = {} + for template_file in templates_dir.glob('*.html'): + template_name = template_file.stem # icon, thumbnail, embed, fullscreen + plugin_templates[template_name] = str(template_file) + + if plugin_templates: + templates[plugin_dir.name] = plugin_templates + + return templates + + diff --git a/archivebox/machine/admin.py b/archivebox/machine/admin.py index d126d6b4..b1796025 100644 --- a/archivebox/machine/admin.py +++ b/archivebox/machine/admin.py @@ -3,16 +3,16 @@ __package__ = 'archivebox.machine' from django.contrib import admin from django.utils.html import format_html -from archivebox.base_models.admin import BaseModelAdmin -from machine.models import Machine, NetworkInterface, InstalledBinary +from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin +from machine.models import Machine, NetworkInterface, InstalledBinary, Dependency -class MachineAdmin(BaseModelAdmin): +class MachineAdmin(ConfigEditorMixin, BaseModelAdmin): list_display = ('id', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid', 'health') sort_fields = ('id', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid') readonly_fields = ('guid', 'created_at', 'modified_at', 'ips') - fields = (*readonly_fields, 'hostname', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'hw_uuid', 'os_arch', 'os_family', 'os_platform', 'os_kernel', 'os_release', 'stats', 'num_uses_succeeded', 'num_uses_failed') + fields = (*readonly_fields, 'hostname', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'hw_uuid', 'os_arch', 'os_family', 'os_platform', 'os_kernel', 'os_release', 'stats', 'config', 'num_uses_succeeded', 'num_uses_failed') list_filter = ('hw_in_docker', 'hw_in_vm', 'os_arch', 'os_family', 'os_platform') ordering = ['-created_at'] @@ -48,15 +48,43 @@ class NetworkInterfaceAdmin(BaseModelAdmin): ) +class DependencyAdmin(ConfigEditorMixin, BaseModelAdmin): + list_display = ('id', 'created_at', 'bin_name', 'bin_providers', 'is_installed', 'installed_count') + sort_fields = ('id', 'created_at', 'bin_name', 'bin_providers') + search_fields = ('id', 'bin_name', 'bin_providers') + + readonly_fields = ('id', 'created_at', 'modified_at', 'is_installed', 'installed_count') + fields = ('bin_name', 'bin_providers', 'custom_cmds', 'config', *readonly_fields) + + list_filter = ('bin_providers', 'created_at') + ordering = ['-created_at'] + list_per_page = 100 + actions = ["delete_selected"] + + @admin.display(description='Installed', boolean=True) + def is_installed(self, dependency): + return dependency.is_installed + + @admin.display(description='# Binaries') + def installed_count(self, dependency): + count = dependency.installed_binaries.count() + if count: + return format_html( + '{}', + dependency.id, count, + ) + return '0' + + class InstalledBinaryAdmin(BaseModelAdmin): - list_display = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'health') + list_display = ('id', 'created_at', 'machine_info', 'name', 'dependency_link', 'binprovider', 'version', 'abspath', 'sha256', 'health') sort_fields = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256') - search_fields = ('id', 'machine__id', 'name', 'binprovider', 'version', 'abspath', 'sha256') + search_fields = ('id', 'machine__id', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'dependency__bin_name') readonly_fields = ('created_at', 'modified_at') - fields = ('machine', 'name', 'binprovider', 'abspath', 'version', 'sha256', *readonly_fields, 'num_uses_succeeded', 'num_uses_failed') + fields = ('machine', 'dependency', 'name', 'binprovider', 'abspath', 'version', 'sha256', *readonly_fields, 'num_uses_succeeded', 'num_uses_failed') - list_filter = ('name', 'binprovider', 'machine_id') + list_filter = ('name', 'binprovider', 'machine_id', 'dependency') ordering = ['-created_at'] list_per_page = 100 actions = ["delete_selected"] @@ -68,8 +96,18 @@ class InstalledBinaryAdmin(BaseModelAdmin): installed_binary.machine.id, str(installed_binary.machine.id)[:8], installed_binary.machine.hostname, ) + @admin.display(description='Dependency', ordering='dependency__bin_name') + def dependency_link(self, installed_binary): + if installed_binary.dependency: + return format_html( + '{}', + installed_binary.dependency.id, installed_binary.dependency.bin_name, + ) + return '-' + def register_admin(admin_site): admin_site.register(Machine, MachineAdmin) admin_site.register(NetworkInterface, NetworkInterfaceAdmin) + admin_site.register(Dependency, DependencyAdmin) admin_site.register(InstalledBinary, InstalledBinaryAdmin) diff --git a/archivebox/misc/db.py b/archivebox/misc/db.py index f549f493..7f2c7247 100644 --- a/archivebox/misc/db.py +++ b/archivebox/misc/db.py @@ -37,15 +37,13 @@ def apply_migrations(out_dir: Path = DATA_DIR) -> List[str]: """Apply pending Django migrations""" from django.core.management import call_command - out1, out2 = StringIO(), StringIO() + out1 = StringIO() call_command("migrate", interactive=False, database='default', stdout=out1) out1.seek(0) - call_command("migrate", "huey_monitor", interactive=False, database='queue', stdout=out2) - out2.seek(0) return [ - line.strip() for line in out1.readlines() + out2.readlines() if line.strip() + line.strip() for line in out1.readlines() if line.strip() ] diff --git a/archivebox/misc/logging_util.py b/archivebox/misc/logging_util.py index 72879741..469b705b 100644 --- a/archivebox/misc/logging_util.py +++ b/archivebox/misc/logging_util.py @@ -480,6 +480,138 @@ def printable_filesize(num_bytes: Union[int, float]) -> str: return '%3.1f %s' % (num_bytes, 'TB') +@enforce_types +def format_duration(seconds: float) -> str: + """Format duration in human-readable form.""" + if seconds < 1: + return f'{seconds*1000:.0f}ms' + elif seconds < 60: + return f'{seconds:.1f}s' + elif seconds < 3600: + minutes = int(seconds // 60) + secs = int(seconds % 60) + return f'{minutes}min {secs}s' if secs else f'{minutes}min' + else: + hours = int(seconds // 3600) + minutes = int((seconds % 3600) // 60) + return f'{hours}hr {minutes}min' if minutes else f'{hours}hr' + + +@enforce_types +def truncate_url(url: str, max_length: int = 60) -> str: + """Truncate URL to max_length, keeping domain and adding ellipsis.""" + if len(url) <= max_length: + return url + # Try to keep the domain and beginning of path + if '://' in url: + protocol, rest = url.split('://', 1) + if '/' in rest: + domain, path = rest.split('/', 1) + available = max_length - len(protocol) - len(domain) - 6 # for "://", "/", "..." + if available > 10: + return f'{protocol}://{domain}/{path[:available]}...' + # Fallback: just truncate + return url[:max_length-3] + '...' + + +@enforce_types +def log_worker_event( + worker_type: str, + event: str, + indent_level: int = 0, + pid: Optional[int] = None, + worker_id: Optional[str] = None, + url: Optional[str] = None, + extractor: Optional[str] = None, + metadata: Optional[Dict[str, Any]] = None, + error: Optional[Exception] = None, +) -> None: + """ + Log a worker event with structured metadata and indentation. + + Args: + worker_type: Type of worker (Orchestrator, CrawlWorker, SnapshotWorker, etc.) + event: Event name (Starting, Completed, Failed, etc.) + indent_level: Indentation level (0=Orchestrator, 1=CrawlWorker, 2=SnapshotWorker, 3=ArchiveResultWorker) + pid: Process ID + worker_id: Worker ID (UUID for CrawlWorker, url for SnapshotWorker, extractor for ArchiveResultWorker) + url: URL being processed (for SnapshotWorker/ArchiveResultWorker) + extractor: Extractor name (for ArchiveResultWorker) + metadata: Dict of metadata to show in curly braces + error: Exception if event is an error + """ + indent = ' ' * indent_level + + # Build worker identifier + worker_parts = [worker_type] + if pid: + worker_parts.append(f'pid={pid}') + if worker_id and worker_type in ('CrawlWorker', 'Orchestrator'): + worker_parts.append(f'id={worker_id}') + if url and worker_type == 'SnapshotWorker': + worker_parts.append(f'url={truncate_url(url)}') + if extractor and worker_type == 'ArchiveResultWorker': + worker_parts.append(f'extractor={extractor}') + + worker_label = f'{worker_parts[0]}[{", ".join(worker_parts[1:])}]' + + # Build metadata string + metadata_str = '' + if metadata: + # Format metadata nicely + meta_parts = [] + for k, v in metadata.items(): + if isinstance(v, float): + # Format floats nicely (durations, sizes) + if 'duration' in k.lower(): + meta_parts.append(f'{k}: {format_duration(v)}') + elif 'size' in k.lower(): + meta_parts.append(f'{k}: {printable_filesize(int(v))}') + else: + meta_parts.append(f'{k}: {v:.2f}') + elif isinstance(v, int): + # Format integers - check if it's a size + if 'size' in k.lower() or 'bytes' in k.lower(): + meta_parts.append(f'{k}: {printable_filesize(v)}') + else: + meta_parts.append(f'{k}: {v}') + elif isinstance(v, (list, tuple)): + meta_parts.append(f'{k}: {len(v)}') + else: + meta_parts.append(f'{k}: {v}') + metadata_str = ' {' + ', '.join(meta_parts) + '}' + + # Determine color based on event + color = 'white' + if event in ('Starting...', 'Started', 'STARTED', 'Started in background'): + color = 'green' + elif event in ('Processing...', 'PROCESSING'): + color = 'blue' + elif event in ('Completed', 'COMPLETED', 'All work complete'): + color = 'blue' + elif event in ('Failed', 'ERROR', 'Failed to spawn worker'): + color = 'red' + elif event in ('Shutting down', 'SHUTDOWN'): + color = 'grey53' + + # Build final message + error_str = f' {type(error).__name__}: {error}' if error else '' + # Build colored message - worker_label needs to be inside color tags + # But first we need to format the color tags separately from the worker label + from archivebox.misc.logging import CONSOLE + from rich.text import Text + + # Create a Rich Text object for proper formatting + text = Text() + text.append(indent) # Indentation + # Append worker label and event with color + text.append(f'{worker_label} {event}{error_str}', style=color) + # Append metadata without color + text.append(metadata_str) + + CONSOLE.print(text) + + @enforce_types def printable_folders(folders: Dict[str, Optional["Snapshot"]], with_headers: bool=False) -> str: return '\n'.join( diff --git a/archivebox/plugins/archive_org/templates/icon.html b/archivebox/plugins/archive_org/templates/icon.html new file mode 100644 index 00000000..09f24b76 --- /dev/null +++ b/archivebox/plugins/archive_org/templates/icon.html @@ -0,0 +1 @@ +🏛️ \ No newline at end of file diff --git a/archivebox/plugins/canonical_outputs/on_Snapshot__92_canonical_outputs.py b/archivebox/plugins/canonical_outputs/on_Snapshot__92_canonical_outputs.py index 88a0e090..0bbb9008 100755 --- a/archivebox/plugins/canonical_outputs/on_Snapshot__92_canonical_outputs.py +++ b/archivebox/plugins/canonical_outputs/on_Snapshot__92_canonical_outputs.py @@ -7,7 +7,7 @@ new plugin-based output structure to the legacy canonical output paths that ArchiveBox has historically used. This maintains backward compatibility with existing tools and scripts that expect outputs at specific locations. -Canonical output paths (from Snapshot.canonical_outputs()): +Canonical output paths: - favicon.ico → favicon/favicon.ico - singlefile.html → singlefile/singlefile.html - readability/content.html → readability/content.html @@ -27,27 +27,20 @@ New plugin outputs: - redirects.json → redirects/redirects.json - console.jsonl → consolelog/console.jsonl -Usage: on_Snapshot__91_canonical_outputs.py --url= --snapshot-id= +Usage: on_Snapshot__92_canonical_outputs.py --url= --snapshot-id= Environment variables: SAVE_CANONICAL_SYMLINKS: Enable canonical symlinks (default: true) + DATA_DIR: ArchiveBox data directory + ARCHIVE_DIR: Archive output directory """ -__package__ = 'archivebox.plugins.canonical_outputs' - import os import sys +import json from pathlib import Path -from typing import Dict, Optional - -# Configure Django if running standalone -if __name__ == '__main__': - parent_dir = str(Path(__file__).resolve().parent.parent.parent) - if parent_dir not in sys.path: - sys.path.insert(0, parent_dir) - os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings') - import django - django.setup() +from datetime import datetime, timezone +from typing import Dict import rich_click as click @@ -150,10 +143,7 @@ def create_canonical_symlinks(snapshot_dir: Path) -> Dict[str, bool]: @click.option('--snapshot-id', required=True, help='Snapshot UUID') def main(url: str, snapshot_id: str): """Create symlinks from plugin outputs to canonical legacy locations.""" - from datetime import datetime - from archivebox.core.models import Snapshot - - start_ts = datetime.now() + start_ts = datetime.now(timezone.utc) status = 'failed' output = None error = '' @@ -161,31 +151,20 @@ def main(url: str, snapshot_id: str): try: # Check if enabled - from archivebox.config import CONSTANTS save_canonical = os.getenv('SAVE_CANONICAL_SYMLINKS', 'true').lower() in ('true', '1', 'yes', 'on') if not save_canonical: - click.echo('Skipping canonical symlinks (SAVE_CANONICAL_SYMLINKS=False)') status = 'skipped' - end_ts = datetime.now() - click.echo(f'START_TS={start_ts.isoformat()}') - click.echo(f'END_TS={end_ts.isoformat()}') - click.echo(f'STATUS={status}') - click.echo(f'RESULT_JSON={{"extractor": "canonical_outputs", "status": "{status}", "url": "{url}", "snapshot_id": "{snapshot_id}"}}') + click.echo(json.dumps({'status': status, 'output': 'SAVE_CANONICAL_SYMLINKS=false'})) sys.exit(0) - # Get snapshot - try: - snapshot = Snapshot.objects.get(id=snapshot_id) - except Snapshot.DoesNotExist: - error = f'Snapshot {snapshot_id} not found' - raise ValueError(error) + # Working directory is the extractor output dir (e.g., /canonical_outputs/) + # Parent is the snapshot directory + output_dir = Path.cwd() + snapshot_dir = output_dir.parent - # Get snapshot directory - snapshot_dir = Path(snapshot.output_dir) if not snapshot_dir.exists(): - error = f'Snapshot directory not found: {snapshot_dir}' - raise FileNotFoundError(error) + raise FileNotFoundError(f'Snapshot directory not found: {snapshot_dir}') # Create canonical symlinks results = create_canonical_symlinks(snapshot_dir) @@ -203,37 +182,18 @@ def main(url: str, snapshot_id: str): status = 'failed' click.echo(f'Error: {error}', err=True) - end_ts = datetime.now() - duration = (end_ts - start_ts).total_seconds() + end_ts = datetime.now(timezone.utc) - # Print results - click.echo(f'START_TS={start_ts.isoformat()}') - click.echo(f'END_TS={end_ts.isoformat()}') - click.echo(f'DURATION={duration:.2f}') - if output: - click.echo(f'OUTPUT={output}') - click.echo(f'STATUS={status}') - - if error: - click.echo(f'ERROR={error}', err=True) - - # Print JSON result - import json - result_json = { - 'extractor': 'canonical_outputs', - 'url': url, - 'snapshot_id': snapshot_id, + # Print JSON result for hook runner + result = { 'status': status, - 'start_ts': start_ts.isoformat(), - 'end_ts': end_ts.isoformat(), - 'duration': round(duration, 2), 'output': output, - 'symlinks_created': symlinks_created, 'error': error or None, + 'symlinks_created': symlinks_created, } - click.echo(f'RESULT_JSON={json.dumps(result_json)}') + click.echo(json.dumps(result)) - sys.exit(0 if status == 'succeeded' else 1) + sys.exit(0 if status in ('succeeded', 'skipped') else 1) if __name__ == '__main__': diff --git a/archivebox/plugins/chrome_session/on_Crawl__00_install_chrome.py b/archivebox/plugins/chrome_session/on_Crawl__00_install_chrome.py deleted file mode 100755 index 62de95d2..00000000 --- a/archivebox/plugins/chrome_session/on_Crawl__00_install_chrome.py +++ /dev/null @@ -1,149 +0,0 @@ -#!/usr/bin/env python3 -""" -Install Chrome/Chromium if not already available. - -Runs at crawl start to ensure Chrome is installed. -Uses playwright to install chromium if no system Chrome found. -Outputs JSONL for InstalledBinary. -""" - -import json -import sys -import os -import shutil -from pathlib import Path - - -def find_chrome(): - """Try to find system Chrome/Chromium.""" - # Comprehensive list of Chrome/Chromium binary names and paths - chromium_names_linux = [ - 'chromium', - 'chromium-browser', - 'chromium-browser-beta', - 'chromium-browser-unstable', - 'chromium-browser-canary', - 'chromium-browser-dev', - ] - - chrome_names_linux = [ - 'google-chrome', - 'google-chrome-stable', - 'google-chrome-beta', - 'google-chrome-canary', - 'google-chrome-unstable', - 'google-chrome-dev', - 'chrome', - ] - - chrome_paths_macos = [ - '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', - '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary', - '/Applications/Chromium.app/Contents/MacOS/Chromium', - ] - - chrome_paths_linux = [ - '/usr/bin/google-chrome', - '/usr/bin/google-chrome-stable', - '/usr/bin/chromium', - '/usr/bin/chromium-browser', - '/snap/bin/chromium', - '/opt/google/chrome/chrome', - ] - - all_chrome_names = chrome_names_linux + chromium_names_linux - all_chrome_paths = chrome_paths_macos + chrome_paths_linux - - # Check env var first - env_path = os.environ.get('CHROME_BINARY', '') - if env_path and Path(env_path).is_file(): - return env_path - - # Try shutil.which for various names - for name in all_chrome_names: - abspath = shutil.which(name) - if abspath: - return abspath - - # Check common paths - for path in all_chrome_paths: - if Path(path).is_file(): - return path - - return None - - -def main(): - try: - # First try to find system Chrome - system_chrome = find_chrome() - if system_chrome: - print(json.dumps({ - 'type': 'InstalledBinary', - 'name': 'chrome', - 'abspath': str(system_chrome), - 'version': None, - 'sha256': None, - 'binprovider': 'env', - })) - sys.exit(0) - - # If not found in system, try to install chromium via apt/brew - from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides - - AptProvider.model_rebuild() - BrewProvider.model_rebuild() - EnvProvider.model_rebuild() - - # Try chromium-browser or chromium via system package managers - for binary_name in ['chromium', 'chromium-browser', 'google-chrome']: - try: - chrome_binary = Binary( - name=binary_name, - binproviders=[AptProvider(), BrewProvider(), EnvProvider()] - ) - - # Try to load, install if not found - try: - loaded = chrome_binary.load() - if not loaded or not loaded.abspath: - raise Exception("Not loaded") - except Exception: - # Install via system package manager - loaded = chrome_binary.install() - - if loaded and loaded.abspath: - # Output InstalledBinary JSONL - print(json.dumps({ - 'type': 'InstalledBinary', - 'name': 'chrome', - 'abspath': str(loaded.abspath), - 'version': str(loaded.version) if loaded.version else None, - 'sha256': loaded.sha256, - 'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown', - })) - sys.exit(0) - except Exception: - continue - - # If all attempts failed - print(json.dumps({ - 'type': 'Dependency', - 'bin_name': 'chrome', - 'bin_providers': 'apt,brew,env', - })) - print("Failed to install Chrome/Chromium", file=sys.stderr) - sys.exit(1) - - except Exception as e: - print(json.dumps({ - 'type': 'Dependency', - 'bin_name': 'chrome', - 'bin_providers': 'apt,brew,env', - })) - print(f"Error installing Chrome: {e}", file=sys.stderr) - sys.exit(1) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/chrome_session/tests/test_chrome_session.py b/archivebox/plugins/chrome_session/tests/test_chrome_session.py index f61bb42e..96f3a380 100644 --- a/archivebox/plugins/chrome_session/tests/test_chrome_session.py +++ b/archivebox/plugins/chrome_session/tests/test_chrome_session.py @@ -2,7 +2,7 @@ Integration tests for chrome_session plugin Tests verify: -1. Install hook finds system Chrome or installs chromium +1. Validate hook checks for Chrome/Chromium binary 2. Verify deps with abx-pkg 3. Chrome session script exists """ @@ -14,7 +14,7 @@ from pathlib import Path import pytest PLUGIN_DIR = Path(__file__).parent.parent -CHROME_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_chrome.py' +CHROME_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_chrome.py' CHROME_SESSION_HOOK = PLUGIN_DIR / 'on_Snapshot__20_chrome_session.js' @@ -23,37 +23,50 @@ def test_hook_script_exists(): assert CHROME_SESSION_HOOK.exists(), f"Hook not found: {CHROME_SESSION_HOOK}" -def test_chrome_install_hook(): - """Test chrome install hook to find or install Chrome/Chromium.""" +def test_chrome_validate_hook(): + """Test chrome validate hook checks for Chrome/Chromium binary.""" result = subprocess.run( - [sys.executable, str(CHROME_INSTALL_HOOK)], + [sys.executable, str(CHROME_VALIDATE_HOOK)], capture_output=True, text=True, - timeout=600 + timeout=30 ) - assert result.returncode == 0, f"Install hook failed: {result.stderr}" - - # Verify InstalledBinary JSONL output - found_binary = False - for line in result.stdout.strip().split('\n'): - if line.strip(): - try: - record = json.loads(line) - if record.get('type') == 'InstalledBinary': - assert record['name'] == 'chrome' - assert record['abspath'] - assert Path(record['abspath']).exists(), f"Chrome binary should exist at {record['abspath']}" - found_binary = True - break - except json.JSONDecodeError: - pass - - assert found_binary, "Should output InstalledBinary record" + # Hook exits 0 if binary found, 1 if not found (with Dependency record) + if result.returncode == 0: + # Binary found - verify InstalledBinary JSONL output + found_binary = False + for line in result.stdout.strip().split('\n'): + if line.strip(): + try: + record = json.loads(line) + if record.get('type') == 'InstalledBinary': + assert record['name'] == 'chrome' + assert record['abspath'] + assert Path(record['abspath']).exists(), f"Chrome binary should exist at {record['abspath']}" + found_binary = True + break + except json.JSONDecodeError: + pass + assert found_binary, "Should output InstalledBinary record when binary found" + else: + # Binary not found - verify Dependency JSONL output + found_dependency = False + for line in result.stdout.strip().split('\n'): + if line.strip(): + try: + record = json.loads(line) + if record.get('type') == 'Dependency': + assert record['bin_name'] == 'chrome' + found_dependency = True + break + except json.JSONDecodeError: + pass + assert found_dependency, "Should output Dependency record when binary not found" def test_verify_deps_with_abx_pkg(): - """Verify chrome is available via abx-pkg after hook installation.""" + """Verify chrome is available via abx-pkg.""" from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides AptProvider.model_rebuild() @@ -75,10 +88,10 @@ def test_verify_deps_with_abx_pkg(): except Exception: continue - # If we get here, chrome should still be available from system + # If we get here, chrome not available import shutil - assert shutil.which('chromium') or shutil.which('chrome') or shutil.which('google-chrome'), \ - "Chrome should be available after install hook" + if not (shutil.which('chromium') or shutil.which('chrome') or shutil.which('google-chrome')): + pytest.skip("Chrome/Chromium not available - Dependency record should have been emitted") if __name__ == '__main__': diff --git a/archivebox/plugins/dom/templates/embed.html b/archivebox/plugins/dom/templates/embed.html new file mode 100644 index 00000000..d6edc0fd --- /dev/null +++ b/archivebox/plugins/dom/templates/embed.html @@ -0,0 +1,6 @@ + + diff --git a/archivebox/plugins/dom/templates/fullscreen.html b/archivebox/plugins/dom/templates/fullscreen.html new file mode 100644 index 00000000..32b003aa --- /dev/null +++ b/archivebox/plugins/dom/templates/fullscreen.html @@ -0,0 +1,6 @@ + + diff --git a/archivebox/plugins/dom/templates/icon.html b/archivebox/plugins/dom/templates/icon.html new file mode 100644 index 00000000..f8995a81 --- /dev/null +++ b/archivebox/plugins/dom/templates/icon.html @@ -0,0 +1 @@ +🌐 \ No newline at end of file diff --git a/archivebox/plugins/dom/templates/thumbnail.html b/archivebox/plugins/dom/templates/thumbnail.html new file mode 100644 index 00000000..88f126df --- /dev/null +++ b/archivebox/plugins/dom/templates/thumbnail.html @@ -0,0 +1,8 @@ + +
+ +
diff --git a/archivebox/plugins/favicon/templates/icon.html b/archivebox/plugins/favicon/templates/icon.html new file mode 100644 index 00000000..ec6acc11 --- /dev/null +++ b/archivebox/plugins/favicon/templates/icon.html @@ -0,0 +1 @@ +⭐ \ No newline at end of file diff --git a/archivebox/plugins/git/on_Crawl__00_install_git.py b/archivebox/plugins/git/on_Crawl__00_install_git.py deleted file mode 100755 index 795b047f..00000000 --- a/archivebox/plugins/git/on_Crawl__00_install_git.py +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env python3 -""" -Install git if not already available. - -Runs at crawl start to ensure git is installed. -Outputs JSONL for InstalledBinary. -""" - -import json -import sys -from pathlib import Path - - -def main(): - try: - from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides - - AptProvider.model_rebuild() - BrewProvider.model_rebuild() - EnvProvider.model_rebuild() - - # git binary and package have same name - git_binary = Binary( - name='git', - binproviders=[AptProvider(), BrewProvider(), EnvProvider()] - ) - - # Try to load, install if not found - try: - loaded = git_binary.load() - if not loaded or not loaded.abspath: - raise Exception("Not loaded") - except Exception: - # Install via system package manager - loaded = git_binary.install() - - if loaded and loaded.abspath: - # Output InstalledBinary JSONL - print(json.dumps({ - 'type': 'InstalledBinary', - 'name': 'git', - 'abspath': str(loaded.abspath), - 'version': str(loaded.version) if loaded.version else None, - 'sha256': loaded.sha256, - 'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown', - })) - sys.exit(0) - else: - print(json.dumps({ - 'type': 'Dependency', - 'bin_name': 'git', - 'bin_providers': 'apt,brew,env', - })) - print("Failed to install git", file=sys.stderr) - sys.exit(1) - - except Exception as e: - print(json.dumps({ - 'type': 'Dependency', - 'bin_name': 'git', - 'bin_providers': 'apt,brew,env', - })) - print(f"Error installing git: {e}", file=sys.stderr) - sys.exit(1) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/git/templates/embed.html b/archivebox/plugins/git/templates/embed.html new file mode 100644 index 00000000..6170f4c0 --- /dev/null +++ b/archivebox/plugins/git/templates/embed.html @@ -0,0 +1,6 @@ + + diff --git a/archivebox/plugins/git/templates/fullscreen.html b/archivebox/plugins/git/templates/fullscreen.html new file mode 100644 index 00000000..8428d4f5 --- /dev/null +++ b/archivebox/plugins/git/templates/fullscreen.html @@ -0,0 +1,6 @@ + + diff --git a/archivebox/plugins/git/templates/icon.html b/archivebox/plugins/git/templates/icon.html new file mode 100644 index 00000000..de2a340a --- /dev/null +++ b/archivebox/plugins/git/templates/icon.html @@ -0,0 +1 @@ +📂 \ No newline at end of file diff --git a/archivebox/plugins/git/templates/thumbnail.html b/archivebox/plugins/git/templates/thumbnail.html new file mode 100644 index 00000000..3148d5b9 --- /dev/null +++ b/archivebox/plugins/git/templates/thumbnail.html @@ -0,0 +1,5 @@ + +
+ 📂 + Git Repository +
diff --git a/archivebox/plugins/git/tests/test_git.py b/archivebox/plugins/git/tests/test_git.py index 811826ee..4a1029ad 100644 --- a/archivebox/plugins/git/tests/test_git.py +++ b/archivebox/plugins/git/tests/test_git.py @@ -2,7 +2,7 @@ Integration tests for git plugin Tests verify: -1. Install hook installs git via abx-pkg +1. Validate hook checks for git binary 2. Verify deps with abx-pkg 3. Standalone git extractor execution """ @@ -17,50 +17,64 @@ import pytest PLUGIN_DIR = Path(__file__).parent.parent GIT_HOOK = PLUGIN_DIR / 'on_Snapshot__12_git.py' -GIT_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_git.py' +GIT_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_git.py' TEST_URL = 'https://github.com/example/repo.git' def test_hook_script_exists(): assert GIT_HOOK.exists() -def test_git_install_hook(): - """Test git install hook to install git if needed.""" +def test_git_validate_hook(): + """Test git validate hook checks for git binary.""" result = subprocess.run( - [sys.executable, str(GIT_INSTALL_HOOK)], + [sys.executable, str(GIT_VALIDATE_HOOK)], capture_output=True, text=True, - timeout=600 + timeout=30 ) - assert result.returncode == 0, f"Install hook failed: {result.stderr}" - - # Verify InstalledBinary JSONL output - found_binary = False - for line in result.stdout.strip().split('\n'): - if line.strip(): - try: - record = json.loads(line) - if record.get('type') == 'InstalledBinary': - assert record['name'] == 'git' - assert record['abspath'] - found_binary = True - break - except json.JSONDecodeError: - pass - - assert found_binary, "Should output InstalledBinary record" + # Hook exits 0 if binary found, 1 if not found (with Dependency record) + if result.returncode == 0: + # Binary found - verify InstalledBinary JSONL output + found_binary = False + for line in result.stdout.strip().split('\n'): + if line.strip(): + try: + record = json.loads(line) + if record.get('type') == 'InstalledBinary': + assert record['name'] == 'git' + assert record['abspath'] + found_binary = True + break + except json.JSONDecodeError: + pass + assert found_binary, "Should output InstalledBinary record when binary found" + else: + # Binary not found - verify Dependency JSONL output + found_dependency = False + for line in result.stdout.strip().split('\n'): + if line.strip(): + try: + record = json.loads(line) + if record.get('type') == 'Dependency': + assert record['bin_name'] == 'git' + assert 'env' in record['bin_providers'] + found_dependency = True + break + except json.JSONDecodeError: + pass + assert found_dependency, "Should output Dependency record when binary not found" def test_verify_deps_with_abx_pkg(): - """Verify git is available via abx-pkg after hook installation.""" - from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider - - AptProvider.model_rebuild() - BrewProvider.model_rebuild() - EnvProvider.model_rebuild() + """Verify git is available via abx-pkg.""" + from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides git_binary = Binary(name='git', binproviders=[AptProvider(), BrewProvider(), EnvProvider()]) git_loaded = git_binary.load() - assert git_loaded and git_loaded.abspath, "git should be available after install hook" + + if git_loaded and git_loaded.abspath: + assert True, "git is available" + else: + pytest.skip("git not available - Dependency record should have been emitted") def test_reports_missing_git(): with tempfile.TemporaryDirectory() as tmpdir: diff --git a/archivebox/plugins/headers/templates/icon.html b/archivebox/plugins/headers/templates/icon.html new file mode 100644 index 00000000..e74c28f8 --- /dev/null +++ b/archivebox/plugins/headers/templates/icon.html @@ -0,0 +1 @@ +📋 \ No newline at end of file diff --git a/archivebox/plugins/htmltotext/templates/icon.html b/archivebox/plugins/htmltotext/templates/icon.html new file mode 100644 index 00000000..070c6ec4 --- /dev/null +++ b/archivebox/plugins/htmltotext/templates/icon.html @@ -0,0 +1 @@ +📃 \ No newline at end of file diff --git a/archivebox/plugins/media/on_Crawl__00_install_ytdlp.py b/archivebox/plugins/media/on_Crawl__00_install_ytdlp.py deleted file mode 100755 index 497cd684..00000000 --- a/archivebox/plugins/media/on_Crawl__00_install_ytdlp.py +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env python3 -""" -Install yt-dlp if not already available. - -Runs at crawl start to ensure yt-dlp is installed. -Outputs JSONL for InstalledBinary. -""" - -import json -import sys -from pathlib import Path - - -def main(): - try: - from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides - - PipProvider.model_rebuild() - EnvProvider.model_rebuild() - - # yt-dlp binary and package have same name - ytdlp_binary = Binary( - name='yt-dlp', - binproviders=[PipProvider(), EnvProvider()] - ) - - # Try to load, install if not found - try: - loaded = ytdlp_binary.load() - if not loaded or not loaded.abspath: - raise Exception("Not loaded") - except Exception: - # Install via pip - loaded = ytdlp_binary.install() - - if loaded and loaded.abspath: - # Output InstalledBinary JSONL - print(json.dumps({ - 'type': 'InstalledBinary', - 'name': 'yt-dlp', - 'abspath': str(loaded.abspath), - 'version': str(loaded.version) if loaded.version else None, - 'sha256': loaded.sha256, - 'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown', - })) - sys.exit(0) - else: - print(json.dumps({ - 'type': 'Dependency', - 'bin_name': 'yt-dlp', - 'bin_providers': 'pip,brew,env', - })) - print("Failed to install yt-dlp", file=sys.stderr) - sys.exit(1) - - except Exception as e: - print(json.dumps({ - 'type': 'Dependency', - 'bin_name': 'yt-dlp', - 'bin_providers': 'pip,brew,env', - })) - print(f"Error installing yt-dlp: {e}", file=sys.stderr) - sys.exit(1) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/media/on_Crawl__00_validate_ytdlp.py b/archivebox/plugins/media/on_Crawl__00_validate_ytdlp.py new file mode 100755 index 00000000..798d6b60 --- /dev/null +++ b/archivebox/plugins/media/on_Crawl__00_validate_ytdlp.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python3 +""" +Validation hook for yt-dlp and its dependencies (node, ffmpeg). + +Runs at crawl start to verify yt-dlp and required binaries are available. +Outputs JSONL for InstalledBinary and Machine config updates. +""" + +import os +import sys +import json +import shutil +import hashlib +import subprocess +from pathlib import Path + + +def get_binary_version(abspath: str, version_flag: str = '--version') -> str | None: + """Get version string from binary.""" + try: + result = subprocess.run( + [abspath, version_flag], + capture_output=True, + text=True, + timeout=5, + ) + if result.returncode == 0 and result.stdout: + first_line = result.stdout.strip().split('\n')[0] + return first_line[:64] + except Exception: + pass + return None + + +def get_binary_hash(abspath: str) -> str | None: + """Get SHA256 hash of binary.""" + try: + with open(abspath, 'rb') as f: + return hashlib.sha256(f.read()).hexdigest() + except Exception: + return None + + +def find_ytdlp() -> dict | None: + """Find yt-dlp binary.""" + try: + from abx_pkg import Binary, PipProvider, EnvProvider + + class YtdlpBinary(Binary): + name: str = 'yt-dlp' + binproviders_supported = [PipProvider(), EnvProvider()] + + binary = YtdlpBinary() + loaded = binary.load() + if loaded and loaded.abspath: + return { + 'name': 'yt-dlp', + 'abspath': str(loaded.abspath), + 'version': str(loaded.version) if loaded.version else None, + 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, + 'binprovider': loaded.binprovider.name if loaded.binprovider else 'env', + } + except ImportError: + pass + except Exception: + pass + + # Fallback to shutil.which + abspath = shutil.which('yt-dlp') or os.environ.get('YTDLP_BINARY', '') + if abspath and Path(abspath).is_file(): + return { + 'name': 'yt-dlp', + 'abspath': abspath, + 'version': get_binary_version(abspath), + 'sha256': get_binary_hash(abspath), + 'binprovider': 'env', + } + + return None + + +def find_node() -> dict | None: + """Find node binary.""" + try: + from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider + + class NodeBinary(Binary): + name: str = 'node' + binproviders_supported = [AptProvider(), BrewProvider(), EnvProvider()] + overrides: dict = {'apt': {'packages': ['nodejs']}} + + binary = NodeBinary() + loaded = binary.load() + if loaded and loaded.abspath: + return { + 'name': 'node', + 'abspath': str(loaded.abspath), + 'version': str(loaded.version) if loaded.version else None, + 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, + 'binprovider': loaded.binprovider.name if loaded.binprovider else 'env', + } + except ImportError: + pass + except Exception: + pass + + # Fallback to shutil.which + abspath = shutil.which('node') or os.environ.get('NODE_BINARY', '') + if abspath and Path(abspath).is_file(): + return { + 'name': 'node', + 'abspath': abspath, + 'version': get_binary_version(abspath), + 'sha256': get_binary_hash(abspath), + 'binprovider': 'env', + } + + return None + + +def find_ffmpeg() -> dict | None: + """Find ffmpeg binary.""" + try: + from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider + + class FfmpegBinary(Binary): + name: str = 'ffmpeg' + binproviders_supported = [AptProvider(), BrewProvider(), EnvProvider()] + + binary = FfmpegBinary() + loaded = binary.load() + if loaded and loaded.abspath: + return { + 'name': 'ffmpeg', + 'abspath': str(loaded.abspath), + 'version': str(loaded.version) if loaded.version else None, + 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, + 'binprovider': loaded.binprovider.name if loaded.binprovider else 'env', + } + except ImportError: + pass + except Exception: + pass + + # Fallback to shutil.which + abspath = shutil.which('ffmpeg') or os.environ.get('FFMPEG_BINARY', '') + if abspath and Path(abspath).is_file(): + return { + 'name': 'ffmpeg', + 'abspath': abspath, + 'version': get_binary_version(abspath), + 'sha256': get_binary_hash(abspath), + 'binprovider': 'env', + } + + return None + + +def main(): + # Check for yt-dlp (required) + ytdlp_result = find_ytdlp() + + # Check for node (required for JS extraction) + node_result = find_node() + + # Check for ffmpeg (required for video conversion) + ffmpeg_result = find_ffmpeg() + + missing_deps = [] + + # Emit results for yt-dlp + if ytdlp_result and ytdlp_result.get('abspath'): + print(json.dumps({ + 'type': 'InstalledBinary', + 'name': ytdlp_result['name'], + 'abspath': ytdlp_result['abspath'], + 'version': ytdlp_result['version'], + 'sha256': ytdlp_result['sha256'], + 'binprovider': ytdlp_result['binprovider'], + })) + + print(json.dumps({ + 'type': 'Machine', + '_method': 'update', + 'key': 'config/YTDLP_BINARY', + 'value': ytdlp_result['abspath'], + })) + + if ytdlp_result['version']: + print(json.dumps({ + 'type': 'Machine', + '_method': 'update', + 'key': 'config/YTDLP_VERSION', + 'value': ytdlp_result['version'], + })) + else: + print(json.dumps({ + 'type': 'Dependency', + 'bin_name': 'yt-dlp', + 'bin_providers': 'pip,env', + })) + missing_deps.append('yt-dlp') + + # Emit results for node + if node_result and node_result.get('abspath'): + print(json.dumps({ + 'type': 'InstalledBinary', + 'name': node_result['name'], + 'abspath': node_result['abspath'], + 'version': node_result['version'], + 'sha256': node_result['sha256'], + 'binprovider': node_result['binprovider'], + })) + + print(json.dumps({ + 'type': 'Machine', + '_method': 'update', + 'key': 'config/NODE_BINARY', + 'value': node_result['abspath'], + })) + + if node_result['version']: + print(json.dumps({ + 'type': 'Machine', + '_method': 'update', + 'key': 'config/NODE_VERSION', + 'value': node_result['version'], + })) + else: + print(json.dumps({ + 'type': 'Dependency', + 'bin_name': 'node', + 'bin_providers': 'apt,brew,env', + })) + missing_deps.append('node') + + # Emit results for ffmpeg + if ffmpeg_result and ffmpeg_result.get('abspath'): + print(json.dumps({ + 'type': 'InstalledBinary', + 'name': ffmpeg_result['name'], + 'abspath': ffmpeg_result['abspath'], + 'version': ffmpeg_result['version'], + 'sha256': ffmpeg_result['sha256'], + 'binprovider': ffmpeg_result['binprovider'], + })) + + print(json.dumps({ + 'type': 'Machine', + '_method': 'update', + 'key': 'config/FFMPEG_BINARY', + 'value': ffmpeg_result['abspath'], + })) + + if ffmpeg_result['version']: + print(json.dumps({ + 'type': 'Machine', + '_method': 'update', + 'key': 'config/FFMPEG_VERSION', + 'value': ffmpeg_result['version'], + })) + else: + print(json.dumps({ + 'type': 'Dependency', + 'bin_name': 'ffmpeg', + 'bin_providers': 'apt,brew,env', + })) + missing_deps.append('ffmpeg') + + if missing_deps: + print(f"Missing dependencies: {', '.join(missing_deps)}", file=sys.stderr) + sys.exit(1) + else: + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/media/templates/embed.html b/archivebox/plugins/media/templates/embed.html new file mode 100644 index 00000000..bc556b49 --- /dev/null +++ b/archivebox/plugins/media/templates/embed.html @@ -0,0 +1,9 @@ + +
+ +
diff --git a/archivebox/plugins/media/templates/fullscreen.html b/archivebox/plugins/media/templates/fullscreen.html new file mode 100644 index 00000000..3a7b0f6f --- /dev/null +++ b/archivebox/plugins/media/templates/fullscreen.html @@ -0,0 +1,10 @@ + +
+ +
diff --git a/archivebox/plugins/media/templates/icon.html b/archivebox/plugins/media/templates/icon.html new file mode 100644 index 00000000..b17d15b8 --- /dev/null +++ b/archivebox/plugins/media/templates/icon.html @@ -0,0 +1 @@ +🎬 \ No newline at end of file diff --git a/archivebox/plugins/media/templates/thumbnail.html b/archivebox/plugins/media/templates/thumbnail.html new file mode 100644 index 00000000..8cfe3ef5 --- /dev/null +++ b/archivebox/plugins/media/templates/thumbnail.html @@ -0,0 +1,14 @@ + +
+ +
+ 🎬 + Media +
+
diff --git a/archivebox/plugins/media/tests/test_media.py b/archivebox/plugins/media/tests/test_media.py index f2db82b4..a669a549 100644 --- a/archivebox/plugins/media/tests/test_media.py +++ b/archivebox/plugins/media/tests/test_media.py @@ -21,7 +21,7 @@ import pytest PLUGIN_DIR = Path(__file__).parent.parent PLUGINS_ROOT = PLUGIN_DIR.parent MEDIA_HOOK = PLUGIN_DIR / 'on_Snapshot__51_media.py' -MEDIA_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_ytdlp.py' +MEDIA_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_ytdlp.py' TEST_URL = 'https://example.com/video.mp4' def test_hook_script_exists(): @@ -29,46 +29,72 @@ def test_hook_script_exists(): assert MEDIA_HOOK.exists(), f"Hook not found: {MEDIA_HOOK}" -def test_ytdlp_install_hook(): - """Test yt-dlp install hook to install yt-dlp if needed.""" - # Run yt-dlp install hook +def test_ytdlp_validate_hook(): + """Test yt-dlp validate hook checks for yt-dlp and dependencies (node, ffmpeg).""" + # Run yt-dlp validate hook result = subprocess.run( - [sys.executable, str(MEDIA_INSTALL_HOOK)], + [sys.executable, str(MEDIA_VALIDATE_HOOK)], capture_output=True, text=True, - timeout=600 + timeout=30 ) - assert result.returncode == 0, f"Install hook failed: {result.stderr}" + # Hook exits 0 if all binaries found, 1 if any not found + # Parse output for InstalledBinary and Dependency records + found_binaries = {'node': False, 'ffmpeg': False, 'yt-dlp': False} + found_dependencies = {'node': False, 'ffmpeg': False, 'yt-dlp': False} - # Verify InstalledBinary JSONL output - found_binary = False for line in result.stdout.strip().split('\n'): if line.strip(): try: record = json.loads(line) if record.get('type') == 'InstalledBinary': - assert record['name'] == 'yt-dlp' - assert record['abspath'] - found_binary = True - break + name = record['name'] + if name in found_binaries: + assert record['abspath'], f"{name} should have abspath" + found_binaries[name] = True + elif record.get('type') == 'Dependency': + name = record['bin_name'] + if name in found_dependencies: + found_dependencies[name] = True except json.JSONDecodeError: pass - assert found_binary, "Should output InstalledBinary record" + # Each binary should either be found (InstalledBinary) or missing (Dependency) + for binary_name in ['yt-dlp', 'node', 'ffmpeg']: + assert found_binaries[binary_name] or found_dependencies[binary_name], \ + f"{binary_name} should have either InstalledBinary or Dependency record" def test_verify_deps_with_abx_pkg(): - """Verify yt-dlp is available via abx-pkg after hook installation.""" - from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides + """Verify yt-dlp, node, and ffmpeg are available via abx-pkg.""" + from abx_pkg import Binary, PipProvider, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides - PipProvider.model_rebuild() - EnvProvider.model_rebuild() + missing_binaries = [] # Verify yt-dlp is available ytdlp_binary = Binary(name='yt-dlp', binproviders=[PipProvider(), EnvProvider()]) ytdlp_loaded = ytdlp_binary.load() - assert ytdlp_loaded and ytdlp_loaded.abspath, "yt-dlp should be available after install hook" + if not (ytdlp_loaded and ytdlp_loaded.abspath): + missing_binaries.append('yt-dlp') + + # Verify node is available (yt-dlp needs it for JS extraction) + node_binary = Binary( + name='node', + binproviders=[AptProvider(), BrewProvider(), EnvProvider()] + ) + node_loaded = node_binary.load() + if not (node_loaded and node_loaded.abspath): + missing_binaries.append('node') + + # Verify ffmpeg is available (yt-dlp needs it for video conversion) + ffmpeg_binary = Binary(name='ffmpeg', binproviders=[AptProvider(), BrewProvider(), EnvProvider()]) + ffmpeg_loaded = ffmpeg_binary.load() + if not (ffmpeg_loaded and ffmpeg_loaded.abspath): + missing_binaries.append('ffmpeg') + + if missing_binaries: + pytest.skip(f"Binaries not available: {', '.join(missing_binaries)} - Dependency records should have been emitted") def test_handles_non_media_url(): """Test that media extractor handles non-media URLs gracefully via hook.""" diff --git a/archivebox/plugins/mercury/on_Crawl__00_install_mercury.py b/archivebox/plugins/mercury/on_Crawl__00_install_mercury.py deleted file mode 100755 index e7f86995..00000000 --- a/archivebox/plugins/mercury/on_Crawl__00_install_mercury.py +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env python3 -""" -Install mercury-parser if not already available. - -Runs at crawl start to ensure mercury-parser is installed. -Outputs JSONL for InstalledBinary. -""" - -import json -import sys -from pathlib import Path - - -def main(): - try: - from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides - - NpmProvider.model_rebuild() - EnvProvider.model_rebuild() - - # Note: npm package is @postlight/mercury-parser, binary is mercury-parser - mercury_binary = Binary( - name='mercury-parser', - binproviders=[NpmProvider(), EnvProvider()], - overrides={'npm': {'packages': ['@postlight/mercury-parser']}} - ) - - # Try to load, install if not found - try: - loaded = mercury_binary.load() - if not loaded or not loaded.abspath: - raise Exception("Not loaded") - except Exception: - # Install via npm - loaded = mercury_binary.install() - - if loaded and loaded.abspath: - # Output InstalledBinary JSONL - print(json.dumps({ - 'type': 'InstalledBinary', - 'name': 'mercury-parser', - 'abspath': str(loaded.abspath), - 'version': str(loaded.version) if loaded.version else None, - 'sha256': loaded.sha256, - 'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown', - })) - sys.exit(0) - else: - print(json.dumps({ - 'type': 'Dependency', - 'bin_name': 'mercury-parser', - 'bin_providers': 'npm,env', - })) - print("Failed to install mercury-parser", file=sys.stderr) - sys.exit(1) - - except Exception as e: - print(json.dumps({ - 'type': 'Dependency', - 'bin_name': 'mercury-parser', - 'bin_providers': 'npm,env', - })) - print(f"Error installing mercury-parser: {e}", file=sys.stderr) - sys.exit(1) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/mercury/on_Crawl__00_validate_mercury.py b/archivebox/plugins/mercury/on_Crawl__00_validate_mercury.py new file mode 100755 index 00000000..21e46225 --- /dev/null +++ b/archivebox/plugins/mercury/on_Crawl__00_validate_mercury.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 +""" +Validation hook for postlight-parser binary. + +Runs at crawl start to verify postlight-parser is available. +Outputs JSONL for InstalledBinary and Machine config updates. +""" + +import os +import sys +import json +import shutil +import hashlib +import subprocess +from pathlib import Path + + +def get_binary_version(abspath: str) -> str | None: + """Get version string from binary.""" + try: + result = subprocess.run( + [abspath, '--version'], + capture_output=True, + text=True, + timeout=5, + ) + if result.returncode == 0 and result.stdout: + first_line = result.stdout.strip().split('\n')[0] + return first_line[:64] + except Exception: + pass + return None + + +def get_binary_hash(abspath: str) -> str | None: + """Get SHA256 hash of binary.""" + try: + with open(abspath, 'rb') as f: + return hashlib.sha256(f.read()).hexdigest() + except Exception: + return None + + +def find_mercury() -> dict | None: + """Find postlight-parser binary.""" + try: + from abx_pkg import Binary, NpmProvider, EnvProvider + + class MercuryBinary(Binary): + name: str = 'postlight-parser' + binproviders_supported = [NpmProvider(), EnvProvider()] + overrides: dict = {'npm': {'packages': ['@postlight/parser']}} + + binary = MercuryBinary() + loaded = binary.load() + if loaded and loaded.abspath: + return { + 'name': 'postlight-parser', + 'abspath': str(loaded.abspath), + 'version': str(loaded.version) if loaded.version else None, + 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, + 'binprovider': loaded.binprovider.name if loaded.binprovider else 'env', + } + except ImportError: + pass + except Exception: + pass + + # Fallback to shutil.which + abspath = shutil.which('postlight-parser') or os.environ.get('MERCURY_BINARY', '') + if abspath and Path(abspath).is_file(): + return { + 'name': 'postlight-parser', + 'abspath': abspath, + 'version': get_binary_version(abspath), + 'sha256': get_binary_hash(abspath), + 'binprovider': 'env', + } + + return None + + +def main(): + result = find_mercury() + + if result and result.get('abspath'): + print(json.dumps({ + 'type': 'InstalledBinary', + 'name': result['name'], + 'abspath': result['abspath'], + 'version': result['version'], + 'sha256': result['sha256'], + 'binprovider': result['binprovider'], + })) + + print(json.dumps({ + 'type': 'Machine', + '_method': 'update', + 'key': 'config/MERCURY_BINARY', + 'value': result['abspath'], + })) + + if result['version']: + print(json.dumps({ + 'type': 'Machine', + '_method': 'update', + 'key': 'config/MERCURY_VERSION', + 'value': result['version'], + })) + + sys.exit(0) + else: + print(json.dumps({ + 'type': 'Dependency', + 'bin_name': 'postlight-parser', + 'bin_providers': 'npm,env', + })) + print(f"postlight-parser binary not found", file=sys.stderr) + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/mercury/on_Snapshot__53_mercury.py b/archivebox/plugins/mercury/on_Snapshot__53_mercury.py index 89be4a4d..e9b5f63a 100644 --- a/archivebox/plugins/mercury/on_Snapshot__53_mercury.py +++ b/archivebox/plugins/mercury/on_Snapshot__53_mercury.py @@ -6,10 +6,10 @@ Usage: on_Snapshot__mercury.py --url= --snapshot-id= Output: Creates mercury/ directory with content.html, content.txt, article.json Environment variables: - MERCURY_BINARY: Path to mercury-parser binary + MERCURY_BINARY: Path to postlight-parser binary TIMEOUT: Timeout in seconds (default: 60) -Note: Requires mercury-parser: npm install -g @postlight/mercury-parser +Note: Requires postlight-parser: npm install -g @postlight/parser """ import json @@ -25,7 +25,7 @@ import rich_click as click # Extractor metadata EXTRACTOR_NAME = 'mercury' -BIN_NAME = 'mercury-parser' +BIN_NAME = 'postlight-parser' BIN_PROVIDERS = 'npm,env' OUTPUT_DIR = 'mercury' @@ -42,12 +42,12 @@ def get_env_int(name: str, default: int = 0) -> int: def find_mercury() -> str | None: - """Find mercury-parser binary.""" + """Find postlight-parser binary.""" mercury = get_env('MERCURY_BINARY') if mercury and os.path.isfile(mercury): return mercury - for name in ['mercury-parser', 'mercury']: + for name in ['postlight-parser']: binary = shutil.which(name) if binary: return binary @@ -56,7 +56,7 @@ def find_mercury() -> str | None: def get_version(binary: str) -> str: - """Get mercury-parser version.""" + """Get postlight-parser version.""" try: result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10) return result.stdout.strip()[:64] @@ -83,12 +83,12 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]: if result_text.returncode != 0: stderr = result_text.stderr.decode('utf-8', errors='replace') - return False, None, f'mercury-parser failed: {stderr[:200]}' + return False, None, f'postlight-parser failed: {stderr[:200]}' try: text_json = json.loads(result_text.stdout) except json.JSONDecodeError: - return False, None, 'mercury-parser returned invalid JSON' + return False, None, 'postlight-parser returned invalid JSON' if text_json.get('failed'): return False, None, 'Mercury was not able to extract article' @@ -139,7 +139,7 @@ def main(url: str, snapshot_id: str): # Find binary binary = find_mercury() if not binary: - print(f'ERROR: mercury-parser binary not found', file=sys.stderr) + print(f'ERROR: postlight-parser binary not found', file=sys.stderr) print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr) print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr) sys.exit(1) diff --git a/archivebox/plugins/mercury/templates/embed.html b/archivebox/plugins/mercury/templates/embed.html new file mode 100644 index 00000000..29b52d02 --- /dev/null +++ b/archivebox/plugins/mercury/templates/embed.html @@ -0,0 +1,6 @@ + + diff --git a/archivebox/plugins/mercury/templates/fullscreen.html b/archivebox/plugins/mercury/templates/fullscreen.html new file mode 100644 index 00000000..6cf4dd70 --- /dev/null +++ b/archivebox/plugins/mercury/templates/fullscreen.html @@ -0,0 +1,6 @@ + + diff --git a/archivebox/plugins/mercury/templates/icon.html b/archivebox/plugins/mercury/templates/icon.html new file mode 100644 index 00000000..776ed9b1 --- /dev/null +++ b/archivebox/plugins/mercury/templates/icon.html @@ -0,0 +1 @@ +☿️ \ No newline at end of file diff --git a/archivebox/plugins/mercury/templates/thumbnail.html b/archivebox/plugins/mercury/templates/thumbnail.html new file mode 100644 index 00000000..cf7cdb40 --- /dev/null +++ b/archivebox/plugins/mercury/templates/thumbnail.html @@ -0,0 +1,8 @@ + +
+ +
diff --git a/archivebox/plugins/mercury/tests/test_mercury.py b/archivebox/plugins/mercury/tests/test_mercury.py index 45de57a4..1a15cc5d 100644 --- a/archivebox/plugins/mercury/tests/test_mercury.py +++ b/archivebox/plugins/mercury/tests/test_mercury.py @@ -21,7 +21,7 @@ import pytest PLUGIN_DIR = Path(__file__).parent.parent PLUGINS_ROOT = PLUGIN_DIR.parent MERCURY_HOOK = PLUGIN_DIR / 'on_Snapshot__53_mercury.py' -MERCURY_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_mercury.py' +MERCURY_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_mercury.py' TEST_URL = 'https://example.com' def test_hook_script_exists(): @@ -29,53 +29,70 @@ def test_hook_script_exists(): assert MERCURY_HOOK.exists(), f"Hook not found: {MERCURY_HOOK}" -def test_mercury_install_hook(): - """Test mercury install hook to install mercury-parser if needed.""" - # Run mercury install hook +def test_mercury_validate_hook(): + """Test mercury validate hook checks for postlight-parser.""" + # Run mercury validate hook result = subprocess.run( - [sys.executable, str(MERCURY_INSTALL_HOOK)], + [sys.executable, str(MERCURY_VALIDATE_HOOK)], capture_output=True, text=True, - timeout=600 + timeout=30 ) - assert result.returncode == 0, f"Install hook failed: {result.stderr}" - - # Verify InstalledBinary JSONL output - found_binary = False - for line in result.stdout.strip().split('\n'): - if line.strip(): - try: - record = json.loads(line) - if record.get('type') == 'InstalledBinary': - assert record['name'] == 'mercury-parser' - assert record['abspath'] - found_binary = True - break - except json.JSONDecodeError: - pass - - assert found_binary, "Should output InstalledBinary record" + # Hook exits 0 if binary found, 1 if not found (with Dependency record) + if result.returncode == 0: + # Binary found - verify InstalledBinary JSONL output + found_binary = False + for line in result.stdout.strip().split('\n'): + if line.strip(): + try: + record = json.loads(line) + if record.get('type') == 'InstalledBinary': + assert record['name'] == 'postlight-parser' + assert record['abspath'] + found_binary = True + break + except json.JSONDecodeError: + pass + assert found_binary, "Should output InstalledBinary record when binary found" + else: + # Binary not found - verify Dependency JSONL output + found_dependency = False + for line in result.stdout.strip().split('\n'): + if line.strip(): + try: + record = json.loads(line) + if record.get('type') == 'Dependency': + assert record['bin_name'] == 'postlight-parser' + assert 'npm' in record['bin_providers'] + found_dependency = True + break + except json.JSONDecodeError: + pass + assert found_dependency, "Should output Dependency record when binary not found" def test_verify_deps_with_abx_pkg(): - """Verify mercury-parser is available via abx-pkg after hook installation.""" + """Verify postlight-parser is available via abx-pkg.""" from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides - NpmProvider.model_rebuild() - EnvProvider.model_rebuild() - - # Verify mercury-parser is available + # Verify postlight-parser is available mercury_binary = Binary( - name='mercury-parser', + name='postlight-parser', binproviders=[NpmProvider(), EnvProvider()], - overrides={'npm': {'packages': ['@postlight/mercury-parser']}} + overrides={'npm': {'packages': ['@postlight/parser']}} ) mercury_loaded = mercury_binary.load() - assert mercury_loaded and mercury_loaded.abspath, "mercury-parser should be available after install hook" + + # If validate hook found it (exit 0), this should succeed + # If validate hook didn't find it (exit 1), this may fail unless binprovider installed it + if mercury_loaded and mercury_loaded.abspath: + assert True, "postlight-parser is available" + else: + pytest.skip("postlight-parser not available - Dependency record should have been emitted") def test_extracts_with_mercury_parser(): - """Test full workflow: extract with mercury-parser from real HTML via hook.""" + """Test full workflow: extract with postlight-parser from real HTML via hook.""" # Prerequisites checked by earlier test with tempfile.TemporaryDirectory() as tmpdir: diff --git a/archivebox/plugins/merkletree/on_Snapshot__93_merkletree.py b/archivebox/plugins/merkletree/on_Snapshot__93_merkletree.py index 97757d3f..7ebd39c4 100755 --- a/archivebox/plugins/merkletree/on_Snapshot__93_merkletree.py +++ b/archivebox/plugins/merkletree/on_Snapshot__93_merkletree.py @@ -2,46 +2,28 @@ """ Create a Merkle tree of all archived outputs. -This plugin runs after all extractors and post-processing complete (priority 92) -and generates a cryptographic Merkle tree of all files in the snapshot directory. -This provides: - - Tamper detection: verify archive integrity - - Efficient updates: only re-hash changed files - - Compact proofs: prove file inclusion without sending all files - - Deduplication: identify identical content across snapshots +This plugin runs after all extractors complete (priority 93) and generates +a cryptographic Merkle tree of all files in the snapshot directory. -Output: merkletree/merkletree.json containing: - - root_hash: SHA256 hash of the Merkle root - - tree: Full tree structure with internal nodes - - files: List of all files with their hashes - - metadata: Timestamp, file count, total size +Output: merkletree.json containing root_hash, tree structure, file list, metadata -Usage: on_Snapshot__92_merkletree.py --url= --snapshot-id= +Usage: on_Snapshot__93_merkletree.py --url= --snapshot-id= Environment variables: SAVE_MERKLETREE: Enable merkle tree generation (default: true) + DATA_DIR: ArchiveBox data directory + ARCHIVE_DIR: Archive output directory """ -__package__ = 'archivebox.plugins.merkletree' - import os import sys import json import hashlib from pathlib import Path -from datetime import datetime +from datetime import datetime, timezone from typing import Dict, List, Optional, Tuple, Any -# Configure Django if running standalone -if __name__ == '__main__': - parent_dir = str(Path(__file__).resolve().parent.parent.parent) - if parent_dir not in sys.path: - sys.path.insert(0, parent_dir) - os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings') - import django - django.setup() - -import rich_click as click +import click def sha256_file(filepath: Path) -> str: @@ -49,12 +31,10 @@ def sha256_file(filepath: Path) -> str: h = hashlib.sha256() try: with open(filepath, 'rb') as f: - # Read in 64kb chunks while chunk := f.read(65536): h.update(chunk) return h.hexdigest() except (OSError, PermissionError): - # If we can't read the file, return a null hash return '0' * 64 @@ -64,74 +44,45 @@ def sha256_data(data: bytes) -> str: def collect_files(snapshot_dir: Path, exclude_dirs: Optional[List[str]] = None) -> List[Tuple[Path, str, int]]: - """ - Recursively collect all files in snapshot directory. - - Args: - snapshot_dir: Root directory to scan - exclude_dirs: Directory names to exclude (e.g., ['merkletree', '.git']) - - Returns: - List of (relative_path, sha256_hash, file_size) tuples - """ + """Recursively collect all files in snapshot directory.""" exclude_dirs = exclude_dirs or ['merkletree', '.git', '__pycache__'] files = [] for root, dirs, filenames in os.walk(snapshot_dir): - # Filter out excluded directories dirs[:] = [d for d in dirs if d not in exclude_dirs] for filename in filenames: filepath = Path(root) / filename rel_path = filepath.relative_to(snapshot_dir) - # Skip symlinks (we hash the target, not the link) if filepath.is_symlink(): continue - # Compute hash and size file_hash = sha256_file(filepath) file_size = filepath.stat().st_size if filepath.exists() else 0 - files.append((rel_path, file_hash, file_size)) - # Sort by path for deterministic tree files.sort(key=lambda x: str(x[0])) return files def build_merkle_tree(file_hashes: List[str]) -> Tuple[str, List[List[str]]]: - """ - Build a Merkle tree from a list of leaf hashes. - - Args: - file_hashes: List of SHA256 hashes (leaves) - - Returns: - (root_hash, tree_levels) where tree_levels is a list of hash lists per level - """ + """Build a Merkle tree from a list of leaf hashes.""" if not file_hashes: - # Empty tree return sha256_data(b''), [[]] - # Initialize with leaf level tree_levels = [file_hashes.copy()] - # Build tree bottom-up while len(tree_levels[-1]) > 1: current_level = tree_levels[-1] next_level = [] - # Process pairs for i in range(0, len(current_level), 2): left = current_level[i] - if i + 1 < len(current_level): - # Combine left + right right = current_level[i + 1] combined = left + right else: - # Odd number of nodes: duplicate the last one combined = left + left parent_hash = sha256_data(combined.encode('utf-8')) @@ -139,67 +90,41 @@ def build_merkle_tree(file_hashes: List[str]) -> Tuple[str, List[List[str]]]: tree_levels.append(next_level) - # Root is the single hash at the top level root_hash = tree_levels[-1][0] return root_hash, tree_levels def create_merkle_tree(snapshot_dir: Path) -> Dict[str, Any]: - """ - Create a complete Merkle tree of all files in snapshot directory. - - Args: - snapshot_dir: The snapshot directory to scan - - Returns: - Dict containing root_hash, tree structure, file list, and metadata - """ - # Collect all files + """Create a complete Merkle tree of all files in snapshot directory.""" files = collect_files(snapshot_dir) - - # Extract just the hashes for tree building file_hashes = [file_hash for _, file_hash, _ in files] - - # Build Merkle tree root_hash, tree_levels = build_merkle_tree(file_hashes) - - # Calculate total size total_size = sum(size for _, _, size in files) - # Prepare file list with metadata file_list = [ - { - 'path': str(path), - 'hash': file_hash, - 'size': size, - } + {'path': str(path), 'hash': file_hash, 'size': size} for path, file_hash, size in files ] - # Prepare result - result = { + return { 'root_hash': root_hash, 'tree_levels': tree_levels, 'files': file_list, 'metadata': { - 'timestamp': datetime.now().isoformat(), + 'timestamp': datetime.now(timezone.utc).isoformat(), 'file_count': len(files), 'total_size': total_size, 'tree_depth': len(tree_levels), }, } - return result - @click.command() @click.option('--url', required=True, help='URL being archived') @click.option('--snapshot-id', required=True, help='Snapshot UUID') def main(url: str, snapshot_id: str): """Generate Merkle tree of all archived outputs.""" - from archivebox.core.models import Snapshot - - start_ts = datetime.now() + start_ts = datetime.now(timezone.utc) status = 'failed' output = None error = '' @@ -211,30 +136,19 @@ def main(url: str, snapshot_id: str): save_merkletree = os.getenv('SAVE_MERKLETREE', 'true').lower() in ('true', '1', 'yes', 'on') if not save_merkletree: - click.echo('Skipping merkle tree (SAVE_MERKLETREE=False)') status = 'skipped' - end_ts = datetime.now() - click.echo(f'START_TS={start_ts.isoformat()}') - click.echo(f'END_TS={end_ts.isoformat()}') - click.echo(f'STATUS={status}') - click.echo(f'RESULT_JSON={{"extractor": "merkletree", "status": "{status}", "url": "{url}", "snapshot_id": "{snapshot_id}"}}') + click.echo(json.dumps({'status': status, 'output': 'SAVE_MERKLETREE=false'})) sys.exit(0) - # Get snapshot - try: - snapshot = Snapshot.objects.get(id=snapshot_id) - except Snapshot.DoesNotExist: - error = f'Snapshot {snapshot_id} not found' - raise ValueError(error) + # Working directory is the extractor output dir (e.g., /merkletree/) + # Parent is the snapshot directory + output_dir = Path.cwd() + snapshot_dir = output_dir.parent - # Get snapshot directory - snapshot_dir = Path(snapshot.output_dir) if not snapshot_dir.exists(): - error = f'Snapshot directory not found: {snapshot_dir}' - raise FileNotFoundError(error) + raise FileNotFoundError(f'Snapshot directory not found: {snapshot_dir}') - # Create output directory - output_dir = snapshot_dir / 'merkletree' + # Ensure output directory exists output_dir.mkdir(exist_ok=True) output_path = output_dir / 'merkletree.json' @@ -246,49 +160,31 @@ def main(url: str, snapshot_id: str): json.dump(merkle_data, f, indent=2) status = 'succeeded' - output = str(output_path) + output = 'merkletree.json' root_hash = merkle_data['root_hash'] file_count = merkle_data['metadata']['file_count'] total_size = merkle_data['metadata']['total_size'] - click.echo(f'Merkle tree created: {file_count} files, root={root_hash[:16]}..., size={total_size:,} bytes') + click.echo(f'Merkle tree: {file_count} files, root={root_hash[:16]}..., size={total_size:,} bytes') except Exception as e: error = f'{type(e).__name__}: {e}' status = 'failed' click.echo(f'Error: {error}', err=True) - end_ts = datetime.now() - duration = (end_ts - start_ts).total_seconds() + end_ts = datetime.now(timezone.utc) - # Print results - click.echo(f'START_TS={start_ts.isoformat()}') - click.echo(f'END_TS={end_ts.isoformat()}') - click.echo(f'DURATION={duration:.2f}') - if output: - click.echo(f'OUTPUT={output}') - click.echo(f'STATUS={status}') - - if error: - click.echo(f'ERROR={error}', err=True) - - # Print JSON result - result_json = { - 'extractor': 'merkletree', - 'url': url, - 'snapshot_id': snapshot_id, + # Print JSON result for hook runner + result = { 'status': status, - 'start_ts': start_ts.isoformat(), - 'end_ts': end_ts.isoformat(), - 'duration': round(duration, 2), 'output': output, + 'error': error or None, 'root_hash': root_hash, 'file_count': file_count, - 'error': error or None, } - click.echo(f'RESULT_JSON={json.dumps(result_json)}') + click.echo(json.dumps(result)) - sys.exit(0 if status == 'succeeded' else 1) + sys.exit(0 if status in ('succeeded', 'skipped') else 1) if __name__ == '__main__': diff --git a/archivebox/plugins/parse_dom_outlinks/templates/icon.html b/archivebox/plugins/parse_dom_outlinks/templates/icon.html new file mode 100644 index 00000000..f77458fd --- /dev/null +++ b/archivebox/plugins/parse_dom_outlinks/templates/icon.html @@ -0,0 +1 @@ +🔗 diff --git a/archivebox/plugins/parse_html_urls/on_Snapshot__60_parse_html_urls.py b/archivebox/plugins/parse_html_urls/on_Snapshot__60_parse_html_urls.py index 430f87ae..b295f79f 100755 --- a/archivebox/plugins/parse_html_urls/on_Snapshot__60_parse_html_urls.py +++ b/archivebox/plugins/parse_html_urls/on_Snapshot__60_parse_html_urls.py @@ -133,7 +133,8 @@ def fetch_content(url: str) -> str: @click.command() @click.option('--url', required=True, help='HTML URL to parse') -def main(url: str): +@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)') +def main(url: str, snapshot_id: str = None): """Parse HTML and extract href URLs.""" # Skip only if parse_dom_outlinks already ran AND found URLs (it uses Chrome for better coverage) diff --git a/archivebox/plugins/parse_html_urls/templates/icon.html b/archivebox/plugins/parse_html_urls/templates/icon.html new file mode 100644 index 00000000..f77458fd --- /dev/null +++ b/archivebox/plugins/parse_html_urls/templates/icon.html @@ -0,0 +1 @@ +🔗 diff --git a/archivebox/plugins/parse_jsonl_urls/on_Snapshot__64_parse_jsonl_urls.py b/archivebox/plugins/parse_jsonl_urls/on_Snapshot__64_parse_jsonl_urls.py index 57979504..e75a9a4f 100755 --- a/archivebox/plugins/parse_jsonl_urls/on_Snapshot__64_parse_jsonl_urls.py +++ b/archivebox/plugins/parse_jsonl_urls/on_Snapshot__64_parse_jsonl_urls.py @@ -127,7 +127,8 @@ def fetch_content(url: str) -> str: @click.command() @click.option('--url', required=True, help='JSONL file URL to parse') -def main(url: str): +@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)') +def main(url: str, snapshot_id: str = None): """Parse JSONL bookmark file and extract URLs.""" try: diff --git a/archivebox/plugins/parse_jsonl_urls/templates/icon.html b/archivebox/plugins/parse_jsonl_urls/templates/icon.html new file mode 100644 index 00000000..98c76c15 --- /dev/null +++ b/archivebox/plugins/parse_jsonl_urls/templates/icon.html @@ -0,0 +1 @@ +📋 diff --git a/archivebox/plugins/parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py b/archivebox/plugins/parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py index 6f1dd512..a11e9bc9 100755 --- a/archivebox/plugins/parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py +++ b/archivebox/plugins/parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py @@ -52,7 +52,8 @@ def fetch_content(url: str) -> str: @click.command() @click.option('--url', required=True, help='Netscape bookmark file URL to parse') -def main(url: str): +@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)') +def main(url: str, snapshot_id: str = None): """Parse Netscape bookmark HTML and extract URLs.""" try: diff --git a/archivebox/plugins/parse_netscape_urls/templates/icon.html b/archivebox/plugins/parse_netscape_urls/templates/icon.html new file mode 100644 index 00000000..0cc8da81 --- /dev/null +++ b/archivebox/plugins/parse_netscape_urls/templates/icon.html @@ -0,0 +1 @@ +🔖 diff --git a/archivebox/plugins/parse_rss_urls/on_Snapshot__61_parse_rss_urls.py b/archivebox/plugins/parse_rss_urls/on_Snapshot__61_parse_rss_urls.py index 64310810..554eb8ef 100755 --- a/archivebox/plugins/parse_rss_urls/on_Snapshot__61_parse_rss_urls.py +++ b/archivebox/plugins/parse_rss_urls/on_Snapshot__61_parse_rss_urls.py @@ -51,7 +51,8 @@ def fetch_content(url: str) -> str: @click.command() @click.option('--url', required=True, help='RSS/Atom feed URL to parse') -def main(url: str): +@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)') +def main(url: str, snapshot_id: str = None): """Parse RSS/Atom feed and extract article URLs.""" if feedparser is None: diff --git a/archivebox/plugins/parse_rss_urls/templates/icon.html b/archivebox/plugins/parse_rss_urls/templates/icon.html new file mode 100644 index 00000000..81de8a1a --- /dev/null +++ b/archivebox/plugins/parse_rss_urls/templates/icon.html @@ -0,0 +1 @@ +📡 diff --git a/archivebox/plugins/parse_txt_urls/on_Snapshot__62_parse_txt_urls.py b/archivebox/plugins/parse_txt_urls/on_Snapshot__62_parse_txt_urls.py index c3c5c8d0..9b94d35a 100755 --- a/archivebox/plugins/parse_txt_urls/on_Snapshot__62_parse_txt_urls.py +++ b/archivebox/plugins/parse_txt_urls/on_Snapshot__62_parse_txt_urls.py @@ -100,7 +100,8 @@ def fetch_content(url: str) -> str: @click.command() @click.option('--url', required=True, help='URL to parse (file:// or https://)') -def main(url: str): +@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)') +def main(url: str, snapshot_id: str = None): """Parse plain text and extract URLs.""" try: diff --git a/archivebox/plugins/parse_txt_urls/templates/icon.html b/archivebox/plugins/parse_txt_urls/templates/icon.html new file mode 100644 index 00000000..0351b8bf --- /dev/null +++ b/archivebox/plugins/parse_txt_urls/templates/icon.html @@ -0,0 +1 @@ +📃 diff --git a/archivebox/plugins/pdf/templates/embed.html b/archivebox/plugins/pdf/templates/embed.html new file mode 100644 index 00000000..732a01bc --- /dev/null +++ b/archivebox/plugins/pdf/templates/embed.html @@ -0,0 +1,5 @@ + + diff --git a/archivebox/plugins/pdf/templates/fullscreen.html b/archivebox/plugins/pdf/templates/fullscreen.html new file mode 100644 index 00000000..240b7cea --- /dev/null +++ b/archivebox/plugins/pdf/templates/fullscreen.html @@ -0,0 +1,5 @@ + + diff --git a/archivebox/plugins/pdf/templates/icon.html b/archivebox/plugins/pdf/templates/icon.html new file mode 100644 index 00000000..063530f3 --- /dev/null +++ b/archivebox/plugins/pdf/templates/icon.html @@ -0,0 +1 @@ +📄 \ No newline at end of file diff --git a/archivebox/plugins/pdf/templates/thumbnail.html b/archivebox/plugins/pdf/templates/thumbnail.html new file mode 100644 index 00000000..32895d04 --- /dev/null +++ b/archivebox/plugins/pdf/templates/thumbnail.html @@ -0,0 +1,6 @@ + +
+ +
diff --git a/archivebox/plugins/readability/on_Crawl__00_install_readability.py b/archivebox/plugins/readability/on_Crawl__00_install_readability.py deleted file mode 100755 index 0a1cb077..00000000 --- a/archivebox/plugins/readability/on_Crawl__00_install_readability.py +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env python3 -""" -Install readability-extractor if not already available. - -Runs at crawl start to ensure readability-extractor is installed. -Outputs JSONL for InstalledBinary. -""" - -import json -import sys -from pathlib import Path - - -def main(): - try: - from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides - - NpmProvider.model_rebuild() - EnvProvider.model_rebuild() - - # Note: npm package is from github:ArchiveBox/readability-extractor - readability_binary = Binary( - name='readability-extractor', - binproviders=[NpmProvider(), EnvProvider()], - overrides={'npm': {'packages': ['github:ArchiveBox/readability-extractor']}} - ) - - # Try to load, install if not found - try: - loaded = readability_binary.load() - if not loaded or not loaded.abspath: - raise Exception("Not loaded") - except Exception: - # Install via npm from GitHub repo - loaded = readability_binary.install() - - if loaded and loaded.abspath: - # Output InstalledBinary JSONL - print(json.dumps({ - 'type': 'InstalledBinary', - 'name': 'readability-extractor', - 'abspath': str(loaded.abspath), - 'version': str(loaded.version) if loaded.version else None, - 'sha256': loaded.sha256, - 'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown', - })) - sys.exit(0) - else: - print(json.dumps({ - 'type': 'Dependency', - 'bin_name': 'readability-extractor', - 'bin_providers': 'npm,env', - })) - print("Failed to install readability-extractor", file=sys.stderr) - sys.exit(1) - - except Exception as e: - print(json.dumps({ - 'type': 'Dependency', - 'bin_name': 'readability-extractor', - 'bin_providers': 'npm,env', - })) - print(f"Error installing readability-extractor: {e}", file=sys.stderr) - sys.exit(1) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/readability/on_Crawl__00_validate_readability.py b/archivebox/plugins/readability/on_Crawl__00_validate_readability.py new file mode 100755 index 00000000..d82a795f --- /dev/null +++ b/archivebox/plugins/readability/on_Crawl__00_validate_readability.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 +""" +Validation hook for readability-extractor binary. + +Runs at crawl start to verify readability-extractor is available. +Outputs JSONL for InstalledBinary and Machine config updates. +""" + +import os +import sys +import json +import shutil +import hashlib +import subprocess +from pathlib import Path + + +def get_binary_version(abspath: str) -> str | None: + """Get version string from binary.""" + try: + result = subprocess.run( + [abspath, '--version'], + capture_output=True, + text=True, + timeout=5, + ) + if result.returncode == 0 and result.stdout: + first_line = result.stdout.strip().split('\n')[0] + return first_line[:64] + except Exception: + pass + return None + + +def get_binary_hash(abspath: str) -> str | None: + """Get SHA256 hash of binary.""" + try: + with open(abspath, 'rb') as f: + return hashlib.sha256(f.read()).hexdigest() + except Exception: + return None + + +def find_readability() -> dict | None: + """Find readability-extractor binary.""" + try: + from abx_pkg import Binary, NpmProvider, EnvProvider + + class ReadabilityBinary(Binary): + name: str = 'readability-extractor' + binproviders_supported = [NpmProvider(), EnvProvider()] + overrides: dict = {'npm': {'packages': ['github:ArchiveBox/readability-extractor']}} + + binary = ReadabilityBinary() + loaded = binary.load() + if loaded and loaded.abspath: + return { + 'name': 'readability-extractor', + 'abspath': str(loaded.abspath), + 'version': str(loaded.version) if loaded.version else None, + 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, + 'binprovider': loaded.binprovider.name if loaded.binprovider else 'env', + } + except ImportError: + pass + except Exception: + pass + + # Fallback to shutil.which + abspath = shutil.which('readability-extractor') or os.environ.get('READABILITY_BINARY', '') + if abspath and Path(abspath).is_file(): + return { + 'name': 'readability-extractor', + 'abspath': abspath, + 'version': get_binary_version(abspath), + 'sha256': get_binary_hash(abspath), + 'binprovider': 'env', + } + + return None + + +def main(): + result = find_readability() + + if result and result.get('abspath'): + print(json.dumps({ + 'type': 'InstalledBinary', + 'name': result['name'], + 'abspath': result['abspath'], + 'version': result['version'], + 'sha256': result['sha256'], + 'binprovider': result['binprovider'], + })) + + print(json.dumps({ + 'type': 'Machine', + '_method': 'update', + 'key': 'config/READABILITY_BINARY', + 'value': result['abspath'], + })) + + if result['version']: + print(json.dumps({ + 'type': 'Machine', + '_method': 'update', + 'key': 'config/READABILITY_VERSION', + 'value': result['version'], + })) + + sys.exit(0) + else: + print(json.dumps({ + 'type': 'Dependency', + 'bin_name': 'readability-extractor', + 'bin_providers': 'npm,env', + })) + print(f"readability-extractor binary not found", file=sys.stderr) + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/readability/templates/embed.html b/archivebox/plugins/readability/templates/embed.html new file mode 100644 index 00000000..bea7dd13 --- /dev/null +++ b/archivebox/plugins/readability/templates/embed.html @@ -0,0 +1,6 @@ + + diff --git a/archivebox/plugins/readability/templates/fullscreen.html b/archivebox/plugins/readability/templates/fullscreen.html new file mode 100644 index 00000000..4e842fb6 --- /dev/null +++ b/archivebox/plugins/readability/templates/fullscreen.html @@ -0,0 +1,6 @@ + + diff --git a/archivebox/plugins/readability/templates/icon.html b/archivebox/plugins/readability/templates/icon.html new file mode 100644 index 00000000..66336e65 --- /dev/null +++ b/archivebox/plugins/readability/templates/icon.html @@ -0,0 +1 @@ +📖 \ No newline at end of file diff --git a/archivebox/plugins/readability/templates/thumbnail.html b/archivebox/plugins/readability/templates/thumbnail.html new file mode 100644 index 00000000..5e118e55 --- /dev/null +++ b/archivebox/plugins/readability/templates/thumbnail.html @@ -0,0 +1,8 @@ + +
+ +
diff --git a/archivebox/plugins/readability/tests/test_readability.py b/archivebox/plugins/readability/tests/test_readability.py index 403bfa3a..eede2939 100644 --- a/archivebox/plugins/readability/tests/test_readability.py +++ b/archivebox/plugins/readability/tests/test_readability.py @@ -2,7 +2,7 @@ Integration tests for readability plugin Tests verify: -1. Install hook installs readability-extractor via abx-pkg +1. Validate hook checks for readability-extractor binary 2. Verify deps with abx-pkg 3. Plugin reports missing dependency correctly 4. Extraction works against real example.com content @@ -21,7 +21,7 @@ import pytest PLUGIN_DIR = Path(__file__).parent.parent PLUGINS_ROOT = PLUGIN_DIR.parent READABILITY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_readability.py')) -READABILITY_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_readability.py' +READABILITY_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_readability.py' TEST_URL = 'https://example.com' @@ -101,48 +101,63 @@ def test_reports_missing_dependency_when_not_installed(): assert 'readability-extractor' in combined or 'BIN_NAME' in combined, "Should mention readability-extractor" -def test_readability_install_hook(): - """Test readability install hook to install readability-extractor if needed.""" +def test_readability_validate_hook(): + """Test readability validate hook checks for readability-extractor binary.""" result = subprocess.run( - [sys.executable, str(READABILITY_INSTALL_HOOK)], + [sys.executable, str(READABILITY_VALIDATE_HOOK)], capture_output=True, text=True, - timeout=600 + timeout=30 ) - assert result.returncode == 0, f"Install hook failed: {result.stderr}" - - # Verify InstalledBinary JSONL output - found_binary = False - for line in result.stdout.strip().split('\n'): - if line.strip(): - try: - record = json.loads(line) - if record.get('type') == 'InstalledBinary': - assert record['name'] == 'readability-extractor' - assert record['abspath'] - found_binary = True - break - except json.JSONDecodeError: - pass - - assert found_binary, "Should output InstalledBinary record" + # Hook exits 0 if binary found, 1 if not found (with Dependency record) + if result.returncode == 0: + # Binary found - verify InstalledBinary JSONL output + found_binary = False + for line in result.stdout.strip().split('\n'): + if line.strip(): + try: + record = json.loads(line) + if record.get('type') == 'InstalledBinary': + assert record['name'] == 'readability-extractor' + assert record['abspath'] + found_binary = True + break + except json.JSONDecodeError: + pass + assert found_binary, "Should output InstalledBinary record when binary found" + else: + # Binary not found - verify Dependency JSONL output + found_dependency = False + for line in result.stdout.strip().split('\n'): + if line.strip(): + try: + record = json.loads(line) + if record.get('type') == 'Dependency': + assert record['bin_name'] == 'readability-extractor' + assert 'npm' in record['bin_providers'] + found_dependency = True + break + except json.JSONDecodeError: + pass + assert found_dependency, "Should output Dependency record when binary not found" def test_verify_deps_with_abx_pkg(): - """Verify readability-extractor is available via abx-pkg after hook installation.""" + """Verify readability-extractor is available via abx-pkg.""" from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides - NpmProvider.model_rebuild() - EnvProvider.model_rebuild() - readability_binary = Binary( name='readability-extractor', binproviders=[NpmProvider(), EnvProvider()], overrides={'npm': {'packages': ['github:ArchiveBox/readability-extractor']}} ) readability_loaded = readability_binary.load() - assert readability_loaded and readability_loaded.abspath, "readability-extractor should be available after install hook" + + if readability_loaded and readability_loaded.abspath: + assert True, "readability-extractor is available" + else: + pytest.skip("readability-extractor not available - Dependency record should have been emitted") def test_extracts_article_after_installation(): diff --git a/archivebox/plugins/screenshot/templates/embed.html b/archivebox/plugins/screenshot/templates/embed.html new file mode 100644 index 00000000..097a8aa6 --- /dev/null +++ b/archivebox/plugins/screenshot/templates/embed.html @@ -0,0 +1,5 @@ + +Screenshot of page diff --git a/archivebox/plugins/screenshot/templates/fullscreen.html b/archivebox/plugins/screenshot/templates/fullscreen.html new file mode 100644 index 00000000..b5f8901a --- /dev/null +++ b/archivebox/plugins/screenshot/templates/fullscreen.html @@ -0,0 +1,8 @@ + +
+ Screenshot of page +
diff --git a/archivebox/plugins/screenshot/templates/icon.html b/archivebox/plugins/screenshot/templates/icon.html new file mode 100644 index 00000000..e76b5f98 --- /dev/null +++ b/archivebox/plugins/screenshot/templates/icon.html @@ -0,0 +1 @@ +📷 \ No newline at end of file diff --git a/archivebox/plugins/screenshot/templates/thumbnail.html b/archivebox/plugins/screenshot/templates/thumbnail.html new file mode 100644 index 00000000..5d49374d --- /dev/null +++ b/archivebox/plugins/screenshot/templates/thumbnail.html @@ -0,0 +1,8 @@ + +Screenshot of page +
📷 Screenshot
diff --git a/archivebox/plugins/singlefile/templates/embed.html b/archivebox/plugins/singlefile/templates/embed.html new file mode 100644 index 00000000..e6982391 --- /dev/null +++ b/archivebox/plugins/singlefile/templates/embed.html @@ -0,0 +1,6 @@ + + diff --git a/archivebox/plugins/singlefile/templates/fullscreen.html b/archivebox/plugins/singlefile/templates/fullscreen.html new file mode 100644 index 00000000..1a671579 --- /dev/null +++ b/archivebox/plugins/singlefile/templates/fullscreen.html @@ -0,0 +1,6 @@ + + diff --git a/archivebox/plugins/singlefile/templates/icon.html b/archivebox/plugins/singlefile/templates/icon.html new file mode 100644 index 00000000..31f4673e --- /dev/null +++ b/archivebox/plugins/singlefile/templates/icon.html @@ -0,0 +1 @@ +📦 \ No newline at end of file diff --git a/archivebox/plugins/singlefile/templates/thumbnail.html b/archivebox/plugins/singlefile/templates/thumbnail.html new file mode 100644 index 00000000..5d7e5614 --- /dev/null +++ b/archivebox/plugins/singlefile/templates/thumbnail.html @@ -0,0 +1,8 @@ + +
+ +
diff --git a/archivebox/plugins/staticfile/templates/icon.html b/archivebox/plugins/staticfile/templates/icon.html new file mode 100644 index 00000000..54431735 --- /dev/null +++ b/archivebox/plugins/staticfile/templates/icon.html @@ -0,0 +1 @@ +📁 diff --git a/archivebox/plugins/title/templates/icon.html b/archivebox/plugins/title/templates/icon.html new file mode 100644 index 00000000..5a051312 --- /dev/null +++ b/archivebox/plugins/title/templates/icon.html @@ -0,0 +1 @@ +📝 \ No newline at end of file diff --git a/archivebox/plugins/wget/on_Crawl__00_install_wget.py b/archivebox/plugins/wget/on_Crawl__00_install_wget.py deleted file mode 100755 index ae79f6e8..00000000 --- a/archivebox/plugins/wget/on_Crawl__00_install_wget.py +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env python3 -""" -Install wget if not already available. - -Runs at crawl start to ensure wget is installed. -Outputs JSONL for InstalledBinary. -""" - -import json -import sys -from pathlib import Path - - -def main(): - try: - from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides - - AptProvider.model_rebuild() - BrewProvider.model_rebuild() - EnvProvider.model_rebuild() - - # wget binary and package have same name - wget_binary = Binary( - name='wget', - binproviders=[AptProvider(), BrewProvider(), EnvProvider()] - ) - - # Try to load, install if not found - try: - loaded = wget_binary.load() - if not loaded or not loaded.abspath: - raise Exception("Not loaded") - except Exception: - # Install via system package manager - loaded = wget_binary.install() - - if loaded and loaded.abspath: - # Output InstalledBinary JSONL - print(json.dumps({ - 'type': 'InstalledBinary', - 'name': 'wget', - 'abspath': str(loaded.abspath), - 'version': str(loaded.version) if loaded.version else None, - 'sha256': loaded.sha256, - 'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown', - })) - sys.exit(0) - else: - print(json.dumps({ - 'type': 'Dependency', - 'bin_name': 'wget', - 'bin_providers': 'apt,brew,env', - })) - print("Failed to install wget", file=sys.stderr) - sys.exit(1) - - except Exception as e: - print(json.dumps({ - 'type': 'Dependency', - 'bin_name': 'wget', - 'bin_providers': 'apt,brew,env', - })) - print(f"Error installing wget: {e}", file=sys.stderr) - sys.exit(1) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/wget/templates/embed.html b/archivebox/plugins/wget/templates/embed.html new file mode 100644 index 00000000..07f733ca --- /dev/null +++ b/archivebox/plugins/wget/templates/embed.html @@ -0,0 +1,6 @@ + + diff --git a/archivebox/plugins/wget/templates/fullscreen.html b/archivebox/plugins/wget/templates/fullscreen.html new file mode 100644 index 00000000..0c2b553a --- /dev/null +++ b/archivebox/plugins/wget/templates/fullscreen.html @@ -0,0 +1,6 @@ + + diff --git a/archivebox/plugins/wget/templates/icon.html b/archivebox/plugins/wget/templates/icon.html new file mode 100644 index 00000000..fdf8df21 --- /dev/null +++ b/archivebox/plugins/wget/templates/icon.html @@ -0,0 +1 @@ +📥 \ No newline at end of file diff --git a/archivebox/plugins/wget/templates/thumbnail.html b/archivebox/plugins/wget/templates/thumbnail.html new file mode 100644 index 00000000..550db449 --- /dev/null +++ b/archivebox/plugins/wget/templates/thumbnail.html @@ -0,0 +1,8 @@ + +
+ +
diff --git a/archivebox/plugins/wget/tests/test_wget.py b/archivebox/plugins/wget/tests/test_wget.py index 0b257628..e1686333 100644 --- a/archivebox/plugins/wget/tests/test_wget.py +++ b/archivebox/plugins/wget/tests/test_wget.py @@ -2,8 +2,8 @@ Integration tests for wget plugin Tests verify: -1. Plugin reports missing dependency correctly -2. wget can be installed via brew/apt provider hooks +1. Validate hook checks for wget binary +2. Verify deps with abx-pkg 3. Config options work (SAVE_WGET, SAVE_WARC, etc.) 4. Extraction works against real example.com 5. Output files contain actual page content @@ -26,7 +26,7 @@ import pytest PLUGIN_DIR = Path(__file__).parent.parent PLUGINS_ROOT = PLUGIN_DIR.parent WGET_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_wget.py')) -WGET_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_wget.py' +WGET_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_wget.py' BREW_HOOK = PLUGINS_ROOT / 'brew' / 'on_Dependency__install_using_brew_provider.py' APT_HOOK = PLUGINS_ROOT / 'apt' / 'on_Dependency__install_using_apt_provider.py' TEST_URL = 'https://example.com' @@ -37,45 +37,59 @@ def test_hook_script_exists(): assert WGET_HOOK.exists(), f"Hook script not found: {WGET_HOOK}" -def test_wget_install_hook(): - """Test wget install hook to install wget if needed.""" +def test_wget_validate_hook(): + """Test wget validate hook checks for wget binary.""" result = subprocess.run( - [sys.executable, str(WGET_INSTALL_HOOK)], + [sys.executable, str(WGET_VALIDATE_HOOK)], capture_output=True, text=True, - timeout=600 + timeout=30 ) - assert result.returncode == 0, f"Install hook failed: {result.stderr}" - - # Verify InstalledBinary JSONL output - found_binary = False - for line in result.stdout.strip().split('\n'): - if line.strip(): - try: - record = json.loads(line) - if record.get('type') == 'InstalledBinary': - assert record['name'] == 'wget' - assert record['abspath'] - found_binary = True - break - except json.JSONDecodeError: - pass - - assert found_binary, "Should output InstalledBinary record" + # Hook exits 0 if binary found, 1 if not found (with Dependency record) + if result.returncode == 0: + # Binary found - verify InstalledBinary JSONL output + found_binary = False + for line in result.stdout.strip().split('\n'): + if line.strip(): + try: + record = json.loads(line) + if record.get('type') == 'InstalledBinary': + assert record['name'] == 'wget' + assert record['abspath'] + found_binary = True + break + except json.JSONDecodeError: + pass + assert found_binary, "Should output InstalledBinary record when binary found" + else: + # Binary not found - verify Dependency JSONL output + found_dependency = False + for line in result.stdout.strip().split('\n'): + if line.strip(): + try: + record = json.loads(line) + if record.get('type') == 'Dependency': + assert record['bin_name'] == 'wget' + assert 'env' in record['bin_providers'] + found_dependency = True + break + except json.JSONDecodeError: + pass + assert found_dependency, "Should output Dependency record when binary not found" def test_verify_deps_with_abx_pkg(): - """Verify wget is available via abx-pkg after hook installation.""" - from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider - - AptProvider.model_rebuild() - BrewProvider.model_rebuild() - EnvProvider.model_rebuild() + """Verify wget is available via abx-pkg.""" + from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides wget_binary = Binary(name='wget', binproviders=[AptProvider(), BrewProvider(), EnvProvider()]) wget_loaded = wget_binary.load() - assert wget_loaded and wget_loaded.abspath, "wget should be available after install hook" + + if wget_loaded and wget_loaded.abspath: + assert True, "wget is available" + else: + pytest.skip("wget not available - Dependency record should have been emitted") def test_reports_missing_dependency_when_not_installed(): diff --git a/archivebox/templates/admin/base.html b/archivebox/templates/admin/base.html index 192cc323..8c580cc5 100644 --- a/archivebox/templates/admin/base.html +++ b/archivebox/templates/admin/base.html @@ -110,6 +110,10 @@ {% block nav-global %}{% endblock %} + {% if has_permission %} + {% include 'admin/progress_monitor.html' %} + {% endif %} + {% block breadcrumbs %} {% endfor %} @@ -431,7 +473,15 @@ - + {% if best_result.result %} + {# Use plugin-specific fullscreen template when ArchiveResult is available #} +
+ {% extractor_fullscreen best_result.result %} +
+ {% else %} + {# Fall back to generic iframe #} + + {% endif %} diff --git a/archivebox/workers/admin.py b/archivebox/workers/admin.py index 40aaeade..bf7a8e7c 100644 --- a/archivebox/workers/admin.py +++ b/archivebox/workers/admin.py @@ -1,23 +1,13 @@ +""" +Workers admin module. + +The orchestrator/worker system doesn't need Django admin registration +as workers are managed via CLI commands and the orchestrator. +""" + __package__ = 'archivebox.workers' -from django.contrib.auth import get_permission_codename - -from huey_monitor.apps import HueyMonitorConfig -from huey_monitor.admin import TaskModel, TaskModelAdmin, SignalInfoModel, SignalInfoModelAdmin - - -HueyMonitorConfig.verbose_name = 'Background Workers' - - -class CustomTaskModelAdmin(TaskModelAdmin): - actions = ["delete_selected"] - - def has_delete_permission(self, request, obj=None): - codename = get_permission_codename("delete", self.opts) - return request.user.has_perm("%s.%s" % (self.opts.app_label, codename)) - - def register_admin(admin_site): - admin_site.register(TaskModel, CustomTaskModelAdmin) - admin_site.register(SignalInfoModel, SignalInfoModelAdmin) + """No models to register - workers are process-based, not Django models.""" + pass diff --git a/archivebox/workers/management/__init__.py b/archivebox/workers/management/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/archivebox/workers/management/commands/__init__.py b/archivebox/workers/management/commands/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/archivebox/workers/management/commands/orchestrator.py b/archivebox/workers/management/commands/orchestrator.py new file mode 100644 index 00000000..27ef11d0 --- /dev/null +++ b/archivebox/workers/management/commands/orchestrator.py @@ -0,0 +1,15 @@ +from django.core.management.base import BaseCommand + +from workers.orchestrator import Orchestrator + + +class Command(BaseCommand): + help = 'Run the archivebox orchestrator' + + def add_arguments(self, parser): + parser.add_argument('--daemon', '-d', action='store_true', help="Run forever (don't exit on idle)") + + def handle(self, *args, **kwargs): + daemon = kwargs.get('daemon', False) + orchestrator = Orchestrator(exit_on_idle=not daemon) + orchestrator.runloop() diff --git a/archivebox/workers/orchestrator.py b/archivebox/workers/orchestrator.py index 68a13628..a4c1a390 100644 --- a/archivebox/workers/orchestrator.py +++ b/archivebox/workers/orchestrator.py @@ -35,6 +35,7 @@ from django.utils import timezone from rich import print +from archivebox.misc.logging_util import log_worker_event from .worker import Worker, CrawlWorker, SnapshotWorker, ArchiveResultWorker from .pid_utils import ( write_pid_file, @@ -82,22 +83,39 @@ class Orchestrator: """Called when orchestrator starts.""" self.pid = os.getpid() self.pid_file = write_pid_file('orchestrator', worker_id=0) - print(f'[green]👨‍✈️ {self} STARTED[/green]') - + # Clean up any stale PID files from previous runs stale_count = cleanup_stale_pid_files() + + # Collect startup metadata + metadata = { + 'max_workers_per_type': self.MAX_WORKERS_PER_TYPE, + 'max_total_workers': self.MAX_TOTAL_WORKERS, + 'poll_interval': self.POLL_INTERVAL, + } if stale_count: - print(f'[yellow]👨‍✈️ {self} cleaned up {stale_count} stale PID files[/yellow]') + metadata['cleaned_stale_pids'] = stale_count + + log_worker_event( + worker_type='Orchestrator', + event='Starting...', + indent_level=0, + pid=self.pid, + metadata=metadata, + ) def on_shutdown(self, error: BaseException | None = None) -> None: """Called when orchestrator shuts down.""" if self.pid_file: remove_pid_file(self.pid_file) - - if error and not isinstance(error, KeyboardInterrupt): - print(f'[red]👨‍✈️ {self} SHUTDOWN with error:[/red] {type(error).__name__}: {error}') - else: - print(f'[grey53]👨‍✈️ {self} SHUTDOWN[/grey53]') + + log_worker_event( + worker_type='Orchestrator', + event='Shutting down', + indent_level=0, + pid=self.pid, + error=error if error and not isinstance(error, KeyboardInterrupt) else None, + ) def get_total_worker_count(self) -> int: """Get total count of running workers across all types.""" @@ -129,10 +147,17 @@ class Orchestrator: """Spawn a new worker process. Returns PID or None if spawn failed.""" try: pid = WorkerClass.start(daemon=False) - print(f'[blue]👨‍✈️ {self} spawned {WorkerClass.name} worker[/blue] pid={pid}') + # Worker spawning is logged by the worker itself in on_startup() return pid except Exception as e: - print(f'[red]👨‍✈️ {self} failed to spawn {WorkerClass.name} worker:[/red] {e}') + log_worker_event( + worker_type='Orchestrator', + event='Failed to spawn worker', + indent_level=0, + pid=self.pid, + metadata={'worker_type': WorkerClass.name}, + error=e, + ) return None def check_queues_and_spawn_workers(self) -> dict[str, int]: @@ -181,26 +206,13 @@ class Orchestrator: def on_tick(self, queue_sizes: dict[str, int]) -> None: """Called each orchestrator tick. Override for custom behavior.""" - total_queued = sum(queue_sizes.values()) - total_workers = self.get_total_worker_count() - - if total_queued > 0 or total_workers > 0: - # Build status line - status_parts = [] - for WorkerClass in self.WORKER_TYPES: - name = WorkerClass.name - queued = queue_sizes.get(name, 0) - workers = len(WorkerClass.get_running_workers()) - if queued > 0 or workers > 0: - status_parts.append(f'{name}={queued}q/{workers}w') - - if status_parts: - print(f'[grey53]👨‍✈️ {self} tick:[/grey53] {" ".join(status_parts)}') + # Tick logging suppressed to reduce noise + pass def on_idle(self) -> None: """Called when orchestrator is idle (no work, no workers).""" - if self.idle_count == 1: - print(f'[grey53]👨‍✈️ {self} idle, waiting for work...[/grey53]') + # Idle logging suppressed to reduce noise + pass def should_exit(self, queue_sizes: dict[str, int]) -> bool: """Determine if orchestrator should exit.""" @@ -242,7 +254,12 @@ class Orchestrator: # Check if we should exit if self.should_exit(queue_sizes): - print(f'[green]👨‍✈️ {self} all work complete, exiting[/green]') + log_worker_event( + worker_type='Orchestrator', + event='All work complete', + indent_level=0, + pid=self.pid, + ) break time.sleep(self.POLL_INTERVAL) @@ -267,9 +284,14 @@ class Orchestrator: proc = Process(target=run_orchestrator, name='orchestrator') proc.start() - + assert proc.pid is not None - print(f'[green]👨‍✈️ Orchestrator started in background[/green] pid={proc.pid}') + log_worker_event( + worker_type='Orchestrator', + event='Started in background', + indent_level=0, + pid=proc.pid, + ) return proc.pid @classmethod diff --git a/archivebox/workers/supervisord_util.py b/archivebox/workers/supervisord_util.py index bd443569..69b440c4 100644 --- a/archivebox/workers/supervisord_util.py +++ b/archivebox/workers/supervisord_util.py @@ -26,22 +26,6 @@ CONFIG_FILE_NAME = "supervisord.conf" PID_FILE_NAME = "supervisord.pid" WORKERS_DIR_NAME = "workers" -SCHEDULER_WORKER = { - "name": "worker_scheduler", - "command": "archivebox manage djangohuey --queue system_tasks -w 4 -k thread --disable-health-check --flush-locks", - "autostart": "true", - "autorestart": "true", - "stdout_logfile": "logs/worker_scheduler.log", - "redirect_stderr": "true", -} -COMMAND_WORKER = { - "name": "worker_commands", - "command": "archivebox manage djangohuey --queue commands -w 4 -k thread --no-periodic --disable-health-check", - "autostart": "true", - "autorestart": "true", - "stdout_logfile": "logs/worker_commands.log", - "redirect_stderr": "true", -} ORCHESTRATOR_WORKER = { "name": "worker_orchestrator", "command": "archivebox manage orchestrator", @@ -391,10 +375,8 @@ def watch_worker(supervisor, daemon_name, interval=5): def start_server_workers(host='0.0.0.0', port='8000', daemonize=False): supervisor = get_or_create_supervisord_process(daemonize=daemonize) - + bg_workers = [ - SCHEDULER_WORKER, - COMMAND_WORKER, ORCHESTRATOR_WORKER, ] @@ -422,8 +404,7 @@ def start_server_workers(host='0.0.0.0', port='8000', daemonize=False): def start_cli_workers(watch=False): supervisor = get_or_create_supervisord_process(daemonize=False) - - start_worker(supervisor, COMMAND_WORKER) + start_worker(supervisor, ORCHESTRATOR_WORKER) if watch: @@ -434,13 +415,12 @@ def start_cli_workers(watch=False): except SystemExit: pass except BaseException as e: - STDERR.print(f"\n[🛑] Got {e.__class__.__name__} exception, stopping web server gracefully...") + STDERR.print(f"\n[🛑] Got {e.__class__.__name__} exception, stopping orchestrator gracefully...") raise finally: - stop_worker(supervisor, COMMAND_WORKER['name']) stop_worker(supervisor, ORCHESTRATOR_WORKER['name']) time.sleep(0.5) - return [COMMAND_WORKER, ORCHESTRATOR_WORKER] + return [ORCHESTRATOR_WORKER] # def main(daemons): diff --git a/archivebox/workers/tasks.py b/archivebox/workers/tasks.py index 9bf2f200..01858e7f 100644 --- a/archivebox/workers/tasks.py +++ b/archivebox/workers/tasks.py @@ -1,89 +1,60 @@ +""" +Background task functions for queuing work to the orchestrator. + +These functions queue Snapshots/Crawls for processing by setting their status +to QUEUED, which the orchestrator workers will pick up and process. +""" + __package__ = 'archivebox.workers' -from functools import wraps -# from django.utils import timezone +from django.utils import timezone -from django_huey import db_task, task -from huey_monitor.models import TaskModel -from huey_monitor.tqdm import ProcessInfo +def ensure_orchestrator_running(): + """Ensure the orchestrator is running to process queued items.""" + from .orchestrator import Orchestrator -from .supervisord_util import get_or_create_supervisord_process + if not Orchestrator.is_running(): + # Start orchestrator in background + orchestrator = Orchestrator(exit_on_idle=True) + orchestrator.start() -# @db_task(queue="commands", context=True, schedule=1) -# def scheduler_tick(): -# print('SCHEDULER TICK', timezone.now().isoformat()) -# # abx.archivebox.events.on_scheduler_runloop_start(timezone.now(), machine=Machine.objects.get_current_machine()) -# # abx.archivebox.events.on_scheduler_tick_start(timezone.now(), machine=Machine.objects.get_current_machine()) - -# scheduled_crawls = CrawlSchedule.objects.filter(is_enabled=True) -# scheduled_crawls_due = scheduled_crawls.filter(next_run_at__lte=timezone.now()) - -# for scheduled_crawl in scheduled_crawls_due: -# try: -# abx.archivebox.events.on_crawl_schedule_tick(scheduled_crawl) -# except Exception as e: -# abx.archivebox.events.on_crawl_schedule_failure(timezone.now(), machine=Machine.objects.get_current_machine(), error=e, schedule=scheduled_crawl) - -# # abx.archivebox.events.on_scheduler_tick_end(timezone.now(), machine=Machine.objects.get_current_machine(), tasks=scheduled_tasks_due) +def bg_add(add_kwargs: dict) -> int: + """ + Add URLs and queue them for archiving. -def db_task_with_parent(func): - """Decorator for db_task that sets the parent task for the db_task""" - - @wraps(func) - def wrapper(*args, **kwargs): - task = kwargs.get('task') - parent_task_id = kwargs.get('parent_task_id') - - if task and parent_task_id: - TaskModel.objects.set_parent_task(main_task_id=parent_task_id, sub_task_id=task.id) - - return func(*args, **kwargs) - - return wrapper - -@db_task(queue="commands", context=True) -def bg_add(add_kwargs, task=None, parent_task_id=None): - get_or_create_supervisord_process(daemonize=False) - - from ..main import add - - if task and parent_task_id: - TaskModel.objects.set_parent_task(main_task_id=parent_task_id, sub_task_id=task.id) + Returns the number of snapshots created. + """ + from archivebox.cli.archivebox_add import add assert add_kwargs and add_kwargs.get("urls") - rough_url_count = add_kwargs["urls"].count("://") - process_info = ProcessInfo(task, desc="add", parent_task_id=parent_task_id, total=rough_url_count) + # When called as background task, always run in background mode + add_kwargs = add_kwargs.copy() + add_kwargs['bg'] = True result = add(**add_kwargs) - process_info.update(n=rough_url_count) - return result + + # Ensure orchestrator is running to process the new snapshots + ensure_orchestrator_running() + + return len(result) if result else 0 -@task(queue="commands", context=True) -def bg_archive_snapshots(snapshots, kwargs=None, task=None, parent_task_id=None): +def bg_archive_snapshots(snapshots, kwargs: dict | None = None) -> int: """ Queue multiple snapshots for archiving via the state machine system. This sets snapshots to 'queued' status so the orchestrator workers pick them up. - The actual archiving happens through ArchiveResult.run(). - """ - get_or_create_supervisord_process(daemonize=False) + The actual archiving happens through the worker's process_item() method. - from django.utils import timezone + Returns the number of snapshots queued. + """ from core.models import Snapshot - if task and parent_task_id: - TaskModel.objects.set_parent_task(main_task_id=parent_task_id, sub_task_id=task.id) - - assert snapshots kwargs = kwargs or {} - rough_count = len(snapshots) if hasattr(snapshots, '__len__') else snapshots.count() - process_info = ProcessInfo(task, desc="archive_snapshots", parent_task_id=parent_task_id, total=rough_count) - # Queue snapshots by setting status to queued with immediate retry_at queued_count = 0 for snapshot in snapshots: @@ -95,36 +66,33 @@ def bg_archive_snapshots(snapshots, kwargs=None, task=None, parent_task_id=None) ) queued_count += 1 - process_info.update(n=queued_count) + # Ensure orchestrator is running to process the queued snapshots + if queued_count > 0: + ensure_orchestrator_running() + return queued_count -@task(queue="commands", context=True) -def bg_archive_snapshot(snapshot, overwrite=False, methods=None, task=None, parent_task_id=None): +def bg_archive_snapshot(snapshot, overwrite: bool = False, methods: list | None = None) -> int: """ Queue a single snapshot for archiving via the state machine system. This sets the snapshot to 'queued' status so the orchestrator workers pick it up. - The actual archiving happens through ArchiveResult.run(). + The actual archiving happens through the worker's process_item() method. + + Returns 1 if queued, 0 otherwise. """ - get_or_create_supervisord_process(daemonize=False) - - from django.utils import timezone from core.models import Snapshot - if task and parent_task_id: - TaskModel.objects.set_parent_task(main_task_id=parent_task_id, sub_task_id=task.id) - - process_info = ProcessInfo(task, desc="archive_snapshot", parent_task_id=parent_task_id, total=1) - # Queue the snapshot by setting status to queued if hasattr(snapshot, 'id'): Snapshot.objects.filter(id=snapshot.id).update( status=Snapshot.StatusChoices.QUEUED, retry_at=timezone.now(), ) - process_info.update(n=1) + + # Ensure orchestrator is running to process the queued snapshot + ensure_orchestrator_running() return 1 return 0 - diff --git a/archivebox/workers/worker.py b/archivebox/workers/worker.py index f1949b63..78e062da 100644 --- a/archivebox/workers/worker.py +++ b/archivebox/workers/worker.py @@ -25,6 +25,7 @@ from django.conf import settings from rich import print +from archivebox.misc.logging_util import log_worker_event from .pid_utils import ( write_pid_file, remove_pid_file, @@ -126,7 +127,7 @@ class Worker: obj.sm.tick() return True except Exception as e: - print(f'[red]{self} error processing {obj.pk}:[/red] {e}') + # Error will be logged in runloop's completion event traceback.print_exc() return False @@ -134,7 +135,28 @@ class Worker: """Called when worker starts.""" self.pid = os.getpid() self.pid_file = write_pid_file(self.name, self.worker_id) - print(f'[green]{self} STARTED[/green] pid_file={self.pid_file}') + + # Determine worker type for logging + worker_type_name = self.__class__.__name__ + indent_level = 1 # Default for most workers + + # Adjust indent level based on worker type + if 'Snapshot' in worker_type_name: + indent_level = 2 + elif 'ArchiveResult' in worker_type_name: + indent_level = 3 + + log_worker_event( + worker_type=worker_type_name, + event='Starting...', + indent_level=indent_level, + pid=self.pid, + worker_id=str(self.worker_id), + metadata={ + 'max_concurrent': self.MAX_CONCURRENT_TASKS, + 'poll_interval': self.POLL_INTERVAL, + }, + ) def on_shutdown(self, error: BaseException | None = None) -> None: """Called when worker shuts down.""" @@ -142,10 +164,23 @@ class Worker: if self.pid_file: remove_pid_file(self.pid_file) - if error and not isinstance(error, KeyboardInterrupt): - print(f'[red]{self} SHUTDOWN with error:[/red] {type(error).__name__}: {error}') - else: - print(f'[grey53]{self} SHUTDOWN[/grey53]') + # Determine worker type for logging + worker_type_name = self.__class__.__name__ + indent_level = 1 + + if 'Snapshot' in worker_type_name: + indent_level = 2 + elif 'ArchiveResult' in worker_type_name: + indent_level = 3 + + log_worker_event( + worker_type=worker_type_name, + event='Shutting down', + indent_level=indent_level, + pid=self.pid, + worker_id=str(self.worker_id), + error=error if error and not isinstance(error, KeyboardInterrupt) else None, + ) def should_exit(self) -> bool: """Check if worker should exit due to idle timeout.""" @@ -161,6 +196,15 @@ class Worker: """Main worker loop - polls queue, processes items.""" self.on_startup() + # Determine worker type for logging + worker_type_name = self.__class__.__name__ + indent_level = 1 + + if 'Snapshot' in worker_type_name: + indent_level = 2 + elif 'ArchiveResult' in worker_type_name: + indent_level = 3 + try: while True: # Try to claim and process an item @@ -168,25 +212,62 @@ class Worker: if obj is not None: self.idle_count = 0 - print(f'[blue]{self} processing:[/blue] {obj.pk}') + + # Build metadata for task start + start_metadata = {'task_id': str(obj.pk)} + if hasattr(obj, 'url'): + # SnapshotWorker + url = str(obj.url) if obj.url else None + else: + url = None + + extractor = None + if hasattr(obj, 'extractor'): + # ArchiveResultWorker + extractor = obj.extractor + start_metadata['extractor'] = extractor + + log_worker_event( + worker_type=worker_type_name, + event='Processing...', + indent_level=indent_level, + pid=self.pid, + worker_id=str(self.worker_id), + url=url, + extractor=extractor, + metadata=start_metadata, + ) start_time = time.time() success = self.process_item(obj) elapsed = time.time() - start_time - if success: - print(f'[green]{self} completed ({elapsed:.1f}s):[/green] {obj.pk}') - else: - print(f'[red]{self} failed ({elapsed:.1f}s):[/red] {obj.pk}') + # Build metadata for task completion + complete_metadata = { + 'task_id': str(obj.pk), + 'duration': elapsed, + 'status': 'success' if success else 'failed', + } + if hasattr(obj, 'status'): + complete_metadata['final_status'] = str(obj.status) + + log_worker_event( + worker_type=worker_type_name, + event='Completed' if success else 'Failed', + indent_level=indent_level, + pid=self.pid, + worker_id=str(self.worker_id), + url=url, + extractor=extractor, + metadata=complete_metadata, + ) else: - # No work available + # No work available - idle logging suppressed self.idle_count += 1 - if self.idle_count == 1: - print(f'[grey53]{self} idle, waiting for work...[/grey53]') # Check if we should exit if self.should_exit(): - print(f'[grey53]{self} idle timeout reached, exiting[/grey53]') + # Exit logging suppressed - shutdown will be logged by on_shutdown() break time.sleep(self.POLL_INTERVAL) @@ -293,7 +374,7 @@ class ArchiveResultWorker(Worker): obj.sm.tick() return True except Exception as e: - print(f'[red]{self} error processing {obj.pk}:[/red] {e}') + # Error will be logged in runloop's completion event traceback.print_exc() return False diff --git a/pyproject.toml b/pyproject.toml index 34bbf601..c78d8fb4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,8 +43,6 @@ dependencies = [ "channels[daphne]>=4.1.0", "django-ninja>=1.5.1", "django-extensions>=3.2.3", - "django-huey>=1.2.1", - "django-huey-monitor>=0.9.0", "django-signal-webhooks>=0.3.0", "django-admin-data-views>=0.4.1", "django-object-actions>=4.3.0",