remove huey

This commit is contained in:
Nick Sweeting
2025-12-24 23:40:18 -08:00
parent 6c769d831c
commit d95f0dc186
105 changed files with 3635 additions and 1402 deletions

View File

@@ -9,25 +9,17 @@ from django.core.exceptions import ValidationError
from django.urls import reverse, resolve
from django.utils import timezone
from huey_monitor.admin import TaskModel
from archivebox.config import DATA_DIR
from archivebox.config.common import SERVER_CONFIG
from archivebox.misc.paginators import AccelleratedPaginator
from archivebox.base_models.admin import BaseModelAdmin
from archivebox.hooks import get_extractor_icon
from core.models import ArchiveResult, Snapshot
def result_url(result: TaskModel) -> str:
url = reverse("admin:huey_monitor_taskmodel_change", args=[str(result.id)])
return format_html('<a href="{url}" class="fade-in-progress-url">See progress...</a>'.format(url=url))
class ArchiveResultInline(admin.TabularInline):
name = 'Archive Results Log'
model = ArchiveResult
@@ -101,9 +93,9 @@ class ArchiveResultInline(admin.TabularInline):
class ArchiveResultAdmin(BaseModelAdmin):
list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'extractor', 'cmd_str', 'output_str')
list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'extractor_with_icon', 'cmd_str', 'output_str')
sort_fields = ('id', 'created_by', 'created_at', 'extractor', 'status')
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary')
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'extractor_with_icon')
search_fields = ('id', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
fields = ('snapshot', 'extractor', 'status', 'retry_at', 'start_ts', 'end_ts', 'created_by', 'pwd', 'cmd_version', 'cmd', 'output', *readonly_fields)
autocomplete_fields = ['snapshot']
@@ -144,17 +136,29 @@ class ArchiveResultAdmin(BaseModelAdmin):
def tags_str(self, result):
return result.snapshot.tags_str()
@admin.display(description='Extractor', ordering='extractor')
def extractor_with_icon(self, result):
icon = get_extractor_icon(result.extractor)
return format_html(
'<span title="{}">{}</span> {}',
result.extractor,
icon,
result.extractor,
)
def cmd_str(self, result):
return format_html(
'<pre>{}</pre>',
' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd),
)
def output_str(self, result):
# Determine output link path - use output if file exists, otherwise link to index
output_path = result.output if (result.status == 'succeeded' and result.output) else 'index.html'
return format_html(
'<a href="/archive/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
result.snapshot.timestamp,
result.output if (result.status == 'succeeded') and result.extractor not in ('title', 'archive_org') else 'index.html',
output_path,
result.output,
)
@@ -185,7 +189,7 @@ class ArchiveResultAdmin(BaseModelAdmin):
is_hidden = filename.startswith('.')
output_str += format_html('<span style="opacity: {}.2">{}{}</span><br/>', int(not is_hidden), indentation_str, filename.strip())
return output_str + format_html('</code></pre>')
return output_str + mark_safe('</code></pre>')

View File

@@ -35,8 +35,19 @@ def register_admin_site():
admin.site = archivebox_admin
sites.site = archivebox_admin
# Plugin admin registration is now handled by individual app admins
# No longer using archivebox.pm.hook.register_admin()
# Register admin views for each app
# (Previously handled by ABX plugin system, now called directly)
from core.admin import register_admin as register_core_admin
from crawls.admin import register_admin as register_crawls_admin
from api.admin import register_admin as register_api_admin
from machine.admin import register_admin as register_machine_admin
from workers.admin import register_admin as register_workers_admin
register_core_admin(archivebox_admin)
register_crawls_admin(archivebox_admin)
register_api_admin(archivebox_admin)
register_machine_admin(archivebox_admin)
register_workers_admin(archivebox_admin)
return archivebox_admin

View File

@@ -25,7 +25,7 @@ from archivebox.workers.tasks import bg_archive_snapshots, bg_add
from core.models import Tag
from core.admin_tags import TagInline
from core.admin_archiveresults import ArchiveResultInline, result_url
from core.admin_archiveresults import ArchiveResultInline
# GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
@@ -54,10 +54,10 @@ class SnapshotActionForm(ActionForm):
class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
list_display = ('created_at', 'title_str', 'status', 'files', 'size', 'url_str')
sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'link_dir', 'available_config_options')
readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir')
search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', 'config', 'available_config_options', *readonly_fields[:-1])
fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', 'config', *readonly_fields)
ordering = ['-created_at']
actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
inlines = [TagInline, ArchiveResultInline]
@@ -93,12 +93,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
# self.request = request
# return super().get_queryset(request).prefetch_related('archiveresult_set').distinct() # .annotate(archiveresult_count=Count('archiveresult'))
@admin.action(
description="Imported Timestamp"
)
@admin.display(description="Imported Timestamp")
def imported_timestamp(self, obj):
context = RequestContext(self.request, {
'bookmarked_date': obj.bookmarked,
'bookmarked_date': obj.bookmarked_at,
'timestamp': obj.timestamp,
})
@@ -145,22 +143,15 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
def status_info(self, obj):
return format_html(
# URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
'''
Archived: {} ({} files {}) &nbsp; &nbsp;
Favicon: <img src="{}" style="height: 20px"/> &nbsp; &nbsp;
Status code: {} &nbsp; &nbsp;<br/>
Server: {} &nbsp; &nbsp;
Content type: {} &nbsp; &nbsp;
Extension: {} &nbsp; &nbsp;
''',
'' if obj.is_archived else '',
obj.num_outputs,
self.size(obj) or '0kb',
f'/archive/{obj.timestamp}/favicon.ico',
obj.status_code or '-',
obj.headers and obj.headers.get('Server') or '-',
obj.headers and obj.headers.get('Content-Type') or '-',
obj.extension or '-',
)
@@ -184,8 +175,8 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
obj.archive_path,
obj.archive_path,
obj.archive_path,
'fetched' if obj.latest_title or obj.title else 'pending',
urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
'fetched' if obj.title else 'pending',
urldecode(htmldecode(obj.title or ''))[:128] or 'Pending...'
) + mark_safe(f' <span class="tags">{tags}</span>')
@admin.display(
@@ -259,14 +250,13 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
description=" Get Title"
)
def update_titles(self, request, queryset):
from core.models import Snapshot
count = queryset.count()
# Queue snapshots for archiving via the state machine system
result = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR})
queued = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR})
messages.success(
request,
mark_safe(f"Title and favicon are updating in the background for {count} URLs. {result_url(result)}"),
f"Queued {queued} snapshots for title/favicon update. The orchestrator will process them in the background.",
)
@admin.action(
@@ -275,11 +265,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
def update_snapshots(self, request, queryset):
count = queryset.count()
result = bg_archive_snapshots(queryset, kwargs={"overwrite": False, "out_dir": DATA_DIR})
queued = bg_archive_snapshots(queryset, kwargs={"overwrite": False, "out_dir": DATA_DIR})
messages.success(
request,
mark_safe(f"Re-trying any previously failed methods for {count} URLs in the background. {result_url(result)}"),
f"Queued {queued} snapshots for re-archiving. The orchestrator will process them in the background.",
)
@@ -291,11 +281,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
timestamp = timezone.now().isoformat('T', 'seconds')
new_url = snapshot.url.split('#')[0] + f'#{timestamp}'
result = bg_add({'urls': new_url, 'tag': snapshot.tags_str()})
bg_add({'urls': new_url, 'tag': snapshot.tags_str()})
messages.success(
request,
mark_safe(f"Creating new fresh snapshots for {queryset.count()} URLs in the background. {result_url(result)}"),
f"Creating {queryset.count()} new fresh snapshots. The orchestrator will process them in the background.",
)
@admin.action(
@@ -304,11 +294,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
def overwrite_snapshots(self, request, queryset):
count = queryset.count()
result = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "out_dir": DATA_DIR})
queued = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "out_dir": DATA_DIR})
messages.success(
request,
mark_safe(f"Clearing all previous results and re-downloading {count} URLs in the background. {result_url(result)}"),
f"Queued {queued} snapshots for full re-archive (overwriting existing). The orchestrator will process them in the background.",
)
@admin.action(

View File

@@ -1,5 +1,7 @@
__package__ = 'archivebox.core'
import sys
from django.apps import AppConfig
@@ -10,6 +12,41 @@ class CoreConfig(AppConfig):
"""Register the archivebox.core.admin_site as the main django admin site"""
from core.admin_site import register_admin_site
register_admin_site()
# Auto-start the orchestrator when running the web server
self._maybe_start_orchestrator()
def _maybe_start_orchestrator(self):
"""Start the orchestrator if we're running a web server."""
import os
# Don't start orchestrator during migrations, shell, tests, etc.
# Only start when running: runserver, daphne, gunicorn, uwsgi
if not self._is_web_server():
return
# Don't start if RUN_ORCHESTRATOR env var is explicitly set to false
if os.environ.get('RUN_ORCHESTRATOR', '').lower() in ('false', '0', 'no'):
return
# Don't start in autoreload child process (avoid double-start)
if os.environ.get('RUN_MAIN') != 'true' and 'runserver' in sys.argv:
return
try:
from workers.orchestrator import Orchestrator
if not Orchestrator.is_running():
# Start orchestrator as daemon (won't exit on idle when started by server)
orchestrator = Orchestrator(exit_on_idle=False)
orchestrator.start()
except Exception as e:
# Don't crash the server if orchestrator fails to start
import logging
logging.getLogger('archivebox').warning(f'Failed to auto-start orchestrator: {e}')
def _is_web_server(self) -> bool:
"""Check if we're running a web server command."""
# Check for common web server indicators
server_commands = ('runserver', 'daphne', 'gunicorn', 'uwsgi', 'server')
return any(cmd in ' '.join(sys.argv).lower() for cmd in server_commands)

View File

@@ -23,7 +23,11 @@ from archivebox.config import CONSTANTS
from archivebox.misc.system import get_dir_size, atomic_write
from archivebox.misc.util import parse_date, base_url, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode
from archivebox.misc.hashing import get_dir_info
from archivebox.hooks import ARCHIVE_METHODS_INDEXING_PRECEDENCE
from archivebox.hooks import (
ARCHIVE_METHODS_INDEXING_PRECEDENCE,
get_extractors, get_extractor_name, get_extractor_icon,
DEFAULT_EXTRACTOR_ICONS,
)
from archivebox.base_models.models import (
ModelWithUUID, ModelWithSerializers, ModelWithOutputDir,
ModelWithConfig, ModelWithNotes, ModelWithHealthStats,
@@ -343,45 +347,37 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
def icons(self) -> str:
"""Generate HTML icons showing which extractors have succeeded for this snapshot"""
from django.utils.html import format_html, mark_safe
from collections import defaultdict
cache_key = f'result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}'
def calc_icons():
if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
archive_results = [r for r in self.archiveresult_set.all() if r.status == "succeeded" and r.output]
archive_results = {r.extractor: r for r in self.archiveresult_set.all() if r.status == "succeeded" and r.output}
else:
archive_results = self.archiveresult_set.filter(status="succeeded", output__isnull=False)
archive_results = {r.extractor: r for r in self.archiveresult_set.filter(status="succeeded", output__isnull=False)}
path = self.archive_path
canon = self.canonical_outputs()
output = ""
output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> &nbsp;'
icons = {
"singlefile": "", "wget": "🆆", "dom": "🅷", "pdf": "📄",
"screenshot": "💻", "media": "📼", "git": "🅶", "archive_org": "🏛",
"readability": "🆁", "mercury": "🅼", "warc": "📦"
}
exclude = ["favicon", "title", "headers", "htmltotext", "archive_org"]
extractor_outputs = defaultdict(lambda: None)
for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES:
for result in archive_results:
if result.extractor == extractor:
extractor_outputs[extractor] = result
# Get all extractors from hooks system (sorted by numeric prefix)
all_extractors = [get_extractor_name(e) for e in get_extractors()]
for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES:
if extractor not in exclude:
existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
output += format_html(output_template, path, canon.get(extractor, ''), str(bool(existing)), extractor, icons.get(extractor, "?"))
if extractor == "wget":
exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
output += format_html(output_template, path, canon.get("warc", "warc/"), str(bool(exists)), "warc", icons.get("warc", "?"))
if extractor == "archive_org":
exists = extractor in extractor_outputs and extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon.get("archive_org", ""), str(exists), "archive_org", icons.get("archive_org", "?"))
for extractor in all_extractors:
result = archive_results.get(extractor)
existing = result and result.status == 'succeeded' and result.output
icon = get_extractor_icon(extractor)
output += format_html(
output_template,
path,
canon.get(extractor, extractor + '/'),
str(bool(existing)),
extractor,
icon
)
return format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))
return format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}</span>', mark_safe(output))
cache_result = cache.get(cache_key)
if cache_result:
@@ -767,12 +763,11 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
FAILED = 'failed', 'Failed'
SKIPPED = 'skipped', 'Skipped'
EXTRACTOR_CHOICES = (
('htmltotext', 'htmltotext'), ('git', 'git'), ('singlefile', 'singlefile'), ('media', 'media'),
('archive_org', 'archive_org'), ('readability', 'readability'), ('mercury', 'mercury'),
('favicon', 'favicon'), ('pdf', 'pdf'), ('headers', 'headers'), ('screenshot', 'screenshot'),
('dom', 'dom'), ('title', 'title'), ('wget', 'wget'),
)
@classmethod
def get_extractor_choices(cls):
"""Get extractor choices from discovered hooks (for forms/admin)."""
extractors = [get_extractor_name(e) for e in get_extractors()]
return tuple((e, e) for e in extractors)
# Keep AutoField for backward compatibility with 0.7.x databases
# UUID field is added separately by migration for new records
@@ -783,7 +778,8 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
modified_at = models.DateTimeField(auto_now=True)
snapshot: Snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) # type: ignore
extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32, blank=False, null=False, db_index=True)
# No choices= constraint - extractor names come from plugin system and can be any string
extractor = models.CharField(max_length=32, blank=False, null=False, db_index=True)
pwd = models.CharField(max_length=256, default=None, null=True, blank=True)
cmd = models.JSONField(default=None, null=True, blank=True)
cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
@@ -835,6 +831,25 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
def output_exists(self) -> bool:
return os.path.exists(Path(self.snapshot_dir) / self.extractor)
def embed_path(self) -> Optional[str]:
"""
Get the relative path to the embeddable output file for this result.
Returns the output field if set and file exists, otherwise tries to
find a reasonable default based on the extractor type.
"""
if self.output:
return self.output
# Try to find output file based on extractor's canonical output path
canonical = self.snapshot.canonical_outputs()
extractor_key = f'{self.extractor}_path'
if extractor_key in canonical:
return canonical[extractor_key]
# Fallback to extractor directory
return f'{self.extractor}/'
def create_output_dir(self):
output_dir = Path(self.snapshot_dir) / self.extractor
output_dir.mkdir(parents=True, exist_ok=True)
@@ -891,6 +906,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
output_dir=extractor_dir,
config_objects=config_objects,
url=self.snapshot.url,
snapshot_id=str(self.snapshot.id),
)
end_ts = timezone.now()
@@ -1000,6 +1016,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
hook,
output_dir=self.output_dir,
config_objects=config_objects,
url=self.snapshot.url,
snapshot_id=str(self.snapshot.id),
extractor=self.extractor,
)

View File

@@ -68,9 +68,6 @@ INSTALLED_APPS = [
# 3rd-party apps from PyPI that need to be loaded last
"admin_data_views", # handles rendering some convenient automatic read-only views of data in Django admin
"django_extensions", # provides Django Debug Toolbar (and other non-debug helpers)
"django_huey", # provides multi-queue support for django huey https://github.com/gaiacoop/django-huey
"bx_django_utils", # needed for huey_monitor https://github.com/boxine/bx_django_utils
"huey_monitor", # adds an admin UI for monitoring background huey tasks https://github.com/boxine/django-huey-monitor
]
@@ -215,70 +212,6 @@ MIGRATION_MODULES = {"signal_webhooks": None}
# as much as I'd love this to be a UUID or ULID field, it's not supported yet as of Django 5.0
DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"
HUEY = {
"huey_class": "huey.SqliteHuey",
"filename": CONSTANTS.QUEUE_DATABASE_FILENAME,
"name": "commands",
"results": True,
"store_none": True,
"immediate": False,
"utc": True,
"consumer": {
"workers": 1,
"worker_type": "thread",
"initial_delay": 0.1, # Smallest polling interval, same as -d.
"backoff": 1.15, # Exponential backoff using this rate, -b.
"max_delay": 10.0, # Max possible polling interval, -m.
"scheduler_interval": 1, # Check schedule every second, -s.
"periodic": True, # Enable crontab feature.
"check_worker_health": True, # Enable worker health checks.
"health_check_interval": 1, # Check worker health every second.
},
}
# https://huey.readthedocs.io/en/latest/contrib.html#setting-things-up
# https://github.com/gaiacoop/django-huey
DJANGO_HUEY = {
"default": "commands",
"queues": {
HUEY["name"]: HUEY.copy(),
# more registered here at plugin import-time by BaseQueue.register()
# Additional huey queues configured via settings
},
}
class HueyDBRouter:
"""
A router to store all the Huey result k:v / Huey Monitor models in the queue.sqlite3 database.
We keep the databases separate because the queue database receives many more reads/writes per second
and we want to avoid single-write lock contention with the main database. Also all the in-progress task
data is ephemeral/not-important-long-term. This makes it easier to for the user to clear non-critical
temp data by just deleting queue.sqlite3 and leaving index.sqlite3.
"""
route_app_labels = {"huey_monitor", "django_huey", "djhuey"}
db_name = "queue"
def db_for_read(self, model, **hints):
if model._meta.app_label in self.route_app_labels:
return self.db_name
return "default"
def db_for_write(self, model, **hints):
if model._meta.app_label in self.route_app_labels:
return self.db_name
return "default"
def allow_relation(self, obj1, obj2, **hints):
if obj1._meta.app_label in self.route_app_labels or obj2._meta.app_label in self.route_app_labels:
return obj1._meta.app_label == obj2._meta.app_label
return None
def allow_migrate(self, db, app_label, model_name=None, **hints):
if app_label in self.route_app_labels:
return db == self.db_name
return db == "default"
# class FilestoreDBRouter:
@@ -311,7 +244,7 @@ class HueyDBRouter:
# return db == self.db_name
# return db == "default"
DATABASE_ROUTERS = ["core.settings.HueyDBRouter"]
DATABASE_ROUTERS = []
CACHES = {
"default": {"BACKEND": "django.core.cache.backends.locmem.LocMemCache"},

View File

@@ -1,9 +1,13 @@
from django import template
from django.contrib.admin.templatetags.base import InclusionAdminNode
from django.utils.safestring import mark_safe
from typing import Union
from archivebox.hooks import (
get_extractor_icon, get_extractor_template, get_extractor_name,
)
register = template.Library()
@@ -44,3 +48,115 @@ def url_replace(context, **kwargs):
dict_ = context['request'].GET.copy()
dict_.update(**kwargs)
return dict_.urlencode()
@register.simple_tag
def extractor_icon(extractor: str) -> str:
"""
Render the icon for an extractor.
Usage: {% extractor_icon "screenshot" %}
"""
return mark_safe(get_extractor_icon(extractor))
@register.simple_tag(takes_context=True)
def extractor_thumbnail(context, result) -> str:
"""
Render the thumbnail template for an archive result.
Usage: {% extractor_thumbnail result %}
Context variables passed to template:
- result: ArchiveResult object
- snapshot: Parent Snapshot object
- output_path: Path to output relative to snapshot dir (from embed_path())
- extractor: Extractor base name
"""
extractor = get_extractor_name(result.extractor)
template_str = get_extractor_template(extractor, 'thumbnail')
if not template_str:
return ''
# Use embed_path() for the display path (includes canonical paths)
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
# Create a mini template and render it with context
try:
tpl = template.Template(template_str)
ctx = template.Context({
'result': result,
'snapshot': result.snapshot,
'output_path': output_path,
'extractor': extractor,
})
return mark_safe(tpl.render(ctx))
except Exception:
return ''
@register.simple_tag(takes_context=True)
def extractor_embed(context, result) -> str:
"""
Render the embed iframe template for an archive result.
Usage: {% extractor_embed result %}
"""
extractor = get_extractor_name(result.extractor)
template_str = get_extractor_template(extractor, 'embed')
if not template_str:
return ''
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
try:
tpl = template.Template(template_str)
ctx = template.Context({
'result': result,
'snapshot': result.snapshot,
'output_path': output_path,
'extractor': extractor,
})
return mark_safe(tpl.render(ctx))
except Exception:
return ''
@register.simple_tag(takes_context=True)
def extractor_fullscreen(context, result) -> str:
"""
Render the fullscreen template for an archive result.
Usage: {% extractor_fullscreen result %}
"""
extractor = get_extractor_name(result.extractor)
template_str = get_extractor_template(extractor, 'fullscreen')
if not template_str:
return ''
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
try:
tpl = template.Template(template_str)
ctx = template.Context({
'result': result,
'snapshot': result.snapshot,
'output_path': output_path,
'extractor': extractor,
})
return mark_safe(tpl.render(ctx))
except Exception:
return ''
@register.filter
def extractor_name(value: str) -> str:
"""
Get the base name of an extractor (strips numeric prefix).
Usage: {{ result.extractor|extractor_name }}
"""
return get_extractor_name(value)

View File

@@ -8,7 +8,7 @@ from django.views.generic.base import RedirectView
from archivebox.misc.serve_static import serve_static
from core.admin_site import archivebox_admin
from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView, live_progress_view
from workers.views import JobsDashboardView
@@ -43,8 +43,10 @@ urlpatterns = [
path('accounts/', include('django.contrib.auth.urls')),
path('admin/live-progress/', live_progress_view, name='live_progress'),
path('admin/', archivebox_admin.urls),
path("api/", include('api.urls'), name='api'),
path('health/', HealthCheckView.as_view(), name='healthcheck'),

View File

@@ -34,6 +34,7 @@ from archivebox.search import query_search_index
from core.models import Snapshot
from core.forms import AddLinkForm
from crawls.models import Seed, Crawl
from archivebox.hooks import get_extractors, get_extractor_name
@@ -54,8 +55,10 @@ class SnapshotView(View):
@staticmethod
def render_live_index(request, snapshot):
TITLE_LOADING_MSG = 'Not yet archived...'
HIDDEN_RESULTS = ('favicon', 'headers', 'title', 'htmltotext', 'warc', 'archive_org')
# Dict of extractor -> ArchiveResult object
archiveresult_objects = {}
# Dict of extractor -> result info dict (for template compatibility)
archiveresults = {}
results = snapshot.archiveresult_set.all()
@@ -65,18 +68,21 @@ class SnapshotView(View):
abs_path = result.snapshot_dir / (embed_path or 'None')
if (result.status == 'succeeded'
and (result.extractor not in HIDDEN_RESULTS)
and embed_path
and os.access(abs_path, os.R_OK)
and abs_path.exists()):
if os.path.isdir(abs_path) and not any(abs_path.glob('*.*')):
continue
# Store the full ArchiveResult object for template tags
archiveresult_objects[result.extractor] = result
result_info = {
'name': result.extractor,
'path': embed_path,
'ts': ts_to_date_str(result.end_ts),
'size': abs_path.stat().st_size or '?',
'result': result, # Include the full object for template tags
}
archiveresults[result.extractor] = result_info
@@ -101,11 +107,11 @@ class SnapshotView(View):
}
# iterate through all the files in the snapshot dir and add the biggest ones to1 the result list
# iterate through all the files in the snapshot dir and add the biggest ones to the result list
snap_dir = Path(snapshot.output_dir)
if not os.path.isdir(snap_dir) and os.access(snap_dir, os.R_OK):
return {}
for result_file in (*snap_dir.glob('*'), *snap_dir.glob('*/*')):
extension = result_file.suffix.lstrip('.').lower()
if result_file.is_dir() or result_file.name.startswith('.') or extension not in allowed_extensions:
@@ -121,12 +127,16 @@ class SnapshotView(View):
'path': result_file.relative_to(snap_dir),
'ts': ts_to_date_str(result_file.stat().st_mtime or 0),
'size': file_size,
'result': None, # No ArchiveResult object for filesystem-discovered files
}
preferred_types = ('singlefile', 'screenshot', 'wget', 'dom', 'media', 'pdf', 'readability', 'mercury')
# Get available extractors from hooks (sorted by numeric prefix for ordering)
# Convert to base names for display ordering
all_extractors = [get_extractor_name(e) for e in get_extractors()]
preferred_types = tuple(all_extractors)
all_types = preferred_types + tuple(result_type for result_type in archiveresults.keys() if result_type not in preferred_types)
best_result = {'path': 'None'}
best_result = {'path': 'None', 'result': None}
for result_type in preferred_types:
if result_type in archiveresults:
best_result = archiveresults[result_type]
@@ -157,6 +167,7 @@ class SnapshotView(View):
'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']),
'best_result': best_result,
'snapshot': snapshot, # Pass the snapshot object for template tags
}
return render(template_name='core/snapshot_live.html', request=request, context=context)
@@ -436,7 +447,7 @@ class AddView(UserPassesTestMixin, FormView):
def form_valid(self, form):
urls = form.cleaned_data["url"]
print(f'[+] Adding URL: {urls}')
parser = form.cleaned_data["parser"]
parser = form.cleaned_data.get("parser", "auto") # default to auto-detect parser
tag = form.cleaned_data["tag"]
depth = 0 if form.cleaned_data["depth"] == "0" else 1
extractors = ','.join(form.cleaned_data["archive_methods"])
@@ -452,18 +463,19 @@ class AddView(UserPassesTestMixin, FormView):
if extractors:
input_kwargs.update({"extractors": extractors})
from archivebox.config.permissions import HOSTNAME
# 1. save the provided urls to sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{self.request.user.pk}.txt'
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
# 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
seed = Seed.from_file(
sources_file,
label=f'{self.request.user.username}@{HOSTNAME}{self.request.path}',
label=f'{self.request.user.username}@{HOSTNAME}{self.request.path} {timestamp}',
parser=parser,
tag=tag,
created_by=self.request.user.pk,
@@ -472,7 +484,7 @@ class AddView(UserPassesTestMixin, FormView):
# 'INDEX_ONLY': index_only,
# 'OVERWRITE': False,
'DEPTH': depth,
'EXTRACTORS': parser,
'EXTRACTORS': extractors or '',
# 'DEFAULT_PERSONA': persona or 'Default',
})
# 3. create a new Crawl pointing to the Seed
@@ -490,10 +502,15 @@ class AddView(UserPassesTestMixin, FormView):
self.request,
mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a minute start seeing results) {crawl.admin_change_url}"),
)
# if not bg:
# from workers.orchestrator import Orchestrator
# orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4)
# orchestrator.start()
# Start orchestrator in background to process the queued crawl
try:
from archivebox.workers.tasks import ensure_orchestrator_running
ensure_orchestrator_running()
except Exception as e:
# Orchestrator may already be running via supervisord, or fail to start
# This is not fatal - the crawl will be processed when orchestrator runs
print(f'[!] Failed to start orchestrator: {e}')
return redirect(crawl.admin_change_url)
@@ -513,6 +530,141 @@ class HealthCheckView(View):
)
import json
from django.http import JsonResponse
def live_progress_view(request):
"""Simple JSON endpoint for live progress status - used by admin progress monitor."""
try:
from workers.orchestrator import Orchestrator
from crawls.models import Crawl
from core.models import Snapshot, ArchiveResult
# Get orchestrator status
orchestrator_running = Orchestrator.is_running()
total_workers = Orchestrator().get_total_worker_count() if orchestrator_running else 0
# Get model counts by status
crawls_pending = Crawl.objects.filter(status=Crawl.StatusChoices.QUEUED).count()
crawls_started = Crawl.objects.filter(status=Crawl.StatusChoices.STARTED).count()
# Get recent crawls (last 24 hours)
from datetime import timedelta
one_day_ago = timezone.now() - timedelta(days=1)
crawls_recent = Crawl.objects.filter(created_at__gte=one_day_ago).count()
snapshots_pending = Snapshot.objects.filter(status=Snapshot.StatusChoices.QUEUED).count()
snapshots_started = Snapshot.objects.filter(status=Snapshot.StatusChoices.STARTED).count()
archiveresults_pending = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.QUEUED).count()
archiveresults_started = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.STARTED).count()
archiveresults_succeeded = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.SUCCEEDED).count()
archiveresults_failed = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.FAILED).count()
# Build hierarchical active crawls with nested snapshots and archive results
active_crawls = []
for crawl in Crawl.objects.filter(
status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]
).order_by('-modified_at')[:10]:
# Get snapshots for this crawl
crawl_snapshots = Snapshot.objects.filter(crawl=crawl)
total_snapshots = crawl_snapshots.count()
completed_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.SEALED).count()
pending_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.QUEUED).count()
# Calculate crawl progress
crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0
# Get active snapshots for this crawl
active_snapshots_for_crawl = []
for snapshot in crawl_snapshots.filter(
status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]
).order_by('-modified_at')[:5]:
# Get archive results for this snapshot
snapshot_results = ArchiveResult.objects.filter(snapshot=snapshot)
total_extractors = snapshot_results.count()
completed_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.SUCCEEDED).count()
failed_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.FAILED).count()
pending_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.QUEUED).count()
# Calculate snapshot progress
snapshot_progress = int(((completed_extractors + failed_extractors) / total_extractors) * 100) if total_extractors > 0 else 0
# Get active extractors for this snapshot
active_extractors = [
{
'id': str(ar.id),
'extractor': ar.extractor,
'status': ar.status,
'started': ar.start_ts.isoformat() if ar.start_ts else None,
'progress': 50,
}
for ar in snapshot_results.filter(status=ArchiveResult.StatusChoices.STARTED).order_by('-start_ts')[:5]
]
active_snapshots_for_crawl.append({
'id': str(snapshot.id),
'url': snapshot.url[:80],
'status': snapshot.status,
'started': snapshot.modified_at.isoformat() if snapshot.modified_at else None,
'progress': snapshot_progress,
'total_extractors': total_extractors,
'completed_extractors': completed_extractors,
'failed_extractors': failed_extractors,
'pending_extractors': pending_extractors,
'active_extractors': active_extractors,
})
active_crawls.append({
'id': str(crawl.id),
'label': str(crawl)[:60],
'status': crawl.status,
'started': crawl.modified_at.isoformat() if crawl.modified_at else None,
'progress': crawl_progress,
'max_depth': crawl.max_depth,
'total_snapshots': total_snapshots,
'completed_snapshots': completed_snapshots,
'failed_snapshots': 0,
'pending_snapshots': pending_snapshots,
'active_snapshots': active_snapshots_for_crawl,
})
return JsonResponse({
'orchestrator_running': orchestrator_running,
'total_workers': total_workers,
'crawls_pending': crawls_pending,
'crawls_started': crawls_started,
'crawls_recent': crawls_recent,
'snapshots_pending': snapshots_pending,
'snapshots_started': snapshots_started,
'archiveresults_pending': archiveresults_pending,
'archiveresults_started': archiveresults_started,
'archiveresults_succeeded': archiveresults_succeeded,
'archiveresults_failed': archiveresults_failed,
'active_crawls': active_crawls,
'server_time': timezone.now().isoformat(),
})
except Exception as e:
import traceback
return JsonResponse({
'error': str(e),
'traceback': traceback.format_exc(),
'orchestrator_running': False,
'total_workers': 0,
'crawls_pending': 0,
'crawls_started': 0,
'crawls_recent': 0,
'snapshots_pending': 0,
'snapshots_started': 0,
'archiveresults_pending': 0,
'archiveresults_started': 0,
'archiveresults_succeeded': 0,
'archiveresults_failed': 0,
'active_crawls': [],
'server_time': timezone.now().isoformat(),
}, status=500)
def find_config_section(key: str) -> str:
CONFIGS = get_all_configs()