wip

2026-04-06 07:47:53 +10:00 · 2026-01-21 03:19:56 -08:00
parent f3f55d3395
commit ec4b27056e
113 changed files with 6929 additions and 2396 deletions
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -1,13 +1,16 @@
 __package__ = 'archivebox.core'

 import os
+import posixpath
+from glob import glob, escape
 from django.utils import timezone
 import inspect
 from typing import Callable, get_type_hints
 from pathlib import Path
+from urllib.parse import urlparse

 from django.shortcuts import render, redirect
-from django.http import HttpRequest, HttpResponse, Http404
+from django.http import HttpRequest, HttpResponse, Http404, HttpResponseForbidden
 from django.utils.html import format_html, mark_safe
 from django.views import View
 from django.views.generic.list import ListView
@@ -31,6 +34,21 @@ from archivebox.misc.logging_util import printable_filesize
 from archivebox.search import query_search_index

 from archivebox.core.models import Snapshot
+from archivebox.core.host_utils import build_snapshot_url
+
+
+def _files_index_target(snapshot: Snapshot, archivefile: str | None) -> str:
+    target = archivefile or ''
+    if target == 'index.html':
+        target = ''
+    fullpath = Path(snapshot.output_dir) / target
+    if fullpath.is_file():
+        target = str(Path(target).parent)
+        if target == '.':
+            target = ''
+    return target
+
+
 from archivebox.core.forms import AddLinkForm
 from archivebox.crawls.models import Crawl
 from archivebox.hooks import get_enabled_plugins, get_plugin_name
@@ -86,13 +104,95 @@ class SnapshotView(View):
    def render_live_index(request, snapshot):
        TITLE_LOADING_MSG = 'Not yet archived...'

-        outputs = snapshot.discover_outputs()
+        hidden_card_plugins = {'archivedotorg', 'favicon', 'title'}
+        outputs = [
+            out for out in snapshot.discover_outputs()
+            if (out.get('size') or 0) > 0 and out.get('name') not in hidden_card_plugins
+        ]
        archiveresults = {out['name']: out for out in outputs}
        snap_dir = Path(snapshot.output_dir)
-
        # Get available extractor plugins from hooks (sorted by numeric prefix for ordering)
        # Convert to base names for display ordering
        all_plugins = [get_plugin_name(e) for e in get_enabled_plugins()]
+        accounted_entries: set[str] = set()
+        for output in outputs:
+            output_name = output.get('name') or ''
+            if output_name:
+                accounted_entries.add(output_name)
+            output_path = output.get('path') or ''
+            if not output_path:
+                continue
+            parts = Path(output_path).parts
+            if parts:
+                accounted_entries.add(parts[0])
+
+        ignore_names = {
+            '.DS_Store',
+            'index.html',
+            'index.json',
+            'index.jsonl',
+            'favicon.ico',
+        }
+        ignored_suffixes = {'.log', '.pid', '.sh'}
+        max_loose_scan = 300
+
+        def has_meaningful_files(dir_path: Path) -> bool:
+            scanned = 0
+            for file_path in dir_path.rglob('*'):
+                scanned += 1
+                if scanned > max_loose_scan:
+                    return True
+                if file_path.is_dir() or file_path.name.startswith('.'):
+                    continue
+                if file_path.suffix.lower() in ignored_suffixes:
+                    continue
+                try:
+                    if file_path.stat().st_size == 0:
+                        continue
+                except OSError:
+                    continue
+                return True
+            return False
+
+        unaccounted_entries = []
+        if snap_dir.exists():
+            for entry in snap_dir.iterdir():
+                name = entry.name
+                if name.startswith('.') or name in ignore_names or name in accounted_entries:
+                    continue
+                is_dir = entry.is_dir()
+                is_meaningful = False
+                size = None
+                if is_dir:
+                    is_meaningful = has_meaningful_files(entry)
+                elif entry.is_file():
+                    if entry.suffix.lower() not in ignored_suffixes:
+                        try:
+                            size = entry.stat().st_size
+                            is_meaningful = size > 0
+                        except OSError:
+                            size = None
+                            is_meaningful = False
+
+                unaccounted_entries.append({
+                    'name': name,
+                    'path': name,
+                    'is_dir': is_dir,
+                    'size': size,
+                    'is_meaningful': is_meaningful,
+                })
+
+        unaccounted_entries.sort(key=lambda item: item['name'].lower())
+        loose_items = [item for item in unaccounted_entries if item['is_meaningful']]
+        failed_exclude_suffixes = {'.json', '.jsonl', '.sh', '.log'}
+        failed_items = [
+            item for item in unaccounted_entries
+            if not item['is_meaningful']
+            and not (
+                not item['is_dir']
+                and Path(item['name']).suffix.lower() in failed_exclude_suffixes
+            )
+        ]
        preview_priority = [
            'singlefile',
            'screenshot',
@@ -111,12 +211,48 @@ class SnapshotView(View):
                break

        snapshot_info = snapshot.to_dict(extended=True)
+        related_snapshots_qs = SnapshotView.find_snapshots_for_url(snapshot.url)
+        related_snapshots = list(
+            related_snapshots_qs.exclude(id=snapshot.id).order_by('-bookmarked_at', '-created_at', '-timestamp')[:25]
+        )
+        related_years_map: dict[int, list[Snapshot]] = {}
+        for snap in [snapshot, *related_snapshots]:
+            snap_dt = snap.bookmarked_at or snap.created_at or snap.downloaded_at
+            if not snap_dt:
+                continue
+            related_years_map.setdefault(snap_dt.year, []).append(snap)
+        related_years = []
+        for year, snaps in related_years_map.items():
+            snaps_sorted = sorted(
+                snaps,
+                key=lambda s: (s.bookmarked_at or s.created_at or s.downloaded_at or timezone.now()),
+                reverse=True,
+            )
+            related_years.append({
+                'year': year,
+                'latest': snaps_sorted[0],
+                'snapshots': snaps_sorted,
+            })
+        related_years.sort(key=lambda item: item['year'], reverse=True)

        try:
            warc_path = 'warc/' + list(Path(snap_dir).glob('warc/*.warc.*'))[0].name
        except IndexError:
            warc_path = 'warc/'

+        ordered_outputs = sorted(
+            archiveresults.values(),
+            key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size'],
+        )
+        non_compact_outputs = [
+            out for out in ordered_outputs
+            if not out.get('is_compact') and not out.get('is_metadata')
+        ]
+        compact_outputs = [
+            out for out in ordered_outputs
+            if out.get('is_compact') or out.get('is_metadata')
+        ]
+
        context = {
            **snapshot_info,
            'title': htmlencode(
@@ -131,9 +267,13 @@ class SnapshotView(View):
            'oldest_archive_date': ts_to_date_str(snapshot.oldest_archive_date),
            'warc_path': warc_path,
            'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
-            'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']),
+            'archiveresults': [*non_compact_outputs, *compact_outputs],
            'best_result': best_result,
            'snapshot': snapshot,  # Pass the snapshot object for template tags
+            'related_snapshots': related_snapshots,
+            'related_years': related_years,
+            'loose_items': loose_items,
+            'failed_items': failed_items,
        }
        return render(template_name='core/snapshot_live.html', request=request, context=context)

@@ -168,13 +308,20 @@ class SnapshotView(View):
                            target_path = f'{target_path}?{query}'
                        return redirect(target_path)

-                    if archivefile == 'index.html':
+                    if request.GET.get('files'):
+                        target_path = _files_index_target(snapshot, archivefile)
+                        response = serve_static_with_byterange_support(
+                            request, target_path, document_root=snapshot.output_dir, show_indexes=True,
+                        )
+                    elif archivefile == 'index.html':
                        # if they requested snapshot index, serve live rendered template instead of static html
                        response = self.render_live_index(request, snapshot)
                    else:
-                        response = serve_static_with_byterange_support(
-                            request, archivefile, document_root=snapshot.output_dir, show_indexes=True,
-                        )
+                        target = build_snapshot_url(str(snapshot.id), archivefile, request=request)
+                        query = request.META.get('QUERY_STRING')
+                        if query:
+                            target = f'{target}?{query}'
+                        return redirect(target)
                    response["Link"] = f'<{snapshot.url}>; rel="canonical"'
                    return response
                except Snapshot.DoesNotExist:
@@ -328,13 +475,16 @@ class SnapshotView(View):
 class SnapshotPathView(View):
    """Serve snapshots by the new URL scheme: /<username>/<YYYYMMDD>/<domain>/<uuid>/..."""

-    def get(self, request, username: str, date: str, domain: str | None = None, snapshot_id: str | None = None, path: str = "", url: str | None = None):
+    def get(self, request, username: str, date: str | None = None, domain: str | None = None, snapshot_id: str | None = None, path: str = "", url: str | None = None):
        if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
            return redirect(f'/admin/login/?next={request.path}')

        if username == 'system':
            return redirect(request.path.replace('/system/', '/web/', 1))

+        if date and domain and domain == date:
+            raise Http404
+
        requested_url = url
        if not requested_url and domain and domain.startswith(('http://', 'https://')):
            requested_url = domain
@@ -358,19 +508,20 @@ class SnapshotPathView(View):
            else:
                qs = Snapshot.objects.filter(crawl__created_by__username=username_lookup)

-            try:
-                if len(date) == 4:
-                    qs = qs.filter(created_at__year=int(date))
-                elif len(date) == 6:
-                    qs = qs.filter(created_at__year=int(date[:4]), created_at__month=int(date[4:6]))
-                elif len(date) == 8:
-                    qs = qs.filter(
-                        created_at__year=int(date[:4]),
-                        created_at__month=int(date[4:6]),
-                        created_at__day=int(date[6:8]),
-                    )
-            except ValueError:
-                pass
+            if date:
+                try:
+                    if len(date) == 4:
+                        qs = qs.filter(created_at__year=int(date))
+                    elif len(date) == 6:
+                        qs = qs.filter(created_at__year=int(date[:4]), created_at__month=int(date[4:6]))
+                    elif len(date) == 8:
+                        qs = qs.filter(
+                            created_at__year=int(date[:4]),
+                            created_at__month=int(date[4:6]),
+                            created_at__day=int(date[6:8]),
+                        )
+                except ValueError:
+                    pass

            if requested_url:
                snapshot = qs.order_by('-created_at', '-bookmarked_at', '-timestamp').first()
@@ -401,7 +552,10 @@ class SnapshotPathView(View):
            )

        canonical_base = snapshot.url_path
-        requested_base = f'{username}/{date}/{domain or url or ""}'
+        if date:
+            requested_base = f'{username}/{date}/{domain or url or ""}'
+        else:
+            requested_base = f'{username}/{domain or url or ""}'
        if snapshot_id:
            requested_base = f'{requested_base}/{snapshot_id}'
        if canonical_base != requested_base:
@@ -412,6 +566,18 @@ class SnapshotPathView(View):
            return redirect(target)

        archivefile = path or "index.html"
+        if archivefile != "index.html" and not request.GET.get('files'):
+            target = build_snapshot_url(str(snapshot.id), archivefile, request=request)
+            query = request.META.get('QUERY_STRING')
+            if query:
+                target = f'{target}?{query}'
+            return redirect(target)
+
+        if request.GET.get('files'):
+            target_path = _files_index_target(snapshot, archivefile)
+            return serve_static_with_byterange_support(
+                request, target_path, document_root=snapshot.output_dir, show_indexes=True,
+            )

        if archivefile == "index.html":
            return SnapshotView.render_live_index(request, snapshot)
@@ -421,6 +587,202 @@ class SnapshotPathView(View):
        )


+def _safe_archive_relpath(path: str) -> str | None:
+    if not path:
+        return ""
+    cleaned = posixpath.normpath(path)
+    cleaned = cleaned.lstrip("/")
+    if cleaned.startswith("..") or "/../" in f"/{cleaned}/":
+        return None
+    return cleaned
+
+
+def _latest_response_match(domain: str, rel_path: str) -> tuple[Path, Path] | None:
+    if not domain or not rel_path:
+        return None
+    domain = domain.split(":", 1)[0].lower()
+    # TODO: optimize by querying output_files in DB instead of globbing filesystem
+    data_root = DATA_DIR / "users"
+    escaped_domain = escape(domain)
+    escaped_path = escape(rel_path)
+    pattern = str(data_root / "*" / "snapshots" / "*" / escaped_domain / "*" / "responses" / escaped_domain / escaped_path)
+    matches = glob(pattern)
+    if not matches:
+        return None
+
+    def sort_key(match_path: str) -> tuple[str, str]:
+        parts = Path(match_path).parts
+        date_str = ""
+        try:
+            idx = parts.index("snapshots")
+            date_str = parts[idx + 1]
+        except Exception:
+            date_str = ""
+        return (date_str, match_path)
+
+    best = max(matches, key=sort_key)
+    best_path = Path(best)
+    parts = best_path.parts
+    try:
+        responses_idx = parts.index("responses")
+    except ValueError:
+        return None
+    responses_root = Path(*parts[: responses_idx + 1])
+    rel_to_root = Path(*parts[responses_idx + 1 :])
+    return responses_root, rel_to_root
+
+
+def _latest_responses_root(domain: str) -> Path | None:
+    if not domain:
+        return None
+    domain = domain.split(":", 1)[0].lower()
+    data_root = DATA_DIR / "users"
+    escaped_domain = escape(domain)
+    pattern = str(data_root / "*" / "snapshots" / "*" / escaped_domain / "*" / "responses" / escaped_domain)
+    matches = glob(pattern)
+    if not matches:
+        return None
+
+    def sort_key(match_path: str) -> tuple[str, str]:
+        parts = Path(match_path).parts
+        date_str = ""
+        try:
+            idx = parts.index("snapshots")
+            date_str = parts[idx + 1]
+        except Exception:
+            date_str = ""
+        return (date_str, match_path)
+
+    best = max(matches, key=sort_key)
+    return Path(best)
+
+
+def _serve_responses_path(request, responses_root: Path, rel_path: str, show_indexes: bool):
+    candidates: list[str] = []
+    rel_path = rel_path or ""
+    if rel_path.endswith("/"):
+        rel_path = f"{rel_path}index.html"
+    if "." not in Path(rel_path).name:
+        candidates.append(f"{rel_path.rstrip('/')}/index.html")
+    candidates.append(rel_path)
+
+    for candidate in candidates:
+        try:
+            return serve_static_with_byterange_support(
+                request,
+                candidate,
+                document_root=str(responses_root),
+                show_indexes=show_indexes,
+            )
+        except Http404:
+            pass
+
+    if rel_path.endswith("index.html"):
+        rel_dir = rel_path[: -len("index.html")]
+        try:
+            return serve_static_with_byterange_support(
+                request,
+                rel_dir,
+                document_root=str(responses_root),
+                show_indexes=True,
+            )
+        except Http404:
+            return None
+    return None
+
+
+class SnapshotHostView(View):
+    """Serve snapshot directory contents on <snapshot_id>.<listen_host>/<path>."""
+
+    def get(self, request, snapshot_id: str, path: str = ""):
+        if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
+            return HttpResponseForbidden("Public snapshots are disabled.")
+        snapshot = None
+        if snapshot_id:
+            try:
+                snapshot = Snapshot.objects.get(pk=snapshot_id)
+            except Snapshot.DoesNotExist:
+                try:
+                    snapshot = Snapshot.objects.get(id__startswith=snapshot_id)
+                except Snapshot.DoesNotExist:
+                    snapshot = None
+                except Snapshot.MultipleObjectsReturned:
+                    snapshot = Snapshot.objects.filter(id__startswith=snapshot_id).first()
+
+        if not snapshot:
+            raise Http404
+
+        rel_path = path or ""
+        show_indexes = bool(request.GET.get("files"))
+        if not rel_path or rel_path.endswith("/"):
+            if show_indexes:
+                rel_path = rel_path.rstrip("/")
+            else:
+                rel_path = f"{rel_path}index.html"
+        rel_path = _safe_archive_relpath(rel_path)
+        if rel_path is None:
+            raise Http404
+
+        try:
+            return serve_static_with_byterange_support(
+                request,
+                rel_path,
+                document_root=snapshot.output_dir,
+                show_indexes=show_indexes,
+            )
+        except Http404:
+            pass
+
+        # Fallback to responses/<domain>/<path>
+        host = urlparse(snapshot.url).hostname or snapshot.domain
+        responses_root = Path(snapshot.output_dir) / "responses" / host
+        if responses_root.exists():
+            response = _serve_responses_path(request, responses_root, rel_path, show_indexes)
+            if response is not None:
+                return response
+
+        raise Http404
+
+
+class OriginalDomainHostView(View):
+    """Serve responses from the most recent snapshot when using <domain>.<listen_host>/<path>."""
+
+    def get(self, request, domain: str, path: str = ""):
+        if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
+            return HttpResponseForbidden("Public snapshots are disabled.")
+        rel_path = path or ""
+        if not rel_path or rel_path.endswith("/"):
+            rel_path = f"{rel_path}index.html"
+        rel_path = _safe_archive_relpath(rel_path)
+        if rel_path is None:
+            raise Http404
+
+        domain = domain.lower()
+        match = _latest_response_match(domain, rel_path)
+        if not match and "." not in Path(rel_path).name:
+            index_path = f"{rel_path.rstrip('/')}/index.html"
+            match = _latest_response_match(domain, index_path)
+        if not match and "." not in Path(rel_path).name:
+            html_path = f"{rel_path}.html"
+            match = _latest_response_match(domain, html_path)
+
+        show_indexes = bool(request.GET.get("files"))
+        if match:
+            responses_root, rel_to_root = match
+            response = _serve_responses_path(request, responses_root, str(rel_to_root), show_indexes)
+            if response is not None:
+                return response
+
+        # If no direct match, try serving directory index from latest responses root
+        responses_root = _latest_responses_root(domain)
+        if responses_root:
+            response = _serve_responses_path(request, responses_root, rel_path, show_indexes)
+            if response is not None:
+                return response
+
+        raise Http404
+
+
 class PublicIndexView(ListView):
    template_name = 'public_index.html'
    model = Snapshot
@@ -508,7 +870,7 @@ class AddView(UserPassesTestMixin, FormView):
            'available_tags': list(Tag.objects.all().order_by('name').values_list('name', flat=True)),
        }

-    def form_valid(self, form):
+    def _create_crawl_from_form(self, form, *, created_by_id=None) -> Crawl:
        urls = form.cleaned_data["url"]
        print(f'[+] Adding URL: {urls}')

@@ -522,13 +884,21 @@ class AddView(UserPassesTestMixin, FormView):
        update = form.cleaned_data.get("update", False)
        index_only = form.cleaned_data.get("index_only", False)
        notes = form.cleaned_data.get("notes", "")
-        custom_config = form.cleaned_data.get("config", {})
+        custom_config = form.cleaned_data.get("config") or {}

        from archivebox.config.permissions import HOSTNAME

+        if created_by_id is None:
+            if self.request.user.is_authenticated:
+                created_by_id = self.request.user.pk
+            else:
+                from archivebox.base_models.models import get_or_create_system_user_pk
+                created_by_id = get_or_create_system_user_pk()
+
+        created_by_name = self.request.user.username if self.request.user.is_authenticated else 'web'

        # 1. save the provided urls to sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
-        sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{self.request.user.pk}.txt'
+        sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{created_by_id}.txt'
        sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))

        # 2. create a new Crawl with the URLs from the file
@@ -552,8 +922,8 @@ class AddView(UserPassesTestMixin, FormView):
            max_depth=depth,
            tags_str=tag,
            notes=notes,
-            label=f'{self.request.user.username}@{HOSTNAME}{self.request.path} {timestamp}',
-            created_by_id=self.request.user.pk,
+            label=f'{created_by_name}@{HOSTNAME}{self.request.path} {timestamp}',
+            created_by_id=created_by_id,
            config=config
        )

@@ -566,7 +936,7 @@ class AddView(UserPassesTestMixin, FormView):
                is_enabled=True,
                label=crawl.label,
                notes=f"Auto-created from add page. {notes}".strip(),
-                created_by_id=self.request.user.pk,
+                created_by_id=created_by_id,
            )
            crawl.schedule = crawl_schedule
            crawl.save(update_fields=['schedule'])
@@ -576,7 +946,13 @@ class AddView(UserPassesTestMixin, FormView):
        # from archivebox.crawls.actors import CrawlActor
        # from archivebox.core.actors import SnapshotActor, ArchiveResultActor

+        return crawl

+    def form_valid(self, form):
+        crawl = self._create_crawl_from_form(form)
+
+        urls = form.cleaned_data["url"]
+        schedule = form.cleaned_data.get("schedule", "").strip()
        rough_url_count = urls.count('://')

        # Build success message with schedule link if created
@@ -593,6 +969,74 @@ class AddView(UserPassesTestMixin, FormView):
        return redirect(crawl.admin_change_url)


+class WebAddView(AddView):
+    def _latest_snapshot_for_url(self, requested_url: str):
+        return SnapshotView.find_snapshots_for_url(requested_url).order_by(
+            '-created_at', '-bookmarked_at', '-timestamp'
+        ).first()
+
+    def _normalize_add_url(self, requested_url: str) -> str:
+        if requested_url.startswith(('http://', 'https://')):
+            return requested_url
+        return f'https://{requested_url}'
+
+    def dispatch(self, request, *args, **kwargs):
+        requested_url = urldecode(kwargs.get('url', '') or '')
+        if requested_url:
+            snapshot = self._latest_snapshot_for_url(requested_url)
+            if snapshot:
+                return redirect(f'/{snapshot.url_path}')
+
+        if not self.test_func():
+            return HttpResponse(
+                format_html(
+                    (
+                        '<center><br/><br/><br/>'
+                        'No Snapshots match the given url: <code>{}</code><br/><br/><br/>'
+                        'Return to the <a href="/" target="_top">Main Index</a>'
+                        '</center>'
+                    ),
+                    requested_url or '',
+                ),
+                content_type="text/html",
+                status=404,
+            )
+
+        return super().dispatch(request, *args, **kwargs)
+
+    def get(self, request, url: str):
+        requested_url = urldecode(url)
+        if not requested_url:
+            raise Http404
+
+        snapshot = self._latest_snapshot_for_url(requested_url)
+        if snapshot:
+            return redirect(f'/{snapshot.url_path}')
+
+        add_url = self._normalize_add_url(requested_url)
+        defaults_form = self.form_class()
+        form_data = {
+            'url': add_url,
+            'depth': defaults_form.fields['depth'].initial or '0',
+            'persona': defaults_form.fields['persona'].initial or 'Default',
+            'config': {},
+        }
+        if defaults_form.fields['update'].initial:
+            form_data['update'] = 'on'
+        if defaults_form.fields['overwrite'].initial:
+            form_data['overwrite'] = 'on'
+        if defaults_form.fields['index_only'].initial:
+            form_data['index_only'] = 'on'
+
+        form = self.form_class(data=form_data)
+        if not form.is_valid():
+            return self.form_invalid(form)
+
+        crawl = self._create_crawl_from_form(form)
+        snapshot = Snapshot.from_json({'url': add_url, 'tags': form.cleaned_data.get('tag', '')}, overrides={'crawl': crawl})
+        return redirect(f'/{snapshot.url_path}')
+
+
 class HealthCheckView(View):
    """
    A Django view that renders plain text "OK" for service discovery tools
@@ -617,11 +1061,19 @@ def live_progress_view(request):
        from archivebox.workers.orchestrator import Orchestrator
        from archivebox.crawls.models import Crawl
        from archivebox.core.models import Snapshot, ArchiveResult
+        from archivebox.machine.models import Process, Machine
        from django.db.models import Case, When, Value, IntegerField

        # Get orchestrator status
        orchestrator_running = Orchestrator.is_running()
        total_workers = Orchestrator().get_total_worker_count() if orchestrator_running else 0
+        machine = Machine.current()
+        orchestrator_proc = Process.objects.filter(
+            machine=machine,
+            process_type=Process.TypeChoices.ORCHESTRATOR,
+            status=Process.StatusChoices.RUNNING,
+        ).order_by('-started_at').first()
+        orchestrator_pid = orchestrator_proc.pid if orchestrator_proc else None

        # Get model counts by status
        crawls_pending = Crawl.objects.filter(status=Crawl.StatusChoices.QUEUED).count()
@@ -653,24 +1105,47 @@ def live_progress_view(request):
                ext = embed.lower().split('.')[-1] if '.' in embed else ''
                is_embeddable = ext in ('png', 'jpg', 'jpeg', 'gif', 'webp', 'svg', 'ico', 'pdf', 'html')
                if is_embeddable or ar.plugin in ('screenshot', 'favicon', 'dom'):
+                    archive_path = embed or ''
                    recent_thumbnails.append({
                        'id': str(ar.id),
                        'plugin': ar.plugin,
                        'snapshot_id': str(ar.snapshot_id),
                        'snapshot_url': ar.snapshot.url[:60] if ar.snapshot else '',
                        'embed_path': embed,
-                        'archive_path': f'/{ar.snapshot.archive_path}/{embed}' if ar.snapshot else '',
+                        'archive_path': archive_path,
+                        'archive_url': build_snapshot_url(str(ar.snapshot_id), archive_path, request=request) if archive_path else '',
                        'end_ts': ar.end_ts.isoformat() if ar.end_ts else None,
                    })

        # Build hierarchical active crawls with nested snapshots and archive results
        from django.db.models import Prefetch

+        running_workers = Process.objects.filter(
+            machine=machine,
+            process_type=Process.TypeChoices.WORKER,
+            status=Process.StatusChoices.RUNNING,
+        )
+        crawl_worker_pids: dict[str, int] = {}
+        snapshot_worker_pids: dict[str, int] = {}
+        for proc in running_workers:
+            env = proc.env or {}
+            if not isinstance(env, dict):
+                continue
+            if proc.worker_type == 'crawl':
+                crawl_id = env.get('CRAWL_ID')
+                if crawl_id:
+                    crawl_worker_pids[str(crawl_id)] = proc.pid
+            elif proc.worker_type == 'snapshot':
+                snapshot_id = env.get('SNAPSHOT_ID')
+                if snapshot_id:
+                    snapshot_worker_pids[str(snapshot_id)] = proc.pid
+
        active_crawls_qs = Crawl.objects.filter(
            status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]
        ).prefetch_related(
            'snapshot_set',
            'snapshot_set__archiveresult_set',
+            'snapshot_set__archiveresult_set__process',
        ).distinct().order_by('-modified_at')[:10]

        active_crawls = []
@@ -710,8 +1185,9 @@ def live_progress_view(request):
                failed_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.FAILED)
                pending_plugins = sum(1 for ar in snapshot_results if ar.status == ArchiveResult.StatusChoices.QUEUED)

-                # Calculate snapshot progress
-                snapshot_progress = int(((completed_plugins + failed_plugins) / total_plugins) * 100) if total_plugins > 0 else 0
+                # Calculate snapshot progress using per-plugin progress
+                now = timezone.now()
+                plugin_progress_values: list[int] = []

                # Get all extractor plugins for this snapshot (already prefetched, sort in Python)
                # Order: started first, then queued, then completed
@@ -724,14 +1200,42 @@ def live_progress_view(request):
                    }
                    return (status_order.get(ar.status, 4), ar.plugin)

-                all_plugins = [
-                    {
+                all_plugins = []
+                for ar in sorted(snapshot_results, key=plugin_sort_key):
+                    status = ar.status
+                    progress_value = 0
+                    if status in (
+                        ArchiveResult.StatusChoices.SUCCEEDED,
+                        ArchiveResult.StatusChoices.FAILED,
+                        ArchiveResult.StatusChoices.SKIPPED,
+                    ):
+                        progress_value = 100
+                    elif status == ArchiveResult.StatusChoices.STARTED:
+                        started_at = ar.start_ts or (ar.process.started_at if ar.process_id and ar.process else None)
+                        timeout = ar.timeout or 120
+                        if started_at and timeout:
+                            elapsed = max(0.0, (now - started_at).total_seconds())
+                            progress_value = int(min(99, max(1, (elapsed / float(timeout)) * 100)))
+                        else:
+                            progress_value = 1
+                    else:
+                        progress_value = 0
+
+                    plugin_progress_values.append(progress_value)
+
+                    plugin_payload = {
                        'id': str(ar.id),
                        'plugin': ar.plugin,
-                        'status': ar.status,
+                        'status': status,
                    }
-                    for ar in sorted(snapshot_results, key=plugin_sort_key)
-                ]
+                    if ar.process_id and ar.process and ar.process.status == Process.StatusChoices.RUNNING:
+                        plugin_payload['pid'] = ar.process.pid
+                    if status == ArchiveResult.StatusChoices.STARTED:
+                        plugin_payload['progress'] = progress_value
+                        plugin_payload['timeout'] = ar.timeout or 120
+                    all_plugins.append(plugin_payload)
+
+                snapshot_progress = int(sum(plugin_progress_values) / total_plugins) if total_plugins > 0 else 0

                active_snapshots_for_crawl.append({
                    'id': str(snapshot.id),
@@ -744,6 +1248,7 @@ def live_progress_view(request):
                    'failed_plugins': failed_plugins,
                    'pending_plugins': pending_plugins,
                    'all_plugins': all_plugins,
+                    'worker_pid': snapshot_worker_pids.get(str(snapshot.id)),
                })

            # Check if crawl can start (for debugging stuck crawls)
@@ -772,10 +1277,12 @@ def live_progress_view(request):
                'urls_preview': urls_preview,
                'retry_at_future': retry_at_future,
                'seconds_until_retry': seconds_until_retry,
+                'worker_pid': crawl_worker_pids.get(str(crawl.id)),
            })

        return JsonResponse({
            'orchestrator_running': orchestrator_running,
+            'orchestrator_pid': orchestrator_pid,
            'total_workers': total_workers,
            'crawls_pending': crawls_pending,
            'crawls_started': crawls_started,