Files
ArchiveBox/archivebox/core/views.py

1986 lines
84 KiB
Python

__package__ = "archivebox.core"
import json
import os
import posixpath
from glob import glob, escape
from django.utils import timezone
import inspect
from typing import cast, get_type_hints
from collections.abc import Callable
from pathlib import Path
from urllib.parse import quote, urlparse
from django.shortcuts import render, redirect
from django.http import JsonResponse, HttpRequest, HttpResponse, Http404, HttpResponseForbidden
from django.utils.html import format_html
from django.utils.safestring import mark_safe
from django.views import View
from django.views.generic.list import ListView
from django.views.generic import FormView
from django.db.models import Q
from django.contrib import messages
from django.contrib.auth.mixins import UserPassesTestMixin
from django.views.decorators.csrf import csrf_exempt
from django.utils.decorators import method_decorator
from admin_data_views.typing import TableContext, ItemContext, SectionData
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG
from archivebox.config.configset import get_flat_config, get_config, get_all_configs
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str, urldecode, without_fragment
from archivebox.misc.serve_static import serve_static_with_byterange_support
from archivebox.misc.logging_util import printable_filesize
from archivebox.search import get_search_mode, prioritize_metadata_matches, query_search_index
from archivebox.core.models import Snapshot
from archivebox.core.host_utils import (
build_admin_url,
build_snapshot_url,
build_web_url,
get_admin_host,
get_snapshot_host,
get_snapshot_lookup_key,
get_web_host,
host_matches,
)
from archivebox.core.forms import AddLinkForm
from archivebox.crawls.models import Crawl
from archivebox.hooks import (
BUILTIN_PLUGINS_DIR,
USER_PLUGINS_DIR,
discover_plugin_configs,
get_enabled_plugins,
get_plugin_name,
iter_plugin_dirs,
)
ABX_PLUGINS_GITHUB_BASE_URL = "https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/"
LIVE_PLUGIN_BASE_URL = "/admin/environment/plugins/"
def _files_index_target(snapshot: Snapshot, archivefile: str | None) -> str:
target = archivefile or ""
if target == "index.html":
target = ""
fullpath = Path(snapshot.output_dir) / target
if fullpath.is_file():
target = str(Path(target).parent)
if target == ".":
target = ""
return target
def _find_snapshot_by_ref(snapshot_ref: str) -> Snapshot | None:
lookup = get_snapshot_lookup_key(snapshot_ref)
if not lookup:
return None
if len(lookup) == 12 and "-" not in lookup:
return Snapshot.objects.filter(id__endswith=lookup).order_by("-created_at", "-downloaded_at").first()
try:
return Snapshot.objects.get(pk=lookup)
except Snapshot.DoesNotExist:
try:
return Snapshot.objects.get(id__startswith=lookup)
except Snapshot.DoesNotExist:
return None
except Snapshot.MultipleObjectsReturned:
return Snapshot.objects.filter(id__startswith=lookup).first()
def _admin_login_redirect_or_forbidden(request: HttpRequest):
if SERVER_CONFIG.CONTROL_PLANE_ENABLED:
return redirect(f"/admin/login/?next={request.path}")
return HttpResponseForbidden("ArchiveBox is running with the control plane disabled in this security mode.")
class HomepageView(View):
def get(self, request):
if request.user.is_authenticated and SERVER_CONFIG.CONTROL_PLANE_ENABLED:
return redirect("/admin/core/snapshot/")
if SERVER_CONFIG.PUBLIC_INDEX:
return redirect("/public")
return _admin_login_redirect_or_forbidden(request)
class SnapshotView(View):
# render static html index from filesystem archive/<timestamp>/index.html
@staticmethod
def find_snapshots_for_url(path: str):
"""Return a queryset of snapshots matching a URL-ish path."""
def _fragmentless_url_query(url: str) -> Q:
canonical = without_fragment(url)
return Q(url=canonical) | Q(url__startswith=f"{canonical}#")
normalized = without_fragment(path)
if path.startswith(("http://", "https://")):
# try exact match on full url / ID first
qs = Snapshot.objects.filter(_fragmentless_url_query(path) | Q(id__icontains=path) | Q(id__icontains=normalized))
if qs.exists():
return qs
normalized = normalized.split("://", 1)[1]
# try exact match on full url / ID (without scheme)
qs = Snapshot.objects.filter(
_fragmentless_url_query("http://" + normalized)
| _fragmentless_url_query("https://" + normalized)
| Q(id__icontains=normalized),
)
if qs.exists():
return qs
# fall back to match on exact base_url
base = base_url(normalized)
qs = Snapshot.objects.filter(
_fragmentless_url_query("http://" + base) | _fragmentless_url_query("https://" + base),
)
if qs.exists():
return qs
# fall back to matching base_url as prefix
return Snapshot.objects.filter(Q(url__startswith="http://" + base) | Q(url__startswith="https://" + base))
@staticmethod
def render_live_index(request, snapshot):
TITLE_LOADING_MSG = "Not yet archived..."
from archivebox.core.widgets import TagEditorWidget
hidden_card_plugins = {"archivedotorg", "favicon", "title"}
outputs = [
out
for out in snapshot.discover_outputs(include_filesystem_fallback=True)
if (out.get("size") or 0) > 0 and out.get("name") not in hidden_card_plugins
]
archiveresults = {out["name"]: out for out in outputs}
hash_index = snapshot.hashes_index
# Get available extractor plugins from hooks (sorted by numeric prefix for ordering)
# Convert to base names for display ordering
all_plugins = [get_plugin_name(e) for e in get_enabled_plugins()]
accounted_entries: set[str] = set()
for output in outputs:
output_name = output.get("name") or ""
if output_name:
accounted_entries.add(output_name)
output_path = output.get("path") or ""
if not output_path:
continue
parts = Path(output_path).parts
if parts:
accounted_entries.add(parts[0])
loose_items, failed_items = snapshot.get_detail_page_auxiliary_items(outputs, hidden_card_plugins=hidden_card_plugins)
preview_priority = [
"singlefile",
"screenshot",
"wget",
"dom",
"pdf",
"readability",
]
preferred_types = tuple(preview_priority + [p for p in all_plugins if p not in preview_priority])
all_types = preferred_types + tuple(result_type for result_type in archiveresults.keys() if result_type not in preferred_types)
best_result = {"path": "about:blank", "result": None}
for result_type in preferred_types:
if result_type in archiveresults:
best_result = archiveresults[result_type]
break
related_snapshots_qs = SnapshotView.find_snapshots_for_url(snapshot.url)
related_snapshots = list(related_snapshots_qs.exclude(id=snapshot.id).order_by("-bookmarked_at", "-created_at", "-timestamp")[:25])
related_years_map: dict[int, list[Snapshot]] = {}
for snap in [snapshot, *related_snapshots]:
snap_dt = snap.bookmarked_at or snap.created_at or snap.downloaded_at
if not snap_dt:
continue
related_years_map.setdefault(snap_dt.year, []).append(snap)
related_years = []
for year, snaps in related_years_map.items():
snaps_sorted = sorted(
snaps,
key=lambda s: s.bookmarked_at or s.created_at or s.downloaded_at or timezone.now(),
reverse=True,
)
related_years.append(
{
"year": year,
"latest": snaps_sorted[0],
"snapshots": snaps_sorted,
},
)
related_years.sort(key=lambda item: item["year"], reverse=True)
warc_path = next(
(rel_path for rel_path in hash_index if rel_path.startswith("warc/") and ".warc" in Path(rel_path).name),
"warc/",
)
ordered_outputs = sorted(
archiveresults.values(),
key=lambda r: all_types.index(r["name"]) if r["name"] in all_types else -r["size"],
)
non_compact_outputs = [out for out in ordered_outputs if not out.get("is_compact") and not out.get("is_metadata")]
compact_outputs = [out for out in ordered_outputs if out.get("is_compact") or out.get("is_metadata")]
tag_widget = TagEditorWidget()
output_size = sum(int(out.get("size") or 0) for out in ordered_outputs)
is_archived = bool(ordered_outputs or snapshot.downloaded_at or snapshot.status == Snapshot.StatusChoices.SEALED)
context = {
"id": str(snapshot.id),
"snapshot_id": str(snapshot.id),
"url": snapshot.url,
"archive_path": snapshot.archive_path_from_db,
"title": htmlencode(snapshot.resolved_title or (snapshot.base_url if is_archived else TITLE_LOADING_MSG)),
"extension": snapshot.extension or "html",
"tags": snapshot.tags_str() or "untagged",
"size": printable_filesize(output_size) if output_size else "pending",
"status": "archived" if is_archived else "not yet archived",
"status_color": "success" if is_archived else "danger",
"bookmarked_date": snapshot.bookmarked_date,
"downloaded_datestr": snapshot.downloaded_datestr,
"num_outputs": snapshot.num_outputs,
"num_failures": snapshot.num_failures,
"oldest_archive_date": ts_to_date_str(snapshot.oldest_archive_date),
"warc_path": warc_path,
"PREVIEW_ORIGINALS": SERVER_CONFIG.PREVIEW_ORIGINALS,
"archiveresults": [*non_compact_outputs, *compact_outputs],
"best_result": best_result,
"snapshot": snapshot, # Pass the snapshot object for template tags
"related_snapshots": related_snapshots,
"related_years": related_years,
"loose_items": loose_items,
"failed_items": failed_items,
"title_tags": [{"name": tag.name, "style": tag_widget._tag_style(tag.name)} for tag in snapshot.tags.all().order_by("name")],
}
return render(template_name="core/snapshot.html", request=request, context=context)
def get(self, request, path):
if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
return _admin_login_redirect_or_forbidden(request)
snapshot = None
try:
slug, archivefile = path.split("/", 1)
except (IndexError, ValueError):
slug, archivefile = path.split("/", 1)[0], "index.html"
# slug is a timestamp
if slug.replace(".", "").isdigit():
# missing trailing slash -> redirect to index
if "/" not in path:
return redirect(f"{path}/index.html")
try:
try:
snapshot = Snapshot.objects.get(Q(timestamp=slug) | Q(id__startswith=slug))
canonical_base = snapshot.url_path
if canonical_base != snapshot.legacy_archive_path:
target_path = f"/{canonical_base}/{archivefile or 'index.html'}"
query = request.META.get("QUERY_STRING")
if query:
target_path = f"{target_path}?{query}"
return redirect(target_path)
if request.GET.get("files"):
target_path = _files_index_target(snapshot, archivefile)
response = serve_static_with_byterange_support(
request,
target_path,
document_root=snapshot.output_dir,
show_indexes=True,
is_archive_replay=True,
)
elif archivefile == "index.html":
# if they requested snapshot index, serve live rendered template instead of static html
response = self.render_live_index(request, snapshot)
else:
target = build_snapshot_url(str(snapshot.id), archivefile, request=request)
query = request.META.get("QUERY_STRING")
if query:
target = f"{target}?{query}"
return redirect(target)
response["Link"] = f'<{snapshot.url}>; rel="canonical"'
return response
except Snapshot.DoesNotExist:
if Snapshot.objects.filter(timestamp__startswith=slug).exists():
raise Snapshot.MultipleObjectsReturned
else:
raise
except Snapshot.DoesNotExist:
# Snapshot does not exist
return HttpResponse(
format_html(
(
"<center><br/><br/><br/>"
"No Snapshot directories match the given timestamp/ID: <code>{}</code><br/><br/>"
'You can <a href="/add/" target="_top">add a new Snapshot</a>, or return to the <a href="/" target="_top">Main Index</a>'
"</center>"
),
slug,
path,
),
content_type="text/html",
status=404,
)
except Snapshot.MultipleObjectsReturned:
snapshot_hrefs = mark_safe("<br/>").join(
format_html(
'{} <a href="/{}/index.html"><b><code>{}</code></b></a> {} <b>{}</b>',
snap.bookmarked_at.strftime("%Y-%m-%d %H:%M:%S"),
snap.archive_path,
snap.timestamp,
snap.url,
snap.title_stripped[:64] or "",
)
for snap in Snapshot.objects.filter(timestamp__startswith=slug)
.only("url", "timestamp", "title", "bookmarked_at")
.order_by("-bookmarked_at")
)
return HttpResponse(
format_html(
("Multiple Snapshots match the given timestamp/ID <code>{}</code><br/><pre>"),
slug,
)
+ snapshot_hrefs
+ format_html('</pre><br/>Choose a Snapshot to proceed or go back to the <a href="/" target="_top">Main Index</a>'),
content_type="text/html",
status=404,
)
except Http404:
assert snapshot # (Snapshot.DoesNotExist is already handled above)
# Snapshot dir exists but file within does not e.g. 124235.324234/screenshot.png
return HttpResponse(
format_html(
(
"<html><head>"
"<title>Snapshot Not Found</title>"
#'<script>'
#'setTimeout(() => { window.location.reload(); }, 5000);'
#'</script>'
"</head><body>"
"<center><br/><br/><br/>"
f'Snapshot <a href="/{snapshot.archive_path}/index.html" target="_top"><b><code>[{snapshot.timestamp}]</code></b></a>: <a href="{snapshot.url}" target="_blank" rel="noreferrer">{snapshot.url}</a><br/>'
f"was queued on {str(snapshot.bookmarked_at).split('.')[0]}, "
f'but no files have been saved yet in:<br/><b><a href="/{snapshot.archive_path}/" target="_top"><code>{snapshot.timestamp}</code></a><code>/'
"{}"
f"</code></b><br/><br/>"
"It's possible {} "
f"during the last capture on {str(snapshot.bookmarked_at).split('.')[0]},<br/>or that the archiving process has not completed yet.<br/>"
f"<pre><code># run this cmd to finish/retry archiving this Snapshot</code><br/>"
f'<code style="user-select: all; color: #333">archivebox update -t timestamp {snapshot.timestamp}</code></pre><br/><br/>'
'<div class="text-align: left; width: 100%; max-width: 400px">'
"<i><b>Next steps:</i></b><br/>"
f'- list all the <a href="/{snapshot.archive_path}/" target="_top">Snapshot files <code>.*</code></a><br/>'
f'- view the <a href="/{snapshot.archive_path}/index.html" target="_top">Snapshot <code>./index.html</code></a><br/>'
f'- go to the <a href="/admin/core/snapshot/{snapshot.pk}/change/" target="_top">Snapshot admin</a> to edit<br/>'
f'- go to the <a href="/admin/core/snapshot/?id__exact={snapshot.id}" target="_top">Snapshot actions</a> to re-archive<br/>'
'- or return to <a href="/" target="_top">the main index...</a></div>'
"</center>"
"</body></html>"
),
archivefile if str(archivefile) != "None" else "",
f"the {archivefile} resource could not be fetched"
if str(archivefile) != "None"
else "the original site was not available",
),
content_type="text/html",
status=404,
)
# slug is a URL
try:
try:
snapshot = SnapshotView.find_snapshots_for_url(path).get()
except Snapshot.DoesNotExist:
raise
except Snapshot.DoesNotExist:
return HttpResponse(
format_html(
(
"<center><br/><br/><br/>"
"No Snapshots match the given url: <code>{}</code><br/><br/><br/>"
'Return to the <a href="/" target="_top">Main Index</a>, or:<br/><br/>'
'+ <i><a href="/add/?url={}" target="_top">Add a new Snapshot for <code>{}</code></a><br/><br/></i>'
"</center>"
),
base_url(path),
path if "://" in path else f"https://{path}",
path,
),
content_type="text/html",
status=404,
)
except Snapshot.MultipleObjectsReturned:
snapshots = SnapshotView.find_snapshots_for_url(path)
snapshot_hrefs = mark_safe("<br/>").join(
format_html(
'{} <code style="font-size: 0.8em">{}</code> <a href="/{}/index.html"><b><code>{}</code></b></a> {} <b>{}</b>',
snap.bookmarked_at.strftime("%Y-%m-%d %H:%M:%S"),
str(snap.id)[:8],
snap.archive_path,
snap.timestamp,
snap.url,
snap.title_stripped[:64] or "",
)
for snap in snapshots.only("url", "timestamp", "title", "bookmarked_at").order_by("-bookmarked_at")
)
return HttpResponse(
format_html(
("Multiple Snapshots match the given URL <code>{}</code><br/><pre>"),
base_url(path),
)
+ snapshot_hrefs
+ format_html('</pre><br/>Choose a Snapshot to proceed or go back to the <a href="/" target="_top">Main Index</a>'),
content_type="text/html",
status=404,
)
target_path = f"/{snapshot.archive_path}/index.html"
query = request.META.get("QUERY_STRING")
if query:
target_path = f"{target_path}?{query}"
return redirect(target_path)
class SnapshotPathView(View):
"""Serve snapshots by the new URL scheme: /<username>/<YYYYMMDD>/<domain>/<uuid>/..."""
def get(
self,
request,
username: str,
date: str | None = None,
domain: str | None = None,
snapshot_id: str | None = None,
path: str = "",
url: str | None = None,
):
if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
return _admin_login_redirect_or_forbidden(request)
if username == "system":
return redirect(request.path.replace("/system/", "/web/", 1))
if date and domain and domain == date:
raise Http404
requested_url = url
if not requested_url and domain and domain.startswith(("http://", "https://")):
requested_url = domain
snapshot = None
if snapshot_id:
try:
snapshot = Snapshot.objects.get(pk=snapshot_id)
except Snapshot.DoesNotExist:
try:
snapshot = Snapshot.objects.get(id__startswith=snapshot_id)
except Snapshot.DoesNotExist:
snapshot = None
except Snapshot.MultipleObjectsReturned:
snapshot = Snapshot.objects.filter(id__startswith=snapshot_id).first()
else:
# fuzzy lookup by date + domain/url (most recent)
username_lookup = "system" if username == "web" else username
if requested_url:
qs = SnapshotView.find_snapshots_for_url(requested_url).filter(crawl__created_by__username=username_lookup)
else:
qs = Snapshot.objects.filter(crawl__created_by__username=username_lookup)
if date:
try:
if len(date) == 4:
qs = qs.filter(created_at__year=int(date))
elif len(date) == 6:
qs = qs.filter(created_at__year=int(date[:4]), created_at__month=int(date[4:6]))
elif len(date) == 8:
qs = qs.filter(
created_at__year=int(date[:4]),
created_at__month=int(date[4:6]),
created_at__day=int(date[6:8]),
)
except ValueError:
pass
if requested_url:
snapshot = qs.order_by("-created_at", "-bookmarked_at", "-timestamp").first()
else:
requested_domain = domain or ""
if requested_domain.startswith(("http://", "https://")):
requested_domain = Snapshot.extract_domain_from_url(requested_domain)
else:
requested_domain = Snapshot.extract_domain_from_url(f"https://{requested_domain}")
# Prefer exact domain matches
matches = [
s for s in qs.order_by("-created_at", "-bookmarked_at") if Snapshot.extract_domain_from_url(s.url) == requested_domain
]
snapshot = matches[0] if matches else qs.order_by("-created_at", "-bookmarked_at", "-timestamp").first()
if not snapshot:
return HttpResponse(
format_html(
(
"<center><br/><br/><br/>"
"No Snapshots match the given id or url: <code>{}</code><br/><br/><br/>"
'Return to the <a href="/" target="_top">Main Index</a>'
"</center>"
),
snapshot_id or requested_url or domain,
),
content_type="text/html",
status=404,
)
canonical_base = snapshot.url_path
if date:
requested_base = f"{username}/{date}/{domain or url or ''}"
else:
requested_base = f"{username}/{domain or url or ''}"
if snapshot_id:
requested_base = f"{requested_base}/{snapshot_id}"
if canonical_base != requested_base:
target = f"/{canonical_base}/{path or 'index.html'}"
query = request.META.get("QUERY_STRING")
if query:
target = f"{target}?{query}"
return redirect(target)
archivefile = path or "index.html"
if archivefile != "index.html" and not request.GET.get("files"):
target = build_snapshot_url(str(snapshot.id), archivefile, request=request)
query = request.META.get("QUERY_STRING")
if query:
target = f"{target}?{query}"
return redirect(target)
if request.GET.get("files"):
target_path = _files_index_target(snapshot, archivefile)
return serve_static_with_byterange_support(
request,
target_path,
document_root=snapshot.output_dir,
show_indexes=True,
is_archive_replay=True,
)
if archivefile == "index.html":
return SnapshotView.render_live_index(request, snapshot)
return serve_static_with_byterange_support(
request,
archivefile,
document_root=snapshot.output_dir,
show_indexes=True,
is_archive_replay=True,
)
def _safe_archive_relpath(path: str) -> str | None:
if not path:
return ""
cleaned = posixpath.normpath(path)
cleaned = cleaned.lstrip("/")
if cleaned.startswith("..") or "/../" in f"/{cleaned}/":
return None
return cleaned
def _coerce_sort_timestamp(value: str | float | None) -> float:
if value is None:
return 0.0
try:
return float(value)
except (TypeError, ValueError):
return 0.0
def _snapshot_sort_key(match_path: str, cache: dict[str, float]) -> tuple[float, str]:
parts = Path(match_path).parts
date_str = ""
snapshot_id = ""
try:
idx = parts.index("snapshots")
date_str = parts[idx + 1]
snapshot_id = parts[idx + 3]
except Exception:
return (_coerce_sort_timestamp(date_str), match_path)
if snapshot_id not in cache:
snapshot = Snapshot.objects.filter(id=snapshot_id).only("bookmarked_at", "created_at", "downloaded_at", "timestamp").first()
if snapshot:
snap_dt = snapshot.bookmarked_at or snapshot.created_at or snapshot.downloaded_at
cache[snapshot_id] = snap_dt.timestamp() if snap_dt else _coerce_sort_timestamp(snapshot.timestamp)
else:
cache[snapshot_id] = _coerce_sort_timestamp(date_str)
return (cache[snapshot_id], match_path)
def _latest_response_match(domain: str, rel_path: str) -> tuple[Path, Path] | None:
if not domain or not rel_path:
return None
domain = domain.split(":", 1)[0].lower()
# TODO: optimize by querying output_files in DB instead of globbing filesystem
data_root = DATA_DIR / "users"
escaped_domain = escape(domain)
escaped_path = escape(rel_path)
pattern = str(data_root / "*" / "snapshots" / "*" / escaped_domain / "*" / "responses" / escaped_domain / escaped_path)
matches = glob(pattern)
if not matches:
return None
sort_cache: dict[str, float] = {}
best = max(matches, key=lambda match_path: _snapshot_sort_key(match_path, sort_cache))
best_path = Path(best)
parts = best_path.parts
try:
responses_idx = parts.index("responses")
except ValueError:
return None
responses_root = Path(*parts[: responses_idx + 1])
rel_to_root = Path(*parts[responses_idx + 1 :])
return responses_root, rel_to_root
def _latest_responses_root(domain: str) -> Path | None:
if not domain:
return None
domain = domain.split(":", 1)[0].lower()
data_root = DATA_DIR / "users"
escaped_domain = escape(domain)
pattern = str(data_root / "*" / "snapshots" / "*" / escaped_domain / "*" / "responses" / escaped_domain)
matches = glob(pattern)
if not matches:
return None
sort_cache: dict[str, float] = {}
best = max(matches, key=lambda match_path: _snapshot_sort_key(match_path, sort_cache))
return Path(best)
def _latest_snapshot_for_domain(domain: str) -> Snapshot | None:
if not domain:
return None
requested_domain = domain.split(":", 1)[0].lower()
snapshots = SnapshotView.find_snapshots_for_url(f"https://{requested_domain}").order_by("-created_at", "-bookmarked_at", "-timestamp")
for snapshot in snapshots:
if Snapshot.extract_domain_from_url(snapshot.url).lower() == requested_domain:
return snapshot
return None
def _original_request_url(domain: str, path: str = "", query_string: str = "") -> str:
normalized_domain = (domain or "").split(":", 1)[0].lower()
normalized_path = (path or "").lstrip("/")
if normalized_path in ("", "index.html"):
normalized_path = ""
target = f"https://{normalized_domain}"
if normalized_path:
target = f"{target}/{normalized_path}"
if query_string:
target = f"{target}?{query_string}"
return target
def _serve_responses_path(request, responses_root: Path, rel_path: str, show_indexes: bool):
candidates: list[str] = []
rel_path = rel_path or ""
if rel_path.endswith("/"):
rel_path = f"{rel_path}index.html"
if "." not in Path(rel_path).name:
candidates.append(f"{rel_path.rstrip('/')}/index.html")
candidates.append(rel_path)
for candidate in candidates:
try:
return serve_static_with_byterange_support(
request,
candidate,
document_root=str(responses_root),
show_indexes=show_indexes,
is_archive_replay=True,
)
except Http404:
pass
if rel_path.endswith("index.html"):
rel_dir = rel_path[: -len("index.html")]
try:
return serve_static_with_byterange_support(
request,
rel_dir,
document_root=str(responses_root),
show_indexes=True,
is_archive_replay=True,
)
except Http404:
return None
return None
def _serve_snapshot_replay(request: HttpRequest, snapshot: Snapshot, path: str = ""):
rel_path = path or ""
is_directory_request = bool(path) and path.endswith("/")
show_indexes = bool(request.GET.get("files")) or (SERVER_CONFIG.USES_SUBDOMAIN_ROUTING and is_directory_request)
if not show_indexes and (not rel_path or rel_path == "index.html"):
return SnapshotView.render_live_index(request, snapshot)
if not rel_path or rel_path.endswith("/"):
if show_indexes:
rel_path = rel_path.rstrip("/")
else:
rel_path = f"{rel_path}index.html"
rel_path = _safe_archive_relpath(rel_path)
if rel_path is None:
raise Http404
try:
return serve_static_with_byterange_support(
request,
rel_path,
document_root=snapshot.output_dir,
show_indexes=show_indexes,
is_archive_replay=True,
)
except Http404:
pass
host = urlparse(snapshot.url).hostname or snapshot.domain
responses_root = Path(snapshot.output_dir) / "responses" / host
if responses_root.exists():
response = _serve_responses_path(request, responses_root, rel_path, show_indexes)
if response is not None:
return response
raise Http404
def _serve_original_domain_replay(request: HttpRequest, domain: str, path: str = ""):
requested_root_index = path in ("", "index.html") or path.endswith("/")
rel_path = path or ""
if not rel_path or rel_path.endswith("/"):
rel_path = f"{rel_path}index.html"
rel_path = _safe_archive_relpath(rel_path)
if rel_path is None:
raise Http404
domain = domain.lower()
match = _latest_response_match(domain, rel_path)
if not match and "." not in Path(rel_path).name:
index_path = f"{rel_path.rstrip('/')}/index.html"
match = _latest_response_match(domain, index_path)
if not match and "." not in Path(rel_path).name:
html_path = f"{rel_path}.html"
match = _latest_response_match(domain, html_path)
show_indexes = bool(request.GET.get("files"))
if match:
responses_root, rel_to_root = match
response = _serve_responses_path(request, responses_root, str(rel_to_root), show_indexes)
if response is not None:
return response
responses_root = _latest_responses_root(domain)
if responses_root:
response = _serve_responses_path(request, responses_root, rel_path, show_indexes)
if response is not None:
return response
if requested_root_index and not show_indexes:
snapshot = _latest_snapshot_for_domain(domain)
if snapshot:
return SnapshotView.render_live_index(request, snapshot)
if SERVER_CONFIG.PUBLIC_ADD_VIEW or request.user.is_authenticated:
target_url = _original_request_url(domain, path, request.META.get("QUERY_STRING", ""))
return redirect(build_web_url(f"/web/{quote(target_url, safe=':/')}"))
raise Http404
class SnapshotHostView(View):
"""Serve snapshot directory contents on <snapshot-subdomain>.<listen_host>/<path>."""
def get(self, request, snapshot_id: str, path: str = ""):
if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
return _admin_login_redirect_or_forbidden(request)
snapshot = _find_snapshot_by_ref(snapshot_id)
if not snapshot:
raise Http404
canonical_host = get_snapshot_host(str(snapshot.id))
if not host_matches(request.get_host(), canonical_host):
target = build_snapshot_url(str(snapshot.id), path, request=request)
if request.META.get("QUERY_STRING"):
target = f"{target}?{request.META['QUERY_STRING']}"
return redirect(target)
return _serve_snapshot_replay(request, snapshot, path)
class SnapshotReplayView(View):
"""Serve snapshot directory contents on a one-domain replay path."""
def get(self, request, snapshot_id: str, path: str = ""):
if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
return _admin_login_redirect_or_forbidden(request)
snapshot = _find_snapshot_by_ref(snapshot_id)
if not snapshot:
raise Http404
return _serve_snapshot_replay(request, snapshot, path)
class OriginalDomainHostView(View):
"""Serve responses from the most recent snapshot when using <domain>.<listen_host>/<path>."""
def get(self, request, domain: str, path: str = ""):
if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
return _admin_login_redirect_or_forbidden(request)
return _serve_original_domain_replay(request, domain, path)
class OriginalDomainReplayView(View):
"""Serve original-domain replay content on a one-domain replay path."""
def get(self, request, domain: str, path: str = ""):
if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
return _admin_login_redirect_or_forbidden(request)
return _serve_original_domain_replay(request, domain, path)
class PublicIndexView(ListView):
template_name = "public_index.html"
model = Snapshot
paginate_by = SERVER_CONFIG.SNAPSHOTS_PER_PAGE
ordering = ["-bookmarked_at", "-created_at"]
def get_context_data(self, **kwargs):
return {
**super().get_context_data(**kwargs),
"VERSION": VERSION,
"COMMIT_HASH": SHELL_CONFIG.COMMIT_HASH,
"FOOTER_INFO": SERVER_CONFIG.FOOTER_INFO,
"search_mode": get_search_mode(self.request.GET.get("search_mode")),
}
def get_queryset(self, **kwargs):
qs = super().get_queryset(**kwargs)
query = self.request.GET.get("q", default="").strip()
if not query:
return qs.distinct()
query_type = self.request.GET.get("query_type")
search_mode = get_search_mode(self.request.GET.get("search_mode"))
if not query_type or query_type == "all":
metadata_qs = qs.filter(
Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query),
)
if search_mode == "meta":
qs = metadata_qs
else:
try:
qs = prioritize_metadata_matches(
qs,
metadata_qs,
query_search_index(query, search_mode=search_mode),
ordering=self.ordering,
)
except Exception as err:
print(f"[!] Error while using search backend: {err.__class__.__name__} {err}")
qs = metadata_qs
elif query_type == "fulltext":
if search_mode == "meta":
qs = qs.none()
else:
try:
qs = query_search_index(query, search_mode=search_mode).filter(pk__in=qs.values("pk"))
except Exception as err:
print(f"[!] Error while using search backend: {err.__class__.__name__} {err}")
qs = qs.none()
elif query_type == "meta":
qs = qs.filter(
Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query),
)
elif query_type == "url":
qs = qs.filter(Q(url__icontains=query))
elif query_type == "title":
qs = qs.filter(Q(title__icontains=query))
elif query_type == "timestamp":
qs = qs.filter(Q(timestamp__icontains=query))
elif query_type == "tags":
qs = qs.filter(Q(tags__name__icontains=query))
else:
print(f'[!] Unknown value for query_type: "{query_type}"')
return qs.distinct()
def get(self, *args, **kwargs):
if self.request.user.is_authenticated:
return redirect("/admin/core/snapshot/")
if SERVER_CONFIG.PUBLIC_INDEX:
response = super().get(*args, **kwargs)
return response
else:
return _admin_login_redirect_or_forbidden(self.request)
@method_decorator(csrf_exempt, name="dispatch")
class AddView(UserPassesTestMixin, FormView):
template_name = "add.html"
form_class = AddLinkForm
def get_initial(self):
"""Prefill the AddLinkForm with the 'url' GET parameter"""
if self.request.method == "GET":
url = self.request.GET.get("url", None)
if url:
return {"url": url if "://" in url else f"https://{url}"}
return super().get_initial()
def test_func(self):
return SERVER_CONFIG.PUBLIC_ADD_VIEW or self.request.user.is_authenticated
def _can_override_crawl_config(self) -> bool:
user = self.request.user
return bool(user.is_authenticated and (getattr(user, "is_superuser", False) or getattr(user, "is_staff", False)))
def _get_custom_config_overrides(self, form: AddLinkForm) -> dict:
custom_config = form.cleaned_data.get("config") or {}
if not isinstance(custom_config, dict):
return {}
if not self._can_override_crawl_config():
return {}
return custom_config
def get_context_data(self, **kwargs):
required_search_plugin = f"search_backend_{SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}".strip()
plugin_configs = discover_plugin_configs()
plugin_dependency_map = {
plugin_name: [
str(required_plugin).strip() for required_plugin in (schema.get("required_plugins") or []) if str(required_plugin).strip()
]
for plugin_name, schema in plugin_configs.items()
if isinstance(schema.get("required_plugins"), list) and schema.get("required_plugins")
}
return {
**super().get_context_data(**kwargs),
"title": "Create Crawl",
# We can't just call request.build_absolute_uri in the template, because it would include query parameters
"absolute_add_path": self.request.build_absolute_uri(self.request.path),
"VERSION": VERSION,
"FOOTER_INFO": SERVER_CONFIG.FOOTER_INFO,
"required_search_plugin": required_search_plugin,
"plugin_dependency_map_json": json.dumps(plugin_dependency_map, sort_keys=True),
"stdout": "",
}
def _create_crawl_from_form(self, form, *, created_by_id=None) -> Crawl:
urls = form.cleaned_data["url"]
print(f"[+] Adding URL: {urls}")
# Extract all form fields
tag = form.cleaned_data["tag"]
depth = int(form.cleaned_data["depth"])
max_urls = int(form.cleaned_data.get("max_urls") or 0)
max_size = int(form.cleaned_data.get("max_size") or 0)
plugins = ",".join(form.cleaned_data.get("plugins", []))
schedule = form.cleaned_data.get("schedule", "").strip()
persona = form.cleaned_data.get("persona")
index_only = form.cleaned_data.get("index_only", False)
notes = form.cleaned_data.get("notes", "")
url_filters = form.cleaned_data.get("url_filters") or {}
custom_config = self._get_custom_config_overrides(form)
from archivebox.config.permissions import HOSTNAME
if created_by_id is None:
if self.request.user.is_authenticated:
created_by_id = self.request.user.pk
else:
from archivebox.base_models.models import get_or_create_system_user_pk
created_by_id = get_or_create_system_user_pk()
created_by_name = getattr(self.request.user, "username", "web") if self.request.user.is_authenticated else "web"
# 1. save the provided urls to sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
sources_file = CONSTANTS.SOURCES_DIR / f"{timezone.now().strftime('%Y-%m-%d__%H-%M-%S')}__web_ui_add_by_user_{created_by_id}.txt"
sources_file.parent.mkdir(parents=True, exist_ok=True)
sources_file.write_text(urls if isinstance(urls, str) else "\n".join(urls))
# 2. create a new Crawl with the URLs from the file
timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
urls_content = sources_file.read_text()
# Build complete config
config = {
"INDEX_ONLY": index_only,
"DEPTH": depth,
"PLUGINS": plugins or "",
"DEFAULT_PERSONA": (persona.name if persona else "Default"),
}
# Merge custom config overrides
config.update(custom_config)
if url_filters.get("allowlist"):
config["URL_ALLOWLIST"] = url_filters["allowlist"]
if url_filters.get("denylist"):
config["URL_DENYLIST"] = url_filters["denylist"]
crawl = Crawl.objects.create(
urls=urls_content,
max_depth=depth,
max_urls=max_urls,
max_size=max_size,
tags_str=tag,
notes=notes,
label=f"{created_by_name}@{HOSTNAME}{self.request.path} {timestamp}",
created_by_id=created_by_id,
config=config,
)
# 3. create a CrawlSchedule if schedule is provided
if schedule:
from archivebox.crawls.models import CrawlSchedule
crawl_schedule = CrawlSchedule.objects.create(
template=crawl,
schedule=schedule,
is_enabled=True,
label=crawl.label,
notes=f"Auto-created from add page. {notes}".strip(),
created_by_id=created_by_id,
)
crawl.schedule = crawl_schedule
crawl.save(update_fields=["schedule"])
crawl.create_snapshots_from_urls()
from archivebox.services.runner import ensure_background_runner
ensure_background_runner()
# 4. start the Orchestrator & wait until it completes
# ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
# from archivebox.crawls.actors import CrawlActor
# from archivebox.core.actors import SnapshotActor, ArchiveResultActor
return crawl
def form_valid(self, form):
crawl = self._create_crawl_from_form(form)
urls = form.cleaned_data["url"]
schedule = form.cleaned_data.get("schedule", "").strip()
rough_url_count = len([url for url in urls.splitlines() if url.strip()])
# Build success message with schedule link if created
schedule_msg = ""
if schedule:
schedule_msg = f" and <a href='{crawl.schedule.admin_change_url}'>scheduled to repeat {schedule}</a>"
messages.success(
self.request,
mark_safe(
f"Created crawl with {rough_url_count} starting URL(s){schedule_msg}. Snapshots will be created and archived in the background. <a href='{crawl.admin_change_url}'>View Crawl →</a>",
),
)
# Orchestrator (managed by supervisord) will pick up the queued crawl
return redirect(crawl.admin_change_url)
class WebAddView(AddView):
def _latest_snapshot_for_url(self, requested_url: str):
return SnapshotView.find_snapshots_for_url(requested_url).order_by("-created_at", "-bookmarked_at", "-timestamp").first()
def _normalize_add_url(self, requested_url: str) -> str:
if requested_url.startswith(("http://", "https://")):
return requested_url
return f"https://{requested_url}"
def dispatch(self, request, *args, **kwargs):
requested_url = urldecode(kwargs.get("url", "") or "")
if requested_url:
snapshot = self._latest_snapshot_for_url(requested_url)
if snapshot:
return redirect(f"/{snapshot.url_path}")
if not self.test_func():
request_host = (request.get_host() or "").lower()
if host_matches(request_host, get_web_host()):
return redirect(build_admin_url(request.get_full_path(), request=request))
if host_matches(request_host, get_admin_host()):
next_url = quote(request.get_full_path(), safe="/:?=&")
return redirect(f"{build_admin_url('/admin/login/', request=request)}?next={next_url}")
return HttpResponse(
format_html(
(
"<center><br/><br/><br/>"
"No Snapshots match the given url: <code>{}</code><br/><br/><br/>"
'Return to the <a href="/" target="_top">Main Index</a>'
"</center>"
),
requested_url or "",
),
content_type="text/html",
status=404,
)
return super().dispatch(request, *args, **kwargs)
def get(self, request: HttpRequest, *args: object, **kwargs: object):
requested_url = urldecode(str(kwargs.get("url") or (args[0] if args else "")))
if not requested_url:
raise Http404
snapshot = self._latest_snapshot_for_url(requested_url)
if snapshot:
return redirect(f"/{snapshot.url_path}")
add_url = self._normalize_add_url(requested_url)
assert self.form_class is not None
defaults_form = self.form_class()
form_data = {
"url": add_url,
"depth": defaults_form.fields["depth"].initial or "0",
"max_urls": defaults_form.fields["max_urls"].initial or 0,
"max_size": defaults_form.fields["max_size"].initial or "0",
"persona": defaults_form.fields["persona"].initial or "Default",
"config": {},
}
if defaults_form.fields["index_only"].initial:
form_data["index_only"] = "on"
form = self.form_class(data=form_data)
if not form.is_valid():
return self.form_invalid(form)
crawl = self._create_crawl_from_form(form)
snapshot = Snapshot.from_json({"url": add_url, "tags": form.cleaned_data.get("tag", "")}, overrides={"crawl": crawl})
assert snapshot is not None
return redirect(f"/{snapshot.url_path}")
class HealthCheckView(View):
"""
A Django view that renders plain text "OK" for service discovery tools
"""
def get(self, request):
"""
Handle a GET request
"""
return HttpResponse("OK", content_type="text/plain", status=200)
def live_progress_view(request):
"""Simple JSON endpoint for live progress status - used by admin progress monitor."""
try:
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.machine.models import Process, Machine
def is_current_run_timestamp(event_ts, run_started_at) -> bool:
if run_started_at is None:
return True
if event_ts is None:
return False
return event_ts >= run_started_at
def archiveresult_matches_current_run(ar, run_started_at) -> bool:
if run_started_at is None:
return True
if ar.status in (
ArchiveResult.StatusChoices.QUEUED,
ArchiveResult.StatusChoices.STARTED,
ArchiveResult.StatusChoices.BACKOFF,
):
return True
event_ts = ar.end_ts or ar.start_ts or ar.modified_at or ar.created_at
return is_current_run_timestamp(event_ts, run_started_at)
def hook_details(hook_name: str, plugin: str = "setup") -> tuple[str, str, str, str]:
normalized_hook_name = Path(hook_name).name if hook_name else ""
if not normalized_hook_name:
return (plugin, plugin, "unknown", "")
phase = "unknown"
if normalized_hook_name.startswith("on_Install__"):
phase = "install"
elif normalized_hook_name.startswith("on_CrawlSetup__"):
phase = "crawl"
elif normalized_hook_name.startswith("on_Snapshot__"):
phase = "snapshot"
elif normalized_hook_name.startswith("on_BinaryRequest__"):
phase = "binary"
label = normalized_hook_name
if "__" in normalized_hook_name:
label = normalized_hook_name.split("__", 1)[1]
label = label.rsplit(".", 1)[0]
if len(label) > 3 and label[:2].isdigit() and label[2] == "_":
label = label[3:]
label = label.replace("_", " ").strip() or plugin
return (plugin, label, phase, normalized_hook_name)
def process_label(cmd: list[str] | None) -> tuple[str, str, str, str]:
hook_path = ""
if isinstance(cmd, list) and cmd:
first = cmd[0]
if isinstance(first, str):
hook_path = first
if not hook_path:
return ("", "setup", "unknown", "")
return hook_details(Path(hook_path).name, plugin=Path(hook_path).parent.name or "setup")
machine = Machine.current()
Process.cleanup_stale_running(machine=machine)
Process.cleanup_orphaned_workers()
orchestrator_proc = (
Process.objects.filter(
machine=machine,
process_type=Process.TypeChoices.ORCHESTRATOR,
status=Process.StatusChoices.RUNNING,
)
.order_by("-started_at")
.first()
)
orchestrator_running = orchestrator_proc is not None
orchestrator_pid = orchestrator_proc.pid if orchestrator_proc else None
# Get model counts by status
crawls_pending = Crawl.objects.filter(status=Crawl.StatusChoices.QUEUED).count()
crawls_started = Crawl.objects.filter(status=Crawl.StatusChoices.STARTED).count()
# Get recent crawls (last 24 hours)
from datetime import timedelta
one_day_ago = timezone.now() - timedelta(days=1)
crawls_recent = Crawl.objects.filter(created_at__gte=one_day_ago).count()
snapshots_pending = Snapshot.objects.filter(status=Snapshot.StatusChoices.QUEUED).count()
snapshots_started = Snapshot.objects.filter(status=Snapshot.StatusChoices.STARTED).count()
archiveresults_pending = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.QUEUED).count()
archiveresults_started = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.STARTED).count()
archiveresults_succeeded = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.SUCCEEDED).count()
archiveresults_failed = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.FAILED).count()
# Get recently completed ArchiveResults with thumbnails (last 20 succeeded results)
recent_thumbnails = []
recent_results = (
ArchiveResult.objects.filter(
status=ArchiveResult.StatusChoices.SUCCEEDED,
)
.select_related("snapshot")
.order_by("-end_ts")[:20]
)
for ar in recent_results:
embed = ar.embed_path()
if embed:
# Only include results with embeddable image/media files
ext = embed.lower().split(".")[-1] if "." in embed else ""
is_embeddable = ext in ("png", "jpg", "jpeg", "gif", "webp", "svg", "ico", "pdf", "html")
if is_embeddable or ar.plugin in ("screenshot", "favicon", "dom"):
archive_path = embed or ""
recent_thumbnails.append(
{
"id": str(ar.id),
"plugin": ar.plugin,
"snapshot_id": str(ar.snapshot_id),
"snapshot_url": ar.snapshot.url[:60] if ar.snapshot else "",
"embed_path": embed,
"archive_path": archive_path,
"archive_url": build_snapshot_url(str(ar.snapshot_id), archive_path, request=request) if archive_path else "",
"end_ts": ar.end_ts.isoformat() if ar.end_ts else None,
},
)
# Build hierarchical active crawls with nested snapshots and archive results
running_processes = Process.objects.filter(
machine=machine,
status=Process.StatusChoices.RUNNING,
process_type__in=[
Process.TypeChoices.HOOK,
Process.TypeChoices.BINARY,
],
)
recent_processes = Process.objects.filter(
machine=machine,
process_type__in=[
Process.TypeChoices.HOOK,
Process.TypeChoices.BINARY,
],
modified_at__gte=timezone.now() - timedelta(minutes=10),
).order_by("-modified_at")
crawl_process_pids: dict[str, int] = {}
snapshot_process_pids: dict[str, int] = {}
process_records_by_crawl: dict[str, list[tuple[dict[str, object], object | None]]] = {}
process_records_by_snapshot: dict[str, list[tuple[dict[str, object], object | None]]] = {}
seen_process_records: set[str] = set()
for proc in running_processes:
env = proc.env or {}
if not isinstance(env, dict):
env = {}
crawl_id = env.get("CRAWL_ID")
snapshot_id = env.get("SNAPSHOT_ID")
_plugin, _label, phase, _hook_name = process_label(proc.cmd)
if crawl_id and proc.pid:
crawl_process_pids.setdefault(str(crawl_id), proc.pid)
if phase == "snapshot" and snapshot_id and proc.pid:
snapshot_process_pids.setdefault(str(snapshot_id), proc.pid)
for proc in recent_processes:
env = proc.env or {}
if not isinstance(env, dict):
env = {}
crawl_id = env.get("CRAWL_ID")
snapshot_id = env.get("SNAPSHOT_ID")
if not crawl_id and not snapshot_id:
continue
plugin, label, phase, hook_name = process_label(proc.cmd)
record_scope = str(snapshot_id) if phase == "snapshot" and snapshot_id else str(crawl_id)
proc_key = f"{record_scope}:{plugin}:{label}:{proc.status}:{proc.exit_code}"
if proc_key in seen_process_records:
continue
seen_process_records.add(proc_key)
status = (
"started"
if proc.status == Process.StatusChoices.RUNNING
else ("failed" if proc.exit_code not in (None, 0) else "succeeded")
)
payload: dict[str, object] = {
"id": str(proc.id),
"plugin": plugin,
"label": label,
"hook_name": hook_name,
"status": status,
"phase": phase,
"source": "process",
"process_id": str(proc.id),
}
if status == "started" and proc.pid:
payload["pid"] = proc.pid
proc_started_at = proc.started_at or proc.modified_at
if phase == "snapshot" and snapshot_id:
process_records_by_snapshot.setdefault(str(snapshot_id), []).append((payload, proc_started_at))
elif crawl_id:
process_records_by_crawl.setdefault(str(crawl_id), []).append((payload, proc_started_at))
active_crawls_qs = (
Crawl.objects.filter(status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED])
.prefetch_related(
"snapshot_set",
"snapshot_set__archiveresult_set",
"snapshot_set__archiveresult_set__process",
)
.distinct()
.order_by("-modified_at")[:10]
)
active_crawls = []
total_workers = 0
for crawl in active_crawls_qs:
# Get ALL snapshots for this crawl to count status (already prefetched)
all_crawl_snapshots = list(crawl.snapshot_set.all())
# Count snapshots by status from ALL snapshots
total_snapshots = len(all_crawl_snapshots)
completed_snapshots = sum(1 for s in all_crawl_snapshots if s.status == Snapshot.StatusChoices.SEALED)
started_snapshots = sum(1 for s in all_crawl_snapshots if s.status == Snapshot.StatusChoices.STARTED)
pending_snapshots = sum(1 for s in all_crawl_snapshots if s.status == Snapshot.StatusChoices.QUEUED)
# Get only ACTIVE snapshots to display (limit to 5 most recent)
active_crawl_snapshots = [
s for s in all_crawl_snapshots if s.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]
][:5]
# Count URLs in the crawl (for when snapshots haven't been created yet)
urls_count = 0
if crawl.urls:
urls_count = len([u for u in crawl.urls.split("\n") if u.strip() and not u.startswith("#")])
# Calculate crawl progress
crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0
crawl_run_started_at = crawl.created_at
crawl_setup_plugins = [
payload
for payload, proc_started_at in process_records_by_crawl.get(str(crawl.id), [])
if is_current_run_timestamp(proc_started_at, crawl_run_started_at)
]
total_workers += sum(1 for item in crawl_setup_plugins if item.get("source") == "process" and item.get("status") == "started")
crawl_setup_total = len(crawl_setup_plugins)
crawl_setup_completed = sum(1 for item in crawl_setup_plugins if item.get("status") == "succeeded")
crawl_setup_failed = sum(1 for item in crawl_setup_plugins if item.get("status") == "failed")
crawl_setup_pending = sum(1 for item in crawl_setup_plugins if item.get("status") == "queued")
# Get active snapshots for this crawl (already prefetched)
active_snapshots_for_crawl = []
for snapshot in active_crawl_snapshots:
snapshot_run_started_at = snapshot.downloaded_at or snapshot.created_at
# Get archive results for this snapshot (already prefetched)
snapshot_results = [
ar for ar in snapshot.archiveresult_set.all() if archiveresult_matches_current_run(ar, snapshot_run_started_at)
]
now = timezone.now()
plugin_progress_values: list[int] = []
all_plugins: list[dict[str, object]] = []
seen_plugin_keys: set[str] = set()
def plugin_sort_key(ar):
status_order = {
ArchiveResult.StatusChoices.STARTED: 0,
ArchiveResult.StatusChoices.QUEUED: 1,
ArchiveResult.StatusChoices.SUCCEEDED: 2,
ArchiveResult.StatusChoices.NORESULTS: 3,
ArchiveResult.StatusChoices.FAILED: 4,
}
return (status_order.get(ar.status, 5), ar.plugin, ar.hook_name or "")
for ar in sorted(snapshot_results, key=plugin_sort_key):
status = ar.status
progress_value = 0
if status in (
ArchiveResult.StatusChoices.SUCCEEDED,
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
ArchiveResult.StatusChoices.NORESULTS,
):
progress_value = 100
elif status == ArchiveResult.StatusChoices.STARTED:
started_at = ar.start_ts or (ar.process.started_at if ar.process_id and ar.process else None)
timeout = ar.timeout or 120
if started_at and timeout:
elapsed = max(0.0, (now - started_at).total_seconds())
progress_value = int(min(99, max(1, (elapsed / float(timeout)) * 100)))
else:
progress_value = 1
else:
progress_value = 0
plugin_progress_values.append(progress_value)
plugin, label, phase, hook_name = hook_details(ar.hook_name or ar.plugin, plugin=ar.plugin)
plugin_payload = {
"id": str(ar.id),
"plugin": ar.plugin,
"label": label,
"hook_name": hook_name,
"phase": phase,
"status": status,
"process_id": str(ar.process_id) if ar.process_id else None,
}
if status == ArchiveResult.StatusChoices.STARTED and ar.process_id and ar.process:
plugin_payload["pid"] = ar.process.pid
if status == ArchiveResult.StatusChoices.STARTED:
plugin_payload["progress"] = progress_value
plugin_payload["timeout"] = ar.timeout or 120
plugin_payload["source"] = "archiveresult"
all_plugins.append(plugin_payload)
seen_plugin_keys.add(str(ar.process_id) if ar.process_id else f"{ar.plugin}:{hook_name}")
for proc_payload, proc_started_at in process_records_by_snapshot.get(str(snapshot.id), []):
if not is_current_run_timestamp(proc_started_at, snapshot_run_started_at):
continue
proc_key = str(proc_payload.get("process_id") or f"{proc_payload.get('plugin')}:{proc_payload.get('hook_name')}")
if proc_key in seen_plugin_keys:
continue
seen_plugin_keys.add(proc_key)
all_plugins.append(proc_payload)
proc_status = proc_payload.get("status")
if proc_status in ("succeeded", "failed", "skipped"):
plugin_progress_values.append(100)
elif proc_status == "started":
plugin_progress_values.append(1)
total_workers += 1
else:
plugin_progress_values.append(0)
total_plugins = len(all_plugins)
completed_plugins = sum(1 for item in all_plugins if item.get("status") == "succeeded")
failed_plugins = sum(1 for item in all_plugins if item.get("status") == "failed")
pending_plugins = sum(1 for item in all_plugins if item.get("status") == "queued")
snapshot_progress = int(sum(plugin_progress_values) / len(plugin_progress_values)) if plugin_progress_values else 0
active_snapshots_for_crawl.append(
{
"id": str(snapshot.id),
"url": snapshot.url[:80],
"status": snapshot.status,
"started": (snapshot.downloaded_at or snapshot.created_at).isoformat()
if (snapshot.downloaded_at or snapshot.created_at)
else None,
"progress": snapshot_progress,
"total_plugins": total_plugins,
"completed_plugins": completed_plugins,
"failed_plugins": failed_plugins,
"pending_plugins": pending_plugins,
"all_plugins": all_plugins,
"worker_pid": snapshot_process_pids.get(str(snapshot.id)),
},
)
# Check if crawl can start (for debugging stuck crawls)
can_start = bool(crawl.urls)
urls_preview = crawl.urls[:60] if crawl.urls else None
# Check if retry_at is in the future (would prevent worker from claiming)
retry_at_future = crawl.retry_at > timezone.now() if crawl.retry_at else False
seconds_until_retry = int((crawl.retry_at - timezone.now()).total_seconds()) if crawl.retry_at and retry_at_future else 0
active_crawls.append(
{
"id": str(crawl.id),
"label": str(crawl)[:60],
"status": crawl.status,
"started": crawl.created_at.isoformat() if crawl.created_at else None,
"progress": crawl_progress,
"max_depth": crawl.max_depth,
"urls_count": urls_count,
"total_snapshots": total_snapshots,
"completed_snapshots": completed_snapshots,
"started_snapshots": started_snapshots,
"failed_snapshots": 0,
"pending_snapshots": pending_snapshots,
"setup_plugins": crawl_setup_plugins,
"setup_total_plugins": crawl_setup_total,
"setup_completed_plugins": crawl_setup_completed,
"setup_failed_plugins": crawl_setup_failed,
"setup_pending_plugins": crawl_setup_pending,
"active_snapshots": active_snapshots_for_crawl,
"can_start": can_start,
"urls_preview": urls_preview,
"retry_at_future": retry_at_future,
"seconds_until_retry": seconds_until_retry,
"worker_pid": crawl_process_pids.get(str(crawl.id)),
},
)
return JsonResponse(
{
"orchestrator_running": orchestrator_running,
"orchestrator_pid": orchestrator_pid,
"total_workers": total_workers,
"crawls_pending": crawls_pending,
"crawls_started": crawls_started,
"crawls_recent": crawls_recent,
"snapshots_pending": snapshots_pending,
"snapshots_started": snapshots_started,
"archiveresults_pending": archiveresults_pending,
"archiveresults_started": archiveresults_started,
"archiveresults_succeeded": archiveresults_succeeded,
"archiveresults_failed": archiveresults_failed,
"active_crawls": active_crawls,
"recent_thumbnails": recent_thumbnails,
"server_time": timezone.now().isoformat(),
},
)
except Exception as e:
import traceback
return JsonResponse(
{
"error": str(e),
"traceback": traceback.format_exc(),
"orchestrator_running": False,
"total_workers": 0,
"crawls_pending": 0,
"crawls_started": 0,
"crawls_recent": 0,
"snapshots_pending": 0,
"snapshots_started": 0,
"archiveresults_pending": 0,
"archiveresults_started": 0,
"archiveresults_succeeded": 0,
"archiveresults_failed": 0,
"active_crawls": [],
"recent_thumbnails": [],
"server_time": timezone.now().isoformat(),
},
status=500,
)
def find_config_section(key: str) -> str:
CONFIGS = get_all_configs()
if key in CONSTANTS_CONFIG:
return "CONSTANT"
matching_sections = [section_id for section_id, section in CONFIGS.items() if key in dict(section)]
section = matching_sections[0] if matching_sections else "DYNAMIC"
return section
def find_config_default(key: str) -> str:
CONFIGS = get_all_configs()
if key in CONSTANTS_CONFIG:
return str(CONSTANTS_CONFIG[key])
default_val = None
for config in CONFIGS.values():
if key in dict(config):
default_field = getattr(config, "model_fields", dict(config))[key]
default_val = default_field.default if hasattr(default_field, "default") else default_field
break
if isinstance(default_val, Callable):
default_val = inspect.getsource(default_val).split("lambda", 1)[-1].split(":", 1)[-1].replace("\n", " ").strip()
if default_val.count(")") > default_val.count("("):
default_val = default_val[:-1]
else:
default_val = str(default_val)
return default_val
def find_config_type(key: str) -> str:
from typing import ClassVar
CONFIGS = get_all_configs()
for config in CONFIGS.values():
if hasattr(config, key):
# Try to get from pydantic model_fields first (more reliable)
if hasattr(config, "model_fields") and key in config.model_fields:
field = config.model_fields[key]
if hasattr(field, "annotation") and field.annotation is not None:
try:
return str(field.annotation.__name__)
except AttributeError:
return str(field.annotation)
# Fallback to get_type_hints with proper namespace
try:
import typing
namespace = {
"ClassVar": ClassVar,
"Optional": typing.Optional,
"Union": typing.Union,
"List": list,
"Dict": dict,
"Path": Path,
}
type_hints = get_type_hints(config, globalns=namespace, localns=namespace)
try:
return str(type_hints[key].__name__)
except AttributeError:
return str(type_hints[key])
except Exception:
# If all else fails, return str
pass
return "str"
def key_is_safe(key: str) -> bool:
for term in ("key", "password", "secret", "token"):
if term in key.lower():
return False
return True
def find_config_source(key: str, merged_config: dict) -> str:
"""Determine where a config value comes from."""
from archivebox.machine.models import Machine
# Environment variables override all persistent config sources.
if key in os.environ:
return "Environment"
# Machine.config overrides ArchiveBox.conf.
try:
machine = Machine.current()
if machine.config and key in machine.config:
return "Machine"
except Exception:
pass
# Check if it's from archivebox.config.file
from archivebox.config.configset import BaseConfigSet
file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE)
if key in file_config:
return "Config File"
# Otherwise it's using the default
return "Default"
def find_plugin_for_config_key(key: str) -> str | None:
for plugin_name, schema in discover_plugin_configs().items():
if key in (schema.get("properties") or {}):
return plugin_name
return None
def get_config_definition_link(key: str) -> tuple[str, str]:
plugin_name = find_plugin_for_config_key(key)
if not plugin_name:
return (
f"https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{quote(key)}&type=code",
"archivebox/config",
)
plugin_dir = next((path.resolve() for path in iter_plugin_dirs() if path.name == plugin_name), None)
if plugin_dir:
builtin_root = BUILTIN_PLUGINS_DIR.resolve()
if plugin_dir.is_relative_to(builtin_root):
return (
f"{ABX_PLUGINS_GITHUB_BASE_URL}{quote(plugin_name)}/config.json",
f"abx_plugins/plugins/{plugin_name}/config.json",
)
user_root = USER_PLUGINS_DIR.resolve()
if plugin_dir.is_relative_to(user_root):
return (
f"{LIVE_PLUGIN_BASE_URL}user.{quote(plugin_name)}/",
f"data/custom_plugins/{plugin_name}/config.json",
)
return (
f"{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/",
f"abx_plugins/plugins/{plugin_name}/config.json",
)
@render_with_table_view
def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
CONFIGS = get_all_configs()
assert getattr(request.user, "is_superuser", False), "Must be a superuser to view configuration settings."
# Get merged config that includes Machine.config overrides
try:
from archivebox.machine.models import Machine
Machine.current()
merged_config = get_config()
except Exception:
# Fallback if Machine model not available
merged_config = get_config()
rows = {
"Section": [],
"Key": [],
"Type": [],
"Value": [],
"Source": [],
"Default": [],
# "Documentation": [],
# "Aliases": [],
}
for section_id, section in reversed(list(CONFIGS.items())):
for key in dict(section).keys():
rows["Section"].append(section_id) # section.replace('_', ' ').title().replace(' Config', '')
rows["Key"].append(ItemLink(key, key=key))
rows["Type"].append(format_html("<code>{}</code>", find_config_type(key)))
# Use merged config value (includes machine overrides)
actual_value = merged_config.get(key, getattr(section, key, None))
rows["Value"].append(mark_safe(f"<code>{actual_value}</code>") if key_is_safe(key) else "******** (redacted)")
# Show where the value comes from
source = find_config_source(key, merged_config)
source_colors = {"Machine": "purple", "Environment": "blue", "Config File": "green", "Default": "gray"}
rows["Source"].append(format_html('<code style="color: {}">{}</code>', source_colors.get(source, "gray"), source))
rows["Default"].append(
mark_safe(
f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>',
),
)
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
# rows['Aliases'].append(', '.join(find_config_aliases(key)))
section = "CONSTANT"
for key in CONSTANTS_CONFIG.keys():
rows["Section"].append(section) # section.replace('_', ' ').title().replace(' Config', '')
rows["Key"].append(ItemLink(key, key=key))
rows["Type"].append(format_html("<code>{}</code>", getattr(type(CONSTANTS_CONFIG[key]), "__name__", str(CONSTANTS_CONFIG[key]))))
rows["Value"].append(format_html("<code>{}</code>", CONSTANTS_CONFIG[key]) if key_is_safe(key) else "******** (redacted)")
rows["Source"].append(mark_safe('<code style="color: gray">Constant</code>'))
rows["Default"].append(
mark_safe(
f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>',
),
)
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
# rows['Aliases'].append('')
return TableContext(
title="Computed Configuration Values",
table=rows,
)
@render_with_item_view
def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
from archivebox.machine.models import Machine
from archivebox.config.configset import BaseConfigSet
CONFIGS = get_all_configs()
FLAT_CONFIG = get_flat_config()
assert getattr(request.user, "is_superuser", False), "Must be a superuser to view configuration settings."
# Get merged config
merged_config = get_config()
# Determine all sources for this config value
sources_info = []
# Environment variable
if key in os.environ:
sources_info.append(("Environment", os.environ[key] if key_is_safe(key) else "********", "blue"))
# Machine config
machine = None
machine_admin_url = None
try:
machine = Machine.current()
machine_admin_url = f"/admin/machine/machine/{machine.id}/change/"
if machine.config and key in machine.config:
sources_info.append(("Machine", machine.config[key] if key_is_safe(key) else "********", "purple"))
except Exception:
pass
# Config file value
if CONSTANTS.CONFIG_FILE.exists():
file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE)
if key in file_config:
sources_info.append(("Config File", file_config[key], "green"))
# Default value
default_val = find_config_default(key)
if default_val:
sources_info.append(("Default", default_val, "gray"))
# Final computed value
final_value = merged_config.get(key, FLAT_CONFIG.get(key, CONFIGS.get(key, None)))
if not key_is_safe(key):
final_value = "********"
# Build sources display
sources_html = "<br/>".join([f'<b style="color: {color}">{source}:</b> <code>{value}</code>' for source, value, color in sources_info])
# aliases = USER_CONFIG.get(key, {}).get("aliases", [])
aliases = []
if key in CONSTANTS_CONFIG:
section_header = mark_safe(
f'[CONSTANTS] &nbsp; <b><code style="color: lightgray">{key}</code></b> &nbsp; <small>(read-only, hardcoded by ArchiveBox)</small>',
)
elif key in FLAT_CONFIG:
section_header = mark_safe(
f'data / ArchiveBox.conf &nbsp; [{find_config_section(key)}] &nbsp; <b><code style="color: lightgray">{key}</code></b>',
)
else:
section_header = mark_safe(
f'[DYNAMIC CONFIG] &nbsp; <b><code style="color: lightgray">{key}</code></b> &nbsp; <small>(read-only, calculated at runtime)</small>',
)
definition_url, definition_label = get_config_definition_link(key)
section_data = cast(
SectionData,
{
"name": section_header,
"description": None,
"fields": {
"Key": key,
"Type": find_config_type(key),
"Value": final_value,
"Currently read from": find_config_source(key, merged_config),
},
"help_texts": {
"Key": mark_safe(f"""
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">Documentation</a> &nbsp;
<span style="display: {"inline" if aliases else "none"}">
Aliases: {", ".join(aliases)}
</span>
"""),
"Type": mark_safe(f'''
<a href="{definition_url}" target="_blank" rel="noopener noreferrer">
See full definition in <code>{definition_label}</code>...
</a>
'''),
"Value": mark_safe(f'''
{
'<b style="color: red">Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)</b><br/><br/>'
if not key_is_safe(key)
else ""
}
<br/><hr/><br/>
<b>Configuration Sources (highest priority first):</b><br/><br/>
{sources_html}
<br/><br/>
<p style="display: {"block" if key in FLAT_CONFIG and key not in CONSTANTS_CONFIG else "none"}">
<i>To change this value, edit <code>data/ArchiveBox.conf</code> or run:</i>
<br/><br/>
<code>archivebox config --set {key}="{
val.strip("'")
if (val := find_config_default(key))
else (str(FLAT_CONFIG[key] if key_is_safe(key) else "********")).strip("'")
}"</code>
</p>
'''),
"Currently read from": mark_safe(f"""
The value shown in the "Value" field comes from the <b>{find_config_source(key, merged_config)}</b> source.
<br/><br/>
Priority order (highest to lowest):
<ol>
<li><b style="color: blue">Environment</b> - Environment variables</li>
<li><b style="color: purple">Machine</b> - Machine-specific overrides (e.g., resolved binary paths)
{f'<br/><a href="{machine_admin_url}">→ Edit <code>{key}</code> in Machine.config for this server</a>' if machine_admin_url else ""}
</li>
<li><b style="color: green">Config File</b> - data/ArchiveBox.conf</li>
<li><b style="color: gray">Default</b> - Default value from code</li>
</ol>
{f'<br/><b>Tip:</b> To override <code>{key}</code> on this machine, <a href="{machine_admin_url}">edit the Machine.config field</a> and add:<br/><code>{{"\\"{key}\\": "your_value_here"}}</code>' if machine_admin_url and key not in CONSTANTS_CONFIG else ""}
"""),
},
},
)
return ItemContext(
slug=key,
title=key,
data=[section_data],
)