mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
1986 lines
84 KiB
Python
1986 lines
84 KiB
Python
__package__ = "archivebox.core"
|
|
|
|
import json
|
|
import os
|
|
import posixpath
|
|
from glob import glob, escape
|
|
from django.utils import timezone
|
|
import inspect
|
|
from typing import cast, get_type_hints
|
|
from collections.abc import Callable
|
|
from pathlib import Path
|
|
from urllib.parse import quote, urlparse
|
|
|
|
from django.shortcuts import render, redirect
|
|
from django.http import JsonResponse, HttpRequest, HttpResponse, Http404, HttpResponseForbidden
|
|
from django.utils.html import format_html
|
|
from django.utils.safestring import mark_safe
|
|
from django.views import View
|
|
from django.views.generic.list import ListView
|
|
from django.views.generic import FormView
|
|
from django.db.models import Q
|
|
from django.contrib import messages
|
|
from django.contrib.auth.mixins import UserPassesTestMixin
|
|
from django.views.decorators.csrf import csrf_exempt
|
|
from django.utils.decorators import method_decorator
|
|
|
|
from admin_data_views.typing import TableContext, ItemContext, SectionData
|
|
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
|
|
|
|
from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION
|
|
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG
|
|
from archivebox.config.configset import get_flat_config, get_config, get_all_configs
|
|
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str, urldecode, without_fragment
|
|
from archivebox.misc.serve_static import serve_static_with_byterange_support
|
|
from archivebox.misc.logging_util import printable_filesize
|
|
from archivebox.search import get_search_mode, prioritize_metadata_matches, query_search_index
|
|
|
|
from archivebox.core.models import Snapshot
|
|
from archivebox.core.host_utils import (
|
|
build_admin_url,
|
|
build_snapshot_url,
|
|
build_web_url,
|
|
get_admin_host,
|
|
get_snapshot_host,
|
|
get_snapshot_lookup_key,
|
|
get_web_host,
|
|
host_matches,
|
|
)
|
|
from archivebox.core.forms import AddLinkForm
|
|
from archivebox.crawls.models import Crawl
|
|
from archivebox.hooks import (
|
|
BUILTIN_PLUGINS_DIR,
|
|
USER_PLUGINS_DIR,
|
|
discover_plugin_configs,
|
|
get_enabled_plugins,
|
|
get_plugin_name,
|
|
iter_plugin_dirs,
|
|
)
|
|
|
|
|
|
ABX_PLUGINS_GITHUB_BASE_URL = "https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/"
|
|
LIVE_PLUGIN_BASE_URL = "/admin/environment/plugins/"
|
|
|
|
|
|
def _files_index_target(snapshot: Snapshot, archivefile: str | None) -> str:
|
|
target = archivefile or ""
|
|
if target == "index.html":
|
|
target = ""
|
|
fullpath = Path(snapshot.output_dir) / target
|
|
if fullpath.is_file():
|
|
target = str(Path(target).parent)
|
|
if target == ".":
|
|
target = ""
|
|
return target
|
|
|
|
|
|
def _find_snapshot_by_ref(snapshot_ref: str) -> Snapshot | None:
|
|
lookup = get_snapshot_lookup_key(snapshot_ref)
|
|
if not lookup:
|
|
return None
|
|
|
|
if len(lookup) == 12 and "-" not in lookup:
|
|
return Snapshot.objects.filter(id__endswith=lookup).order_by("-created_at", "-downloaded_at").first()
|
|
|
|
try:
|
|
return Snapshot.objects.get(pk=lookup)
|
|
except Snapshot.DoesNotExist:
|
|
try:
|
|
return Snapshot.objects.get(id__startswith=lookup)
|
|
except Snapshot.DoesNotExist:
|
|
return None
|
|
except Snapshot.MultipleObjectsReturned:
|
|
return Snapshot.objects.filter(id__startswith=lookup).first()
|
|
|
|
|
|
def _admin_login_redirect_or_forbidden(request: HttpRequest):
|
|
if SERVER_CONFIG.CONTROL_PLANE_ENABLED:
|
|
return redirect(f"/admin/login/?next={request.path}")
|
|
return HttpResponseForbidden("ArchiveBox is running with the control plane disabled in this security mode.")
|
|
|
|
|
|
class HomepageView(View):
|
|
def get(self, request):
|
|
if request.user.is_authenticated and SERVER_CONFIG.CONTROL_PLANE_ENABLED:
|
|
return redirect("/admin/core/snapshot/")
|
|
|
|
if SERVER_CONFIG.PUBLIC_INDEX:
|
|
return redirect("/public")
|
|
|
|
return _admin_login_redirect_or_forbidden(request)
|
|
|
|
|
|
class SnapshotView(View):
|
|
# render static html index from filesystem archive/<timestamp>/index.html
|
|
|
|
@staticmethod
|
|
def find_snapshots_for_url(path: str):
|
|
"""Return a queryset of snapshots matching a URL-ish path."""
|
|
|
|
def _fragmentless_url_query(url: str) -> Q:
|
|
canonical = without_fragment(url)
|
|
return Q(url=canonical) | Q(url__startswith=f"{canonical}#")
|
|
|
|
normalized = without_fragment(path)
|
|
if path.startswith(("http://", "https://")):
|
|
# try exact match on full url / ID first
|
|
qs = Snapshot.objects.filter(_fragmentless_url_query(path) | Q(id__icontains=path) | Q(id__icontains=normalized))
|
|
if qs.exists():
|
|
return qs
|
|
normalized = normalized.split("://", 1)[1]
|
|
|
|
# try exact match on full url / ID (without scheme)
|
|
qs = Snapshot.objects.filter(
|
|
_fragmentless_url_query("http://" + normalized)
|
|
| _fragmentless_url_query("https://" + normalized)
|
|
| Q(id__icontains=normalized),
|
|
)
|
|
if qs.exists():
|
|
return qs
|
|
|
|
# fall back to match on exact base_url
|
|
base = base_url(normalized)
|
|
qs = Snapshot.objects.filter(
|
|
_fragmentless_url_query("http://" + base) | _fragmentless_url_query("https://" + base),
|
|
)
|
|
if qs.exists():
|
|
return qs
|
|
|
|
# fall back to matching base_url as prefix
|
|
return Snapshot.objects.filter(Q(url__startswith="http://" + base) | Q(url__startswith="https://" + base))
|
|
|
|
@staticmethod
|
|
def render_live_index(request, snapshot):
|
|
TITLE_LOADING_MSG = "Not yet archived..."
|
|
from archivebox.core.widgets import TagEditorWidget
|
|
|
|
hidden_card_plugins = {"archivedotorg", "favicon", "title"}
|
|
outputs = [
|
|
out
|
|
for out in snapshot.discover_outputs(include_filesystem_fallback=True)
|
|
if (out.get("size") or 0) > 0 and out.get("name") not in hidden_card_plugins
|
|
]
|
|
archiveresults = {out["name"]: out for out in outputs}
|
|
hash_index = snapshot.hashes_index
|
|
# Get available extractor plugins from hooks (sorted by numeric prefix for ordering)
|
|
# Convert to base names for display ordering
|
|
all_plugins = [get_plugin_name(e) for e in get_enabled_plugins()]
|
|
accounted_entries: set[str] = set()
|
|
for output in outputs:
|
|
output_name = output.get("name") or ""
|
|
if output_name:
|
|
accounted_entries.add(output_name)
|
|
output_path = output.get("path") or ""
|
|
if not output_path:
|
|
continue
|
|
parts = Path(output_path).parts
|
|
if parts:
|
|
accounted_entries.add(parts[0])
|
|
|
|
loose_items, failed_items = snapshot.get_detail_page_auxiliary_items(outputs, hidden_card_plugins=hidden_card_plugins)
|
|
preview_priority = [
|
|
"singlefile",
|
|
"screenshot",
|
|
"wget",
|
|
"dom",
|
|
"pdf",
|
|
"readability",
|
|
]
|
|
preferred_types = tuple(preview_priority + [p for p in all_plugins if p not in preview_priority])
|
|
all_types = preferred_types + tuple(result_type for result_type in archiveresults.keys() if result_type not in preferred_types)
|
|
|
|
best_result = {"path": "about:blank", "result": None}
|
|
for result_type in preferred_types:
|
|
if result_type in archiveresults:
|
|
best_result = archiveresults[result_type]
|
|
break
|
|
|
|
related_snapshots_qs = SnapshotView.find_snapshots_for_url(snapshot.url)
|
|
related_snapshots = list(related_snapshots_qs.exclude(id=snapshot.id).order_by("-bookmarked_at", "-created_at", "-timestamp")[:25])
|
|
related_years_map: dict[int, list[Snapshot]] = {}
|
|
for snap in [snapshot, *related_snapshots]:
|
|
snap_dt = snap.bookmarked_at or snap.created_at or snap.downloaded_at
|
|
if not snap_dt:
|
|
continue
|
|
related_years_map.setdefault(snap_dt.year, []).append(snap)
|
|
related_years = []
|
|
for year, snaps in related_years_map.items():
|
|
snaps_sorted = sorted(
|
|
snaps,
|
|
key=lambda s: s.bookmarked_at or s.created_at or s.downloaded_at or timezone.now(),
|
|
reverse=True,
|
|
)
|
|
related_years.append(
|
|
{
|
|
"year": year,
|
|
"latest": snaps_sorted[0],
|
|
"snapshots": snaps_sorted,
|
|
},
|
|
)
|
|
related_years.sort(key=lambda item: item["year"], reverse=True)
|
|
|
|
warc_path = next(
|
|
(rel_path for rel_path in hash_index if rel_path.startswith("warc/") and ".warc" in Path(rel_path).name),
|
|
"warc/",
|
|
)
|
|
|
|
ordered_outputs = sorted(
|
|
archiveresults.values(),
|
|
key=lambda r: all_types.index(r["name"]) if r["name"] in all_types else -r["size"],
|
|
)
|
|
non_compact_outputs = [out for out in ordered_outputs if not out.get("is_compact") and not out.get("is_metadata")]
|
|
compact_outputs = [out for out in ordered_outputs if out.get("is_compact") or out.get("is_metadata")]
|
|
tag_widget = TagEditorWidget()
|
|
output_size = sum(int(out.get("size") or 0) for out in ordered_outputs)
|
|
is_archived = bool(ordered_outputs or snapshot.downloaded_at or snapshot.status == Snapshot.StatusChoices.SEALED)
|
|
|
|
context = {
|
|
"id": str(snapshot.id),
|
|
"snapshot_id": str(snapshot.id),
|
|
"url": snapshot.url,
|
|
"archive_path": snapshot.archive_path_from_db,
|
|
"title": htmlencode(snapshot.resolved_title or (snapshot.base_url if is_archived else TITLE_LOADING_MSG)),
|
|
"extension": snapshot.extension or "html",
|
|
"tags": snapshot.tags_str() or "untagged",
|
|
"size": printable_filesize(output_size) if output_size else "pending",
|
|
"status": "archived" if is_archived else "not yet archived",
|
|
"status_color": "success" if is_archived else "danger",
|
|
"bookmarked_date": snapshot.bookmarked_date,
|
|
"downloaded_datestr": snapshot.downloaded_datestr,
|
|
"num_outputs": snapshot.num_outputs,
|
|
"num_failures": snapshot.num_failures,
|
|
"oldest_archive_date": ts_to_date_str(snapshot.oldest_archive_date),
|
|
"warc_path": warc_path,
|
|
"PREVIEW_ORIGINALS": SERVER_CONFIG.PREVIEW_ORIGINALS,
|
|
"archiveresults": [*non_compact_outputs, *compact_outputs],
|
|
"best_result": best_result,
|
|
"snapshot": snapshot, # Pass the snapshot object for template tags
|
|
"related_snapshots": related_snapshots,
|
|
"related_years": related_years,
|
|
"loose_items": loose_items,
|
|
"failed_items": failed_items,
|
|
"title_tags": [{"name": tag.name, "style": tag_widget._tag_style(tag.name)} for tag in snapshot.tags.all().order_by("name")],
|
|
}
|
|
return render(template_name="core/snapshot.html", request=request, context=context)
|
|
|
|
def get(self, request, path):
|
|
if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
|
|
return _admin_login_redirect_or_forbidden(request)
|
|
|
|
snapshot = None
|
|
|
|
try:
|
|
slug, archivefile = path.split("/", 1)
|
|
except (IndexError, ValueError):
|
|
slug, archivefile = path.split("/", 1)[0], "index.html"
|
|
|
|
# slug is a timestamp
|
|
if slug.replace(".", "").isdigit():
|
|
# missing trailing slash -> redirect to index
|
|
if "/" not in path:
|
|
return redirect(f"{path}/index.html")
|
|
|
|
try:
|
|
try:
|
|
snapshot = Snapshot.objects.get(Q(timestamp=slug) | Q(id__startswith=slug))
|
|
canonical_base = snapshot.url_path
|
|
if canonical_base != snapshot.legacy_archive_path:
|
|
target_path = f"/{canonical_base}/{archivefile or 'index.html'}"
|
|
query = request.META.get("QUERY_STRING")
|
|
if query:
|
|
target_path = f"{target_path}?{query}"
|
|
return redirect(target_path)
|
|
|
|
if request.GET.get("files"):
|
|
target_path = _files_index_target(snapshot, archivefile)
|
|
response = serve_static_with_byterange_support(
|
|
request,
|
|
target_path,
|
|
document_root=snapshot.output_dir,
|
|
show_indexes=True,
|
|
is_archive_replay=True,
|
|
)
|
|
elif archivefile == "index.html":
|
|
# if they requested snapshot index, serve live rendered template instead of static html
|
|
response = self.render_live_index(request, snapshot)
|
|
else:
|
|
target = build_snapshot_url(str(snapshot.id), archivefile, request=request)
|
|
query = request.META.get("QUERY_STRING")
|
|
if query:
|
|
target = f"{target}?{query}"
|
|
return redirect(target)
|
|
response["Link"] = f'<{snapshot.url}>; rel="canonical"'
|
|
return response
|
|
except Snapshot.DoesNotExist:
|
|
if Snapshot.objects.filter(timestamp__startswith=slug).exists():
|
|
raise Snapshot.MultipleObjectsReturned
|
|
else:
|
|
raise
|
|
except Snapshot.DoesNotExist:
|
|
# Snapshot does not exist
|
|
return HttpResponse(
|
|
format_html(
|
|
(
|
|
"<center><br/><br/><br/>"
|
|
"No Snapshot directories match the given timestamp/ID: <code>{}</code><br/><br/>"
|
|
'You can <a href="/add/" target="_top">add a new Snapshot</a>, or return to the <a href="/" target="_top">Main Index</a>'
|
|
"</center>"
|
|
),
|
|
slug,
|
|
path,
|
|
),
|
|
content_type="text/html",
|
|
status=404,
|
|
)
|
|
except Snapshot.MultipleObjectsReturned:
|
|
snapshot_hrefs = mark_safe("<br/>").join(
|
|
format_html(
|
|
'{} <a href="/{}/index.html"><b><code>{}</code></b></a> {} <b>{}</b>',
|
|
snap.bookmarked_at.strftime("%Y-%m-%d %H:%M:%S"),
|
|
snap.archive_path,
|
|
snap.timestamp,
|
|
snap.url,
|
|
snap.title_stripped[:64] or "",
|
|
)
|
|
for snap in Snapshot.objects.filter(timestamp__startswith=slug)
|
|
.only("url", "timestamp", "title", "bookmarked_at")
|
|
.order_by("-bookmarked_at")
|
|
)
|
|
return HttpResponse(
|
|
format_html(
|
|
("Multiple Snapshots match the given timestamp/ID <code>{}</code><br/><pre>"),
|
|
slug,
|
|
)
|
|
+ snapshot_hrefs
|
|
+ format_html('</pre><br/>Choose a Snapshot to proceed or go back to the <a href="/" target="_top">Main Index</a>'),
|
|
content_type="text/html",
|
|
status=404,
|
|
)
|
|
except Http404:
|
|
assert snapshot # (Snapshot.DoesNotExist is already handled above)
|
|
|
|
# Snapshot dir exists but file within does not e.g. 124235.324234/screenshot.png
|
|
return HttpResponse(
|
|
format_html(
|
|
(
|
|
"<html><head>"
|
|
"<title>Snapshot Not Found</title>"
|
|
#'<script>'
|
|
#'setTimeout(() => { window.location.reload(); }, 5000);'
|
|
#'</script>'
|
|
"</head><body>"
|
|
"<center><br/><br/><br/>"
|
|
f'Snapshot <a href="/{snapshot.archive_path}/index.html" target="_top"><b><code>[{snapshot.timestamp}]</code></b></a>: <a href="{snapshot.url}" target="_blank" rel="noreferrer">{snapshot.url}</a><br/>'
|
|
f"was queued on {str(snapshot.bookmarked_at).split('.')[0]}, "
|
|
f'but no files have been saved yet in:<br/><b><a href="/{snapshot.archive_path}/" target="_top"><code>{snapshot.timestamp}</code></a><code>/'
|
|
"{}"
|
|
f"</code></b><br/><br/>"
|
|
"It's possible {} "
|
|
f"during the last capture on {str(snapshot.bookmarked_at).split('.')[0]},<br/>or that the archiving process has not completed yet.<br/>"
|
|
f"<pre><code># run this cmd to finish/retry archiving this Snapshot</code><br/>"
|
|
f'<code style="user-select: all; color: #333">archivebox update -t timestamp {snapshot.timestamp}</code></pre><br/><br/>'
|
|
'<div class="text-align: left; width: 100%; max-width: 400px">'
|
|
"<i><b>Next steps:</i></b><br/>"
|
|
f'- list all the <a href="/{snapshot.archive_path}/" target="_top">Snapshot files <code>.*</code></a><br/>'
|
|
f'- view the <a href="/{snapshot.archive_path}/index.html" target="_top">Snapshot <code>./index.html</code></a><br/>'
|
|
f'- go to the <a href="/admin/core/snapshot/{snapshot.pk}/change/" target="_top">Snapshot admin</a> to edit<br/>'
|
|
f'- go to the <a href="/admin/core/snapshot/?id__exact={snapshot.id}" target="_top">Snapshot actions</a> to re-archive<br/>'
|
|
'- or return to <a href="/" target="_top">the main index...</a></div>'
|
|
"</center>"
|
|
"</body></html>"
|
|
),
|
|
archivefile if str(archivefile) != "None" else "",
|
|
f"the {archivefile} resource could not be fetched"
|
|
if str(archivefile) != "None"
|
|
else "the original site was not available",
|
|
),
|
|
content_type="text/html",
|
|
status=404,
|
|
)
|
|
|
|
# slug is a URL
|
|
try:
|
|
try:
|
|
snapshot = SnapshotView.find_snapshots_for_url(path).get()
|
|
except Snapshot.DoesNotExist:
|
|
raise
|
|
except Snapshot.DoesNotExist:
|
|
return HttpResponse(
|
|
format_html(
|
|
(
|
|
"<center><br/><br/><br/>"
|
|
"No Snapshots match the given url: <code>{}</code><br/><br/><br/>"
|
|
'Return to the <a href="/" target="_top">Main Index</a>, or:<br/><br/>'
|
|
'+ <i><a href="/add/?url={}" target="_top">Add a new Snapshot for <code>{}</code></a><br/><br/></i>'
|
|
"</center>"
|
|
),
|
|
base_url(path),
|
|
path if "://" in path else f"https://{path}",
|
|
path,
|
|
),
|
|
content_type="text/html",
|
|
status=404,
|
|
)
|
|
except Snapshot.MultipleObjectsReturned:
|
|
snapshots = SnapshotView.find_snapshots_for_url(path)
|
|
snapshot_hrefs = mark_safe("<br/>").join(
|
|
format_html(
|
|
'{} <code style="font-size: 0.8em">{}</code> <a href="/{}/index.html"><b><code>{}</code></b></a> {} <b>{}</b>',
|
|
snap.bookmarked_at.strftime("%Y-%m-%d %H:%M:%S"),
|
|
str(snap.id)[:8],
|
|
snap.archive_path,
|
|
snap.timestamp,
|
|
snap.url,
|
|
snap.title_stripped[:64] or "",
|
|
)
|
|
for snap in snapshots.only("url", "timestamp", "title", "bookmarked_at").order_by("-bookmarked_at")
|
|
)
|
|
return HttpResponse(
|
|
format_html(
|
|
("Multiple Snapshots match the given URL <code>{}</code><br/><pre>"),
|
|
base_url(path),
|
|
)
|
|
+ snapshot_hrefs
|
|
+ format_html('</pre><br/>Choose a Snapshot to proceed or go back to the <a href="/" target="_top">Main Index</a>'),
|
|
content_type="text/html",
|
|
status=404,
|
|
)
|
|
|
|
target_path = f"/{snapshot.archive_path}/index.html"
|
|
query = request.META.get("QUERY_STRING")
|
|
if query:
|
|
target_path = f"{target_path}?{query}"
|
|
return redirect(target_path)
|
|
|
|
|
|
class SnapshotPathView(View):
|
|
"""Serve snapshots by the new URL scheme: /<username>/<YYYYMMDD>/<domain>/<uuid>/..."""
|
|
|
|
def get(
|
|
self,
|
|
request,
|
|
username: str,
|
|
date: str | None = None,
|
|
domain: str | None = None,
|
|
snapshot_id: str | None = None,
|
|
path: str = "",
|
|
url: str | None = None,
|
|
):
|
|
if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
|
|
return _admin_login_redirect_or_forbidden(request)
|
|
|
|
if username == "system":
|
|
return redirect(request.path.replace("/system/", "/web/", 1))
|
|
|
|
if date and domain and domain == date:
|
|
raise Http404
|
|
|
|
requested_url = url
|
|
if not requested_url and domain and domain.startswith(("http://", "https://")):
|
|
requested_url = domain
|
|
|
|
snapshot = None
|
|
if snapshot_id:
|
|
try:
|
|
snapshot = Snapshot.objects.get(pk=snapshot_id)
|
|
except Snapshot.DoesNotExist:
|
|
try:
|
|
snapshot = Snapshot.objects.get(id__startswith=snapshot_id)
|
|
except Snapshot.DoesNotExist:
|
|
snapshot = None
|
|
except Snapshot.MultipleObjectsReturned:
|
|
snapshot = Snapshot.objects.filter(id__startswith=snapshot_id).first()
|
|
else:
|
|
# fuzzy lookup by date + domain/url (most recent)
|
|
username_lookup = "system" if username == "web" else username
|
|
if requested_url:
|
|
qs = SnapshotView.find_snapshots_for_url(requested_url).filter(crawl__created_by__username=username_lookup)
|
|
else:
|
|
qs = Snapshot.objects.filter(crawl__created_by__username=username_lookup)
|
|
|
|
if date:
|
|
try:
|
|
if len(date) == 4:
|
|
qs = qs.filter(created_at__year=int(date))
|
|
elif len(date) == 6:
|
|
qs = qs.filter(created_at__year=int(date[:4]), created_at__month=int(date[4:6]))
|
|
elif len(date) == 8:
|
|
qs = qs.filter(
|
|
created_at__year=int(date[:4]),
|
|
created_at__month=int(date[4:6]),
|
|
created_at__day=int(date[6:8]),
|
|
)
|
|
except ValueError:
|
|
pass
|
|
|
|
if requested_url:
|
|
snapshot = qs.order_by("-created_at", "-bookmarked_at", "-timestamp").first()
|
|
else:
|
|
requested_domain = domain or ""
|
|
if requested_domain.startswith(("http://", "https://")):
|
|
requested_domain = Snapshot.extract_domain_from_url(requested_domain)
|
|
else:
|
|
requested_domain = Snapshot.extract_domain_from_url(f"https://{requested_domain}")
|
|
|
|
# Prefer exact domain matches
|
|
matches = [
|
|
s for s in qs.order_by("-created_at", "-bookmarked_at") if Snapshot.extract_domain_from_url(s.url) == requested_domain
|
|
]
|
|
snapshot = matches[0] if matches else qs.order_by("-created_at", "-bookmarked_at", "-timestamp").first()
|
|
|
|
if not snapshot:
|
|
return HttpResponse(
|
|
format_html(
|
|
(
|
|
"<center><br/><br/><br/>"
|
|
"No Snapshots match the given id or url: <code>{}</code><br/><br/><br/>"
|
|
'Return to the <a href="/" target="_top">Main Index</a>'
|
|
"</center>"
|
|
),
|
|
snapshot_id or requested_url or domain,
|
|
),
|
|
content_type="text/html",
|
|
status=404,
|
|
)
|
|
|
|
canonical_base = snapshot.url_path
|
|
if date:
|
|
requested_base = f"{username}/{date}/{domain or url or ''}"
|
|
else:
|
|
requested_base = f"{username}/{domain or url or ''}"
|
|
if snapshot_id:
|
|
requested_base = f"{requested_base}/{snapshot_id}"
|
|
if canonical_base != requested_base:
|
|
target = f"/{canonical_base}/{path or 'index.html'}"
|
|
query = request.META.get("QUERY_STRING")
|
|
if query:
|
|
target = f"{target}?{query}"
|
|
return redirect(target)
|
|
|
|
archivefile = path or "index.html"
|
|
if archivefile != "index.html" and not request.GET.get("files"):
|
|
target = build_snapshot_url(str(snapshot.id), archivefile, request=request)
|
|
query = request.META.get("QUERY_STRING")
|
|
if query:
|
|
target = f"{target}?{query}"
|
|
return redirect(target)
|
|
|
|
if request.GET.get("files"):
|
|
target_path = _files_index_target(snapshot, archivefile)
|
|
return serve_static_with_byterange_support(
|
|
request,
|
|
target_path,
|
|
document_root=snapshot.output_dir,
|
|
show_indexes=True,
|
|
is_archive_replay=True,
|
|
)
|
|
|
|
if archivefile == "index.html":
|
|
return SnapshotView.render_live_index(request, snapshot)
|
|
|
|
return serve_static_with_byterange_support(
|
|
request,
|
|
archivefile,
|
|
document_root=snapshot.output_dir,
|
|
show_indexes=True,
|
|
is_archive_replay=True,
|
|
)
|
|
|
|
|
|
def _safe_archive_relpath(path: str) -> str | None:
|
|
if not path:
|
|
return ""
|
|
cleaned = posixpath.normpath(path)
|
|
cleaned = cleaned.lstrip("/")
|
|
if cleaned.startswith("..") or "/../" in f"/{cleaned}/":
|
|
return None
|
|
return cleaned
|
|
|
|
|
|
def _coerce_sort_timestamp(value: str | float | None) -> float:
|
|
if value is None:
|
|
return 0.0
|
|
try:
|
|
return float(value)
|
|
except (TypeError, ValueError):
|
|
return 0.0
|
|
|
|
|
|
def _snapshot_sort_key(match_path: str, cache: dict[str, float]) -> tuple[float, str]:
|
|
parts = Path(match_path).parts
|
|
date_str = ""
|
|
snapshot_id = ""
|
|
try:
|
|
idx = parts.index("snapshots")
|
|
date_str = parts[idx + 1]
|
|
snapshot_id = parts[idx + 3]
|
|
except Exception:
|
|
return (_coerce_sort_timestamp(date_str), match_path)
|
|
|
|
if snapshot_id not in cache:
|
|
snapshot = Snapshot.objects.filter(id=snapshot_id).only("bookmarked_at", "created_at", "downloaded_at", "timestamp").first()
|
|
if snapshot:
|
|
snap_dt = snapshot.bookmarked_at or snapshot.created_at or snapshot.downloaded_at
|
|
cache[snapshot_id] = snap_dt.timestamp() if snap_dt else _coerce_sort_timestamp(snapshot.timestamp)
|
|
else:
|
|
cache[snapshot_id] = _coerce_sort_timestamp(date_str)
|
|
|
|
return (cache[snapshot_id], match_path)
|
|
|
|
|
|
def _latest_response_match(domain: str, rel_path: str) -> tuple[Path, Path] | None:
|
|
if not domain or not rel_path:
|
|
return None
|
|
domain = domain.split(":", 1)[0].lower()
|
|
# TODO: optimize by querying output_files in DB instead of globbing filesystem
|
|
data_root = DATA_DIR / "users"
|
|
escaped_domain = escape(domain)
|
|
escaped_path = escape(rel_path)
|
|
pattern = str(data_root / "*" / "snapshots" / "*" / escaped_domain / "*" / "responses" / escaped_domain / escaped_path)
|
|
matches = glob(pattern)
|
|
if not matches:
|
|
return None
|
|
|
|
sort_cache: dict[str, float] = {}
|
|
best = max(matches, key=lambda match_path: _snapshot_sort_key(match_path, sort_cache))
|
|
best_path = Path(best)
|
|
parts = best_path.parts
|
|
try:
|
|
responses_idx = parts.index("responses")
|
|
except ValueError:
|
|
return None
|
|
responses_root = Path(*parts[: responses_idx + 1])
|
|
rel_to_root = Path(*parts[responses_idx + 1 :])
|
|
return responses_root, rel_to_root
|
|
|
|
|
|
def _latest_responses_root(domain: str) -> Path | None:
|
|
if not domain:
|
|
return None
|
|
domain = domain.split(":", 1)[0].lower()
|
|
data_root = DATA_DIR / "users"
|
|
escaped_domain = escape(domain)
|
|
pattern = str(data_root / "*" / "snapshots" / "*" / escaped_domain / "*" / "responses" / escaped_domain)
|
|
matches = glob(pattern)
|
|
if not matches:
|
|
return None
|
|
|
|
sort_cache: dict[str, float] = {}
|
|
best = max(matches, key=lambda match_path: _snapshot_sort_key(match_path, sort_cache))
|
|
return Path(best)
|
|
|
|
|
|
def _latest_snapshot_for_domain(domain: str) -> Snapshot | None:
|
|
if not domain:
|
|
return None
|
|
|
|
requested_domain = domain.split(":", 1)[0].lower()
|
|
snapshots = SnapshotView.find_snapshots_for_url(f"https://{requested_domain}").order_by("-created_at", "-bookmarked_at", "-timestamp")
|
|
for snapshot in snapshots:
|
|
if Snapshot.extract_domain_from_url(snapshot.url).lower() == requested_domain:
|
|
return snapshot
|
|
return None
|
|
|
|
|
|
def _original_request_url(domain: str, path: str = "", query_string: str = "") -> str:
|
|
normalized_domain = (domain or "").split(":", 1)[0].lower()
|
|
normalized_path = (path or "").lstrip("/")
|
|
if normalized_path in ("", "index.html"):
|
|
normalized_path = ""
|
|
target = f"https://{normalized_domain}"
|
|
if normalized_path:
|
|
target = f"{target}/{normalized_path}"
|
|
if query_string:
|
|
target = f"{target}?{query_string}"
|
|
return target
|
|
|
|
|
|
def _serve_responses_path(request, responses_root: Path, rel_path: str, show_indexes: bool):
|
|
candidates: list[str] = []
|
|
rel_path = rel_path or ""
|
|
if rel_path.endswith("/"):
|
|
rel_path = f"{rel_path}index.html"
|
|
if "." not in Path(rel_path).name:
|
|
candidates.append(f"{rel_path.rstrip('/')}/index.html")
|
|
candidates.append(rel_path)
|
|
|
|
for candidate in candidates:
|
|
try:
|
|
return serve_static_with_byterange_support(
|
|
request,
|
|
candidate,
|
|
document_root=str(responses_root),
|
|
show_indexes=show_indexes,
|
|
is_archive_replay=True,
|
|
)
|
|
except Http404:
|
|
pass
|
|
|
|
if rel_path.endswith("index.html"):
|
|
rel_dir = rel_path[: -len("index.html")]
|
|
try:
|
|
return serve_static_with_byterange_support(
|
|
request,
|
|
rel_dir,
|
|
document_root=str(responses_root),
|
|
show_indexes=True,
|
|
is_archive_replay=True,
|
|
)
|
|
except Http404:
|
|
return None
|
|
return None
|
|
|
|
|
|
def _serve_snapshot_replay(request: HttpRequest, snapshot: Snapshot, path: str = ""):
|
|
rel_path = path or ""
|
|
is_directory_request = bool(path) and path.endswith("/")
|
|
show_indexes = bool(request.GET.get("files")) or (SERVER_CONFIG.USES_SUBDOMAIN_ROUTING and is_directory_request)
|
|
if not show_indexes and (not rel_path or rel_path == "index.html"):
|
|
return SnapshotView.render_live_index(request, snapshot)
|
|
|
|
if not rel_path or rel_path.endswith("/"):
|
|
if show_indexes:
|
|
rel_path = rel_path.rstrip("/")
|
|
else:
|
|
rel_path = f"{rel_path}index.html"
|
|
rel_path = _safe_archive_relpath(rel_path)
|
|
if rel_path is None:
|
|
raise Http404
|
|
|
|
try:
|
|
return serve_static_with_byterange_support(
|
|
request,
|
|
rel_path,
|
|
document_root=snapshot.output_dir,
|
|
show_indexes=show_indexes,
|
|
is_archive_replay=True,
|
|
)
|
|
except Http404:
|
|
pass
|
|
|
|
host = urlparse(snapshot.url).hostname or snapshot.domain
|
|
responses_root = Path(snapshot.output_dir) / "responses" / host
|
|
if responses_root.exists():
|
|
response = _serve_responses_path(request, responses_root, rel_path, show_indexes)
|
|
if response is not None:
|
|
return response
|
|
|
|
raise Http404
|
|
|
|
|
|
def _serve_original_domain_replay(request: HttpRequest, domain: str, path: str = ""):
|
|
requested_root_index = path in ("", "index.html") or path.endswith("/")
|
|
rel_path = path or ""
|
|
if not rel_path or rel_path.endswith("/"):
|
|
rel_path = f"{rel_path}index.html"
|
|
rel_path = _safe_archive_relpath(rel_path)
|
|
if rel_path is None:
|
|
raise Http404
|
|
|
|
domain = domain.lower()
|
|
match = _latest_response_match(domain, rel_path)
|
|
if not match and "." not in Path(rel_path).name:
|
|
index_path = f"{rel_path.rstrip('/')}/index.html"
|
|
match = _latest_response_match(domain, index_path)
|
|
if not match and "." not in Path(rel_path).name:
|
|
html_path = f"{rel_path}.html"
|
|
match = _latest_response_match(domain, html_path)
|
|
|
|
show_indexes = bool(request.GET.get("files"))
|
|
if match:
|
|
responses_root, rel_to_root = match
|
|
response = _serve_responses_path(request, responses_root, str(rel_to_root), show_indexes)
|
|
if response is not None:
|
|
return response
|
|
|
|
responses_root = _latest_responses_root(domain)
|
|
if responses_root:
|
|
response = _serve_responses_path(request, responses_root, rel_path, show_indexes)
|
|
if response is not None:
|
|
return response
|
|
|
|
if requested_root_index and not show_indexes:
|
|
snapshot = _latest_snapshot_for_domain(domain)
|
|
if snapshot:
|
|
return SnapshotView.render_live_index(request, snapshot)
|
|
|
|
if SERVER_CONFIG.PUBLIC_ADD_VIEW or request.user.is_authenticated:
|
|
target_url = _original_request_url(domain, path, request.META.get("QUERY_STRING", ""))
|
|
return redirect(build_web_url(f"/web/{quote(target_url, safe=':/')}"))
|
|
|
|
raise Http404
|
|
|
|
|
|
class SnapshotHostView(View):
|
|
"""Serve snapshot directory contents on <snapshot-subdomain>.<listen_host>/<path>."""
|
|
|
|
def get(self, request, snapshot_id: str, path: str = ""):
|
|
if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
|
|
return _admin_login_redirect_or_forbidden(request)
|
|
snapshot = _find_snapshot_by_ref(snapshot_id)
|
|
|
|
if not snapshot:
|
|
raise Http404
|
|
|
|
canonical_host = get_snapshot_host(str(snapshot.id))
|
|
if not host_matches(request.get_host(), canonical_host):
|
|
target = build_snapshot_url(str(snapshot.id), path, request=request)
|
|
if request.META.get("QUERY_STRING"):
|
|
target = f"{target}?{request.META['QUERY_STRING']}"
|
|
return redirect(target)
|
|
|
|
return _serve_snapshot_replay(request, snapshot, path)
|
|
|
|
|
|
class SnapshotReplayView(View):
|
|
"""Serve snapshot directory contents on a one-domain replay path."""
|
|
|
|
def get(self, request, snapshot_id: str, path: str = ""):
|
|
if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
|
|
return _admin_login_redirect_or_forbidden(request)
|
|
|
|
snapshot = _find_snapshot_by_ref(snapshot_id)
|
|
if not snapshot:
|
|
raise Http404
|
|
|
|
return _serve_snapshot_replay(request, snapshot, path)
|
|
|
|
|
|
class OriginalDomainHostView(View):
|
|
"""Serve responses from the most recent snapshot when using <domain>.<listen_host>/<path>."""
|
|
|
|
def get(self, request, domain: str, path: str = ""):
|
|
if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
|
|
return _admin_login_redirect_or_forbidden(request)
|
|
return _serve_original_domain_replay(request, domain, path)
|
|
|
|
|
|
class OriginalDomainReplayView(View):
|
|
"""Serve original-domain replay content on a one-domain replay path."""
|
|
|
|
def get(self, request, domain: str, path: str = ""):
|
|
if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
|
|
return _admin_login_redirect_or_forbidden(request)
|
|
return _serve_original_domain_replay(request, domain, path)
|
|
|
|
|
|
class PublicIndexView(ListView):
|
|
template_name = "public_index.html"
|
|
model = Snapshot
|
|
paginate_by = SERVER_CONFIG.SNAPSHOTS_PER_PAGE
|
|
ordering = ["-bookmarked_at", "-created_at"]
|
|
|
|
def get_context_data(self, **kwargs):
|
|
return {
|
|
**super().get_context_data(**kwargs),
|
|
"VERSION": VERSION,
|
|
"COMMIT_HASH": SHELL_CONFIG.COMMIT_HASH,
|
|
"FOOTER_INFO": SERVER_CONFIG.FOOTER_INFO,
|
|
"search_mode": get_search_mode(self.request.GET.get("search_mode")),
|
|
}
|
|
|
|
def get_queryset(self, **kwargs):
|
|
qs = super().get_queryset(**kwargs)
|
|
query = self.request.GET.get("q", default="").strip()
|
|
|
|
if not query:
|
|
return qs.distinct()
|
|
|
|
query_type = self.request.GET.get("query_type")
|
|
search_mode = get_search_mode(self.request.GET.get("search_mode"))
|
|
|
|
if not query_type or query_type == "all":
|
|
metadata_qs = qs.filter(
|
|
Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query),
|
|
)
|
|
if search_mode == "meta":
|
|
qs = metadata_qs
|
|
else:
|
|
try:
|
|
qs = prioritize_metadata_matches(
|
|
qs,
|
|
metadata_qs,
|
|
query_search_index(query, search_mode=search_mode),
|
|
ordering=self.ordering,
|
|
)
|
|
except Exception as err:
|
|
print(f"[!] Error while using search backend: {err.__class__.__name__} {err}")
|
|
qs = metadata_qs
|
|
elif query_type == "fulltext":
|
|
if search_mode == "meta":
|
|
qs = qs.none()
|
|
else:
|
|
try:
|
|
qs = query_search_index(query, search_mode=search_mode).filter(pk__in=qs.values("pk"))
|
|
except Exception as err:
|
|
print(f"[!] Error while using search backend: {err.__class__.__name__} {err}")
|
|
qs = qs.none()
|
|
elif query_type == "meta":
|
|
qs = qs.filter(
|
|
Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query),
|
|
)
|
|
elif query_type == "url":
|
|
qs = qs.filter(Q(url__icontains=query))
|
|
elif query_type == "title":
|
|
qs = qs.filter(Q(title__icontains=query))
|
|
elif query_type == "timestamp":
|
|
qs = qs.filter(Q(timestamp__icontains=query))
|
|
elif query_type == "tags":
|
|
qs = qs.filter(Q(tags__name__icontains=query))
|
|
else:
|
|
print(f'[!] Unknown value for query_type: "{query_type}"')
|
|
|
|
return qs.distinct()
|
|
|
|
def get(self, *args, **kwargs):
|
|
if self.request.user.is_authenticated:
|
|
return redirect("/admin/core/snapshot/")
|
|
if SERVER_CONFIG.PUBLIC_INDEX:
|
|
response = super().get(*args, **kwargs)
|
|
return response
|
|
else:
|
|
return _admin_login_redirect_or_forbidden(self.request)
|
|
|
|
|
|
@method_decorator(csrf_exempt, name="dispatch")
|
|
class AddView(UserPassesTestMixin, FormView):
|
|
template_name = "add.html"
|
|
form_class = AddLinkForm
|
|
|
|
def get_initial(self):
|
|
"""Prefill the AddLinkForm with the 'url' GET parameter"""
|
|
if self.request.method == "GET":
|
|
url = self.request.GET.get("url", None)
|
|
if url:
|
|
return {"url": url if "://" in url else f"https://{url}"}
|
|
|
|
return super().get_initial()
|
|
|
|
def test_func(self):
|
|
return SERVER_CONFIG.PUBLIC_ADD_VIEW or self.request.user.is_authenticated
|
|
|
|
def _can_override_crawl_config(self) -> bool:
|
|
user = self.request.user
|
|
return bool(user.is_authenticated and (getattr(user, "is_superuser", False) or getattr(user, "is_staff", False)))
|
|
|
|
def _get_custom_config_overrides(self, form: AddLinkForm) -> dict:
|
|
custom_config = form.cleaned_data.get("config") or {}
|
|
|
|
if not isinstance(custom_config, dict):
|
|
return {}
|
|
|
|
if not self._can_override_crawl_config():
|
|
return {}
|
|
|
|
return custom_config
|
|
|
|
def get_context_data(self, **kwargs):
|
|
required_search_plugin = f"search_backend_{SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}".strip()
|
|
plugin_configs = discover_plugin_configs()
|
|
plugin_dependency_map = {
|
|
plugin_name: [
|
|
str(required_plugin).strip() for required_plugin in (schema.get("required_plugins") or []) if str(required_plugin).strip()
|
|
]
|
|
for plugin_name, schema in plugin_configs.items()
|
|
if isinstance(schema.get("required_plugins"), list) and schema.get("required_plugins")
|
|
}
|
|
return {
|
|
**super().get_context_data(**kwargs),
|
|
"title": "Create Crawl",
|
|
# We can't just call request.build_absolute_uri in the template, because it would include query parameters
|
|
"absolute_add_path": self.request.build_absolute_uri(self.request.path),
|
|
"VERSION": VERSION,
|
|
"FOOTER_INFO": SERVER_CONFIG.FOOTER_INFO,
|
|
"required_search_plugin": required_search_plugin,
|
|
"plugin_dependency_map_json": json.dumps(plugin_dependency_map, sort_keys=True),
|
|
"stdout": "",
|
|
}
|
|
|
|
def _create_crawl_from_form(self, form, *, created_by_id=None) -> Crawl:
|
|
urls = form.cleaned_data["url"]
|
|
print(f"[+] Adding URL: {urls}")
|
|
|
|
# Extract all form fields
|
|
tag = form.cleaned_data["tag"]
|
|
depth = int(form.cleaned_data["depth"])
|
|
max_urls = int(form.cleaned_data.get("max_urls") or 0)
|
|
max_size = int(form.cleaned_data.get("max_size") or 0)
|
|
plugins = ",".join(form.cleaned_data.get("plugins", []))
|
|
schedule = form.cleaned_data.get("schedule", "").strip()
|
|
persona = form.cleaned_data.get("persona")
|
|
index_only = form.cleaned_data.get("index_only", False)
|
|
notes = form.cleaned_data.get("notes", "")
|
|
url_filters = form.cleaned_data.get("url_filters") or {}
|
|
custom_config = self._get_custom_config_overrides(form)
|
|
|
|
from archivebox.config.permissions import HOSTNAME
|
|
|
|
if created_by_id is None:
|
|
if self.request.user.is_authenticated:
|
|
created_by_id = self.request.user.pk
|
|
else:
|
|
from archivebox.base_models.models import get_or_create_system_user_pk
|
|
|
|
created_by_id = get_or_create_system_user_pk()
|
|
|
|
created_by_name = getattr(self.request.user, "username", "web") if self.request.user.is_authenticated else "web"
|
|
|
|
# 1. save the provided urls to sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
|
|
sources_file = CONSTANTS.SOURCES_DIR / f"{timezone.now().strftime('%Y-%m-%d__%H-%M-%S')}__web_ui_add_by_user_{created_by_id}.txt"
|
|
sources_file.parent.mkdir(parents=True, exist_ok=True)
|
|
sources_file.write_text(urls if isinstance(urls, str) else "\n".join(urls))
|
|
|
|
# 2. create a new Crawl with the URLs from the file
|
|
timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
|
|
urls_content = sources_file.read_text()
|
|
# Build complete config
|
|
config = {
|
|
"INDEX_ONLY": index_only,
|
|
"DEPTH": depth,
|
|
"PLUGINS": plugins or "",
|
|
"DEFAULT_PERSONA": (persona.name if persona else "Default"),
|
|
}
|
|
|
|
# Merge custom config overrides
|
|
config.update(custom_config)
|
|
if url_filters.get("allowlist"):
|
|
config["URL_ALLOWLIST"] = url_filters["allowlist"]
|
|
if url_filters.get("denylist"):
|
|
config["URL_DENYLIST"] = url_filters["denylist"]
|
|
|
|
crawl = Crawl.objects.create(
|
|
urls=urls_content,
|
|
max_depth=depth,
|
|
max_urls=max_urls,
|
|
max_size=max_size,
|
|
tags_str=tag,
|
|
notes=notes,
|
|
label=f"{created_by_name}@{HOSTNAME}{self.request.path} {timestamp}",
|
|
created_by_id=created_by_id,
|
|
config=config,
|
|
)
|
|
|
|
# 3. create a CrawlSchedule if schedule is provided
|
|
if schedule:
|
|
from archivebox.crawls.models import CrawlSchedule
|
|
|
|
crawl_schedule = CrawlSchedule.objects.create(
|
|
template=crawl,
|
|
schedule=schedule,
|
|
is_enabled=True,
|
|
label=crawl.label,
|
|
notes=f"Auto-created from add page. {notes}".strip(),
|
|
created_by_id=created_by_id,
|
|
)
|
|
crawl.schedule = crawl_schedule
|
|
crawl.save(update_fields=["schedule"])
|
|
|
|
crawl.create_snapshots_from_urls()
|
|
from archivebox.services.runner import ensure_background_runner
|
|
|
|
ensure_background_runner()
|
|
|
|
# 4. start the Orchestrator & wait until it completes
|
|
# ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
|
|
# from archivebox.crawls.actors import CrawlActor
|
|
# from archivebox.core.actors import SnapshotActor, ArchiveResultActor
|
|
|
|
return crawl
|
|
|
|
def form_valid(self, form):
|
|
crawl = self._create_crawl_from_form(form)
|
|
|
|
urls = form.cleaned_data["url"]
|
|
schedule = form.cleaned_data.get("schedule", "").strip()
|
|
rough_url_count = len([url for url in urls.splitlines() if url.strip()])
|
|
|
|
# Build success message with schedule link if created
|
|
schedule_msg = ""
|
|
if schedule:
|
|
schedule_msg = f" and <a href='{crawl.schedule.admin_change_url}'>scheduled to repeat {schedule}</a>"
|
|
|
|
messages.success(
|
|
self.request,
|
|
mark_safe(
|
|
f"Created crawl with {rough_url_count} starting URL(s){schedule_msg}. Snapshots will be created and archived in the background. <a href='{crawl.admin_change_url}'>View Crawl →</a>",
|
|
),
|
|
)
|
|
|
|
# Orchestrator (managed by supervisord) will pick up the queued crawl
|
|
return redirect(crawl.admin_change_url)
|
|
|
|
|
|
class WebAddView(AddView):
|
|
def _latest_snapshot_for_url(self, requested_url: str):
|
|
return SnapshotView.find_snapshots_for_url(requested_url).order_by("-created_at", "-bookmarked_at", "-timestamp").first()
|
|
|
|
def _normalize_add_url(self, requested_url: str) -> str:
|
|
if requested_url.startswith(("http://", "https://")):
|
|
return requested_url
|
|
return f"https://{requested_url}"
|
|
|
|
def dispatch(self, request, *args, **kwargs):
|
|
requested_url = urldecode(kwargs.get("url", "") or "")
|
|
if requested_url:
|
|
snapshot = self._latest_snapshot_for_url(requested_url)
|
|
if snapshot:
|
|
return redirect(f"/{snapshot.url_path}")
|
|
|
|
if not self.test_func():
|
|
request_host = (request.get_host() or "").lower()
|
|
if host_matches(request_host, get_web_host()):
|
|
return redirect(build_admin_url(request.get_full_path(), request=request))
|
|
if host_matches(request_host, get_admin_host()):
|
|
next_url = quote(request.get_full_path(), safe="/:?=&")
|
|
return redirect(f"{build_admin_url('/admin/login/', request=request)}?next={next_url}")
|
|
return HttpResponse(
|
|
format_html(
|
|
(
|
|
"<center><br/><br/><br/>"
|
|
"No Snapshots match the given url: <code>{}</code><br/><br/><br/>"
|
|
'Return to the <a href="/" target="_top">Main Index</a>'
|
|
"</center>"
|
|
),
|
|
requested_url or "",
|
|
),
|
|
content_type="text/html",
|
|
status=404,
|
|
)
|
|
|
|
return super().dispatch(request, *args, **kwargs)
|
|
|
|
def get(self, request: HttpRequest, *args: object, **kwargs: object):
|
|
requested_url = urldecode(str(kwargs.get("url") or (args[0] if args else "")))
|
|
if not requested_url:
|
|
raise Http404
|
|
|
|
snapshot = self._latest_snapshot_for_url(requested_url)
|
|
if snapshot:
|
|
return redirect(f"/{snapshot.url_path}")
|
|
|
|
add_url = self._normalize_add_url(requested_url)
|
|
assert self.form_class is not None
|
|
defaults_form = self.form_class()
|
|
form_data = {
|
|
"url": add_url,
|
|
"depth": defaults_form.fields["depth"].initial or "0",
|
|
"max_urls": defaults_form.fields["max_urls"].initial or 0,
|
|
"max_size": defaults_form.fields["max_size"].initial or "0",
|
|
"persona": defaults_form.fields["persona"].initial or "Default",
|
|
"config": {},
|
|
}
|
|
if defaults_form.fields["index_only"].initial:
|
|
form_data["index_only"] = "on"
|
|
|
|
form = self.form_class(data=form_data)
|
|
if not form.is_valid():
|
|
return self.form_invalid(form)
|
|
|
|
crawl = self._create_crawl_from_form(form)
|
|
snapshot = Snapshot.from_json({"url": add_url, "tags": form.cleaned_data.get("tag", "")}, overrides={"crawl": crawl})
|
|
assert snapshot is not None
|
|
return redirect(f"/{snapshot.url_path}")
|
|
|
|
|
|
class HealthCheckView(View):
|
|
"""
|
|
A Django view that renders plain text "OK" for service discovery tools
|
|
"""
|
|
|
|
def get(self, request):
|
|
"""
|
|
Handle a GET request
|
|
"""
|
|
return HttpResponse("OK", content_type="text/plain", status=200)
|
|
|
|
|
|
def live_progress_view(request):
|
|
"""Simple JSON endpoint for live progress status - used by admin progress monitor."""
|
|
try:
|
|
from archivebox.crawls.models import Crawl
|
|
from archivebox.core.models import Snapshot, ArchiveResult
|
|
from archivebox.machine.models import Process, Machine
|
|
|
|
def is_current_run_timestamp(event_ts, run_started_at) -> bool:
|
|
if run_started_at is None:
|
|
return True
|
|
if event_ts is None:
|
|
return False
|
|
return event_ts >= run_started_at
|
|
|
|
def archiveresult_matches_current_run(ar, run_started_at) -> bool:
|
|
if run_started_at is None:
|
|
return True
|
|
if ar.status in (
|
|
ArchiveResult.StatusChoices.QUEUED,
|
|
ArchiveResult.StatusChoices.STARTED,
|
|
ArchiveResult.StatusChoices.BACKOFF,
|
|
):
|
|
return True
|
|
event_ts = ar.end_ts or ar.start_ts or ar.modified_at or ar.created_at
|
|
return is_current_run_timestamp(event_ts, run_started_at)
|
|
|
|
def hook_details(hook_name: str, plugin: str = "setup") -> tuple[str, str, str, str]:
|
|
normalized_hook_name = Path(hook_name).name if hook_name else ""
|
|
if not normalized_hook_name:
|
|
return (plugin, plugin, "unknown", "")
|
|
|
|
phase = "unknown"
|
|
if normalized_hook_name.startswith("on_Install__"):
|
|
phase = "install"
|
|
elif normalized_hook_name.startswith("on_CrawlSetup__"):
|
|
phase = "crawl"
|
|
elif normalized_hook_name.startswith("on_Snapshot__"):
|
|
phase = "snapshot"
|
|
elif normalized_hook_name.startswith("on_BinaryRequest__"):
|
|
phase = "binary"
|
|
|
|
label = normalized_hook_name
|
|
if "__" in normalized_hook_name:
|
|
label = normalized_hook_name.split("__", 1)[1]
|
|
label = label.rsplit(".", 1)[0]
|
|
if len(label) > 3 and label[:2].isdigit() and label[2] == "_":
|
|
label = label[3:]
|
|
label = label.replace("_", " ").strip() or plugin
|
|
|
|
return (plugin, label, phase, normalized_hook_name)
|
|
|
|
def process_label(cmd: list[str] | None) -> tuple[str, str, str, str]:
|
|
hook_path = ""
|
|
if isinstance(cmd, list) and cmd:
|
|
first = cmd[0]
|
|
if isinstance(first, str):
|
|
hook_path = first
|
|
|
|
if not hook_path:
|
|
return ("", "setup", "unknown", "")
|
|
|
|
return hook_details(Path(hook_path).name, plugin=Path(hook_path).parent.name or "setup")
|
|
|
|
machine = Machine.current()
|
|
Process.cleanup_stale_running(machine=machine)
|
|
Process.cleanup_orphaned_workers()
|
|
orchestrator_proc = (
|
|
Process.objects.filter(
|
|
machine=machine,
|
|
process_type=Process.TypeChoices.ORCHESTRATOR,
|
|
status=Process.StatusChoices.RUNNING,
|
|
)
|
|
.order_by("-started_at")
|
|
.first()
|
|
)
|
|
orchestrator_running = orchestrator_proc is not None
|
|
orchestrator_pid = orchestrator_proc.pid if orchestrator_proc else None
|
|
# Get model counts by status
|
|
crawls_pending = Crawl.objects.filter(status=Crawl.StatusChoices.QUEUED).count()
|
|
crawls_started = Crawl.objects.filter(status=Crawl.StatusChoices.STARTED).count()
|
|
|
|
# Get recent crawls (last 24 hours)
|
|
from datetime import timedelta
|
|
|
|
one_day_ago = timezone.now() - timedelta(days=1)
|
|
crawls_recent = Crawl.objects.filter(created_at__gte=one_day_ago).count()
|
|
|
|
snapshots_pending = Snapshot.objects.filter(status=Snapshot.StatusChoices.QUEUED).count()
|
|
snapshots_started = Snapshot.objects.filter(status=Snapshot.StatusChoices.STARTED).count()
|
|
|
|
archiveresults_pending = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.QUEUED).count()
|
|
archiveresults_started = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.STARTED).count()
|
|
archiveresults_succeeded = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.SUCCEEDED).count()
|
|
archiveresults_failed = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.FAILED).count()
|
|
|
|
# Get recently completed ArchiveResults with thumbnails (last 20 succeeded results)
|
|
recent_thumbnails = []
|
|
recent_results = (
|
|
ArchiveResult.objects.filter(
|
|
status=ArchiveResult.StatusChoices.SUCCEEDED,
|
|
)
|
|
.select_related("snapshot")
|
|
.order_by("-end_ts")[:20]
|
|
)
|
|
|
|
for ar in recent_results:
|
|
embed = ar.embed_path()
|
|
if embed:
|
|
# Only include results with embeddable image/media files
|
|
ext = embed.lower().split(".")[-1] if "." in embed else ""
|
|
is_embeddable = ext in ("png", "jpg", "jpeg", "gif", "webp", "svg", "ico", "pdf", "html")
|
|
if is_embeddable or ar.plugin in ("screenshot", "favicon", "dom"):
|
|
archive_path = embed or ""
|
|
recent_thumbnails.append(
|
|
{
|
|
"id": str(ar.id),
|
|
"plugin": ar.plugin,
|
|
"snapshot_id": str(ar.snapshot_id),
|
|
"snapshot_url": ar.snapshot.url[:60] if ar.snapshot else "",
|
|
"embed_path": embed,
|
|
"archive_path": archive_path,
|
|
"archive_url": build_snapshot_url(str(ar.snapshot_id), archive_path, request=request) if archive_path else "",
|
|
"end_ts": ar.end_ts.isoformat() if ar.end_ts else None,
|
|
},
|
|
)
|
|
|
|
# Build hierarchical active crawls with nested snapshots and archive results
|
|
|
|
running_processes = Process.objects.filter(
|
|
machine=machine,
|
|
status=Process.StatusChoices.RUNNING,
|
|
process_type__in=[
|
|
Process.TypeChoices.HOOK,
|
|
Process.TypeChoices.BINARY,
|
|
],
|
|
)
|
|
recent_processes = Process.objects.filter(
|
|
machine=machine,
|
|
process_type__in=[
|
|
Process.TypeChoices.HOOK,
|
|
Process.TypeChoices.BINARY,
|
|
],
|
|
modified_at__gte=timezone.now() - timedelta(minutes=10),
|
|
).order_by("-modified_at")
|
|
crawl_process_pids: dict[str, int] = {}
|
|
snapshot_process_pids: dict[str, int] = {}
|
|
process_records_by_crawl: dict[str, list[tuple[dict[str, object], object | None]]] = {}
|
|
process_records_by_snapshot: dict[str, list[tuple[dict[str, object], object | None]]] = {}
|
|
seen_process_records: set[str] = set()
|
|
for proc in running_processes:
|
|
env = proc.env or {}
|
|
if not isinstance(env, dict):
|
|
env = {}
|
|
|
|
crawl_id = env.get("CRAWL_ID")
|
|
snapshot_id = env.get("SNAPSHOT_ID")
|
|
_plugin, _label, phase, _hook_name = process_label(proc.cmd)
|
|
if crawl_id and proc.pid:
|
|
crawl_process_pids.setdefault(str(crawl_id), proc.pid)
|
|
if phase == "snapshot" and snapshot_id and proc.pid:
|
|
snapshot_process_pids.setdefault(str(snapshot_id), proc.pid)
|
|
|
|
for proc in recent_processes:
|
|
env = proc.env or {}
|
|
if not isinstance(env, dict):
|
|
env = {}
|
|
|
|
crawl_id = env.get("CRAWL_ID")
|
|
snapshot_id = env.get("SNAPSHOT_ID")
|
|
if not crawl_id and not snapshot_id:
|
|
continue
|
|
|
|
plugin, label, phase, hook_name = process_label(proc.cmd)
|
|
|
|
record_scope = str(snapshot_id) if phase == "snapshot" and snapshot_id else str(crawl_id)
|
|
proc_key = f"{record_scope}:{plugin}:{label}:{proc.status}:{proc.exit_code}"
|
|
if proc_key in seen_process_records:
|
|
continue
|
|
seen_process_records.add(proc_key)
|
|
|
|
status = (
|
|
"started"
|
|
if proc.status == Process.StatusChoices.RUNNING
|
|
else ("failed" if proc.exit_code not in (None, 0) else "succeeded")
|
|
)
|
|
payload: dict[str, object] = {
|
|
"id": str(proc.id),
|
|
"plugin": plugin,
|
|
"label": label,
|
|
"hook_name": hook_name,
|
|
"status": status,
|
|
"phase": phase,
|
|
"source": "process",
|
|
"process_id": str(proc.id),
|
|
}
|
|
if status == "started" and proc.pid:
|
|
payload["pid"] = proc.pid
|
|
proc_started_at = proc.started_at or proc.modified_at
|
|
if phase == "snapshot" and snapshot_id:
|
|
process_records_by_snapshot.setdefault(str(snapshot_id), []).append((payload, proc_started_at))
|
|
elif crawl_id:
|
|
process_records_by_crawl.setdefault(str(crawl_id), []).append((payload, proc_started_at))
|
|
|
|
active_crawls_qs = (
|
|
Crawl.objects.filter(status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED])
|
|
.prefetch_related(
|
|
"snapshot_set",
|
|
"snapshot_set__archiveresult_set",
|
|
"snapshot_set__archiveresult_set__process",
|
|
)
|
|
.distinct()
|
|
.order_by("-modified_at")[:10]
|
|
)
|
|
|
|
active_crawls = []
|
|
total_workers = 0
|
|
for crawl in active_crawls_qs:
|
|
# Get ALL snapshots for this crawl to count status (already prefetched)
|
|
all_crawl_snapshots = list(crawl.snapshot_set.all())
|
|
|
|
# Count snapshots by status from ALL snapshots
|
|
total_snapshots = len(all_crawl_snapshots)
|
|
completed_snapshots = sum(1 for s in all_crawl_snapshots if s.status == Snapshot.StatusChoices.SEALED)
|
|
started_snapshots = sum(1 for s in all_crawl_snapshots if s.status == Snapshot.StatusChoices.STARTED)
|
|
pending_snapshots = sum(1 for s in all_crawl_snapshots if s.status == Snapshot.StatusChoices.QUEUED)
|
|
|
|
# Get only ACTIVE snapshots to display (limit to 5 most recent)
|
|
active_crawl_snapshots = [
|
|
s for s in all_crawl_snapshots if s.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]
|
|
][:5]
|
|
|
|
# Count URLs in the crawl (for when snapshots haven't been created yet)
|
|
urls_count = 0
|
|
if crawl.urls:
|
|
urls_count = len([u for u in crawl.urls.split("\n") if u.strip() and not u.startswith("#")])
|
|
|
|
# Calculate crawl progress
|
|
crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0
|
|
crawl_run_started_at = crawl.created_at
|
|
crawl_setup_plugins = [
|
|
payload
|
|
for payload, proc_started_at in process_records_by_crawl.get(str(crawl.id), [])
|
|
if is_current_run_timestamp(proc_started_at, crawl_run_started_at)
|
|
]
|
|
total_workers += sum(1 for item in crawl_setup_plugins if item.get("source") == "process" and item.get("status") == "started")
|
|
crawl_setup_total = len(crawl_setup_plugins)
|
|
crawl_setup_completed = sum(1 for item in crawl_setup_plugins if item.get("status") == "succeeded")
|
|
crawl_setup_failed = sum(1 for item in crawl_setup_plugins if item.get("status") == "failed")
|
|
crawl_setup_pending = sum(1 for item in crawl_setup_plugins if item.get("status") == "queued")
|
|
|
|
# Get active snapshots for this crawl (already prefetched)
|
|
active_snapshots_for_crawl = []
|
|
for snapshot in active_crawl_snapshots:
|
|
snapshot_run_started_at = snapshot.downloaded_at or snapshot.created_at
|
|
# Get archive results for this snapshot (already prefetched)
|
|
snapshot_results = [
|
|
ar for ar in snapshot.archiveresult_set.all() if archiveresult_matches_current_run(ar, snapshot_run_started_at)
|
|
]
|
|
|
|
now = timezone.now()
|
|
plugin_progress_values: list[int] = []
|
|
all_plugins: list[dict[str, object]] = []
|
|
seen_plugin_keys: set[str] = set()
|
|
|
|
def plugin_sort_key(ar):
|
|
status_order = {
|
|
ArchiveResult.StatusChoices.STARTED: 0,
|
|
ArchiveResult.StatusChoices.QUEUED: 1,
|
|
ArchiveResult.StatusChoices.SUCCEEDED: 2,
|
|
ArchiveResult.StatusChoices.NORESULTS: 3,
|
|
ArchiveResult.StatusChoices.FAILED: 4,
|
|
}
|
|
return (status_order.get(ar.status, 5), ar.plugin, ar.hook_name or "")
|
|
|
|
for ar in sorted(snapshot_results, key=plugin_sort_key):
|
|
status = ar.status
|
|
progress_value = 0
|
|
if status in (
|
|
ArchiveResult.StatusChoices.SUCCEEDED,
|
|
ArchiveResult.StatusChoices.FAILED,
|
|
ArchiveResult.StatusChoices.SKIPPED,
|
|
ArchiveResult.StatusChoices.NORESULTS,
|
|
):
|
|
progress_value = 100
|
|
elif status == ArchiveResult.StatusChoices.STARTED:
|
|
started_at = ar.start_ts or (ar.process.started_at if ar.process_id and ar.process else None)
|
|
timeout = ar.timeout or 120
|
|
if started_at and timeout:
|
|
elapsed = max(0.0, (now - started_at).total_seconds())
|
|
progress_value = int(min(99, max(1, (elapsed / float(timeout)) * 100)))
|
|
else:
|
|
progress_value = 1
|
|
else:
|
|
progress_value = 0
|
|
|
|
plugin_progress_values.append(progress_value)
|
|
plugin, label, phase, hook_name = hook_details(ar.hook_name or ar.plugin, plugin=ar.plugin)
|
|
|
|
plugin_payload = {
|
|
"id": str(ar.id),
|
|
"plugin": ar.plugin,
|
|
"label": label,
|
|
"hook_name": hook_name,
|
|
"phase": phase,
|
|
"status": status,
|
|
"process_id": str(ar.process_id) if ar.process_id else None,
|
|
}
|
|
if status == ArchiveResult.StatusChoices.STARTED and ar.process_id and ar.process:
|
|
plugin_payload["pid"] = ar.process.pid
|
|
if status == ArchiveResult.StatusChoices.STARTED:
|
|
plugin_payload["progress"] = progress_value
|
|
plugin_payload["timeout"] = ar.timeout or 120
|
|
plugin_payload["source"] = "archiveresult"
|
|
all_plugins.append(plugin_payload)
|
|
seen_plugin_keys.add(str(ar.process_id) if ar.process_id else f"{ar.plugin}:{hook_name}")
|
|
|
|
for proc_payload, proc_started_at in process_records_by_snapshot.get(str(snapshot.id), []):
|
|
if not is_current_run_timestamp(proc_started_at, snapshot_run_started_at):
|
|
continue
|
|
proc_key = str(proc_payload.get("process_id") or f"{proc_payload.get('plugin')}:{proc_payload.get('hook_name')}")
|
|
if proc_key in seen_plugin_keys:
|
|
continue
|
|
seen_plugin_keys.add(proc_key)
|
|
all_plugins.append(proc_payload)
|
|
|
|
proc_status = proc_payload.get("status")
|
|
if proc_status in ("succeeded", "failed", "skipped"):
|
|
plugin_progress_values.append(100)
|
|
elif proc_status == "started":
|
|
plugin_progress_values.append(1)
|
|
total_workers += 1
|
|
else:
|
|
plugin_progress_values.append(0)
|
|
|
|
total_plugins = len(all_plugins)
|
|
completed_plugins = sum(1 for item in all_plugins if item.get("status") == "succeeded")
|
|
failed_plugins = sum(1 for item in all_plugins if item.get("status") == "failed")
|
|
pending_plugins = sum(1 for item in all_plugins if item.get("status") == "queued")
|
|
|
|
snapshot_progress = int(sum(plugin_progress_values) / len(plugin_progress_values)) if plugin_progress_values else 0
|
|
|
|
active_snapshots_for_crawl.append(
|
|
{
|
|
"id": str(snapshot.id),
|
|
"url": snapshot.url[:80],
|
|
"status": snapshot.status,
|
|
"started": (snapshot.downloaded_at or snapshot.created_at).isoformat()
|
|
if (snapshot.downloaded_at or snapshot.created_at)
|
|
else None,
|
|
"progress": snapshot_progress,
|
|
"total_plugins": total_plugins,
|
|
"completed_plugins": completed_plugins,
|
|
"failed_plugins": failed_plugins,
|
|
"pending_plugins": pending_plugins,
|
|
"all_plugins": all_plugins,
|
|
"worker_pid": snapshot_process_pids.get(str(snapshot.id)),
|
|
},
|
|
)
|
|
|
|
# Check if crawl can start (for debugging stuck crawls)
|
|
can_start = bool(crawl.urls)
|
|
urls_preview = crawl.urls[:60] if crawl.urls else None
|
|
|
|
# Check if retry_at is in the future (would prevent worker from claiming)
|
|
retry_at_future = crawl.retry_at > timezone.now() if crawl.retry_at else False
|
|
seconds_until_retry = int((crawl.retry_at - timezone.now()).total_seconds()) if crawl.retry_at and retry_at_future else 0
|
|
|
|
active_crawls.append(
|
|
{
|
|
"id": str(crawl.id),
|
|
"label": str(crawl)[:60],
|
|
"status": crawl.status,
|
|
"started": crawl.created_at.isoformat() if crawl.created_at else None,
|
|
"progress": crawl_progress,
|
|
"max_depth": crawl.max_depth,
|
|
"urls_count": urls_count,
|
|
"total_snapshots": total_snapshots,
|
|
"completed_snapshots": completed_snapshots,
|
|
"started_snapshots": started_snapshots,
|
|
"failed_snapshots": 0,
|
|
"pending_snapshots": pending_snapshots,
|
|
"setup_plugins": crawl_setup_plugins,
|
|
"setup_total_plugins": crawl_setup_total,
|
|
"setup_completed_plugins": crawl_setup_completed,
|
|
"setup_failed_plugins": crawl_setup_failed,
|
|
"setup_pending_plugins": crawl_setup_pending,
|
|
"active_snapshots": active_snapshots_for_crawl,
|
|
"can_start": can_start,
|
|
"urls_preview": urls_preview,
|
|
"retry_at_future": retry_at_future,
|
|
"seconds_until_retry": seconds_until_retry,
|
|
"worker_pid": crawl_process_pids.get(str(crawl.id)),
|
|
},
|
|
)
|
|
|
|
return JsonResponse(
|
|
{
|
|
"orchestrator_running": orchestrator_running,
|
|
"orchestrator_pid": orchestrator_pid,
|
|
"total_workers": total_workers,
|
|
"crawls_pending": crawls_pending,
|
|
"crawls_started": crawls_started,
|
|
"crawls_recent": crawls_recent,
|
|
"snapshots_pending": snapshots_pending,
|
|
"snapshots_started": snapshots_started,
|
|
"archiveresults_pending": archiveresults_pending,
|
|
"archiveresults_started": archiveresults_started,
|
|
"archiveresults_succeeded": archiveresults_succeeded,
|
|
"archiveresults_failed": archiveresults_failed,
|
|
"active_crawls": active_crawls,
|
|
"recent_thumbnails": recent_thumbnails,
|
|
"server_time": timezone.now().isoformat(),
|
|
},
|
|
)
|
|
except Exception as e:
|
|
import traceback
|
|
|
|
return JsonResponse(
|
|
{
|
|
"error": str(e),
|
|
"traceback": traceback.format_exc(),
|
|
"orchestrator_running": False,
|
|
"total_workers": 0,
|
|
"crawls_pending": 0,
|
|
"crawls_started": 0,
|
|
"crawls_recent": 0,
|
|
"snapshots_pending": 0,
|
|
"snapshots_started": 0,
|
|
"archiveresults_pending": 0,
|
|
"archiveresults_started": 0,
|
|
"archiveresults_succeeded": 0,
|
|
"archiveresults_failed": 0,
|
|
"active_crawls": [],
|
|
"recent_thumbnails": [],
|
|
"server_time": timezone.now().isoformat(),
|
|
},
|
|
status=500,
|
|
)
|
|
|
|
|
|
def find_config_section(key: str) -> str:
|
|
CONFIGS = get_all_configs()
|
|
|
|
if key in CONSTANTS_CONFIG:
|
|
return "CONSTANT"
|
|
matching_sections = [section_id for section_id, section in CONFIGS.items() if key in dict(section)]
|
|
section = matching_sections[0] if matching_sections else "DYNAMIC"
|
|
return section
|
|
|
|
|
|
def find_config_default(key: str) -> str:
|
|
CONFIGS = get_all_configs()
|
|
|
|
if key in CONSTANTS_CONFIG:
|
|
return str(CONSTANTS_CONFIG[key])
|
|
|
|
default_val = None
|
|
|
|
for config in CONFIGS.values():
|
|
if key in dict(config):
|
|
default_field = getattr(config, "model_fields", dict(config))[key]
|
|
default_val = default_field.default if hasattr(default_field, "default") else default_field
|
|
break
|
|
|
|
if isinstance(default_val, Callable):
|
|
default_val = inspect.getsource(default_val).split("lambda", 1)[-1].split(":", 1)[-1].replace("\n", " ").strip()
|
|
if default_val.count(")") > default_val.count("("):
|
|
default_val = default_val[:-1]
|
|
else:
|
|
default_val = str(default_val)
|
|
|
|
return default_val
|
|
|
|
|
|
def find_config_type(key: str) -> str:
|
|
from typing import ClassVar
|
|
|
|
CONFIGS = get_all_configs()
|
|
|
|
for config in CONFIGS.values():
|
|
if hasattr(config, key):
|
|
# Try to get from pydantic model_fields first (more reliable)
|
|
if hasattr(config, "model_fields") and key in config.model_fields:
|
|
field = config.model_fields[key]
|
|
if hasattr(field, "annotation") and field.annotation is not None:
|
|
try:
|
|
return str(field.annotation.__name__)
|
|
except AttributeError:
|
|
return str(field.annotation)
|
|
|
|
# Fallback to get_type_hints with proper namespace
|
|
try:
|
|
import typing
|
|
|
|
namespace = {
|
|
"ClassVar": ClassVar,
|
|
"Optional": typing.Optional,
|
|
"Union": typing.Union,
|
|
"List": list,
|
|
"Dict": dict,
|
|
"Path": Path,
|
|
}
|
|
type_hints = get_type_hints(config, globalns=namespace, localns=namespace)
|
|
try:
|
|
return str(type_hints[key].__name__)
|
|
except AttributeError:
|
|
return str(type_hints[key])
|
|
except Exception:
|
|
# If all else fails, return str
|
|
pass
|
|
return "str"
|
|
|
|
|
|
def key_is_safe(key: str) -> bool:
|
|
for term in ("key", "password", "secret", "token"):
|
|
if term in key.lower():
|
|
return False
|
|
return True
|
|
|
|
|
|
def find_config_source(key: str, merged_config: dict) -> str:
|
|
"""Determine where a config value comes from."""
|
|
from archivebox.machine.models import Machine
|
|
|
|
# Environment variables override all persistent config sources.
|
|
if key in os.environ:
|
|
return "Environment"
|
|
|
|
# Machine.config overrides ArchiveBox.conf.
|
|
try:
|
|
machine = Machine.current()
|
|
if machine.config and key in machine.config:
|
|
return "Machine"
|
|
except Exception:
|
|
pass
|
|
|
|
# Check if it's from archivebox.config.file
|
|
from archivebox.config.configset import BaseConfigSet
|
|
|
|
file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE)
|
|
if key in file_config:
|
|
return "Config File"
|
|
|
|
# Otherwise it's using the default
|
|
return "Default"
|
|
|
|
|
|
def find_plugin_for_config_key(key: str) -> str | None:
|
|
for plugin_name, schema in discover_plugin_configs().items():
|
|
if key in (schema.get("properties") or {}):
|
|
return plugin_name
|
|
return None
|
|
|
|
|
|
def get_config_definition_link(key: str) -> tuple[str, str]:
|
|
plugin_name = find_plugin_for_config_key(key)
|
|
if not plugin_name:
|
|
return (
|
|
f"https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{quote(key)}&type=code",
|
|
"archivebox/config",
|
|
)
|
|
|
|
plugin_dir = next((path.resolve() for path in iter_plugin_dirs() if path.name == plugin_name), None)
|
|
if plugin_dir:
|
|
builtin_root = BUILTIN_PLUGINS_DIR.resolve()
|
|
if plugin_dir.is_relative_to(builtin_root):
|
|
return (
|
|
f"{ABX_PLUGINS_GITHUB_BASE_URL}{quote(plugin_name)}/config.json",
|
|
f"abx_plugins/plugins/{plugin_name}/config.json",
|
|
)
|
|
|
|
user_root = USER_PLUGINS_DIR.resolve()
|
|
if plugin_dir.is_relative_to(user_root):
|
|
return (
|
|
f"{LIVE_PLUGIN_BASE_URL}user.{quote(plugin_name)}/",
|
|
f"data/custom_plugins/{plugin_name}/config.json",
|
|
)
|
|
|
|
return (
|
|
f"{LIVE_PLUGIN_BASE_URL}builtin.{quote(plugin_name)}/",
|
|
f"abx_plugins/plugins/{plugin_name}/config.json",
|
|
)
|
|
|
|
|
|
@render_with_table_view
|
|
def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
|
CONFIGS = get_all_configs()
|
|
|
|
assert getattr(request.user, "is_superuser", False), "Must be a superuser to view configuration settings."
|
|
|
|
# Get merged config that includes Machine.config overrides
|
|
try:
|
|
from archivebox.machine.models import Machine
|
|
|
|
Machine.current()
|
|
merged_config = get_config()
|
|
except Exception:
|
|
# Fallback if Machine model not available
|
|
merged_config = get_config()
|
|
|
|
rows = {
|
|
"Section": [],
|
|
"Key": [],
|
|
"Type": [],
|
|
"Value": [],
|
|
"Source": [],
|
|
"Default": [],
|
|
# "Documentation": [],
|
|
# "Aliases": [],
|
|
}
|
|
|
|
for section_id, section in reversed(list(CONFIGS.items())):
|
|
for key in dict(section).keys():
|
|
rows["Section"].append(section_id) # section.replace('_', ' ').title().replace(' Config', '')
|
|
rows["Key"].append(ItemLink(key, key=key))
|
|
rows["Type"].append(format_html("<code>{}</code>", find_config_type(key)))
|
|
|
|
# Use merged config value (includes machine overrides)
|
|
actual_value = merged_config.get(key, getattr(section, key, None))
|
|
rows["Value"].append(mark_safe(f"<code>{actual_value}</code>") if key_is_safe(key) else "******** (redacted)")
|
|
|
|
# Show where the value comes from
|
|
source = find_config_source(key, merged_config)
|
|
source_colors = {"Machine": "purple", "Environment": "blue", "Config File": "green", "Default": "gray"}
|
|
rows["Source"].append(format_html('<code style="color: {}">{}</code>', source_colors.get(source, "gray"), source))
|
|
|
|
rows["Default"].append(
|
|
mark_safe(
|
|
f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>',
|
|
),
|
|
)
|
|
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
|
|
# rows['Aliases'].append(', '.join(find_config_aliases(key)))
|
|
|
|
section = "CONSTANT"
|
|
for key in CONSTANTS_CONFIG.keys():
|
|
rows["Section"].append(section) # section.replace('_', ' ').title().replace(' Config', '')
|
|
rows["Key"].append(ItemLink(key, key=key))
|
|
rows["Type"].append(format_html("<code>{}</code>", getattr(type(CONSTANTS_CONFIG[key]), "__name__", str(CONSTANTS_CONFIG[key]))))
|
|
rows["Value"].append(format_html("<code>{}</code>", CONSTANTS_CONFIG[key]) if key_is_safe(key) else "******** (redacted)")
|
|
rows["Source"].append(mark_safe('<code style="color: gray">Constant</code>'))
|
|
rows["Default"].append(
|
|
mark_safe(
|
|
f'<a href="https://github.com/search?q=repo%3AArchiveBox%2FArchiveBox+path%3Aconfig+{key}&type=code"><code style="text-decoration: underline">{find_config_default(key) or "See here..."}</code></a>',
|
|
),
|
|
)
|
|
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
|
|
# rows['Aliases'].append('')
|
|
|
|
return TableContext(
|
|
title="Computed Configuration Values",
|
|
table=rows,
|
|
)
|
|
|
|
|
|
@render_with_item_view
|
|
def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
|
from archivebox.machine.models import Machine
|
|
from archivebox.config.configset import BaseConfigSet
|
|
|
|
CONFIGS = get_all_configs()
|
|
FLAT_CONFIG = get_flat_config()
|
|
|
|
assert getattr(request.user, "is_superuser", False), "Must be a superuser to view configuration settings."
|
|
|
|
# Get merged config
|
|
merged_config = get_config()
|
|
|
|
# Determine all sources for this config value
|
|
sources_info = []
|
|
|
|
# Environment variable
|
|
if key in os.environ:
|
|
sources_info.append(("Environment", os.environ[key] if key_is_safe(key) else "********", "blue"))
|
|
|
|
# Machine config
|
|
machine = None
|
|
machine_admin_url = None
|
|
try:
|
|
machine = Machine.current()
|
|
machine_admin_url = f"/admin/machine/machine/{machine.id}/change/"
|
|
if machine.config and key in machine.config:
|
|
sources_info.append(("Machine", machine.config[key] if key_is_safe(key) else "********", "purple"))
|
|
except Exception:
|
|
pass
|
|
|
|
# Config file value
|
|
if CONSTANTS.CONFIG_FILE.exists():
|
|
file_config = BaseConfigSet.load_from_file(CONSTANTS.CONFIG_FILE)
|
|
if key in file_config:
|
|
sources_info.append(("Config File", file_config[key], "green"))
|
|
|
|
# Default value
|
|
default_val = find_config_default(key)
|
|
if default_val:
|
|
sources_info.append(("Default", default_val, "gray"))
|
|
|
|
# Final computed value
|
|
final_value = merged_config.get(key, FLAT_CONFIG.get(key, CONFIGS.get(key, None)))
|
|
if not key_is_safe(key):
|
|
final_value = "********"
|
|
|
|
# Build sources display
|
|
sources_html = "<br/>".join([f'<b style="color: {color}">{source}:</b> <code>{value}</code>' for source, value, color in sources_info])
|
|
|
|
# aliases = USER_CONFIG.get(key, {}).get("aliases", [])
|
|
aliases = []
|
|
|
|
if key in CONSTANTS_CONFIG:
|
|
section_header = mark_safe(
|
|
f'[CONSTANTS] <b><code style="color: lightgray">{key}</code></b> <small>(read-only, hardcoded by ArchiveBox)</small>',
|
|
)
|
|
elif key in FLAT_CONFIG:
|
|
section_header = mark_safe(
|
|
f'data / ArchiveBox.conf [{find_config_section(key)}] <b><code style="color: lightgray">{key}</code></b>',
|
|
)
|
|
else:
|
|
section_header = mark_safe(
|
|
f'[DYNAMIC CONFIG] <b><code style="color: lightgray">{key}</code></b> <small>(read-only, calculated at runtime)</small>',
|
|
)
|
|
|
|
definition_url, definition_label = get_config_definition_link(key)
|
|
|
|
section_data = cast(
|
|
SectionData,
|
|
{
|
|
"name": section_header,
|
|
"description": None,
|
|
"fields": {
|
|
"Key": key,
|
|
"Type": find_config_type(key),
|
|
"Value": final_value,
|
|
"Currently read from": find_config_source(key, merged_config),
|
|
},
|
|
"help_texts": {
|
|
"Key": mark_safe(f"""
|
|
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">Documentation</a>
|
|
<span style="display: {"inline" if aliases else "none"}">
|
|
Aliases: {", ".join(aliases)}
|
|
</span>
|
|
"""),
|
|
"Type": mark_safe(f'''
|
|
<a href="{definition_url}" target="_blank" rel="noopener noreferrer">
|
|
See full definition in <code>{definition_label}</code>...
|
|
</a>
|
|
'''),
|
|
"Value": mark_safe(f'''
|
|
{
|
|
'<b style="color: red">Value is redacted for your security. (Passwords, secrets, API tokens, etc. cannot be viewed in the Web UI)</b><br/><br/>'
|
|
if not key_is_safe(key)
|
|
else ""
|
|
}
|
|
<br/><hr/><br/>
|
|
<b>Configuration Sources (highest priority first):</b><br/><br/>
|
|
{sources_html}
|
|
<br/><br/>
|
|
<p style="display: {"block" if key in FLAT_CONFIG and key not in CONSTANTS_CONFIG else "none"}">
|
|
<i>To change this value, edit <code>data/ArchiveBox.conf</code> or run:</i>
|
|
<br/><br/>
|
|
<code>archivebox config --set {key}="{
|
|
val.strip("'")
|
|
if (val := find_config_default(key))
|
|
else (str(FLAT_CONFIG[key] if key_is_safe(key) else "********")).strip("'")
|
|
}"</code>
|
|
</p>
|
|
'''),
|
|
"Currently read from": mark_safe(f"""
|
|
The value shown in the "Value" field comes from the <b>{find_config_source(key, merged_config)}</b> source.
|
|
<br/><br/>
|
|
Priority order (highest to lowest):
|
|
<ol>
|
|
<li><b style="color: blue">Environment</b> - Environment variables</li>
|
|
<li><b style="color: purple">Machine</b> - Machine-specific overrides (e.g., resolved binary paths)
|
|
{f'<br/><a href="{machine_admin_url}">→ Edit <code>{key}</code> in Machine.config for this server</a>' if machine_admin_url else ""}
|
|
</li>
|
|
<li><b style="color: green">Config File</b> - data/ArchiveBox.conf</li>
|
|
<li><b style="color: gray">Default</b> - Default value from code</li>
|
|
</ol>
|
|
{f'<br/><b>Tip:</b> To override <code>{key}</code> on this machine, <a href="{machine_admin_url}">edit the Machine.config field</a> and add:<br/><code>{{"\\"{key}\\": "your_value_here"}}</code>' if machine_admin_url and key not in CONSTANTS_CONFIG else ""}
|
|
"""),
|
|
},
|
|
},
|
|
)
|
|
|
|
return ItemContext(
|
|
slug=key,
|
|
title=key,
|
|
data=[section_data],
|
|
)
|