This commit is contained in:
Nick Sweeting
2026-01-21 03:19:56 -08:00
parent f3f55d3395
commit ec4b27056e
113 changed files with 6929 additions and 2396 deletions

View File

@@ -1,3 +1,6 @@
import html
import json
import re
import os
import stat
import posixpath
@@ -10,6 +13,267 @@ from django.http import StreamingHttpResponse, Http404, HttpResponse, HttpRespon
from django.utils._os import safe_join
from django.utils.http import http_date
from django.utils.translation import gettext as _
from archivebox.config.common import SERVER_CONFIG
_HASHES_CACHE: dict[Path, tuple[float, dict[str, str]]] = {}
def _load_hash_map(snapshot_dir: Path) -> dict[str, str] | None:
hashes_path = snapshot_dir / 'hashes' / 'hashes.json'
if not hashes_path.exists():
return None
try:
mtime = hashes_path.stat().st_mtime
except OSError:
return None
cached = _HASHES_CACHE.get(hashes_path)
if cached and cached[0] == mtime:
return cached[1]
try:
data = json.loads(hashes_path.read_text(encoding='utf-8'))
except Exception:
return None
file_map = {str(entry.get('path')): entry.get('hash') for entry in data.get('files', []) if entry.get('path')}
_HASHES_CACHE[hashes_path] = (mtime, file_map)
return file_map
def _hash_for_path(document_root: Path, rel_path: str) -> str | None:
file_map = _load_hash_map(document_root)
if not file_map:
return None
return file_map.get(rel_path)
def _cache_policy() -> str:
return 'public' if SERVER_CONFIG.PUBLIC_SNAPSHOTS else 'private'
# Ensure common web types are mapped consistently across platforms.
mimetypes.add_type("text/html", ".html")
mimetypes.add_type("text/html", ".htm")
mimetypes.add_type("text/css", ".css")
mimetypes.add_type("application/javascript", ".js")
mimetypes.add_type("application/json", ".json")
mimetypes.add_type("application/x-ndjson", ".jsonl")
mimetypes.add_type("text/markdown", ".md")
mimetypes.add_type("text/yaml", ".yml")
mimetypes.add_type("text/yaml", ".yaml")
mimetypes.add_type("text/csv", ".csv")
mimetypes.add_type("text/tab-separated-values", ".tsv")
mimetypes.add_type("application/xml", ".xml")
mimetypes.add_type("image/svg+xml", ".svg")
try:
import markdown as _markdown
except Exception:
_markdown = None
MARKDOWN_INLINE_LINK_RE = re.compile(r'\[([^\]]+)\]\(([^)\s]+(?:\([^)]*\)[^)\s]*)*)\)')
MARKDOWN_INLINE_IMAGE_RE = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)')
MARKDOWN_BOLD_RE = re.compile(r'\*\*([^*]+)\*\*')
MARKDOWN_ITALIC_RE = re.compile(r'(?<!\*)\*([^*]+)\*(?!\*)')
HTML_TAG_RE = re.compile(r'<[A-Za-z][^>]*>')
HTML_BODY_RE = re.compile(r'<body[^>]*>(.*)</body>', flags=re.IGNORECASE | re.DOTALL)
def _extract_markdown_candidate(text: str) -> str:
candidate = text
body_match = HTML_BODY_RE.search(candidate)
if body_match:
candidate = body_match.group(1)
candidate = re.sub(r'^\s*<p[^>]*>', '', candidate, flags=re.IGNORECASE)
candidate = re.sub(r'</p>\s*$', '', candidate, flags=re.IGNORECASE)
return candidate.strip()
def _looks_like_markdown(text: str) -> bool:
lower = text.lower()
if "<html" in lower and "<head" in lower and "</body>" in lower:
return False
md_markers = 0
md_markers += len(re.findall(r'^\s{0,3}#{1,6}\s+\S', text, flags=re.MULTILINE))
md_markers += len(re.findall(r'^\s*[-*+]\s+\S', text, flags=re.MULTILINE))
md_markers += len(re.findall(r'^\s*\d+\.\s+\S', text, flags=re.MULTILINE))
md_markers += text.count('[TOC]')
md_markers += len(MARKDOWN_INLINE_LINK_RE.findall(text))
md_markers += text.count('\n---') + text.count('\n***')
return md_markers >= 6
def _render_markdown_fallback(text: str) -> str:
if _markdown is not None and not HTML_TAG_RE.search(text):
try:
return _markdown.markdown(
text,
extensions=["extra", "toc", "sane_lists"],
output_format="html5",
)
except Exception:
pass
lines = text.splitlines()
headings = []
def slugify(value: str) -> str:
slug = re.sub(r'[^A-Za-z0-9]+', '-', value).strip('-')
return slug or "section"
for raw_line in lines:
heading_match = re.match(r'^\s{0,3}(#{1,6})\s+(.*)$', raw_line)
if heading_match:
level = len(heading_match.group(1))
content = heading_match.group(2).strip()
headings.append((level, content, slugify(content)))
html_lines = []
in_code = False
in_ul = False
in_ol = False
in_blockquote = False
def render_inline(markup: str) -> str:
content = MARKDOWN_INLINE_IMAGE_RE.sub(r'<img alt="\1" src="\2">', markup)
content = MARKDOWN_INLINE_LINK_RE.sub(r'<a href="\2">\1</a>', content)
content = MARKDOWN_BOLD_RE.sub(r'<strong>\1</strong>', content)
content = MARKDOWN_ITALIC_RE.sub(r'<em>\1</em>', content)
return content
def close_lists():
nonlocal in_ul, in_ol
if in_ul:
html_lines.append("</ul>")
in_ul = False
if in_ol:
html_lines.append("</ol>")
in_ol = False
for raw_line in lines:
line = raw_line.rstrip("\n")
stripped = line.strip()
if stripped.startswith("```"):
if in_code:
html_lines.append("</code></pre>")
in_code = False
else:
close_lists()
if in_blockquote:
html_lines.append("</blockquote>")
in_blockquote = False
html_lines.append("<pre><code>")
in_code = True
continue
if in_code:
html_lines.append(html.escape(line))
continue
if not stripped:
close_lists()
if in_blockquote:
html_lines.append("</blockquote>")
in_blockquote = False
html_lines.append("<br/>")
continue
heading_match = re.match(r'^\s*((?:<[^>]+>\s*)*)(#{1,6})\s+(.*)$', line)
if heading_match:
close_lists()
if in_blockquote:
html_lines.append("</blockquote>")
in_blockquote = False
leading_tags = heading_match.group(1).strip()
level = len(heading_match.group(2))
content = heading_match.group(3).strip()
if leading_tags:
html_lines.append(leading_tags)
html_lines.append(f"<h{level} id=\"{slugify(content)}\">{render_inline(content)}</h{level}>")
continue
if stripped in ("---", "***"):
close_lists()
html_lines.append("<hr/>")
continue
if stripped.startswith("> "):
if not in_blockquote:
close_lists()
html_lines.append("<blockquote>")
in_blockquote = True
content = stripped[2:]
html_lines.append(render_inline(content))
continue
else:
if in_blockquote:
html_lines.append("</blockquote>")
in_blockquote = False
ul_match = re.match(r'^\s*[-*+]\s+(.*)$', line)
if ul_match:
if in_ol:
html_lines.append("</ol>")
in_ol = False
if not in_ul:
html_lines.append("<ul>")
in_ul = True
html_lines.append(f"<li>{render_inline(ul_match.group(1))}</li>")
continue
ol_match = re.match(r'^\s*\d+\.\s+(.*)$', line)
if ol_match:
if in_ul:
html_lines.append("</ul>")
in_ul = False
if not in_ol:
html_lines.append("<ol>")
in_ol = True
html_lines.append(f"<li>{render_inline(ol_match.group(1))}</li>")
continue
close_lists()
# Inline conversions (leave raw HTML intact)
if stripped == "[TOC]":
toc_items = []
for level, title, slug in headings:
toc_items.append(
f'<li class="toc-level-{level}"><a href="#{slug}">{title}</a></li>'
)
html_lines.append(
'<nav class="toc"><ul>' + "".join(toc_items) + '</ul></nav>'
)
continue
html_lines.append(f"<p>{render_inline(line)}</p>")
close_lists()
if in_blockquote:
html_lines.append("</blockquote>")
if in_code:
html_lines.append("</code></pre>")
return "\n".join(html_lines)
def _render_markdown_document(markdown_text: str) -> str:
body = _render_markdown_fallback(markdown_text)
wrapped = (
"<!doctype html><html><head><meta charset=\"utf-8\">"
"<meta name=\"viewport\" content=\"width=device-width,initial-scale=1\">"
"<style>body{max-width:900px;margin:24px auto;padding:0 16px;"
"font-family:system-ui,-apple-system,Segoe UI,Roboto,Helvetica,Arial,sans-serif;"
"line-height:1.55;} img{max-width:100%;} pre{background:#f6f6f6;padding:12px;overflow:auto;}"
".toc ul{list-style:none;padding-left:0;} .toc li{margin:4px 0;}</style>"
"</head><body>"
f"{body}"
"</body></html>"
)
return wrapped
def serve_static_with_byterange_support(request, path, document_root=None, show_indexes=False):
@@ -28,18 +292,101 @@ def serve_static_with_byterange_support(request, path, document_root=None, show_
if not os.access(fullpath, os.R_OK):
raise Http404(_("%(path)s” does not exist") % {"path": fullpath})
# Respect the If-Modified-Since header.
statobj = fullpath.stat()
if not static.was_modified_since(request.META.get("HTTP_IF_MODIFIED_SINCE"), statobj.st_mtime):
return HttpResponseNotModified()
document_root = Path(document_root) if document_root else None
rel_path = path
etag = None
if document_root:
file_hash = _hash_for_path(document_root, rel_path)
if file_hash:
etag = f'"{file_hash}"'
if etag:
inm = request.META.get("HTTP_IF_NONE_MATCH")
if inm:
inm_list = [item.strip() for item in inm.split(",")]
if etag in inm_list or etag.strip('"') in [i.strip('"') for i in inm_list]:
not_modified = HttpResponseNotModified()
not_modified.headers["ETag"] = etag
not_modified.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable"
not_modified.headers["Last-Modified"] = http_date(statobj.st_mtime)
return not_modified
content_type, encoding = mimetypes.guess_type(str(fullpath))
content_type = content_type or "application/octet-stream"
# Add charset for text-like types (best guess), but don't override the type.
is_text_like = (
content_type.startswith("text/")
or content_type in {
"application/json",
"application/javascript",
"application/xml",
"application/x-ndjson",
"image/svg+xml",
}
)
if is_text_like and "charset=" not in content_type:
content_type = f"{content_type}; charset=utf-8"
# Respect the If-Modified-Since header for non-markdown responses.
if not (content_type.startswith("text/plain") or content_type.startswith("text/html")):
if not static.was_modified_since(request.META.get("HTTP_IF_MODIFIED_SINCE"), statobj.st_mtime):
return HttpResponseNotModified()
# Heuristic fix: some archived HTML outputs (e.g. mercury content.html)
# are stored with HTML-escaped markup or markdown sources. If so, render sensibly.
if content_type.startswith("text/plain") or content_type.startswith("text/html"):
try:
max_unescape_size = 10 * 1024 * 1024 # 10MB cap to avoid heavy memory use
if statobj.st_size <= max_unescape_size:
raw = fullpath.read_bytes()
decoded = raw.decode("utf-8", errors="replace")
escaped_count = decoded.count("&lt;") + decoded.count("&gt;")
tag_count = decoded.count("<")
if escaped_count and escaped_count > tag_count * 2:
decoded = html.unescape(decoded)
markdown_candidate = _extract_markdown_candidate(decoded)
if _looks_like_markdown(markdown_candidate):
wrapped = _render_markdown_document(markdown_candidate)
response = HttpResponse(wrapped, content_type="text/html; charset=utf-8")
response.headers["Last-Modified"] = http_date(statobj.st_mtime)
if etag:
response.headers["ETag"] = etag
response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable"
else:
response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=60, stale-while-revalidate=300"
response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"'
if encoding:
response.headers["Content-Encoding"] = encoding
return response
if escaped_count and escaped_count > tag_count * 2:
response = HttpResponse(decoded, content_type=content_type)
response.headers["Last-Modified"] = http_date(statobj.st_mtime)
if etag:
response.headers["ETag"] = etag
response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable"
else:
response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=60, stale-while-revalidate=300"
response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"'
if encoding:
response.headers["Content-Encoding"] = encoding
return response
except Exception:
pass
# setup resposne object
ranged_file = RangedFileReader(open(fullpath, "rb"))
response = StreamingHttpResponse(ranged_file, content_type=content_type)
response.headers["Last-Modified"] = http_date(statobj.st_mtime)
if etag:
response.headers["ETag"] = etag
response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable"
else:
response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=60, stale-while-revalidate=300"
if is_text_like:
response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"'
if content_type.startswith("image/"):
response.headers["Cache-Control"] = "public, max-age=604800, immutable"
# handle byte-range requests by serving chunk of file
if stat.S_ISREG(statobj.st_mode):