wip

2026-04-06 07:47:53 +10:00 · 2026-01-21 03:19:56 -08:00
parent f3f55d3395
commit ec4b27056e
113 changed files with 6929 additions and 2396 deletions
--- a/archivebox/misc/serve_static.py
+++ b/archivebox/misc/serve_static.py
@@ -1,3 +1,6 @@
+import html
+import json
+import re
 import os
 import stat
 import posixpath
@@ -10,6 +13,267 @@ from django.http import StreamingHttpResponse, Http404, HttpResponse, HttpRespon
 from django.utils._os import safe_join
 from django.utils.http import http_date
 from django.utils.translation import gettext as _
+from archivebox.config.common import SERVER_CONFIG
+
+
+_HASHES_CACHE: dict[Path, tuple[float, dict[str, str]]] = {}
+
+
+def _load_hash_map(snapshot_dir: Path) -> dict[str, str] | None:
+    hashes_path = snapshot_dir / 'hashes' / 'hashes.json'
+    if not hashes_path.exists():
+        return None
+    try:
+        mtime = hashes_path.stat().st_mtime
+    except OSError:
+        return None
+
+    cached = _HASHES_CACHE.get(hashes_path)
+    if cached and cached[0] == mtime:
+        return cached[1]
+
+    try:
+        data = json.loads(hashes_path.read_text(encoding='utf-8'))
+    except Exception:
+        return None
+
+    file_map = {str(entry.get('path')): entry.get('hash') for entry in data.get('files', []) if entry.get('path')}
+    _HASHES_CACHE[hashes_path] = (mtime, file_map)
+    return file_map
+
+
+def _hash_for_path(document_root: Path, rel_path: str) -> str | None:
+    file_map = _load_hash_map(document_root)
+    if not file_map:
+        return None
+    return file_map.get(rel_path)
+
+
+def _cache_policy() -> str:
+    return 'public' if SERVER_CONFIG.PUBLIC_SNAPSHOTS else 'private'
+
+
+# Ensure common web types are mapped consistently across platforms.
+mimetypes.add_type("text/html", ".html")
+mimetypes.add_type("text/html", ".htm")
+mimetypes.add_type("text/css", ".css")
+mimetypes.add_type("application/javascript", ".js")
+mimetypes.add_type("application/json", ".json")
+mimetypes.add_type("application/x-ndjson", ".jsonl")
+mimetypes.add_type("text/markdown", ".md")
+mimetypes.add_type("text/yaml", ".yml")
+mimetypes.add_type("text/yaml", ".yaml")
+mimetypes.add_type("text/csv", ".csv")
+mimetypes.add_type("text/tab-separated-values", ".tsv")
+mimetypes.add_type("application/xml", ".xml")
+mimetypes.add_type("image/svg+xml", ".svg")
+
+try:
+    import markdown as _markdown
+except Exception:
+    _markdown = None
+
+MARKDOWN_INLINE_LINK_RE = re.compile(r'\[([^\]]+)\]\(([^)\s]+(?:\([^)]*\)[^)\s]*)*)\)')
+MARKDOWN_INLINE_IMAGE_RE = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)')
+MARKDOWN_BOLD_RE = re.compile(r'\*\*([^*]+)\*\*')
+MARKDOWN_ITALIC_RE = re.compile(r'(?<!\*)\*([^*]+)\*(?!\*)')
+HTML_TAG_RE = re.compile(r'<[A-Za-z][^>]*>')
+HTML_BODY_RE = re.compile(r'<body[^>]*>(.*)</body>', flags=re.IGNORECASE | re.DOTALL)
+
+
+def _extract_markdown_candidate(text: str) -> str:
+    candidate = text
+    body_match = HTML_BODY_RE.search(candidate)
+    if body_match:
+        candidate = body_match.group(1)
+    candidate = re.sub(r'^\s*<p[^>]*>', '', candidate, flags=re.IGNORECASE)
+    candidate = re.sub(r'</p>\s*$', '', candidate, flags=re.IGNORECASE)
+    return candidate.strip()
+
+
+def _looks_like_markdown(text: str) -> bool:
+    lower = text.lower()
+    if "<html" in lower and "<head" in lower and "</body>" in lower:
+        return False
+    md_markers = 0
+    md_markers += len(re.findall(r'^\s{0,3}#{1,6}\s+\S', text, flags=re.MULTILINE))
+    md_markers += len(re.findall(r'^\s*[-*+]\s+\S', text, flags=re.MULTILINE))
+    md_markers += len(re.findall(r'^\s*\d+\.\s+\S', text, flags=re.MULTILINE))
+    md_markers += text.count('[TOC]')
+    md_markers += len(MARKDOWN_INLINE_LINK_RE.findall(text))
+    md_markers += text.count('\n---') + text.count('\n***')
+    return md_markers >= 6
+
+
+def _render_markdown_fallback(text: str) -> str:
+    if _markdown is not None and not HTML_TAG_RE.search(text):
+        try:
+            return _markdown.markdown(
+                text,
+                extensions=["extra", "toc", "sane_lists"],
+                output_format="html5",
+            )
+        except Exception:
+            pass
+
+    lines = text.splitlines()
+    headings = []
+
+    def slugify(value: str) -> str:
+        slug = re.sub(r'[^A-Za-z0-9]+', '-', value).strip('-')
+        return slug or "section"
+
+    for raw_line in lines:
+        heading_match = re.match(r'^\s{0,3}(#{1,6})\s+(.*)$', raw_line)
+        if heading_match:
+            level = len(heading_match.group(1))
+            content = heading_match.group(2).strip()
+            headings.append((level, content, slugify(content)))
+
+    html_lines = []
+    in_code = False
+    in_ul = False
+    in_ol = False
+    in_blockquote = False
+
+    def render_inline(markup: str) -> str:
+        content = MARKDOWN_INLINE_IMAGE_RE.sub(r'<img alt="\1" src="\2">', markup)
+        content = MARKDOWN_INLINE_LINK_RE.sub(r'<a href="\2">\1</a>', content)
+        content = MARKDOWN_BOLD_RE.sub(r'<strong>\1</strong>', content)
+        content = MARKDOWN_ITALIC_RE.sub(r'<em>\1</em>', content)
+        return content
+
+    def close_lists():
+        nonlocal in_ul, in_ol
+        if in_ul:
+            html_lines.append("</ul>")
+            in_ul = False
+        if in_ol:
+            html_lines.append("</ol>")
+            in_ol = False
+
+    for raw_line in lines:
+        line = raw_line.rstrip("\n")
+        stripped = line.strip()
+
+        if stripped.startswith("```"):
+            if in_code:
+                html_lines.append("</code></pre>")
+                in_code = False
+            else:
+                close_lists()
+                if in_blockquote:
+                    html_lines.append("</blockquote>")
+                    in_blockquote = False
+                html_lines.append("<pre><code>")
+                in_code = True
+            continue
+
+        if in_code:
+            html_lines.append(html.escape(line))
+            continue
+
+        if not stripped:
+            close_lists()
+            if in_blockquote:
+                html_lines.append("</blockquote>")
+                in_blockquote = False
+            html_lines.append("<br/>")
+            continue
+
+        heading_match = re.match(r'^\s*((?:<[^>]+>\s*)*)(#{1,6})\s+(.*)$', line)
+        if heading_match:
+            close_lists()
+            if in_blockquote:
+                html_lines.append("</blockquote>")
+                in_blockquote = False
+            leading_tags = heading_match.group(1).strip()
+            level = len(heading_match.group(2))
+            content = heading_match.group(3).strip()
+            if leading_tags:
+                html_lines.append(leading_tags)
+            html_lines.append(f"<h{level} id=\"{slugify(content)}\">{render_inline(content)}</h{level}>")
+            continue
+
+        if stripped in ("---", "***"):
+            close_lists()
+            html_lines.append("<hr/>")
+            continue
+
+        if stripped.startswith("> "):
+            if not in_blockquote:
+                close_lists()
+                html_lines.append("<blockquote>")
+                in_blockquote = True
+            content = stripped[2:]
+            html_lines.append(render_inline(content))
+            continue
+        else:
+            if in_blockquote:
+                html_lines.append("</blockquote>")
+                in_blockquote = False
+
+        ul_match = re.match(r'^\s*[-*+]\s+(.*)$', line)
+        if ul_match:
+            if in_ol:
+                html_lines.append("</ol>")
+                in_ol = False
+            if not in_ul:
+                html_lines.append("<ul>")
+                in_ul = True
+            html_lines.append(f"<li>{render_inline(ul_match.group(1))}</li>")
+            continue
+
+        ol_match = re.match(r'^\s*\d+\.\s+(.*)$', line)
+        if ol_match:
+            if in_ul:
+                html_lines.append("</ul>")
+                in_ul = False
+            if not in_ol:
+                html_lines.append("<ol>")
+                in_ol = True
+            html_lines.append(f"<li>{render_inline(ol_match.group(1))}</li>")
+            continue
+
+        close_lists()
+
+        # Inline conversions (leave raw HTML intact)
+        if stripped == "[TOC]":
+            toc_items = []
+            for level, title, slug in headings:
+                toc_items.append(
+                    f'<li class="toc-level-{level}"><a href="#{slug}">{title}</a></li>'
+                )
+            html_lines.append(
+                '<nav class="toc"><ul>' + "".join(toc_items) + '</ul></nav>'
+            )
+            continue
+
+        html_lines.append(f"<p>{render_inline(line)}</p>")
+
+    close_lists()
+    if in_blockquote:
+        html_lines.append("</blockquote>")
+    if in_code:
+        html_lines.append("</code></pre>")
+
+    return "\n".join(html_lines)
+
+
+def _render_markdown_document(markdown_text: str) -> str:
+    body = _render_markdown_fallback(markdown_text)
+    wrapped = (
+        "<!doctype html><html><head><meta charset=\"utf-8\">"
+        "<meta name=\"viewport\" content=\"width=device-width,initial-scale=1\">"
+        "<style>body{max-width:900px;margin:24px auto;padding:0 16px;"
+        "font-family:system-ui,-apple-system,Segoe UI,Roboto,Helvetica,Arial,sans-serif;"
+        "line-height:1.55;} img{max-width:100%;} pre{background:#f6f6f6;padding:12px;overflow:auto;}"
+        ".toc ul{list-style:none;padding-left:0;} .toc li{margin:4px 0;}</style>"
+        "</head><body>"
+        f"{body}"
+        "</body></html>"
+    )
+    return wrapped


 def serve_static_with_byterange_support(request, path, document_root=None, show_indexes=False):
@@ -28,18 +292,101 @@ def serve_static_with_byterange_support(request, path, document_root=None, show_
    if not os.access(fullpath, os.R_OK):
        raise Http404(_("“%(path)s” does not exist") % {"path": fullpath})
    
-    # Respect the If-Modified-Since header.
    statobj = fullpath.stat()
-    if not static.was_modified_since(request.META.get("HTTP_IF_MODIFIED_SINCE"), statobj.st_mtime):
-        return HttpResponseNotModified()
+    document_root = Path(document_root) if document_root else None
+    rel_path = path
+    etag = None
+    if document_root:
+        file_hash = _hash_for_path(document_root, rel_path)
+        if file_hash:
+            etag = f'"{file_hash}"'
+
+    if etag:
+        inm = request.META.get("HTTP_IF_NONE_MATCH")
+        if inm:
+            inm_list = [item.strip() for item in inm.split(",")]
+            if etag in inm_list or etag.strip('"') in [i.strip('"') for i in inm_list]:
+                not_modified = HttpResponseNotModified()
+                not_modified.headers["ETag"] = etag
+                not_modified.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable"
+                not_modified.headers["Last-Modified"] = http_date(statobj.st_mtime)
+                return not_modified
    
    content_type, encoding = mimetypes.guess_type(str(fullpath))
    content_type = content_type or "application/octet-stream"
-    
+    # Add charset for text-like types (best guess), but don't override the type.
+    is_text_like = (
+        content_type.startswith("text/")
+        or content_type in {
+            "application/json",
+            "application/javascript",
+            "application/xml",
+            "application/x-ndjson",
+            "image/svg+xml",
+        }
+    )
+    if is_text_like and "charset=" not in content_type:
+        content_type = f"{content_type}; charset=utf-8"
+
+    # Respect the If-Modified-Since header for non-markdown responses.
+    if not (content_type.startswith("text/plain") or content_type.startswith("text/html")):
+        if not static.was_modified_since(request.META.get("HTTP_IF_MODIFIED_SINCE"), statobj.st_mtime):
+            return HttpResponseNotModified()
+
+    # Heuristic fix: some archived HTML outputs (e.g. mercury content.html)
+    # are stored with HTML-escaped markup or markdown sources. If so, render sensibly.
+    if content_type.startswith("text/plain") or content_type.startswith("text/html"):
+        try:
+            max_unescape_size = 10 * 1024 * 1024  # 10MB cap to avoid heavy memory use
+            if statobj.st_size <= max_unescape_size:
+                raw = fullpath.read_bytes()
+                decoded = raw.decode("utf-8", errors="replace")
+                escaped_count = decoded.count("&lt;") + decoded.count("&gt;")
+                tag_count = decoded.count("<")
+                if escaped_count and escaped_count > tag_count * 2:
+                    decoded = html.unescape(decoded)
+                markdown_candidate = _extract_markdown_candidate(decoded)
+                if _looks_like_markdown(markdown_candidate):
+                    wrapped = _render_markdown_document(markdown_candidate)
+                    response = HttpResponse(wrapped, content_type="text/html; charset=utf-8")
+                    response.headers["Last-Modified"] = http_date(statobj.st_mtime)
+                    if etag:
+                        response.headers["ETag"] = etag
+                        response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable"
+                    else:
+                        response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=60, stale-while-revalidate=300"
+                    response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"'
+                    if encoding:
+                        response.headers["Content-Encoding"] = encoding
+                    return response
+                if escaped_count and escaped_count > tag_count * 2:
+                    response = HttpResponse(decoded, content_type=content_type)
+                    response.headers["Last-Modified"] = http_date(statobj.st_mtime)
+                    if etag:
+                        response.headers["ETag"] = etag
+                        response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable"
+                    else:
+                        response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=60, stale-while-revalidate=300"
+                    response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"'
+                    if encoding:
+                        response.headers["Content-Encoding"] = encoding
+                    return response
+        except Exception:
+            pass
+
    # setup resposne object
    ranged_file = RangedFileReader(open(fullpath, "rb"))
    response = StreamingHttpResponse(ranged_file, content_type=content_type)
    response.headers["Last-Modified"] = http_date(statobj.st_mtime)
+    if etag:
+        response.headers["ETag"] = etag
+        response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable"
+    else:
+        response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=60, stale-while-revalidate=300"
+    if is_text_like:
+        response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"'
+    if content_type.startswith("image/"):
+        response.headers["Cache-Control"] = "public, max-age=604800, immutable"

    # handle byte-range requests by serving chunk of file    
    if stat.S_ISREG(statobj.st_mode):