| | @@ -1,135 +1,171 @@ |
| 1 | 1 | """HTML sanitization for user-generated content. |
| 2 | 2 | |
| 3 | | -Strips dangerous tags (<script>, <style>, <iframe>, etc.), event handlers (on*), |
| 4 | | -and dangerous URL protocols (javascript:, data:, vbscript:) while preserving |
| 5 | | -safe formatting tags used by Fossil wiki, Markdown, and Pikchr diagrams. |
| 6 | | -""" |
| 7 | | - |
| 8 | | -import re |
| 9 | | - |
| 10 | | -# Tags that are safe to render -- covers Markdown/wiki formatting and Pikchr SVG |
| 11 | | -ALLOWED_TAGS = { |
| 12 | | - "a", |
| 13 | | - "abbr", |
| 14 | | - "acronym", |
| 15 | | - "b", |
| 16 | | - "blockquote", |
| 17 | | - "br", |
| 18 | | - "code", |
| 19 | | - "dd", |
| 20 | | - "del", |
| 21 | | - "details", |
| 22 | | - "div", |
| 23 | | - "dl", |
| 24 | | - "dt", |
| 25 | | - "em", |
| 26 | | - "h1", |
| 27 | | - "h2", |
| 28 | | - "h3", |
| 29 | | - "h4", |
| 30 | | - "h5", |
| 31 | | - "h6", |
| 32 | | - "hr", |
| 33 | | - "i", |
| 34 | | - "img", |
| 35 | | - "ins", |
| 36 | | - "kbd", |
| 37 | | - "li", |
| 38 | | - "mark", |
| 39 | | - "ol", |
| 40 | | - "p", |
| 41 | | - "pre", |
| 42 | | - "q", |
| 43 | | - "s", |
| 44 | | - "samp", |
| 45 | | - "small", |
| 46 | | - "span", |
| 47 | | - "strong", |
| 48 | | - "sub", |
| 49 | | - "summary", |
| 50 | | - "sup", |
| 51 | | - "table", |
| 52 | | - "tbody", |
| 53 | | - "td", |
| 54 | | - "tfoot", |
| 55 | | - "th", |
| 56 | | - "thead", |
| 57 | | - "tr", |
| 58 | | - "tt", |
| 59 | | - "u", |
| 60 | | - "ul", |
| 61 | | - "var", |
| 62 | | - # SVG elements for Pikchr diagrams |
| 63 | | - "svg", |
| 64 | | - "path", |
| 65 | | - "circle", |
| 66 | | - "rect", |
| 67 | | - "line", |
| 68 | | - "polyline", |
| 69 | | - "polygon", |
| 70 | | - "g", |
| 71 | | - "text", |
| 72 | | - "defs", |
| 73 | | - "use", |
| 74 | | - "symbol", |
| 75 | | -} |
| 76 | | - |
| 77 | | -# Tags whose entire content (not just the tag) must be removed |
| 78 | | -_DANGEROUS_CONTENT_TAGS = re.compile( |
| 79 | | - r"<\s*(script|style|iframe|object|embed|form|base|meta|link)\b[^>]*>.*?</\s*\1\s*>", |
| 80 | | - re.IGNORECASE | re.DOTALL, |
| 81 | | -) |
| 82 | | - |
| 83 | | -# Self-closing / unclosed dangerous tags |
| 84 | | -_DANGEROUS_SELF_CLOSING = re.compile( |
| 85 | | - r"<\s*/?\s*(script|style|iframe|object|embed|form|base|meta|link)\b[^>]*/?\s*>", |
| 86 | | - re.IGNORECASE, |
| 87 | | -) |
| 88 | | - |
| 89 | | -# Event handler attributes (onclick, onload, onerror, etc.) |
| 90 | | -_EVENT_HANDLERS = re.compile( |
| 91 | | - r"""\s+on\w+\s*=\s*(?:"[^"]*"|'[^']*'|[^\s>]+)""", |
| 92 | | - re.IGNORECASE, |
| 93 | | -) |
| 94 | | - |
| 95 | | -# Dangerous protocols in href/src values |
| 96 | | -_DANGEROUS_PROTOCOL = re.compile(r"^\s*(?:javascript|vbscript|data):", re.IGNORECASE) |
| 97 | | - |
| 98 | | -# href="..." and src="..." attribute pattern |
| 99 | | -_URL_ATTR = re.compile(r"""(href|src)\s*=\s*(["']?)([^"'>\s]+)\2""", re.IGNORECASE) |
| 100 | | - |
| 101 | | - |
| 102 | | -def _clean_url_attr(match: re.Match) -> str: |
| 103 | | - """Replace dangerous protocol URLs with a safe '#' anchor.""" |
| 104 | | - attr_name = match.group(1) |
| 105 | | - quote = match.group(2) or "" |
| 106 | | - url = match.group(3) |
| 107 | | - if _DANGEROUS_PROTOCOL.match(url): |
| 108 | | - return f"{attr_name}={quote}#{quote}" |
| 109 | | - return match.group(0) |
| 110 | | - |
| 111 | | - |
| 112 | | -def sanitize_html(html: str) -> str: |
| 113 | | - """Remove dangerous HTML tags and attributes while preserving safe formatting. |
| 114 | | - |
| 115 | | - Strips <script>, <style>, <iframe>, <object>, <embed>, <form>, <base>, |
| 116 | | - <meta>, <link> tags and their content. Removes event handler attributes |
| 117 | | - (on*) and replaces dangerous URL protocols (javascript:, data:, vbscript:) |
| 118 | | - in href/src with '#'. |
| 119 | | - """ |
| 120 | | - if not html: |
| 121 | | - return html |
| 122 | | - |
| 123 | | - # 1. Remove dangerous tags WITH their content (e.g. <script>...</script>) |
| 124 | | - html = _DANGEROUS_CONTENT_TAGS.sub("", html) |
| 125 | | - |
| 126 | | - # 2. Remove any remaining self-closing or orphaned dangerous tags |
| 127 | | - html = _DANGEROUS_SELF_CLOSING.sub("", html) |
| 128 | | - |
| 129 | | - # 3. Remove event handler attributes (onclick, onload, onerror, etc.) |
| 130 | | - html = _EVENT_HANDLERS.sub("", html) |
| 131 | | - |
| 132 | | - # 4. Neutralize dangerous URL protocols in href and src attributes |
| 133 | | - html = _URL_ATTR.sub(_clean_url_attr, html) |
| 134 | | - |
| 135 | | - return html |
| 3 | +Uses Python's html.parser to properly parse HTML and enforce an allowlist |
| 4 | +of tags and attributes. Strips everything not explicitly allowed. |
| 5 | +""" |
| 6 | + |
| 7 | +import html |
| 8 | +import re |
| 9 | +from html.parser import HTMLParser |
| 10 | +from io import StringIO |
| 11 | + |
| 12 | +# Tags that are safe to render — covers Markdown/wiki formatting and Pikchr SVG |
| 13 | +ALLOWED_TAGS = frozenset({ |
| 14 | + "a", "abbr", "acronym", "b", "blockquote", "br", "code", "dd", "del", |
| 15 | + "details", "div", "dl", "dt", "em", "h1", "h2", "h3", "h4", "h5", "h6", |
| 16 | + "hr", "i", "img", "ins", "kbd", "li", "mark", "ol", "p", "pre", "q", |
| 17 | + "s", "samp", "small", "span", "strong", "sub", "summary", "sup", |
| 18 | + "table", "tbody", "td", "tfoot", "th", "thead", "tr", "tt", "u", "ul", "var", |
| 19 | + # SVG elements for Pikchr diagrams |
| 20 | + "svg", "path", "circle", "rect", "line", "polyline", "polygon", |
| 21 | + "g", "text", "defs", "use", "symbol", |
| 22 | +}) |
| 23 | + |
| 24 | +# Attributes allowed per tag (all others stripped) |
| 25 | +ALLOWED_ATTRS = { |
| 26 | + "a": {"href", "title", "class", "id", "name"}, |
| 27 | + "img": {"src", "alt", "title", "width", "height", "class"}, |
| 28 | + "div": {"class", "id"}, |
| 29 | + "span": {"class", "id"}, |
| 30 | + "td": {"class", "colspan", "rowspan"}, |
| 31 | + "th": {"class", "colspan", "rowspan"}, |
| 32 | + "table": {"class"}, |
| 33 | + "code": {"class"}, |
| 34 | + "pre": {"class"}, |
| 35 | + "ol": {"class", "start", "type"}, |
| 36 | + "ul": {"class"}, |
| 37 | + "li": {"class", "value"}, |
| 38 | + "details": {"open", "class"}, |
| 39 | + "summary": {"class"}, |
| 40 | + "h1": {"id", "class"}, "h2": {"id", "class"}, "h3": {"id", "class"}, |
| 41 | + "h4": {"id", "class"}, "h5": {"id", "class"}, "h6": {"id", "class"}, |
| 42 | + # SVG attributes |
| 43 | + "svg": {"viewbox", "width", "height", "class", "xmlns", "fill", "stroke"}, |
| 44 | + "path": {"d", "fill", "stroke", "stroke-width", "stroke-linecap", "stroke-linejoin", "class"}, |
| 45 | + "circle": {"cx", "cy", "r", "fill", "stroke", "class"}, |
| 46 | + "rect": {"x", "y", "width", "height", "fill", "stroke", "rx", "ry", "class"}, |
| 47 | + "line": {"x1", "y1", "x2", "y2", "stroke", "stroke-width", "class"}, |
| 48 | + "text": {"x", "y", "font-size", "text-anchor", "fill", "class"}, |
| 49 | + "g": {"transform", "class"}, |
| 50 | + "polyline": {"points", "fill", "stroke", "class"}, |
| 51 | + "polygon": {"points", "fill", "stroke", "class"}, |
| 52 | +} |
| 53 | + |
| 54 | +# Global attributes allowed on any tag |
| 55 | +GLOBAL_ATTRS = frozenset() |
| 56 | + |
| 57 | +# Protocols allowed in href/src — everything else is stripped |
| 58 | +ALLOWED_PROTOCOLS = frozenset({"http", "https", "mailto", "ftp", "#", ""}) |
| 59 | + |
| 60 | +# Regex to detect protocol in a URL (after HTML entity decoding) |
| 61 | +_PROTOCOL_RE = re.compile(r"^([a-zA-Z][a-zA-Z0-9+\-.]*):.*", re.DOTALL) |
| 62 | + |
| 63 | + |
| 64 | +def _is_safe_url(url: str) -> bool: |
| 65 | + """Check if a URL uses a safe protocol. Decodes HTML entities first.""" |
| 66 | + decoded = html.unescape(url).strip() |
| 67 | + m = _PROTOCOL_RE.match(decoded) |
| 68 | + if m: |
| 69 | + return m.group(1).lower() in ALLOWED_PROTOCOLS |
| 70 | + # Relative URLs (no protocol) are safe |
| 71 | + return True |
| 72 | + |
| 73 | + |
| 74 | +class _SanitizingParser(HTMLParser): |
| 75 | + """HTML parser that only emits allowed tags/attributes.""" |
| 76 | + |
| 77 | + def __init__(self): |
| 78 | + super().__init__(convert_charrefs=False) |
| 79 | + self.out = StringIO() |
| 80 | + self._skip_depth = 0 # Track depth inside dangerous tags to skip content |
| 81 | + |
| 82 | + def handle_starttag(self, tag, attrs): |
| 83 | + tag_lower = tag.lower() |
| 84 | + |
| 85 | + # Dangerous content tags — skip tag AND all content inside |
| 86 | + if tag_lower in ("script", "style", "iframe", "object", "embed", "form", "base", "meta", "link"): |
| 87 | + self._skip_depth += 1 |
| 88 | + return |
| 89 | + |
| 90 | + if self._skip_depth > 0: |
| 91 | + return |
| 92 | + |
| 93 | + if tag_lower not in ALLOWED_TAGS: |
| 94 | + return # Strip unknown tag (but keep its text content) |
| 95 | + |
| 96 | + # Filter attributes |
| 97 | + allowed = ALLOWED_ATTRS.get(tag_lower, set()) | GLOBAL_ATTRS |
| 98 | + safe_attrs = [] |
| 99 | + for name, value in attrs: |
| 100 | + name_lower = name.lower() |
| 101 | + # Block event handlers |
| 102 | + if name_lower.startswith("on"): |
| 103 | + continue |
| 104 | + if name_lower not in allowed: |
| 105 | + continue |
| 106 | + # Sanitize URLs in href/src |
| 107 | + if name_lower in ("href", "src") and value and not _is_safe_url(value): |
| 108 | + value = "#" |
| 109 | + safe_attrs.append((name, value)) |
| 110 | + |
| 111 | + # Build the tag |
| 112 | + attr_str = "" |
| 113 | + for name, value in safe_attrs: |
| 114 | + if value is None: |
| 115 | + attr_str += f" {name}" |
| 116 | + else: |
| 117 | + escaped = value.replace("&", "&").replace('"', """) |
| 118 | + attr_str += f' {name}="{escaped}"' |
| 119 | + |
| 120 | + self.out.write(f"<{tag}{attr_str}>") |
| 121 | + |
| 122 | + def handle_endtag(self, tag): |
| 123 | + tag_lower = tag.lower() |
| 124 | + if tag_lower in ("script", "style", "iframe", "object", "embed", "form", "base", "meta", "link"): |
| 125 | + self._skip_depth = max(0, self._skip_depth - 1) |
| 126 | + return |
| 127 | + if self._skip_depth > 0: |
| 128 | + return |
| 129 | + if tag_lower in ALLOWED_TAGS: |
| 130 | + self.out.write(f"</{tag}>") |
| 131 | + |
| 132 | + def handle_data(self, data): |
| 133 | + if self._skip_depth > 0: |
| 134 | + return # Inside a dangerous tag — skip content |
| 135 | + self.out.write(data) |
| 136 | + |
| 137 | + def handle_entityref(self, name): |
| 138 | + if self._skip_depth > 0: |
| 139 | + return |
| 140 | + self.out.write(f"&{name};") |
| 141 | + |
| 142 | + def handle_charref(self, name): |
| 143 | + if self._skip_depth > 0: |
| 144 | + return |
| 145 | + self.out.write(f"&#{name};") |
| 146 | + |
| 147 | + def handle_comment(self, data): |
| 148 | + pass # Strip all HTML comments |
| 149 | + |
| 150 | + def handle_startendtag(self, tag, attrs): |
| 151 | + # Self-closing tags like <br/>, <img/> |
| 152 | + self.handle_starttag(tag, attrs) |
| 153 | + |
| 154 | + |
| 155 | +def sanitize_html(html_content: str) -> str: |
| 156 | + """Sanitize HTML using a proper parser with tag/attribute allowlists. |
| 157 | + |
| 158 | + - Only tags in ALLOWED_TAGS are kept (all others stripped, text preserved) |
| 159 | + - Only attributes in ALLOWED_ATTRS per tag are kept |
| 160 | + - Event handlers (on*) are always stripped |
| 161 | + - URLs in href/src are checked after HTML entity decoding — javascript:, |
| 162 | + data:, vbscript: (including entity-encoded variants) are neutralized |
| 163 | + - Content inside <script>, <style>, <iframe>, etc. is completely removed |
| 164 | + - HTML comments are stripped |
| 165 | + """ |
| 166 | + if not html_content: |
| 167 | + return html_content |
| 168 | + |
| 169 | + parser = _SanitizingParser() |
| 170 | + parser.feed(html_content) |
| 171 | + return parser.out.getvalue() |
| 136 | 172 | |
| 137 | 173 | ADDED core/url_validation.py |