FossilRepo

fossilrepo / core / sanitize.py
Source Blame History 250 lines
c588255… ragelink 1 """HTML sanitization for user-generated content.
c588255… ragelink 2
fcd8df3… ragelink 3 Uses Python's html.parser to properly parse HTML and enforce an allowlist
fcd8df3… ragelink 4 of tags and attributes. Strips everything not explicitly allowed.
fcd8df3… ragelink 5 """
fcd8df3… ragelink 6
fcd8df3… ragelink 7 import html
fcd8df3… ragelink 8 import re
fcd8df3… ragelink 9 from html.parser import HTMLParser
fcd8df3… ragelink 10 from io import StringIO
fcd8df3… ragelink 11
fcd8df3… ragelink 12 # Tags that are safe to render — covers Markdown/wiki formatting and Pikchr SVG
7e1aaf6… ragelink 13 ALLOWED_TAGS = frozenset(
7e1aaf6… ragelink 14 {
7e1aaf6… ragelink 15 "a",
7e1aaf6… ragelink 16 "abbr",
7e1aaf6… ragelink 17 "acronym",
7e1aaf6… ragelink 18 "b",
7e1aaf6… ragelink 19 "blockquote",
7e1aaf6… ragelink 20 "br",
7e1aaf6… ragelink 21 "code",
7e1aaf6… ragelink 22 "dd",
7e1aaf6… ragelink 23 "del",
7e1aaf6… ragelink 24 "details",
7e1aaf6… ragelink 25 "div",
7e1aaf6… ragelink 26 "dl",
7e1aaf6… ragelink 27 "dt",
7e1aaf6… ragelink 28 "em",
7e1aaf6… ragelink 29 "h1",
7e1aaf6… ragelink 30 "h2",
7e1aaf6… ragelink 31 "h3",
7e1aaf6… ragelink 32 "h4",
7e1aaf6… ragelink 33 "h5",
7e1aaf6… ragelink 34 "h6",
7e1aaf6… ragelink 35 "hr",
7e1aaf6… ragelink 36 "i",
7e1aaf6… ragelink 37 "img",
7e1aaf6… ragelink 38 "ins",
7e1aaf6… ragelink 39 "kbd",
7e1aaf6… ragelink 40 "li",
7e1aaf6… ragelink 41 "mark",
7e1aaf6… ragelink 42 "ol",
7e1aaf6… ragelink 43 "p",
7e1aaf6… ragelink 44 "pre",
7e1aaf6… ragelink 45 "q",
7e1aaf6… ragelink 46 "s",
7e1aaf6… ragelink 47 "samp",
7e1aaf6… ragelink 48 "small",
7e1aaf6… ragelink 49 "span",
7e1aaf6… ragelink 50 "strong",
7e1aaf6… ragelink 51 "sub",
7e1aaf6… ragelink 52 "summary",
7e1aaf6… ragelink 53 "sup",
7e1aaf6… ragelink 54 "table",
7e1aaf6… ragelink 55 "tbody",
7e1aaf6… ragelink 56 "td",
7e1aaf6… ragelink 57 "tfoot",
7e1aaf6… ragelink 58 "th",
7e1aaf6… ragelink 59 "thead",
7e1aaf6… ragelink 60 "tr",
7e1aaf6… ragelink 61 "tt",
7e1aaf6… ragelink 62 "u",
7e1aaf6… ragelink 63 "ul",
7e1aaf6… ragelink 64 "var",
7e1aaf6… ragelink 65 # SVG elements for Pikchr diagrams
7e1aaf6… ragelink 66 "svg",
7e1aaf6… ragelink 67 "path",
7e1aaf6… ragelink 68 "circle",
7e1aaf6… ragelink 69 "rect",
7e1aaf6… ragelink 70 "line",
7e1aaf6… ragelink 71 "polyline",
7e1aaf6… ragelink 72 "polygon",
7e1aaf6… ragelink 73 "g",
7e1aaf6… ragelink 74 "text",
7e1aaf6… ragelink 75 "defs",
7e1aaf6… ragelink 76 "use",
7e1aaf6… ragelink 77 "symbol",
7e1aaf6… ragelink 78 }
7e1aaf6… ragelink 79 )
fcd8df3… ragelink 80
fcd8df3… ragelink 81 # Attributes allowed per tag (all others stripped)
fcd8df3… ragelink 82 ALLOWED_ATTRS = {
fcd8df3… ragelink 83 "a": {"href", "title", "class", "id", "name"},
fcd8df3… ragelink 84 "img": {"src", "alt", "title", "width", "height", "class"},
fcd8df3… ragelink 85 "div": {"class", "id"},
fcd8df3… ragelink 86 "span": {"class", "id"},
fcd8df3… ragelink 87 "td": {"class", "colspan", "rowspan"},
fcd8df3… ragelink 88 "th": {"class", "colspan", "rowspan"},
fcd8df3… ragelink 89 "table": {"class"},
fcd8df3… ragelink 90 "code": {"class"},
fcd8df3… ragelink 91 "pre": {"class"},
fcd8df3… ragelink 92 "ol": {"class", "start", "type"},
fcd8df3… ragelink 93 "ul": {"class"},
fcd8df3… ragelink 94 "li": {"class", "value"},
fcd8df3… ragelink 95 "details": {"open", "class"},
fcd8df3… ragelink 96 "summary": {"class"},
7e1aaf6… ragelink 97 "h1": {"id", "class"},
7e1aaf6… ragelink 98 "h2": {"id", "class"},
7e1aaf6… ragelink 99 "h3": {"id", "class"},
7e1aaf6… ragelink 100 "h4": {"id", "class"},
7e1aaf6… ragelink 101 "h5": {"id", "class"},
7e1aaf6… ragelink 102 "h6": {"id", "class"},
fcd8df3… ragelink 103 # SVG attributes
fcd8df3… ragelink 104 "svg": {"viewbox", "width", "height", "class", "xmlns", "fill", "stroke"},
fcd8df3… ragelink 105 "path": {"d", "fill", "stroke", "stroke-width", "stroke-linecap", "stroke-linejoin", "class"},
fcd8df3… ragelink 106 "circle": {"cx", "cy", "r", "fill", "stroke", "class"},
fcd8df3… ragelink 107 "rect": {"x", "y", "width", "height", "fill", "stroke", "rx", "ry", "class"},
fcd8df3… ragelink 108 "line": {"x1", "y1", "x2", "y2", "stroke", "stroke-width", "class"},
fcd8df3… ragelink 109 "text": {"x", "y", "font-size", "text-anchor", "fill", "class"},
fcd8df3… ragelink 110 "g": {"transform", "class"},
fcd8df3… ragelink 111 "polyline": {"points", "fill", "stroke", "class"},
fcd8df3… ragelink 112 "polygon": {"points", "fill", "stroke", "class"},
fcd8df3… ragelink 113 }
fcd8df3… ragelink 114
fcd8df3… ragelink 115 # Global attributes allowed on any tag
fcd8df3… ragelink 116 GLOBAL_ATTRS = frozenset()
fcd8df3… ragelink 117
fcd8df3… ragelink 118 # Protocols allowed in href/src — everything else is stripped
fcd8df3… ragelink 119 ALLOWED_PROTOCOLS = frozenset({"http", "https", "mailto", "ftp", "#", ""})
fcd8df3… ragelink 120
fcd8df3… ragelink 121 # Regex to detect protocol in a URL (after HTML entity decoding)
fcd8df3… ragelink 122 _PROTOCOL_RE = re.compile(r"^([a-zA-Z][a-zA-Z0-9+\-.]*):.*", re.DOTALL)
fcd8df3… ragelink 123
fcd8df3… ragelink 124
fcd8df3… ragelink 125 def _is_safe_url(url: str) -> bool:
7e1aaf6… ragelink 126 """Check if a URL uses a safe protocol.
7e1aaf6… ragelink 127
7e1aaf6… ragelink 128 Decodes HTML entities, then strips ASCII control characters (tabs, CRs, NULs,
7e1aaf6… ragelink 129 etc.) that browsers silently ignore but can be used to bypass protocol checks
7e1aaf6… ragelink 130 (e.g. ``jav	ascript:`` or ``java
script:``).
7e1aaf6… ragelink 131 """
7e1aaf6… ragelink 132 decoded = html.unescape(url)
7e1aaf6… ragelink 133 # Strip all ASCII control characters (0x00-0x1F, 0x7F) — browsers ignore them
7e1aaf6… ragelink 134 # in URL scheme parsing, so "jav\tascript:" is treated as "javascript:"
7e1aaf6… ragelink 135 cleaned = re.sub(r"[\x00-\x1f\x7f]", "", decoded).strip()
7e1aaf6… ragelink 136 m = _PROTOCOL_RE.match(cleaned)
fcd8df3… ragelink 137 if m:
fcd8df3… ragelink 138 return m.group(1).lower() in ALLOWED_PROTOCOLS
fcd8df3… ragelink 139 return True
fcd8df3… ragelink 140
fcd8df3… ragelink 141
fcd8df3… ragelink 142 class _SanitizingParser(HTMLParser):
fcd8df3… ragelink 143 """HTML parser that only emits allowed tags/attributes."""
fcd8df3… ragelink 144
fcd8df3… ragelink 145 def __init__(self):
fcd8df3… ragelink 146 super().__init__(convert_charrefs=False)
fcd8df3… ragelink 147 self.out = StringIO()
fcd8df3… ragelink 148 self._skip_depth = 0 # Track depth inside dangerous tags to skip content
fcd8df3… ragelink 149
254b467… ragelink 150 # Void elements that are dangerous but never have content/closing tags
254b467… ragelink 151 _DANGEROUS_VOID = frozenset({"base", "meta", "link"})
254b467… ragelink 152 # Dangerous container tags — skip both the tag and all content inside
254b467… ragelink 153 _DANGEROUS_CONTAINER = frozenset({"script", "style", "iframe", "object", "embed", "form"})
254b467… ragelink 154
fcd8df3… ragelink 155 def handle_starttag(self, tag, attrs):
fcd8df3… ragelink 156 tag_lower = tag.lower()
fcd8df3… ragelink 157
254b467… ragelink 158 # Dangerous void tags — just drop the tag (no content to skip)
254b467… ragelink 159 if tag_lower in self._DANGEROUS_VOID:
254b467… ragelink 160 return
254b467… ragelink 161
fcd8df3… ragelink 162 # Dangerous content tags — skip tag AND all content inside
254b467… ragelink 163 if tag_lower in self._DANGEROUS_CONTAINER:
fcd8df3… ragelink 164 self._skip_depth += 1
fcd8df3… ragelink 165 return
fcd8df3… ragelink 166
fcd8df3… ragelink 167 if self._skip_depth > 0:
fcd8df3… ragelink 168 return
fcd8df3… ragelink 169
fcd8df3… ragelink 170 if tag_lower not in ALLOWED_TAGS:
fcd8df3… ragelink 171 return # Strip unknown tag (but keep its text content)
fcd8df3… ragelink 172
fcd8df3… ragelink 173 # Filter attributes
fcd8df3… ragelink 174 allowed = ALLOWED_ATTRS.get(tag_lower, set()) | GLOBAL_ATTRS
fcd8df3… ragelink 175 safe_attrs = []
fcd8df3… ragelink 176 for name, value in attrs:
fcd8df3… ragelink 177 name_lower = name.lower()
fcd8df3… ragelink 178 # Block event handlers
fcd8df3… ragelink 179 if name_lower.startswith("on"):
fcd8df3… ragelink 180 continue
fcd8df3… ragelink 181 if name_lower not in allowed:
fcd8df3… ragelink 182 continue
fcd8df3… ragelink 183 # Sanitize URLs in href/src
fcd8df3… ragelink 184 if name_lower in ("href", "src") and value and not _is_safe_url(value):
fcd8df3… ragelink 185 value = "#"
fcd8df3… ragelink 186 safe_attrs.append((name, value))
fcd8df3… ragelink 187
fcd8df3… ragelink 188 # Build the tag
fcd8df3… ragelink 189 attr_str = ""
fcd8df3… ragelink 190 for name, value in safe_attrs:
fcd8df3… ragelink 191 if value is None:
fcd8df3… ragelink 192 attr_str += f" {name}"
fcd8df3… ragelink 193 else:
fcd8df3… ragelink 194 escaped = value.replace("&", "&").replace('"', """)
fcd8df3… ragelink 195 attr_str += f' {name}="{escaped}"'
fcd8df3… ragelink 196
fcd8df3… ragelink 197 self.out.write(f"<{tag}{attr_str}>")
fcd8df3… ragelink 198
fcd8df3… ragelink 199 def handle_endtag(self, tag):
fcd8df3… ragelink 200 tag_lower = tag.lower()
254b467… ragelink 201 if tag_lower in self._DANGEROUS_VOID:
254b467… ragelink 202 return
254b467… ragelink 203 if tag_lower in self._DANGEROUS_CONTAINER:
fcd8df3… ragelink 204 self._skip_depth = max(0, self._skip_depth - 1)
fcd8df3… ragelink 205 return
fcd8df3… ragelink 206 if self._skip_depth > 0:
fcd8df3… ragelink 207 return
fcd8df3… ragelink 208 if tag_lower in ALLOWED_TAGS:
fcd8df3… ragelink 209 self.out.write(f"</{tag}>")
fcd8df3… ragelink 210
fcd8df3… ragelink 211 def handle_data(self, data):
fcd8df3… ragelink 212 if self._skip_depth > 0:
fcd8df3… ragelink 213 return # Inside a dangerous tag — skip content
fcd8df3… ragelink 214 self.out.write(data)
fcd8df3… ragelink 215
fcd8df3… ragelink 216 def handle_entityref(self, name):
fcd8df3… ragelink 217 if self._skip_depth > 0:
fcd8df3… ragelink 218 return
fcd8df3… ragelink 219 self.out.write(f"&{name};")
fcd8df3… ragelink 220
fcd8df3… ragelink 221 def handle_charref(self, name):
fcd8df3… ragelink 222 if self._skip_depth > 0:
fcd8df3… ragelink 223 return
fcd8df3… ragelink 224 self.out.write(f"&#{name};")
fcd8df3… ragelink 225
fcd8df3… ragelink 226 def handle_comment(self, data):
fcd8df3… ragelink 227 pass # Strip all HTML comments
fcd8df3… ragelink 228
fcd8df3… ragelink 229 def handle_startendtag(self, tag, attrs):
fcd8df3… ragelink 230 # Self-closing tags like <br/>, <img/>
fcd8df3… ragelink 231 self.handle_starttag(tag, attrs)
fcd8df3… ragelink 232
fcd8df3… ragelink 233
fcd8df3… ragelink 234 def sanitize_html(html_content: str) -> str:
fcd8df3… ragelink 235 """Sanitize HTML using a proper parser with tag/attribute allowlists.
fcd8df3… ragelink 236
fcd8df3… ragelink 237 - Only tags in ALLOWED_TAGS are kept (all others stripped, text preserved)
fcd8df3… ragelink 238 - Only attributes in ALLOWED_ATTRS per tag are kept
fcd8df3… ragelink 239 - Event handlers (on*) are always stripped
fcd8df3… ragelink 240 - URLs in href/src are checked after HTML entity decoding — javascript:,
fcd8df3… ragelink 241 data:, vbscript: (including entity-encoded variants) are neutralized
fcd8df3… ragelink 242 - Content inside <script>, <style>, <iframe>, etc. is completely removed
fcd8df3… ragelink 243 - HTML comments are stripped
fcd8df3… ragelink 244 """
fcd8df3… ragelink 245 if not html_content:
fcd8df3… ragelink 246 return html_content
fcd8df3… ragelink 247
fcd8df3… ragelink 248 parser = _SanitizingParser()
fcd8df3… ragelink 249 parser.feed(html_content)
fcd8df3… ragelink 250 return parser.out.getvalue()

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button