|
1
|
"""HTML sanitization for user-generated content. |
|
2
|
|
|
3
|
Uses Python's html.parser to properly parse HTML and enforce an allowlist |
|
4
|
of tags and attributes. Strips everything not explicitly allowed. |
|
5
|
""" |
|
6
|
|
|
7
|
import html |
|
8
|
import re |
|
9
|
from html.parser import HTMLParser |
|
10
|
from io import StringIO |
|
11
|
|
|
12
|
# Tags that are safe to render — covers Markdown/wiki formatting and Pikchr SVG |
|
13
|
ALLOWED_TAGS = frozenset( |
|
14
|
{ |
|
15
|
"a", |
|
16
|
"abbr", |
|
17
|
"acronym", |
|
18
|
"b", |
|
19
|
"blockquote", |
|
20
|
"br", |
|
21
|
"code", |
|
22
|
"dd", |
|
23
|
"del", |
|
24
|
"details", |
|
25
|
"div", |
|
26
|
"dl", |
|
27
|
"dt", |
|
28
|
"em", |
|
29
|
"h1", |
|
30
|
"h2", |
|
31
|
"h3", |
|
32
|
"h4", |
|
33
|
"h5", |
|
34
|
"h6", |
|
35
|
"hr", |
|
36
|
"i", |
|
37
|
"img", |
|
38
|
"ins", |
|
39
|
"kbd", |
|
40
|
"li", |
|
41
|
"mark", |
|
42
|
"ol", |
|
43
|
"p", |
|
44
|
"pre", |
|
45
|
"q", |
|
46
|
"s", |
|
47
|
"samp", |
|
48
|
"small", |
|
49
|
"span", |
|
50
|
"strong", |
|
51
|
"sub", |
|
52
|
"summary", |
|
53
|
"sup", |
|
54
|
"table", |
|
55
|
"tbody", |
|
56
|
"td", |
|
57
|
"tfoot", |
|
58
|
"th", |
|
59
|
"thead", |
|
60
|
"tr", |
|
61
|
"tt", |
|
62
|
"u", |
|
63
|
"ul", |
|
64
|
"var", |
|
65
|
# SVG elements for Pikchr diagrams |
|
66
|
"svg", |
|
67
|
"path", |
|
68
|
"circle", |
|
69
|
"rect", |
|
70
|
"line", |
|
71
|
"polyline", |
|
72
|
"polygon", |
|
73
|
"g", |
|
74
|
"text", |
|
75
|
"defs", |
|
76
|
"use", |
|
77
|
"symbol", |
|
78
|
} |
|
79
|
) |
|
80
|
|
|
81
|
# Attributes allowed per tag (all others stripped) |
|
82
|
ALLOWED_ATTRS = { |
|
83
|
"a": {"href", "title", "class", "id", "name"}, |
|
84
|
"img": {"src", "alt", "title", "width", "height", "class"}, |
|
85
|
"div": {"class", "id"}, |
|
86
|
"span": {"class", "id"}, |
|
87
|
"td": {"class", "colspan", "rowspan"}, |
|
88
|
"th": {"class", "colspan", "rowspan"}, |
|
89
|
"table": {"class"}, |
|
90
|
"code": {"class"}, |
|
91
|
"pre": {"class"}, |
|
92
|
"ol": {"class", "start", "type"}, |
|
93
|
"ul": {"class"}, |
|
94
|
"li": {"class", "value"}, |
|
95
|
"details": {"open", "class"}, |
|
96
|
"summary": {"class"}, |
|
97
|
"h1": {"id", "class"}, |
|
98
|
"h2": {"id", "class"}, |
|
99
|
"h3": {"id", "class"}, |
|
100
|
"h4": {"id", "class"}, |
|
101
|
"h5": {"id", "class"}, |
|
102
|
"h6": {"id", "class"}, |
|
103
|
# SVG attributes |
|
104
|
"svg": {"viewbox", "width", "height", "class", "xmlns", "fill", "stroke"}, |
|
105
|
"path": {"d", "fill", "stroke", "stroke-width", "stroke-linecap", "stroke-linejoin", "class"}, |
|
106
|
"circle": {"cx", "cy", "r", "fill", "stroke", "class"}, |
|
107
|
"rect": {"x", "y", "width", "height", "fill", "stroke", "rx", "ry", "class"}, |
|
108
|
"line": {"x1", "y1", "x2", "y2", "stroke", "stroke-width", "class"}, |
|
109
|
"text": {"x", "y", "font-size", "text-anchor", "fill", "class"}, |
|
110
|
"g": {"transform", "class"}, |
|
111
|
"polyline": {"points", "fill", "stroke", "class"}, |
|
112
|
"polygon": {"points", "fill", "stroke", "class"}, |
|
113
|
} |
|
114
|
|
|
115
|
# Global attributes allowed on any tag |
|
116
|
GLOBAL_ATTRS = frozenset() |
|
117
|
|
|
118
|
# Protocols allowed in href/src — everything else is stripped |
|
119
|
ALLOWED_PROTOCOLS = frozenset({"http", "https", "mailto", "ftp", "#", ""}) |
|
120
|
|
|
121
|
# Regex to detect protocol in a URL (after HTML entity decoding) |
|
122
|
_PROTOCOL_RE = re.compile(r"^([a-zA-Z][a-zA-Z0-9+\-.]*):.*", re.DOTALL) |
|
123
|
|
|
124
|
|
|
125
|
def _is_safe_url(url: str) -> bool: |
|
126
|
"""Check if a URL uses a safe protocol. |
|
127
|
|
|
128
|
Decodes HTML entities, then strips ASCII control characters (tabs, CRs, NULs, |
|
129
|
etc.) that browsers silently ignore but can be used to bypass protocol checks |
|
130
|
(e.g. ``jav	ascript:`` or ``java
script:``). |
|
131
|
""" |
|
132
|
decoded = html.unescape(url) |
|
133
|
# Strip all ASCII control characters (0x00-0x1F, 0x7F) — browsers ignore them |
|
134
|
# in URL scheme parsing, so "jav\tascript:" is treated as "javascript:" |
|
135
|
cleaned = re.sub(r"[\x00-\x1f\x7f]", "", decoded).strip() |
|
136
|
m = _PROTOCOL_RE.match(cleaned) |
|
137
|
if m: |
|
138
|
return m.group(1).lower() in ALLOWED_PROTOCOLS |
|
139
|
return True |
|
140
|
|
|
141
|
|
|
142
|
class _SanitizingParser(HTMLParser): |
|
143
|
"""HTML parser that only emits allowed tags/attributes.""" |
|
144
|
|
|
145
|
def __init__(self): |
|
146
|
super().__init__(convert_charrefs=False) |
|
147
|
self.out = StringIO() |
|
148
|
self._skip_depth = 0 # Track depth inside dangerous tags to skip content |
|
149
|
|
|
150
|
# Void elements that are dangerous but never have content/closing tags |
|
151
|
_DANGEROUS_VOID = frozenset({"base", "meta", "link"}) |
|
152
|
# Dangerous container tags — skip both the tag and all content inside |
|
153
|
_DANGEROUS_CONTAINER = frozenset({"script", "style", "iframe", "object", "embed", "form"}) |
|
154
|
|
|
155
|
def handle_starttag(self, tag, attrs): |
|
156
|
tag_lower = tag.lower() |
|
157
|
|
|
158
|
# Dangerous void tags — just drop the tag (no content to skip) |
|
159
|
if tag_lower in self._DANGEROUS_VOID: |
|
160
|
return |
|
161
|
|
|
162
|
# Dangerous content tags — skip tag AND all content inside |
|
163
|
if tag_lower in self._DANGEROUS_CONTAINER: |
|
164
|
self._skip_depth += 1 |
|
165
|
return |
|
166
|
|
|
167
|
if self._skip_depth > 0: |
|
168
|
return |
|
169
|
|
|
170
|
if tag_lower not in ALLOWED_TAGS: |
|
171
|
return # Strip unknown tag (but keep its text content) |
|
172
|
|
|
173
|
# Filter attributes |
|
174
|
allowed = ALLOWED_ATTRS.get(tag_lower, set()) | GLOBAL_ATTRS |
|
175
|
safe_attrs = [] |
|
176
|
for name, value in attrs: |
|
177
|
name_lower = name.lower() |
|
178
|
# Block event handlers |
|
179
|
if name_lower.startswith("on"): |
|
180
|
continue |
|
181
|
if name_lower not in allowed: |
|
182
|
continue |
|
183
|
# Sanitize URLs in href/src |
|
184
|
if name_lower in ("href", "src") and value and not _is_safe_url(value): |
|
185
|
value = "#" |
|
186
|
safe_attrs.append((name, value)) |
|
187
|
|
|
188
|
# Build the tag |
|
189
|
attr_str = "" |
|
190
|
for name, value in safe_attrs: |
|
191
|
if value is None: |
|
192
|
attr_str += f" {name}" |
|
193
|
else: |
|
194
|
escaped = value.replace("&", "&").replace('"', """) |
|
195
|
attr_str += f' {name}="{escaped}"' |
|
196
|
|
|
197
|
self.out.write(f"<{tag}{attr_str}>") |
|
198
|
|
|
199
|
def handle_endtag(self, tag): |
|
200
|
tag_lower = tag.lower() |
|
201
|
if tag_lower in self._DANGEROUS_VOID: |
|
202
|
return |
|
203
|
if tag_lower in self._DANGEROUS_CONTAINER: |
|
204
|
self._skip_depth = max(0, self._skip_depth - 1) |
|
205
|
return |
|
206
|
if self._skip_depth > 0: |
|
207
|
return |
|
208
|
if tag_lower in ALLOWED_TAGS: |
|
209
|
self.out.write(f"</{tag}>") |
|
210
|
|
|
211
|
def handle_data(self, data): |
|
212
|
if self._skip_depth > 0: |
|
213
|
return # Inside a dangerous tag — skip content |
|
214
|
self.out.write(data) |
|
215
|
|
|
216
|
def handle_entityref(self, name): |
|
217
|
if self._skip_depth > 0: |
|
218
|
return |
|
219
|
self.out.write(f"&{name};") |
|
220
|
|
|
221
|
def handle_charref(self, name): |
|
222
|
if self._skip_depth > 0: |
|
223
|
return |
|
224
|
self.out.write(f"&#{name};") |
|
225
|
|
|
226
|
def handle_comment(self, data): |
|
227
|
pass # Strip all HTML comments |
|
228
|
|
|
229
|
def handle_startendtag(self, tag, attrs): |
|
230
|
# Self-closing tags like <br/>, <img/> |
|
231
|
self.handle_starttag(tag, attrs) |
|
232
|
|
|
233
|
|
|
234
|
def sanitize_html(html_content: str) -> str: |
|
235
|
"""Sanitize HTML using a proper parser with tag/attribute allowlists. |
|
236
|
|
|
237
|
- Only tags in ALLOWED_TAGS are kept (all others stripped, text preserved) |
|
238
|
- Only attributes in ALLOWED_ATTRS per tag are kept |
|
239
|
- Event handlers (on*) are always stripped |
|
240
|
- URLs in href/src are checked after HTML entity decoding — javascript:, |
|
241
|
data:, vbscript: (including entity-encoded variants) are neutralized |
|
242
|
- Content inside <script>, <style>, <iframe>, etc. is completely removed |
|
243
|
- HTML comments are stripped |
|
244
|
""" |
|
245
|
if not html_content: |
|
246
|
return html_content |
|
247
|
|
|
248
|
parser = _SanitizingParser() |
|
249
|
parser.feed(html_content) |
|
250
|
return parser.out.getvalue() |
|
251
|
|