|
c588255…
|
ragelink
|
1 |
"""HTML sanitization for user-generated content. |
|
c588255…
|
ragelink
|
2 |
|
|
fcd8df3…
|
ragelink
|
3 |
Uses Python's html.parser to properly parse HTML and enforce an allowlist |
|
fcd8df3…
|
ragelink
|
4 |
of tags and attributes. Strips everything not explicitly allowed. |
|
fcd8df3…
|
ragelink
|
5 |
""" |
|
fcd8df3…
|
ragelink
|
6 |
|
|
fcd8df3…
|
ragelink
|
7 |
import html |
|
fcd8df3…
|
ragelink
|
8 |
import re |
|
fcd8df3…
|
ragelink
|
9 |
from html.parser import HTMLParser |
|
fcd8df3…
|
ragelink
|
10 |
from io import StringIO |
|
fcd8df3…
|
ragelink
|
11 |
|
|
fcd8df3…
|
ragelink
|
12 |
# Tags that are safe to render — covers Markdown/wiki formatting and Pikchr SVG |
|
7e1aaf6…
|
ragelink
|
13 |
ALLOWED_TAGS = frozenset( |
|
7e1aaf6…
|
ragelink
|
14 |
{ |
|
7e1aaf6…
|
ragelink
|
15 |
"a", |
|
7e1aaf6…
|
ragelink
|
16 |
"abbr", |
|
7e1aaf6…
|
ragelink
|
17 |
"acronym", |
|
7e1aaf6…
|
ragelink
|
18 |
"b", |
|
7e1aaf6…
|
ragelink
|
19 |
"blockquote", |
|
7e1aaf6…
|
ragelink
|
20 |
"br", |
|
7e1aaf6…
|
ragelink
|
21 |
"code", |
|
7e1aaf6…
|
ragelink
|
22 |
"dd", |
|
7e1aaf6…
|
ragelink
|
23 |
"del", |
|
7e1aaf6…
|
ragelink
|
24 |
"details", |
|
7e1aaf6…
|
ragelink
|
25 |
"div", |
|
7e1aaf6…
|
ragelink
|
26 |
"dl", |
|
7e1aaf6…
|
ragelink
|
27 |
"dt", |
|
7e1aaf6…
|
ragelink
|
28 |
"em", |
|
7e1aaf6…
|
ragelink
|
29 |
"h1", |
|
7e1aaf6…
|
ragelink
|
30 |
"h2", |
|
7e1aaf6…
|
ragelink
|
31 |
"h3", |
|
7e1aaf6…
|
ragelink
|
32 |
"h4", |
|
7e1aaf6…
|
ragelink
|
33 |
"h5", |
|
7e1aaf6…
|
ragelink
|
34 |
"h6", |
|
7e1aaf6…
|
ragelink
|
35 |
"hr", |
|
7e1aaf6…
|
ragelink
|
36 |
"i", |
|
7e1aaf6…
|
ragelink
|
37 |
"img", |
|
7e1aaf6…
|
ragelink
|
38 |
"ins", |
|
7e1aaf6…
|
ragelink
|
39 |
"kbd", |
|
7e1aaf6…
|
ragelink
|
40 |
"li", |
|
7e1aaf6…
|
ragelink
|
41 |
"mark", |
|
7e1aaf6…
|
ragelink
|
42 |
"ol", |
|
7e1aaf6…
|
ragelink
|
43 |
"p", |
|
7e1aaf6…
|
ragelink
|
44 |
"pre", |
|
7e1aaf6…
|
ragelink
|
45 |
"q", |
|
7e1aaf6…
|
ragelink
|
46 |
"s", |
|
7e1aaf6…
|
ragelink
|
47 |
"samp", |
|
7e1aaf6…
|
ragelink
|
48 |
"small", |
|
7e1aaf6…
|
ragelink
|
49 |
"span", |
|
7e1aaf6…
|
ragelink
|
50 |
"strong", |
|
7e1aaf6…
|
ragelink
|
51 |
"sub", |
|
7e1aaf6…
|
ragelink
|
52 |
"summary", |
|
7e1aaf6…
|
ragelink
|
53 |
"sup", |
|
7e1aaf6…
|
ragelink
|
54 |
"table", |
|
7e1aaf6…
|
ragelink
|
55 |
"tbody", |
|
7e1aaf6…
|
ragelink
|
56 |
"td", |
|
7e1aaf6…
|
ragelink
|
57 |
"tfoot", |
|
7e1aaf6…
|
ragelink
|
58 |
"th", |
|
7e1aaf6…
|
ragelink
|
59 |
"thead", |
|
7e1aaf6…
|
ragelink
|
60 |
"tr", |
|
7e1aaf6…
|
ragelink
|
61 |
"tt", |
|
7e1aaf6…
|
ragelink
|
62 |
"u", |
|
7e1aaf6…
|
ragelink
|
63 |
"ul", |
|
7e1aaf6…
|
ragelink
|
64 |
"var", |
|
7e1aaf6…
|
ragelink
|
65 |
# SVG elements for Pikchr diagrams |
|
7e1aaf6…
|
ragelink
|
66 |
"svg", |
|
7e1aaf6…
|
ragelink
|
67 |
"path", |
|
7e1aaf6…
|
ragelink
|
68 |
"circle", |
|
7e1aaf6…
|
ragelink
|
69 |
"rect", |
|
7e1aaf6…
|
ragelink
|
70 |
"line", |
|
7e1aaf6…
|
ragelink
|
71 |
"polyline", |
|
7e1aaf6…
|
ragelink
|
72 |
"polygon", |
|
7e1aaf6…
|
ragelink
|
73 |
"g", |
|
7e1aaf6…
|
ragelink
|
74 |
"text", |
|
7e1aaf6…
|
ragelink
|
75 |
"defs", |
|
7e1aaf6…
|
ragelink
|
76 |
"use", |
|
7e1aaf6…
|
ragelink
|
77 |
"symbol", |
|
7e1aaf6…
|
ragelink
|
78 |
} |
|
7e1aaf6…
|
ragelink
|
79 |
) |
|
fcd8df3…
|
ragelink
|
80 |
|
|
fcd8df3…
|
ragelink
|
81 |
# Attributes allowed per tag (all others stripped) |
|
fcd8df3…
|
ragelink
|
82 |
ALLOWED_ATTRS = { |
|
fcd8df3…
|
ragelink
|
83 |
"a": {"href", "title", "class", "id", "name"}, |
|
fcd8df3…
|
ragelink
|
84 |
"img": {"src", "alt", "title", "width", "height", "class"}, |
|
fcd8df3…
|
ragelink
|
85 |
"div": {"class", "id"}, |
|
fcd8df3…
|
ragelink
|
86 |
"span": {"class", "id"}, |
|
fcd8df3…
|
ragelink
|
87 |
"td": {"class", "colspan", "rowspan"}, |
|
fcd8df3…
|
ragelink
|
88 |
"th": {"class", "colspan", "rowspan"}, |
|
fcd8df3…
|
ragelink
|
89 |
"table": {"class"}, |
|
fcd8df3…
|
ragelink
|
90 |
"code": {"class"}, |
|
fcd8df3…
|
ragelink
|
91 |
"pre": {"class"}, |
|
fcd8df3…
|
ragelink
|
92 |
"ol": {"class", "start", "type"}, |
|
fcd8df3…
|
ragelink
|
93 |
"ul": {"class"}, |
|
fcd8df3…
|
ragelink
|
94 |
"li": {"class", "value"}, |
|
fcd8df3…
|
ragelink
|
95 |
"details": {"open", "class"}, |
|
fcd8df3…
|
ragelink
|
96 |
"summary": {"class"}, |
|
7e1aaf6…
|
ragelink
|
97 |
"h1": {"id", "class"}, |
|
7e1aaf6…
|
ragelink
|
98 |
"h2": {"id", "class"}, |
|
7e1aaf6…
|
ragelink
|
99 |
"h3": {"id", "class"}, |
|
7e1aaf6…
|
ragelink
|
100 |
"h4": {"id", "class"}, |
|
7e1aaf6…
|
ragelink
|
101 |
"h5": {"id", "class"}, |
|
7e1aaf6…
|
ragelink
|
102 |
"h6": {"id", "class"}, |
|
fcd8df3…
|
ragelink
|
103 |
# SVG attributes |
|
fcd8df3…
|
ragelink
|
104 |
"svg": {"viewbox", "width", "height", "class", "xmlns", "fill", "stroke"}, |
|
fcd8df3…
|
ragelink
|
105 |
"path": {"d", "fill", "stroke", "stroke-width", "stroke-linecap", "stroke-linejoin", "class"}, |
|
fcd8df3…
|
ragelink
|
106 |
"circle": {"cx", "cy", "r", "fill", "stroke", "class"}, |
|
fcd8df3…
|
ragelink
|
107 |
"rect": {"x", "y", "width", "height", "fill", "stroke", "rx", "ry", "class"}, |
|
fcd8df3…
|
ragelink
|
108 |
"line": {"x1", "y1", "x2", "y2", "stroke", "stroke-width", "class"}, |
|
fcd8df3…
|
ragelink
|
109 |
"text": {"x", "y", "font-size", "text-anchor", "fill", "class"}, |
|
fcd8df3…
|
ragelink
|
110 |
"g": {"transform", "class"}, |
|
fcd8df3…
|
ragelink
|
111 |
"polyline": {"points", "fill", "stroke", "class"}, |
|
fcd8df3…
|
ragelink
|
112 |
"polygon": {"points", "fill", "stroke", "class"}, |
|
fcd8df3…
|
ragelink
|
113 |
} |
|
fcd8df3…
|
ragelink
|
114 |
|
|
fcd8df3…
|
ragelink
|
115 |
# Global attributes allowed on any tag |
|
fcd8df3…
|
ragelink
|
116 |
GLOBAL_ATTRS = frozenset() |
|
fcd8df3…
|
ragelink
|
117 |
|
|
fcd8df3…
|
ragelink
|
118 |
# Protocols allowed in href/src — everything else is stripped |
|
fcd8df3…
|
ragelink
|
119 |
ALLOWED_PROTOCOLS = frozenset({"http", "https", "mailto", "ftp", "#", ""}) |
|
fcd8df3…
|
ragelink
|
120 |
|
|
fcd8df3…
|
ragelink
|
121 |
# Regex to detect protocol in a URL (after HTML entity decoding) |
|
fcd8df3…
|
ragelink
|
122 |
_PROTOCOL_RE = re.compile(r"^([a-zA-Z][a-zA-Z0-9+\-.]*):.*", re.DOTALL) |
|
fcd8df3…
|
ragelink
|
123 |
|
|
fcd8df3…
|
ragelink
|
124 |
|
|
fcd8df3…
|
ragelink
|
125 |
def _is_safe_url(url: str) -> bool: |
|
7e1aaf6…
|
ragelink
|
126 |
"""Check if a URL uses a safe protocol. |
|
7e1aaf6…
|
ragelink
|
127 |
|
|
7e1aaf6…
|
ragelink
|
128 |
Decodes HTML entities, then strips ASCII control characters (tabs, CRs, NULs, |
|
7e1aaf6…
|
ragelink
|
129 |
etc.) that browsers silently ignore but can be used to bypass protocol checks |
|
7e1aaf6…
|
ragelink
|
130 |
(e.g. ``jav	ascript:`` or ``java
script:``). |
|
7e1aaf6…
|
ragelink
|
131 |
""" |
|
7e1aaf6…
|
ragelink
|
132 |
decoded = html.unescape(url) |
|
7e1aaf6…
|
ragelink
|
133 |
# Strip all ASCII control characters (0x00-0x1F, 0x7F) — browsers ignore them |
|
7e1aaf6…
|
ragelink
|
134 |
# in URL scheme parsing, so "jav\tascript:" is treated as "javascript:" |
|
7e1aaf6…
|
ragelink
|
135 |
cleaned = re.sub(r"[\x00-\x1f\x7f]", "", decoded).strip() |
|
7e1aaf6…
|
ragelink
|
136 |
m = _PROTOCOL_RE.match(cleaned) |
|
fcd8df3…
|
ragelink
|
137 |
if m: |
|
fcd8df3…
|
ragelink
|
138 |
return m.group(1).lower() in ALLOWED_PROTOCOLS |
|
fcd8df3…
|
ragelink
|
139 |
return True |
|
fcd8df3…
|
ragelink
|
140 |
|
|
fcd8df3…
|
ragelink
|
141 |
|
|
fcd8df3…
|
ragelink
|
142 |
class _SanitizingParser(HTMLParser): |
|
fcd8df3…
|
ragelink
|
143 |
"""HTML parser that only emits allowed tags/attributes.""" |
|
fcd8df3…
|
ragelink
|
144 |
|
|
fcd8df3…
|
ragelink
|
145 |
def __init__(self): |
|
fcd8df3…
|
ragelink
|
146 |
super().__init__(convert_charrefs=False) |
|
fcd8df3…
|
ragelink
|
147 |
self.out = StringIO() |
|
fcd8df3…
|
ragelink
|
148 |
self._skip_depth = 0 # Track depth inside dangerous tags to skip content |
|
fcd8df3…
|
ragelink
|
149 |
|
|
254b467…
|
ragelink
|
150 |
# Void elements that are dangerous but never have content/closing tags |
|
254b467…
|
ragelink
|
151 |
_DANGEROUS_VOID = frozenset({"base", "meta", "link"}) |
|
254b467…
|
ragelink
|
152 |
# Dangerous container tags — skip both the tag and all content inside |
|
254b467…
|
ragelink
|
153 |
_DANGEROUS_CONTAINER = frozenset({"script", "style", "iframe", "object", "embed", "form"}) |
|
254b467…
|
ragelink
|
154 |
|
|
fcd8df3…
|
ragelink
|
155 |
def handle_starttag(self, tag, attrs): |
|
fcd8df3…
|
ragelink
|
156 |
tag_lower = tag.lower() |
|
fcd8df3…
|
ragelink
|
157 |
|
|
254b467…
|
ragelink
|
158 |
# Dangerous void tags — just drop the tag (no content to skip) |
|
254b467…
|
ragelink
|
159 |
if tag_lower in self._DANGEROUS_VOID: |
|
254b467…
|
ragelink
|
160 |
return |
|
254b467…
|
ragelink
|
161 |
|
|
fcd8df3…
|
ragelink
|
162 |
# Dangerous content tags — skip tag AND all content inside |
|
254b467…
|
ragelink
|
163 |
if tag_lower in self._DANGEROUS_CONTAINER: |
|
fcd8df3…
|
ragelink
|
164 |
self._skip_depth += 1 |
|
fcd8df3…
|
ragelink
|
165 |
return |
|
fcd8df3…
|
ragelink
|
166 |
|
|
fcd8df3…
|
ragelink
|
167 |
if self._skip_depth > 0: |
|
fcd8df3…
|
ragelink
|
168 |
return |
|
fcd8df3…
|
ragelink
|
169 |
|
|
fcd8df3…
|
ragelink
|
170 |
if tag_lower not in ALLOWED_TAGS: |
|
fcd8df3…
|
ragelink
|
171 |
return # Strip unknown tag (but keep its text content) |
|
fcd8df3…
|
ragelink
|
172 |
|
|
fcd8df3…
|
ragelink
|
173 |
# Filter attributes |
|
fcd8df3…
|
ragelink
|
174 |
allowed = ALLOWED_ATTRS.get(tag_lower, set()) | GLOBAL_ATTRS |
|
fcd8df3…
|
ragelink
|
175 |
safe_attrs = [] |
|
fcd8df3…
|
ragelink
|
176 |
for name, value in attrs: |
|
fcd8df3…
|
ragelink
|
177 |
name_lower = name.lower() |
|
fcd8df3…
|
ragelink
|
178 |
# Block event handlers |
|
fcd8df3…
|
ragelink
|
179 |
if name_lower.startswith("on"): |
|
fcd8df3…
|
ragelink
|
180 |
continue |
|
fcd8df3…
|
ragelink
|
181 |
if name_lower not in allowed: |
|
fcd8df3…
|
ragelink
|
182 |
continue |
|
fcd8df3…
|
ragelink
|
183 |
# Sanitize URLs in href/src |
|
fcd8df3…
|
ragelink
|
184 |
if name_lower in ("href", "src") and value and not _is_safe_url(value): |
|
fcd8df3…
|
ragelink
|
185 |
value = "#" |
|
fcd8df3…
|
ragelink
|
186 |
safe_attrs.append((name, value)) |
|
fcd8df3…
|
ragelink
|
187 |
|
|
fcd8df3…
|
ragelink
|
188 |
# Build the tag |
|
fcd8df3…
|
ragelink
|
189 |
attr_str = "" |
|
fcd8df3…
|
ragelink
|
190 |
for name, value in safe_attrs: |
|
fcd8df3…
|
ragelink
|
191 |
if value is None: |
|
fcd8df3…
|
ragelink
|
192 |
attr_str += f" {name}" |
|
fcd8df3…
|
ragelink
|
193 |
else: |
|
fcd8df3…
|
ragelink
|
194 |
escaped = value.replace("&", "&").replace('"', """) |
|
fcd8df3…
|
ragelink
|
195 |
attr_str += f' {name}="{escaped}"' |
|
fcd8df3…
|
ragelink
|
196 |
|
|
fcd8df3…
|
ragelink
|
197 |
self.out.write(f"<{tag}{attr_str}>") |
|
fcd8df3…
|
ragelink
|
198 |
|
|
fcd8df3…
|
ragelink
|
199 |
def handle_endtag(self, tag): |
|
fcd8df3…
|
ragelink
|
200 |
tag_lower = tag.lower() |
|
254b467…
|
ragelink
|
201 |
if tag_lower in self._DANGEROUS_VOID: |
|
254b467…
|
ragelink
|
202 |
return |
|
254b467…
|
ragelink
|
203 |
if tag_lower in self._DANGEROUS_CONTAINER: |
|
fcd8df3…
|
ragelink
|
204 |
self._skip_depth = max(0, self._skip_depth - 1) |
|
fcd8df3…
|
ragelink
|
205 |
return |
|
fcd8df3…
|
ragelink
|
206 |
if self._skip_depth > 0: |
|
fcd8df3…
|
ragelink
|
207 |
return |
|
fcd8df3…
|
ragelink
|
208 |
if tag_lower in ALLOWED_TAGS: |
|
fcd8df3…
|
ragelink
|
209 |
self.out.write(f"</{tag}>") |
|
fcd8df3…
|
ragelink
|
210 |
|
|
fcd8df3…
|
ragelink
|
211 |
def handle_data(self, data): |
|
fcd8df3…
|
ragelink
|
212 |
if self._skip_depth > 0: |
|
fcd8df3…
|
ragelink
|
213 |
return # Inside a dangerous tag — skip content |
|
fcd8df3…
|
ragelink
|
214 |
self.out.write(data) |
|
fcd8df3…
|
ragelink
|
215 |
|
|
fcd8df3…
|
ragelink
|
216 |
def handle_entityref(self, name): |
|
fcd8df3…
|
ragelink
|
217 |
if self._skip_depth > 0: |
|
fcd8df3…
|
ragelink
|
218 |
return |
|
fcd8df3…
|
ragelink
|
219 |
self.out.write(f"&{name};") |
|
fcd8df3…
|
ragelink
|
220 |
|
|
fcd8df3…
|
ragelink
|
221 |
def handle_charref(self, name): |
|
fcd8df3…
|
ragelink
|
222 |
if self._skip_depth > 0: |
|
fcd8df3…
|
ragelink
|
223 |
return |
|
fcd8df3…
|
ragelink
|
224 |
self.out.write(f"&#{name};") |
|
fcd8df3…
|
ragelink
|
225 |
|
|
fcd8df3…
|
ragelink
|
226 |
def handle_comment(self, data): |
|
fcd8df3…
|
ragelink
|
227 |
pass # Strip all HTML comments |
|
fcd8df3…
|
ragelink
|
228 |
|
|
fcd8df3…
|
ragelink
|
229 |
def handle_startendtag(self, tag, attrs): |
|
fcd8df3…
|
ragelink
|
230 |
# Self-closing tags like <br/>, <img/> |
|
fcd8df3…
|
ragelink
|
231 |
self.handle_starttag(tag, attrs) |
|
fcd8df3…
|
ragelink
|
232 |
|
|
fcd8df3…
|
ragelink
|
233 |
|
|
fcd8df3…
|
ragelink
|
234 |
def sanitize_html(html_content: str) -> str: |
|
fcd8df3…
|
ragelink
|
235 |
"""Sanitize HTML using a proper parser with tag/attribute allowlists. |
|
fcd8df3…
|
ragelink
|
236 |
|
|
fcd8df3…
|
ragelink
|
237 |
- Only tags in ALLOWED_TAGS are kept (all others stripped, text preserved) |
|
fcd8df3…
|
ragelink
|
238 |
- Only attributes in ALLOWED_ATTRS per tag are kept |
|
fcd8df3…
|
ragelink
|
239 |
- Event handlers (on*) are always stripped |
|
fcd8df3…
|
ragelink
|
240 |
- URLs in href/src are checked after HTML entity decoding — javascript:, |
|
fcd8df3…
|
ragelink
|
241 |
data:, vbscript: (including entity-encoded variants) are neutralized |
|
fcd8df3…
|
ragelink
|
242 |
- Content inside <script>, <style>, <iframe>, etc. is completely removed |
|
fcd8df3…
|
ragelink
|
243 |
- HTML comments are stripped |
|
fcd8df3…
|
ragelink
|
244 |
""" |
|
fcd8df3…
|
ragelink
|
245 |
if not html_content: |
|
fcd8df3…
|
ragelink
|
246 |
return html_content |
|
fcd8df3…
|
ragelink
|
247 |
|
|
fcd8df3…
|
ragelink
|
248 |
parser = _SanitizingParser() |
|
fcd8df3…
|
ragelink
|
249 |
parser.feed(html_content) |
|
fcd8df3…
|
ragelink
|
250 |
return parser.out.getvalue() |