FossilRepo

fossilrepo / core / sanitize.py
Blame History Raw 251 lines
1
"""HTML sanitization for user-generated content.
2
3
Uses Python's html.parser to properly parse HTML and enforce an allowlist
4
of tags and attributes. Strips everything not explicitly allowed.
5
"""
6
7
import html
8
import re
9
from html.parser import HTMLParser
10
from io import StringIO
11
12
# Tags that are safe to render — covers Markdown/wiki formatting and Pikchr SVG
13
ALLOWED_TAGS = frozenset(
14
{
15
"a",
16
"abbr",
17
"acronym",
18
"b",
19
"blockquote",
20
"br",
21
"code",
22
"dd",
23
"del",
24
"details",
25
"div",
26
"dl",
27
"dt",
28
"em",
29
"h1",
30
"h2",
31
"h3",
32
"h4",
33
"h5",
34
"h6",
35
"hr",
36
"i",
37
"img",
38
"ins",
39
"kbd",
40
"li",
41
"mark",
42
"ol",
43
"p",
44
"pre",
45
"q",
46
"s",
47
"samp",
48
"small",
49
"span",
50
"strong",
51
"sub",
52
"summary",
53
"sup",
54
"table",
55
"tbody",
56
"td",
57
"tfoot",
58
"th",
59
"thead",
60
"tr",
61
"tt",
62
"u",
63
"ul",
64
"var",
65
# SVG elements for Pikchr diagrams
66
"svg",
67
"path",
68
"circle",
69
"rect",
70
"line",
71
"polyline",
72
"polygon",
73
"g",
74
"text",
75
"defs",
76
"use",
77
"symbol",
78
}
79
)
80
81
# Attributes allowed per tag (all others stripped)
82
ALLOWED_ATTRS = {
83
"a": {"href", "title", "class", "id", "name"},
84
"img": {"src", "alt", "title", "width", "height", "class"},
85
"div": {"class", "id"},
86
"span": {"class", "id"},
87
"td": {"class", "colspan", "rowspan"},
88
"th": {"class", "colspan", "rowspan"},
89
"table": {"class"},
90
"code": {"class"},
91
"pre": {"class"},
92
"ol": {"class", "start", "type"},
93
"ul": {"class"},
94
"li": {"class", "value"},
95
"details": {"open", "class"},
96
"summary": {"class"},
97
"h1": {"id", "class"},
98
"h2": {"id", "class"},
99
"h3": {"id", "class"},
100
"h4": {"id", "class"},
101
"h5": {"id", "class"},
102
"h6": {"id", "class"},
103
# SVG attributes
104
"svg": {"viewbox", "width", "height", "class", "xmlns", "fill", "stroke"},
105
"path": {"d", "fill", "stroke", "stroke-width", "stroke-linecap", "stroke-linejoin", "class"},
106
"circle": {"cx", "cy", "r", "fill", "stroke", "class"},
107
"rect": {"x", "y", "width", "height", "fill", "stroke", "rx", "ry", "class"},
108
"line": {"x1", "y1", "x2", "y2", "stroke", "stroke-width", "class"},
109
"text": {"x", "y", "font-size", "text-anchor", "fill", "class"},
110
"g": {"transform", "class"},
111
"polyline": {"points", "fill", "stroke", "class"},
112
"polygon": {"points", "fill", "stroke", "class"},
113
}
114
115
# Global attributes allowed on any tag
116
GLOBAL_ATTRS = frozenset()
117
118
# Protocols allowed in href/src — everything else is stripped
119
ALLOWED_PROTOCOLS = frozenset({"http", "https", "mailto", "ftp", "#", ""})
120
121
# Regex to detect protocol in a URL (after HTML entity decoding)
122
_PROTOCOL_RE = re.compile(r"^([a-zA-Z][a-zA-Z0-9+\-.]*):.*", re.DOTALL)
123
124
125
def _is_safe_url(url: str) -> bool:
126
"""Check if a URL uses a safe protocol.
127
128
Decodes HTML entities, then strips ASCII control characters (tabs, CRs, NULs,
129
etc.) that browsers silently ignore but can be used to bypass protocol checks
130
(e.g. ``jav	ascript:`` or ``java
script:``).
131
"""
132
decoded = html.unescape(url)
133
# Strip all ASCII control characters (0x00-0x1F, 0x7F) — browsers ignore them
134
# in URL scheme parsing, so "jav\tascript:" is treated as "javascript:"
135
cleaned = re.sub(r"[\x00-\x1f\x7f]", "", decoded).strip()
136
m = _PROTOCOL_RE.match(cleaned)
137
if m:
138
return m.group(1).lower() in ALLOWED_PROTOCOLS
139
return True
140
141
142
class _SanitizingParser(HTMLParser):
143
"""HTML parser that only emits allowed tags/attributes."""
144
145
def __init__(self):
146
super().__init__(convert_charrefs=False)
147
self.out = StringIO()
148
self._skip_depth = 0 # Track depth inside dangerous tags to skip content
149
150
# Void elements that are dangerous but never have content/closing tags
151
_DANGEROUS_VOID = frozenset({"base", "meta", "link"})
152
# Dangerous container tags — skip both the tag and all content inside
153
_DANGEROUS_CONTAINER = frozenset({"script", "style", "iframe", "object", "embed", "form"})
154
155
def handle_starttag(self, tag, attrs):
156
tag_lower = tag.lower()
157
158
# Dangerous void tags — just drop the tag (no content to skip)
159
if tag_lower in self._DANGEROUS_VOID:
160
return
161
162
# Dangerous content tags — skip tag AND all content inside
163
if tag_lower in self._DANGEROUS_CONTAINER:
164
self._skip_depth += 1
165
return
166
167
if self._skip_depth > 0:
168
return
169
170
if tag_lower not in ALLOWED_TAGS:
171
return # Strip unknown tag (but keep its text content)
172
173
# Filter attributes
174
allowed = ALLOWED_ATTRS.get(tag_lower, set()) | GLOBAL_ATTRS
175
safe_attrs = []
176
for name, value in attrs:
177
name_lower = name.lower()
178
# Block event handlers
179
if name_lower.startswith("on"):
180
continue
181
if name_lower not in allowed:
182
continue
183
# Sanitize URLs in href/src
184
if name_lower in ("href", "src") and value and not _is_safe_url(value):
185
value = "#"
186
safe_attrs.append((name, value))
187
188
# Build the tag
189
attr_str = ""
190
for name, value in safe_attrs:
191
if value is None:
192
attr_str += f" {name}"
193
else:
194
escaped = value.replace("&", "&").replace('"', """)
195
attr_str += f' {name}="{escaped}"'
196
197
self.out.write(f"<{tag}{attr_str}>")
198
199
def handle_endtag(self, tag):
200
tag_lower = tag.lower()
201
if tag_lower in self._DANGEROUS_VOID:
202
return
203
if tag_lower in self._DANGEROUS_CONTAINER:
204
self._skip_depth = max(0, self._skip_depth - 1)
205
return
206
if self._skip_depth > 0:
207
return
208
if tag_lower in ALLOWED_TAGS:
209
self.out.write(f"</{tag}>")
210
211
def handle_data(self, data):
212
if self._skip_depth > 0:
213
return # Inside a dangerous tag — skip content
214
self.out.write(data)
215
216
def handle_entityref(self, name):
217
if self._skip_depth > 0:
218
return
219
self.out.write(f"&{name};")
220
221
def handle_charref(self, name):
222
if self._skip_depth > 0:
223
return
224
self.out.write(f"&#{name};")
225
226
def handle_comment(self, data):
227
pass # Strip all HTML comments
228
229
def handle_startendtag(self, tag, attrs):
230
# Self-closing tags like <br/>, <img/>
231
self.handle_starttag(tag, attrs)
232
233
234
def sanitize_html(html_content: str) -> str:
235
"""Sanitize HTML using a proper parser with tag/attribute allowlists.
236
237
- Only tags in ALLOWED_TAGS are kept (all others stripped, text preserved)
238
- Only attributes in ALLOWED_ATTRS per tag are kept
239
- Event handlers (on*) are always stripped
240
- URLs in href/src are checked after HTML entity decoding — javascript:,
241
data:, vbscript: (including entity-encoded variants) are neutralized
242
- Content inside <script>, <style>, <iframe>, etc. is completely removed
243
- HTML comments are stripped
244
"""
245
if not html_content:
246
return html_content
247
248
parser = _SanitizingParser()
249
parser.feed(html_content)
250
return parser.out.getvalue()
251

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button