FossilRepo

fossilrepo / core / sanitize.py

Blame History Raw 251 lines

1	`"""HTML sanitization for user-generated content.`
2
3	`Uses Python's html.parser to properly parse HTML and enforce an allowlist`
4	`of tags and attributes. Strips everything not explicitly allowed.`
5	`"""`
6
7	`import html`
8	`import re`
9	`from html.parser import HTMLParser`
10	`from io import StringIO`
11
12	`# Tags that are safe to render — covers Markdown/wiki formatting and Pikchr SVG`
13	`ALLOWED_TAGS = frozenset(`
14	`{`
15	`"a",`
16	`"abbr",`
17	`"acronym",`
18	`"b",`
19	`"blockquote",`
20	`"br",`
21	`"code",`
22	`"dd",`
23	`"del",`
24	`"details",`
25	`"div",`
26	`"dl",`
27	`"dt",`
28	`"em",`
29	`"h1",`
30	`"h2",`
31	`"h3",`
32	`"h4",`
33	`"h5",`
34	`"h6",`
35	`"hr",`
36	`"i",`
37	`"img",`
38	`"ins",`
39	`"kbd",`
40	`"li",`
41	`"mark",`
42	`"ol",`
43	`"p",`
44	`"pre",`
45	`"q",`
46	`"s",`
47	`"samp",`
48	`"small",`
49	`"span",`
50	`"strong",`
51	`"sub",`
52	`"summary",`
53	`"sup",`
54	`"table",`
55	`"tbody",`
56	`"td",`
57	`"tfoot",`
58	`"th",`
59	`"thead",`
60	`"tr",`
61	`"tt",`
62	`"u",`
63	`"ul",`
64	`"var",`
65	`# SVG elements for Pikchr diagrams`
66	`"svg",`
67	`"path",`
68	`"circle",`
69	`"rect",`
70	`"line",`
71	`"polyline",`
72	`"polygon",`
73	`"g",`
74	`"text",`
75	`"defs",`
76	`"use",`
77	`"symbol",`
78	`}`
79	`)`
80
81	`# Attributes allowed per tag (all others stripped)`
82	`ALLOWED_ATTRS = {`
83	`"a": {"href", "title", "class", "id", "name"},`
84	`"img": {"src", "alt", "title", "width", "height", "class"},`
85	`"div": {"class", "id"},`
86	`"span": {"class", "id"},`
87	`"td": {"class", "colspan", "rowspan"},`
88	`"th": {"class", "colspan", "rowspan"},`
89	`"table": {"class"},`
90	`"code": {"class"},`
91	`"pre": {"class"},`
92	`"ol": {"class", "start", "type"},`
93	`"ul": {"class"},`
94	`"li": {"class", "value"},`
95	`"details": {"open", "class"},`
96	`"summary": {"class"},`
97	`"h1": {"id", "class"},`
98	`"h2": {"id", "class"},`
99	`"h3": {"id", "class"},`
100	`"h4": {"id", "class"},`
101	`"h5": {"id", "class"},`
102	`"h6": {"id", "class"},`
103	`# SVG attributes`
104	`"svg": {"viewbox", "width", "height", "class", "xmlns", "fill", "stroke"},`
105	`"path": {"d", "fill", "stroke", "stroke-width", "stroke-linecap", "stroke-linejoin", "class"},`
106	`"circle": {"cx", "cy", "r", "fill", "stroke", "class"},`
107	`"rect": {"x", "y", "width", "height", "fill", "stroke", "rx", "ry", "class"},`
108	`"line": {"x1", "y1", "x2", "y2", "stroke", "stroke-width", "class"},`
109	`"text": {"x", "y", "font-size", "text-anchor", "fill", "class"},`
110	`"g": {"transform", "class"},`
111	`"polyline": {"points", "fill", "stroke", "class"},`
112	`"polygon": {"points", "fill", "stroke", "class"},`
113	`}`
114
115	`# Global attributes allowed on any tag`
116	`GLOBAL_ATTRS = frozenset()`
117
118	`# Protocols allowed in href/src — everything else is stripped`
119	`ALLOWED_PROTOCOLS = frozenset({"http", "https", "mailto", "ftp", "#", ""})`
120
121	`# Regex to detect protocol in a URL (after HTML entity decoding)`
122	`_PROTOCOL_RE = re.compile(r"^([a-zA-Z][a-zA-Z0-9+\-.]):.", re.DOTALL)`
123
124
125	`def _is_safe_url(url: str) -> bool:`
126	`"""Check if a URL uses a safe protocol.`
127
128	`Decodes HTML entities, then strips ASCII control characters (tabs, CRs, NULs,`
129	`etc.) that browsers silently ignore but can be used to bypass protocol checks`
130	(e.g. ``jav ascript:`` or ``java script:``).
131	`"""`
132	`decoded = html.unescape(url)`
133	`# Strip all ASCII control characters (0x00-0x1F, 0x7F) — browsers ignore them`
134	`# in URL scheme parsing, so "jav\tascript:" is treated as "javascript:"`
135	`cleaned = re.sub(r"[\x00-\x1f\x7f]", "", decoded).strip()`
136	`m = _PROTOCOL_RE.match(cleaned)`
137	`if m:`
138	`return m.group(1).lower() in ALLOWED_PROTOCOLS`
139	`return True`
140
141
142	`class _SanitizingParser(HTMLParser):`
143	`"""HTML parser that only emits allowed tags/attributes."""`
144
145	`def __init__(self):`
146	`super().__init__(convert_charrefs=False)`
147	`self.out = StringIO()`
148	`self._skip_depth = 0 # Track depth inside dangerous tags to skip content`
149
150	`# Void elements that are dangerous but never have content/closing tags`
151	`_DANGEROUS_VOID = frozenset({"base", "meta", "link"})`
152	`# Dangerous container tags — skip both the tag and all content inside`
153	`_DANGEROUS_CONTAINER = frozenset({"script", "style", "iframe", "object", "embed", "form"})`
154
155	`def handle_starttag(self, tag, attrs):`
156	`tag_lower = tag.lower()`
157
158	`# Dangerous void tags — just drop the tag (no content to skip)`
159	`if tag_lower in self._DANGEROUS_VOID:`
160	`return`
161
162	`# Dangerous content tags — skip tag AND all content inside`
163	`if tag_lower in self._DANGEROUS_CONTAINER:`
164	`self._skip_depth += 1`
165	`return`
166
167	`if self._skip_depth > 0:`
168	`return`
169
170	`if tag_lower not in ALLOWED_TAGS:`
171	`return # Strip unknown tag (but keep its text content)`
172
173	`# Filter attributes`
174	`allowed = ALLOWED_ATTRS.get(tag_lower, set()) \| GLOBAL_ATTRS`
175	`safe_attrs = []`
176	`for name, value in attrs:`
177	`name_lower = name.lower()`
178	`# Block event handlers`
179	`if name_lower.startswith("on"):`
180	`continue`
181	`if name_lower not in allowed:`
182	`continue`
183	`# Sanitize URLs in href/src`
184	`if name_lower in ("href", "src") and value and not _is_safe_url(value):`
185	`value = "#"`
186	`safe_attrs.append((name, value))`
187
188	`# Build the tag`
189	`attr_str = ""`
190	`for name, value in safe_attrs:`
191	`if value is None:`
192	`attr_str += f" {name}"`
193	`else:`
194	`escaped = value.replace("&", "&").replace('"', """)`
195	`attr_str += f' {name}="{escaped}"'`
196
197	`self.out.write(f"<{tag}{attr_str}>")`
198
199	`def handle_endtag(self, tag):`
200	`tag_lower = tag.lower()`
201	`if tag_lower in self._DANGEROUS_VOID:`
202	`return`
203	`if tag_lower in self._DANGEROUS_CONTAINER:`
204	`self._skip_depth = max(0, self._skip_depth - 1)`
205	`return`
206	`if self._skip_depth > 0:`
207	`return`
208	`if tag_lower in ALLOWED_TAGS:`
209	`self.out.write(f"</{tag}>")`
210
211	`def handle_data(self, data):`
212	`if self._skip_depth > 0:`
213	`return # Inside a dangerous tag — skip content`
214	`self.out.write(data)`
215
216	`def handle_entityref(self, name):`
217	`if self._skip_depth > 0:`
218	`return`
219	`self.out.write(f"&{name};")`
220
221	`def handle_charref(self, name):`
222	`if self._skip_depth > 0:`
223	`return`
224	`self.out.write(f"&#{name};")`
225
226	`def handle_comment(self, data):`
227	`pass # Strip all HTML comments`
228
229	`def handle_startendtag(self, tag, attrs):`
230	`# Self-closing tags like <br/>, <img/>`
231	`self.handle_starttag(tag, attrs)`
232
233
234	`def sanitize_html(html_content: str) -> str:`
235	`"""Sanitize HTML using a proper parser with tag/attribute allowlists.`
236
237	`- Only tags in ALLOWED_TAGS are kept (all others stripped, text preserved)`
238	`- Only attributes in ALLOWED_ATTRS per tag are kept`
239	`- Event handlers (on*) are always stripped`
240	`- URLs in href/src are checked after HTML entity decoding — javascript:,`
241	`data:, vbscript: (including entity-encoded variants) are neutralized`
242	`- Content inside <script>, <style>, <iframe>, etc. is completely removed`
243	`- HTML comments are stripped`
244	`"""`
245	`if not html_content:`
246	`return html_content`
247
248	`parser = _SanitizingParser()`
249	`parser.feed(html_content)`
250	`return parser.out.getvalue()`
251

FossilRepo

Keyboard Shortcuts