FossilRepo

Fix: HTML sanitizer rewrite, XSS in xfer, webhook SSRF, health leak

ragelink 2026-04-07 20:05 trunk

Commit 595dfca1fb6aa4b3b754dd4601df5d455fca1cff3a88960b84299db602f5c2ee

Parent 8f52aba83e8e2e4…

4 files changed +2 -2 +169 -133 +59 +43 -27

~ config/urls.py ~ core/sanitize.py + core/url_validation.py ~ fossil/views.py

M config/urls.py

+2 -2

		--- config/urls.py
		+++ config/urls.py
		@@ -83,19 +83,19 @@
83	83
84	84	try:
85	85	with connection.cursor() as cursor:
86	86	cursor.execute("SELECT 1")
87	87	db_ok = True
88		- except Exception as e:
	88	+ except Exception:
89	89	return JsonResponse(
90	90	{
91	91	"service": "fossilrepo-django-htmx",
92	92	"version": settings.VERSION,
93	93	"status": "error",
94	94	"uptime": _uptime_str(),
95	95	"timestamp": datetime.now(UTC).isoformat(),
96		- "checks": {"database": "error", "detail": str(e)},
	96	+ "checks": {"database": "error"},
97	97	},
98	98	status=503,
99	99	)
100	100
101	101	return JsonResponse(
102	102

	--- config/urls.py
	+++ config/urls.py
	@@ -83,19 +83,19 @@
83
84	try:
85	with connection.cursor() as cursor:
86	cursor.execute("SELECT 1")
87	db_ok = True
88	except Exception as e:
89	return JsonResponse(
90	{
91	"service": "fossilrepo-django-htmx",
92	"version": settings.VERSION,
93	"status": "error",
94	"uptime": _uptime_str(),
95	"timestamp": datetime.now(UTC).isoformat(),
96	"checks": {"database": "error", "detail": str(e)},
97	},
98	status=503,
99	)
100
101	return JsonResponse(
102

	--- config/urls.py
	+++ config/urls.py
	@@ -83,19 +83,19 @@
83
84	try:
85	with connection.cursor() as cursor:
86	cursor.execute("SELECT 1")
87	db_ok = True
88	except Exception:
89	return JsonResponse(
90	{
91	"service": "fossilrepo-django-htmx",
92	"version": settings.VERSION,
93	"status": "error",
94	"uptime": _uptime_str(),
95	"timestamp": datetime.now(UTC).isoformat(),
96	"checks": {"database": "error"},
97	},
98	status=503,
99	)
100
101	return JsonResponse(
102

M core/sanitize.py

+169 -133

		--- core/sanitize.py
		+++ core/sanitize.py
		@@ -1,135 +1,171 @@
1	1	"""HTML sanitization for user-generated content.
2	2
3		-Strips dangerous tags (<script>, <style>, <iframe>, etc.), event handlers (on*),
4		-and dangerous URL protocols (javascript:, data:, vbscript:) while preserving
5		-safe formatting tags used by Fossil wiki, Markdown, and Pikchr diagrams.
6		-"""
7		-
8		-import re
9		-
10		-# Tags that are safe to render -- covers Markdown/wiki formatting and Pikchr SVG
11		-ALLOWED_TAGS = {
12		- "a",
13		- "abbr",
14		- "acronym",
15		- "b",
16		- "blockquote",
17		- "br",
18		- "code",
19		- "dd",
20		- "del",
21		- "details",
22		- "div",
23		- "dl",
24		- "dt",
25		- "em",
26		- "h1",
27		- "h2",
28		- "h3",
29		- "h4",
30		- "h5",
31		- "h6",
32		- "hr",
33		- "i",
34		- "img",
35		- "ins",
36		- "kbd",
37		- "li",
38		- "mark",
39		- "ol",
40		- "p",
41		- "pre",
42		- "q",
43		- "s",
44		- "samp",
45		- "small",
46		- "span",
47		- "strong",
48		- "sub",
49		- "summary",
50		- "sup",
51		- "table",
52		- "tbody",
53		- "td",
54		- "tfoot",
55		- "th",
56		- "thead",
57		- "tr",
58		- "tt",
59		- "u",
60		- "ul",
61		- "var",
62		- # SVG elements for Pikchr diagrams
63		- "svg",
64		- "path",
65		- "circle",
66		- "rect",
67		- "line",
68		- "polyline",
69		- "polygon",
70		- "g",
71		- "text",
72		- "defs",
73		- "use",
74		- "symbol",
75		-}
76		-
77		-# Tags whose entire content (not just the tag) must be removed
78		-_DANGEROUS_CONTENT_TAGS = re.compile(
79		- r"<\s(script\|style\|iframe\|object\|embed\|form\|base\|meta\|link)\b[^>]>.?</\s\1\s*>",
80		- re.IGNORECASE \| re.DOTALL,
81		-)
82		-
83		-# Self-closing / unclosed dangerous tags
84		-_DANGEROUS_SELF_CLOSING = re.compile(
85		- r"<\s/?\s(script\|style\|iframe\|object\|embed\|form\|base\|meta\|link)\b[^>]/?\s>",
86		- re.IGNORECASE,
87		-)
88		-
89		-# Event handler attributes (onclick, onload, onerror, etc.)
90		-_EVENT_HANDLERS = re.compile(
91		- r"""\s+on\w+\s=\s(?:"[^"]"\|'[^']'\|[^\s>]+)""",
92		- re.IGNORECASE,
93		-)
94		-
95		-# Dangerous protocols in href/src values
96		-_DANGEROUS_PROTOCOL = re.compile(r"^\s*(?:javascript\|vbscript\|data):", re.IGNORECASE)
97		-
98		-# href="..." and src="..." attribute pattern
99		-_URL_ATTR = re.compile(r"""(href\|src)\s=\s(["']?)([^"'>\s]+)\2""", re.IGNORECASE)
100		-
101		-
102		-def _clean_url_attr(match: re.Match) -> str:
103		- """Replace dangerous protocol URLs with a safe '#' anchor."""
104		- attr_name = match.group(1)
105		- quote = match.group(2) or ""
106		- url = match.group(3)
107		- if _DANGEROUS_PROTOCOL.match(url):
108		- return f"{attr_name}={quote}#{quote}"
109		- return match.group(0)
110		-
111		-
112		-def sanitize_html(html: str) -> str:
113		- """Remove dangerous HTML tags and attributes while preserving safe formatting.
114		-
115		- Strips <script>, <style>, <iframe>, <object>, <embed>, <form>, <base>,
116		- <meta>, <link> tags and their content. Removes event handler attributes
117		- (on*) and replaces dangerous URL protocols (javascript:, data:, vbscript:)
118		- in href/src with '#'.
119		- """
120		- if not html:
121		- return html
122		-
123		- # 1. Remove dangerous tags WITH their content (e.g. <script>...</script>)
124		- html = _DANGEROUS_CONTENT_TAGS.sub("", html)
125		-
126		- # 2. Remove any remaining self-closing or orphaned dangerous tags
127		- html = _DANGEROUS_SELF_CLOSING.sub("", html)
128		-
129		- # 3. Remove event handler attributes (onclick, onload, onerror, etc.)
130		- html = _EVENT_HANDLERS.sub("", html)
131		-
132		- # 4. Neutralize dangerous URL protocols in href and src attributes
133		- html = _URL_ATTR.sub(_clean_url_attr, html)
134		-
135		- return html
	3	+Uses Python's html.parser to properly parse HTML and enforce an allowlist
	4	+of tags and attributes. Strips everything not explicitly allowed.
	5	+"""
	6	+
	7	+import html
	8	+import re
	9	+from html.parser import HTMLParser
	10	+from io import StringIO
	11	+
	12	+# Tags that are safe to render — covers Markdown/wiki formatting and Pikchr SVG
	13	+ALLOWED_TAGS = frozenset({
	14	+ "a", "abbr", "acronym", "b", "blockquote", "br", "code", "dd", "del",
	15	+ "details", "div", "dl", "dt", "em", "h1", "h2", "h3", "h4", "h5", "h6",
	16	+ "hr", "i", "img", "ins", "kbd", "li", "mark", "ol", "p", "pre", "q",
	17	+ "s", "samp", "small", "span", "strong", "sub", "summary", "sup",
	18	+ "table", "tbody", "td", "tfoot", "th", "thead", "tr", "tt", "u", "ul", "var",
	19	+ # SVG elements for Pikchr diagrams
	20	+ "svg", "path", "circle", "rect", "line", "polyline", "polygon",
	21	+ "g", "text", "defs", "use", "symbol",
	22	+})
	23	+
	24	+# Attributes allowed per tag (all others stripped)
	25	+ALLOWED_ATTRS = {
	26	+ "a": {"href", "title", "class", "id", "name"},
	27	+ "img": {"src", "alt", "title", "width", "height", "class"},
	28	+ "div": {"class", "id"},
	29	+ "span": {"class", "id"},
	30	+ "td": {"class", "colspan", "rowspan"},
	31	+ "th": {"class", "colspan", "rowspan"},
	32	+ "table": {"class"},
	33	+ "code": {"class"},
	34	+ "pre": {"class"},
	35	+ "ol": {"class", "start", "type"},
	36	+ "ul": {"class"},
	37	+ "li": {"class", "value"},
	38	+ "details": {"open", "class"},
	39	+ "summary": {"class"},
	40	+ "h1": {"id", "class"}, "h2": {"id", "class"}, "h3": {"id", "class"},
	41	+ "h4": {"id", "class"}, "h5": {"id", "class"}, "h6": {"id", "class"},
	42	+ # SVG attributes
	43	+ "svg": {"viewbox", "width", "height", "class", "xmlns", "fill", "stroke"},
	44	+ "path": {"d", "fill", "stroke", "stroke-width", "stroke-linecap", "stroke-linejoin", "class"},
	45	+ "circle": {"cx", "cy", "r", "fill", "stroke", "class"},
	46	+ "rect": {"x", "y", "width", "height", "fill", "stroke", "rx", "ry", "class"},
	47	+ "line": {"x1", "y1", "x2", "y2", "stroke", "stroke-width", "class"},
	48	+ "text": {"x", "y", "font-size", "text-anchor", "fill", "class"},
	49	+ "g": {"transform", "class"},
	50	+ "polyline": {"points", "fill", "stroke", "class"},
	51	+ "polygon": {"points", "fill", "stroke", "class"},
	52	+}
	53	+
	54	+# Global attributes allowed on any tag
	55	+GLOBAL_ATTRS = frozenset()
	56	+
	57	+# Protocols allowed in href/src — everything else is stripped
	58	+ALLOWED_PROTOCOLS = frozenset({"http", "https", "mailto", "ftp", "#", ""})
	59	+
	60	+# Regex to detect protocol in a URL (after HTML entity decoding)
	61	+_PROTOCOL_RE = re.compile(r"^([a-zA-Z][a-zA-Z0-9+\-.]):.", re.DOTALL)
	62	+
	63	+
	64	+def _is_safe_url(url: str) -> bool:
	65	+ """Check if a URL uses a safe protocol. Decodes HTML entities first."""
	66	+ decoded = html.unescape(url).strip()
	67	+ m = _PROTOCOL_RE.match(decoded)
	68	+ if m:
	69	+ return m.group(1).lower() in ALLOWED_PROTOCOLS
	70	+ # Relative URLs (no protocol) are safe
	71	+ return True
	72	+
	73	+
	74	+class _SanitizingParser(HTMLParser):
	75	+ """HTML parser that only emits allowed tags/attributes."""
	76	+
	77	+ def __init__(self):
	78	+ super().__init__(convert_charrefs=False)
	79	+ self.out = StringIO()
	80	+ self._skip_depth = 0 # Track depth inside dangerous tags to skip content
	81	+
	82	+ def handle_starttag(self, tag, attrs):
	83	+ tag_lower = tag.lower()
	84	+
	85	+ # Dangerous content tags — skip tag AND all content inside
	86	+ if tag_lower in ("script", "style", "iframe", "object", "embed", "form", "base", "meta", "link"):
	87	+ self._skip_depth += 1
	88	+ return
	89	+
	90	+ if self._skip_depth > 0:
	91	+ return
	92	+
	93	+ if tag_lower not in ALLOWED_TAGS:
	94	+ return # Strip unknown tag (but keep its text content)
	95	+
	96	+ # Filter attributes
	97	+ allowed = ALLOWED_ATTRS.get(tag_lower, set()) \| GLOBAL_ATTRS
	98	+ safe_attrs = []
	99	+ for name, value in attrs:
	100	+ name_lower = name.lower()
	101	+ # Block event handlers
	102	+ if name_lower.startswith("on"):
	103	+ continue
	104	+ if name_lower not in allowed:
	105	+ continue
	106	+ # Sanitize URLs in href/src
	107	+ if name_lower in ("href", "src") and value and not _is_safe_url(value):
	108	+ value = "#"
	109	+ safe_attrs.append((name, value))
	110	+
	111	+ # Build the tag
	112	+ attr_str = ""
	113	+ for name, value in safe_attrs:
	114	+ if value is None:
	115	+ attr_str += f" {name}"
	116	+ else:
	117	+ escaped = value.replace("&", "&").replace('"', """)
	118	+ attr_str += f' {name}="{escaped}"'
	119	+
	120	+ self.out.write(f"<{tag}{attr_str}>")
	121	+
	122	+ def handle_endtag(self, tag):
	123	+ tag_lower = tag.lower()
	124	+ if tag_lower in ("script", "style", "iframe", "object", "embed", "form", "base", "meta", "link"):
	125	+ self._skip_depth = max(0, self._skip_depth - 1)
	126	+ return
	127	+ if self._skip_depth > 0:
	128	+ return
	129	+ if tag_lower in ALLOWED_TAGS:
	130	+ self.out.write(f"</{tag}>")
	131	+
	132	+ def handle_data(self, data):
	133	+ if self._skip_depth > 0:
	134	+ return # Inside a dangerous tag — skip content
	135	+ self.out.write(data)
	136	+
	137	+ def handle_entityref(self, name):
	138	+ if self._skip_depth > 0:
	139	+ return
	140	+ self.out.write(f"&{name};")
	141	+
	142	+ def handle_charref(self, name):
	143	+ if self._skip_depth > 0:
	144	+ return
	145	+ self.out.write(f"&#{name};")
	146	+
	147	+ def handle_comment(self, data):
	148	+ pass # Strip all HTML comments
	149	+
	150	+ def handle_startendtag(self, tag, attrs):
	151	+ # Self-closing tags like <br/>, <img/>
	152	+ self.handle_starttag(tag, attrs)
	153	+
	154	+
	155	+def sanitize_html(html_content: str) -> str:
	156	+ """Sanitize HTML using a proper parser with tag/attribute allowlists.
	157	+
	158	+ - Only tags in ALLOWED_TAGS are kept (all others stripped, text preserved)
	159	+ - Only attributes in ALLOWED_ATTRS per tag are kept
	160	+ - Event handlers (on*) are always stripped
	161	+ - URLs in href/src are checked after HTML entity decoding — javascript:,
	162	+ data:, vbscript: (including entity-encoded variants) are neutralized
	163	+ - Content inside <script>, <style>, <iframe>, etc. is completely removed
	164	+ - HTML comments are stripped
	165	+ """
	166	+ if not html_content:
	167	+ return html_content
	168	+
	169	+ parser = _SanitizingParser()
	170	+ parser.feed(html_content)
	171	+ return parser.out.getvalue()
136	172
137	173	ADDED core/url_validation.py

	--- core/sanitize.py
	+++ core/sanitize.py
	@@ -1,135 +1,171 @@
1	"""HTML sanitization for user-generated content.
2
3	Strips dangerous tags (<script>, <style>, <iframe>, etc.), event handlers (on*),
4	and dangerous URL protocols (javascript:, data:, vbscript:) while preserving
5	safe formatting tags used by Fossil wiki, Markdown, and Pikchr diagrams.
6	"""
7
8	import re
9
10	# Tags that are safe to render -- covers Markdown/wiki formatting and Pikchr SVG
11	ALLOWED_TAGS = {
12	"a",
13	"abbr",
14	"acronym",
15	"b",
16	"blockquote",
17	"br",
18	"code",
19	"dd",
20	"del",
21	"details",
22	"div",
23	"dl",
24	"dt",
25	"em",
26	"h1",
27	"h2",
28	"h3",
29	"h4",
30	"h5",
31	"h6",
32	"hr",
33	"i",
34	"img",
35	"ins",
36	"kbd",
37	"li",
38	"mark",
39	"ol",
40	"p",
41	"pre",
42	"q",
43	"s",
44	"samp",
45	"small",
46	"span",
47	"strong",
48	"sub",
49	"summary",
50	"sup",
51	"table",
52	"tbody",
53	"td",
54	"tfoot",
55	"th",
56	"thead",
57	"tr",
58	"tt",
59	"u",
60	"ul",
61	"var",
62	# SVG elements for Pikchr diagrams
63	"svg",
64	"path",
65	"circle",
66	"rect",
67	"line",
68	"polyline",
69	"polygon",
70	"g",
71	"text",
72	"defs",
73	"use",
74	"symbol",
75	}
76
77	# Tags whose entire content (not just the tag) must be removed
78	_DANGEROUS_CONTENT_TAGS = re.compile(
79	r"<\s(script\|style\|iframe\|object\|embed\|form\|base\|meta\|link)\b[^>]>.?</\s\1\s*>",
80	re.IGNORECASE \| re.DOTALL,
81	)
82
83	# Self-closing / unclosed dangerous tags
84	_DANGEROUS_SELF_CLOSING = re.compile(
85	r"<\s/?\s(script\|style\|iframe\|object\|embed\|form\|base\|meta\|link)\b[^>]/?\s>",
86	re.IGNORECASE,
87	)
88
89	# Event handler attributes (onclick, onload, onerror, etc.)
90	_EVENT_HANDLERS = re.compile(
91	r"""\s+on\w+\s=\s(?:"[^"]"\|'[^']'\|[^\s>]+)""",
92	re.IGNORECASE,
93	)
94
95	# Dangerous protocols in href/src values
96	_DANGEROUS_PROTOCOL = re.compile(r"^\s*(?:javascript\|vbscript\|data):", re.IGNORECASE)
97
98	# href="..." and src="..." attribute pattern
99	_URL_ATTR = re.compile(r"""(href\|src)\s=\s(["']?)([^"'>\s]+)\2""", re.IGNORECASE)
100
101
102	def _clean_url_attr(match: re.Match) -> str:
103	"""Replace dangerous protocol URLs with a safe '#' anchor."""
104	attr_name = match.group(1)
105	quote = match.group(2) or ""
106	url = match.group(3)
107	if _DANGEROUS_PROTOCOL.match(url):
108	return f"{attr_name}={quote}#{quote}"
109	return match.group(0)
110
111
112	def sanitize_html(html: str) -> str:
113	"""Remove dangerous HTML tags and attributes while preserving safe formatting.
114
115	Strips <script>, <style>, <iframe>, <object>, <embed>, <form>, <base>,
116	<meta>, <link> tags and their content. Removes event handler attributes
117	(on*) and replaces dangerous URL protocols (javascript:, data:, vbscript:)
118	in href/src with '#'.
119	"""
120	if not html:
121	return html
122
123	# 1. Remove dangerous tags WITH their content (e.g. <script>...</script>)
124	html = _DANGEROUS_CONTENT_TAGS.sub("", html)
125
126	# 2. Remove any remaining self-closing or orphaned dangerous tags
127	html = _DANGEROUS_SELF_CLOSING.sub("", html)
128
129	# 3. Remove event handler attributes (onclick, onload, onerror, etc.)
130	html = _EVENT_HANDLERS.sub("", html)
131
132	# 4. Neutralize dangerous URL protocols in href and src attributes
133	html = _URL_ATTR.sub(_clean_url_attr, html)
134
135	return html




































136
137	DDED core/url_validation.py

	--- core/sanitize.py
	+++ core/sanitize.py
	@@ -1,135 +1,171 @@
1	"""HTML sanitization for user-generated content.
2
3	Uses Python's html.parser to properly parse HTML and enforce an allowlist
4	of tags and attributes. Strips everything not explicitly allowed.
5	"""
6
7	import html
8	import re
9	from html.parser import HTMLParser
10	from io import StringIO
11
12	# Tags that are safe to render — covers Markdown/wiki formatting and Pikchr SVG
13	ALLOWED_TAGS = frozenset({
14	"a", "abbr", "acronym", "b", "blockquote", "br", "code", "dd", "del",
15	"details", "div", "dl", "dt", "em", "h1", "h2", "h3", "h4", "h5", "h6",
16	"hr", "i", "img", "ins", "kbd", "li", "mark", "ol", "p", "pre", "q",
17	"s", "samp", "small", "span", "strong", "sub", "summary", "sup",
18	"table", "tbody", "td", "tfoot", "th", "thead", "tr", "tt", "u", "ul", "var",
19	# SVG elements for Pikchr diagrams
20	"svg", "path", "circle", "rect", "line", "polyline", "polygon",
21	"g", "text", "defs", "use", "symbol",
22	})
23
24	# Attributes allowed per tag (all others stripped)
25	ALLOWED_ATTRS = {
26	"a": {"href", "title", "class", "id", "name"},
27	"img": {"src", "alt", "title", "width", "height", "class"},
28	"div": {"class", "id"},
29	"span": {"class", "id"},
30	"td": {"class", "colspan", "rowspan"},
31	"th": {"class", "colspan", "rowspan"},
32	"table": {"class"},
33	"code": {"class"},
34	"pre": {"class"},
35	"ol": {"class", "start", "type"},
36	"ul": {"class"},
37	"li": {"class", "value"},
38	"details": {"open", "class"},
39	"summary": {"class"},
40	"h1": {"id", "class"}, "h2": {"id", "class"}, "h3": {"id", "class"},
41	"h4": {"id", "class"}, "h5": {"id", "class"}, "h6": {"id", "class"},
42	# SVG attributes
43	"svg": {"viewbox", "width", "height", "class", "xmlns", "fill", "stroke"},
44	"path": {"d", "fill", "stroke", "stroke-width", "stroke-linecap", "stroke-linejoin", "class"},
45	"circle": {"cx", "cy", "r", "fill", "stroke", "class"},
46	"rect": {"x", "y", "width", "height", "fill", "stroke", "rx", "ry", "class"},
47	"line": {"x1", "y1", "x2", "y2", "stroke", "stroke-width", "class"},
48	"text": {"x", "y", "font-size", "text-anchor", "fill", "class"},
49	"g": {"transform", "class"},
50	"polyline": {"points", "fill", "stroke", "class"},
51	"polygon": {"points", "fill", "stroke", "class"},
52	}
53
54	# Global attributes allowed on any tag
55	GLOBAL_ATTRS = frozenset()
56
57	# Protocols allowed in href/src — everything else is stripped
58	ALLOWED_PROTOCOLS = frozenset({"http", "https", "mailto", "ftp", "#", ""})
59
60	# Regex to detect protocol in a URL (after HTML entity decoding)
61	_PROTOCOL_RE = re.compile(r"^([a-zA-Z][a-zA-Z0-9+\-.]):.", re.DOTALL)
62
63
64	def _is_safe_url(url: str) -> bool:
65	"""Check if a URL uses a safe protocol. Decodes HTML entities first."""
66	decoded = html.unescape(url).strip()
67	m = _PROTOCOL_RE.match(decoded)
68	if m:
69	return m.group(1).lower() in ALLOWED_PROTOCOLS
70	# Relative URLs (no protocol) are safe
71	return True
72
73
74	class _SanitizingParser(HTMLParser):
75	"""HTML parser that only emits allowed tags/attributes."""
76
77	def __init__(self):
78	super().__init__(convert_charrefs=False)
79	self.out = StringIO()
80	self._skip_depth = 0 # Track depth inside dangerous tags to skip content
81
82	def handle_starttag(self, tag, attrs):
83	tag_lower = tag.lower()
84
85	# Dangerous content tags — skip tag AND all content inside
86	if tag_lower in ("script", "style", "iframe", "object", "embed", "form", "base", "meta", "link"):
87	self._skip_depth += 1
88	return
89
90	if self._skip_depth > 0:
91	return
92
93	if tag_lower not in ALLOWED_TAGS:
94	return # Strip unknown tag (but keep its text content)
95
96	# Filter attributes
97	allowed = ALLOWED_ATTRS.get(tag_lower, set()) \| GLOBAL_ATTRS
98	safe_attrs = []
99	for name, value in attrs:
100	name_lower = name.lower()
101	# Block event handlers
102	if name_lower.startswith("on"):
103	continue
104	if name_lower not in allowed:
105	continue
106	# Sanitize URLs in href/src
107	if name_lower in ("href", "src") and value and not _is_safe_url(value):
108	value = "#"
109	safe_attrs.append((name, value))
110
111	# Build the tag
112	attr_str = ""
113	for name, value in safe_attrs:
114	if value is None:
115	attr_str += f" {name}"
116	else:
117	escaped = value.replace("&", "&").replace('"', """)
118	attr_str += f' {name}="{escaped}"'
119
120	self.out.write(f"<{tag}{attr_str}>")
121
122	def handle_endtag(self, tag):
123	tag_lower = tag.lower()
124	if tag_lower in ("script", "style", "iframe", "object", "embed", "form", "base", "meta", "link"):
125	self._skip_depth = max(0, self._skip_depth - 1)
126	return
127	if self._skip_depth > 0:
128	return
129	if tag_lower in ALLOWED_TAGS:
130	self.out.write(f"</{tag}>")
131
132	def handle_data(self, data):
133	if self._skip_depth > 0:
134	return # Inside a dangerous tag — skip content
135	self.out.write(data)
136
137	def handle_entityref(self, name):
138	if self._skip_depth > 0:
139	return
140	self.out.write(f"&{name};")
141
142	def handle_charref(self, name):
143	if self._skip_depth > 0:
144	return
145	self.out.write(f"&#{name};")
146
147	def handle_comment(self, data):
148	pass # Strip all HTML comments
149
150	def handle_startendtag(self, tag, attrs):
151	# Self-closing tags like <br/>, <img/>
152	self.handle_starttag(tag, attrs)
153
154
155	def sanitize_html(html_content: str) -> str:
156	"""Sanitize HTML using a proper parser with tag/attribute allowlists.
157
158	- Only tags in ALLOWED_TAGS are kept (all others stripped, text preserved)
159	- Only attributes in ALLOWED_ATTRS per tag are kept
160	- Event handlers (on*) are always stripped
161	- URLs in href/src are checked after HTML entity decoding — javascript:,
162	data:, vbscript: (including entity-encoded variants) are neutralized
163	- Content inside <script>, <style>, <iframe>, etc. is completely removed
164	- HTML comments are stripped
165	"""
166	if not html_content:
167	return html_content
168
169	parser = _SanitizingParser()
170	parser.feed(html_content)
171	return parser.out.getvalue()
172
173	DDED core/url_validation.py

A core/url_validation.py

+59

		--- a/core/url_validation.py
		+++ b/core/url_validation.py
		@@ -0,0 +1,59 @@
	1	+"""URL validation for outbound requests (webhooks, etc.)."""
	2	+
	3	+import ipaddress
	4	+import socket
	5	+from urllib.parse import urlparse
	6	+
	7	+
	8	+def is_safe_webhook_url(url: str) -> tuple[bool, str]:
	9	+ """Validate a webhook URL is safe for server-side requests.
	10	+
	11	+ Blocks:
	12	+ - Non-HTTP(S) protocols
	13	+ - Localhost and loopback addresses
	14	+ - Private/internal IP ranges (10.x, 172.16-31.x, 192.168.x, etc.)
	15	+ - Link-local addresses
	16	+ - AWS metadata endpoint (169.254.169.254)
	17	+
	18	+ Returns (is_safe, error_message).
	19	+ """
	20	+ if not url:
	21	+ return False, "URL is required."
	22	+
	23	+ parsed = urlparse(url)
	24	+
	25	+ if parsed.scheme not in ("http", "https"):
	26	+ return False, "Only http:// and https:// URLs are allowed."
	27	+
	28	+ hostname = parsed.hostname
	29	+ if not hostname:
	30	+ return False, "URL must include a hostname."
	31	+
	32	+ # Block obvious localhost variants
	33	+ if hostname in ("localhost", "127.0.0.1", "::1", "0.0.0.0"):
	34	+ return False, "Localhost URLs are not allowed."
	35	+
	36	+ # Resolve hostname and check the IP
	37	+ try:
	38	+ addr_info = socket.getaddrinfo(hostname, None, socket.AF_UNSPEC, socket.SOCK_STREAM)
	39	+ except socket.gaierror:
	40	+ return False, f"Could not resolve hostname: {hostname}"
	41	+
	42	+ for _family, _type, _proto, _canonname, sockaddr in addr_info:
	43	+ ip_str = sockaddr[0]
	44	+ try:
	45	+ ip = ipaddress.ip_address(ip_str)
	46	+ except ValueError:
	47	+ continue
	48	+
	49	+ if ip.is_loopback:
	50	+ return False, "Loopback addresses are not allowed."
	51	+ if ip.is_private:
	52	+ return False, "Private/internal IP addresses are not allowed."
	53	+ if ip.is_link_local:
	54	+ return False, "Link-local addresses are not allowed."
	55	+ if ip.is_reserved:
	56	+ return False, "Reserved IP addresses are not allowed."
	57	+ # AWS metadata endpoint
	58	+ if ip_str == "169.254.169.254":
	59	+ return False, "Cloud metadata endpoints are not allo

	--- a/core/url_validation.py
	+++ b/core/url_validation.py
	@@ -0,0 +1,59 @@

	--- a/core/url_validation.py
	+++ b/core/url_validation.py
	@@ -0,0 +1,59 @@
1	"""URL validation for outbound requests (webhooks, etc.)."""
2
3	import ipaddress
4	import socket
5	from urllib.parse import urlparse
6
7
8	def is_safe_webhook_url(url: str) -> tuple[bool, str]:
9	"""Validate a webhook URL is safe for server-side requests.
10
11	Blocks:
12	- Non-HTTP(S) protocols
13	- Localhost and loopback addresses
14	- Private/internal IP ranges (10.x, 172.16-31.x, 192.168.x, etc.)
15	- Link-local addresses
16	- AWS metadata endpoint (169.254.169.254)
17
18	Returns (is_safe, error_message).
19	"""
20	if not url:
21	return False, "URL is required."
22
23	parsed = urlparse(url)
24
25	if parsed.scheme not in ("http", "https"):
26	return False, "Only http:// and https:// URLs are allowed."
27
28	hostname = parsed.hostname
29	if not hostname:
30	return False, "URL must include a hostname."
31
32	# Block obvious localhost variants
33	if hostname in ("localhost", "127.0.0.1", "::1", "0.0.0.0"):
34	return False, "Localhost URLs are not allowed."
35
36	# Resolve hostname and check the IP
37	try:
38	addr_info = socket.getaddrinfo(hostname, None, socket.AF_UNSPEC, socket.SOCK_STREAM)
39	except socket.gaierror:
40	return False, f"Could not resolve hostname: {hostname}"
41
42	for _family, _type, _proto, _canonname, sockaddr in addr_info:
43	ip_str = sockaddr[0]
44	try:
45	ip = ipaddress.ip_address(ip_str)
46	except ValueError:
47	continue
48
49	if ip.is_loopback:
50	return False, "Loopback addresses are not allowed."
51	if ip.is_private:
52	return False, "Private/internal IP addresses are not allowed."
53	if ip.is_link_local:
54	return False, "Link-local addresses are not allowed."
55	if ip.is_reserved:
56	return False, "Reserved IP addresses are not allowed."
57	# AWS metadata endpoint
58	if ip_str == "169.254.169.254":
59	return False, "Cloud metadata endpoints are not allo

M fossil/views.py

+43 -27

		--- fossil/views.py
		+++ fossil/views.py
		@@ -1098,21 +1098,27 @@
1098	1098	secret = request.POST.get("secret", "").strip()
1099	1099	events = request.POST.getlist("events")
1100	1100	is_active = request.POST.get("is_active") == "on"
1101	1101
1102	1102	if url:
1103		- events_str = ",".join(events) if events else "all"
1104		- Webhook.objects.create(
1105		- repository=fossil_repo,
1106		- url=url,
1107		- secret=secret,
1108		- events=events_str,
1109		- is_active=is_active,
1110		- created_by=request.user,
1111		- )
1112		- messages.success(request, f"Webhook for {url} created.")
1113		- return redirect("fossil:webhooks", slug=slug)
	1103	+ from core.url_validation import is_safe_webhook_url
	1104	+
	1105	+ is_safe, url_error = is_safe_webhook_url(url)
	1106	+ if not is_safe:
	1107	+ messages.error(request, f"Invalid webhook URL: {url_error}")
	1108	+ else:
	1109	+ events_str = ",".join(events) if events else "all"
	1110	+ Webhook.objects.create(
	1111	+ repository=fossil_repo,
	1112	+ url=url,
	1113	+ secret=secret,
	1114	+ events=events_str,
	1115	+ is_active=is_active,
	1116	+ created_by=request.user,
	1117	+ )
	1118	+ messages.success(request, "Webhook created.")
	1119	+ return redirect("fossil:webhooks", slug=slug)
1114	1120
1115	1121	return render(
1116	1122	request,
1117	1123	"fossil/webhook_form.html",
1118	1124	{
		@@ -1142,20 +1148,25 @@
1142	1148	secret = request.POST.get("secret", "").strip()
1143	1149	events = request.POST.getlist("events")
1144	1150	is_active = request.POST.get("is_active") == "on"
1145	1151
1146	1152	if url:
1147		- webhook.url = url
1148		- # Only update secret if a new one was provided (don't blank it on edit)
1149		- if secret:
1150		- webhook.secret = secret
1151		- webhook.events = ",".join(events) if events else "all"
1152		- webhook.is_active = is_active
1153		- webhook.updated_by = request.user
1154		- webhook.save()
1155		- messages.success(request, f"Webhook for {webhook.url} updated.")
1156		- return redirect("fossil:webhooks", slug=slug)
	1153	+ from core.url_validation import is_safe_webhook_url
	1154	+
	1155	+ is_safe, url_error = is_safe_webhook_url(url)
	1156	+ if not is_safe:
	1157	+ messages.error(request, f"Invalid webhook URL: {url_error}")
	1158	+ else:
	1159	+ webhook.url = url
	1160	+ if secret:
	1161	+ webhook.secret = secret
	1162	+ webhook.events = ",".join(events) if events else "all"
	1163	+ webhook.is_active = is_active
	1164	+ webhook.updated_by = request.user
	1165	+ webhook.save()
	1166	+ messages.success(request, "Webhook updated.")
	1167	+ return redirect("fossil:webhooks", slug=slug)
1157	1168
1158	1169	return render(
1159	1170	request,
1160	1171	"fossil/webhook_form.html",
1161	1172	{
		@@ -1832,24 +1843,29 @@
1832	1843	if request.method == "GET":
1833	1844	if not can_read_project(request.user, project):
1834	1845	from django.core.exceptions import PermissionDenied
1835	1846
1836	1847	raise PermissionDenied
	1848	+ import html as html_mod
	1849	+
1837	1850	clone_url = request.build_absolute_uri()
1838	1851	is_public = project.visibility == "public"
1839	1852	auth_note = "" if is_public else "<p>Authentication is required.</p>"
1840		- html = (
1841		- f"<html><head><title>{project.name} — Fossil Sync</title></head>"
	1853	+ safe_name = html_mod.escape(project.name)
	1854	+ safe_slug = html_mod.escape(project.slug)
	1855	+ safe_url = html_mod.escape(clone_url)
	1856	+ response_html = (
	1857	+ f"<html><head><title>{safe_name} — Fossil Sync</title></head>"
1842	1858	f"<body>"
1843		- f"<h1>{project.name}</h1>"
1844		- f"<p>This is the Fossil sync endpoint for <strong>{project.name}</strong>.</p>"
	1859	+ f"<h1>{safe_name}</h1>"
	1860	+ f"<p>This is the Fossil sync endpoint for <strong>{safe_name}</strong>.</p>"
1845	1861	f"<p>Clone with:</p>"
1846		- f"<pre>fossil clone {clone_url} {project.slug}.fossil</pre>"
	1862	+ f"<pre>fossil clone {safe_url} {safe_slug}.fossil</pre>"
1847	1863	f"{auth_note}"
1848	1864	f"</body></html>"
1849	1865	)
1850		- return HttpResponse(html)
	1866	+ return HttpResponse(response_html)
1851	1867
1852	1868	if request.method == "POST":
1853	1869	if not fossil_repo.exists_on_disk:
1854	1870	raise Http404("Repository file not found on disk.")
1855	1871
1856	1872

	--- fossil/views.py
	+++ fossil/views.py
	@@ -1098,21 +1098,27 @@
1098	secret = request.POST.get("secret", "").strip()
1099	events = request.POST.getlist("events")
1100	is_active = request.POST.get("is_active") == "on"
1101
1102	if url:
1103	events_str = ",".join(events) if events else "all"
1104	Webhook.objects.create(
1105	repository=fossil_repo,
1106	url=url,
1107	secret=secret,
1108	events=events_str,
1109	is_active=is_active,
1110	created_by=request.user,
1111	)
1112	messages.success(request, f"Webhook for {url} created.")
1113	return redirect("fossil:webhooks", slug=slug)






1114
1115	return render(
1116	request,
1117	"fossil/webhook_form.html",
1118	{
	@@ -1142,20 +1148,25 @@
1142	secret = request.POST.get("secret", "").strip()
1143	events = request.POST.getlist("events")
1144	is_active = request.POST.get("is_active") == "on"
1145
1146	if url:
1147	webhook.url = url
1148	# Only update secret if a new one was provided (don't blank it on edit)
1149	if secret:
1150	webhook.secret = secret
1151	webhook.events = ",".join(events) if events else "all"
1152	webhook.is_active = is_active
1153	webhook.updated_by = request.user
1154	webhook.save()
1155	messages.success(request, f"Webhook for {webhook.url} updated.")
1156	return redirect("fossil:webhooks", slug=slug)





1157
1158	return render(
1159	request,
1160	"fossil/webhook_form.html",
1161	{
	@@ -1832,24 +1843,29 @@
1832	if request.method == "GET":
1833	if not can_read_project(request.user, project):
1834	from django.core.exceptions import PermissionDenied
1835
1836	raise PermissionDenied


1837	clone_url = request.build_absolute_uri()
1838	is_public = project.visibility == "public"
1839	auth_note = "" if is_public else "<p>Authentication is required.</p>"
1840	html = (
1841	f"<html><head><title>{project.name} — Fossil Sync</title></head>"



1842	f"<body>"
1843	f"<h1>{project.name}</h1>"
1844	f"<p>This is the Fossil sync endpoint for <strong>{project.name}</strong>.</p>"
1845	f"<p>Clone with:</p>"
1846	f"<pre>fossil clone {clone_url} {project.slug}.fossil</pre>"
1847	f"{auth_note}"
1848	f"</body></html>"
1849	)
1850	return HttpResponse(html)
1851
1852	if request.method == "POST":
1853	if not fossil_repo.exists_on_disk:
1854	raise Http404("Repository file not found on disk.")
1855
1856

	--- fossil/views.py
	+++ fossil/views.py
	@@ -1098,21 +1098,27 @@
1098	secret = request.POST.get("secret", "").strip()
1099	events = request.POST.getlist("events")
1100	is_active = request.POST.get("is_active") == "on"
1101
1102	if url:
1103	from core.url_validation import is_safe_webhook_url
1104
1105	is_safe, url_error = is_safe_webhook_url(url)
1106	if not is_safe:
1107	messages.error(request, f"Invalid webhook URL: {url_error}")
1108	else:
1109	events_str = ",".join(events) if events else "all"
1110	Webhook.objects.create(
1111	repository=fossil_repo,
1112	url=url,
1113	secret=secret,
1114	events=events_str,
1115	is_active=is_active,
1116	created_by=request.user,
1117	)
1118	messages.success(request, "Webhook created.")
1119	return redirect("fossil:webhooks", slug=slug)
1120
1121	return render(
1122	request,
1123	"fossil/webhook_form.html",
1124	{
	@@ -1142,20 +1148,25 @@
1148	secret = request.POST.get("secret", "").strip()
1149	events = request.POST.getlist("events")
1150	is_active = request.POST.get("is_active") == "on"
1151
1152	if url:
1153	from core.url_validation import is_safe_webhook_url
1154
1155	is_safe, url_error = is_safe_webhook_url(url)
1156	if not is_safe:
1157	messages.error(request, f"Invalid webhook URL: {url_error}")
1158	else:
1159	webhook.url = url
1160	if secret:
1161	webhook.secret = secret
1162	webhook.events = ",".join(events) if events else "all"
1163	webhook.is_active = is_active
1164	webhook.updated_by = request.user
1165	webhook.save()
1166	messages.success(request, "Webhook updated.")
1167	return redirect("fossil:webhooks", slug=slug)
1168
1169	return render(
1170	request,
1171	"fossil/webhook_form.html",
1172	{
	@@ -1832,24 +1843,29 @@
1843	if request.method == "GET":
1844	if not can_read_project(request.user, project):
1845	from django.core.exceptions import PermissionDenied
1846
1847	raise PermissionDenied
1848	import html as html_mod
1849
1850	clone_url = request.build_absolute_uri()
1851	is_public = project.visibility == "public"
1852	auth_note = "" if is_public else "<p>Authentication is required.</p>"
1853	safe_name = html_mod.escape(project.name)
1854	safe_slug = html_mod.escape(project.slug)
1855	safe_url = html_mod.escape(clone_url)
1856	response_html = (
1857	f"<html><head><title>{safe_name} — Fossil Sync</title></head>"
1858	f"<body>"
1859	f"<h1>{safe_name}</h1>"
1860	f"<p>This is the Fossil sync endpoint for <strong>{safe_name}</strong>.</p>"
1861	f"<p>Clone with:</p>"
1862	f"<pre>fossil clone {safe_url} {safe_slug}.fossil</pre>"
1863	f"{auth_note}"
1864	f"</body></html>"
1865	)
1866	return HttpResponse(response_html)
1867
1868	if request.method == "POST":
1869	if not fossil_repo.exists_on_disk:
1870	raise Http404("Repository file not found on disk.")
1871
1872

FossilRepo

Keyboard Shortcuts