Fossil SCM

fixed an edge case in invalid_utf8 where a valid three byte seq could be misidentified

scott 2016-06-09 23:59 trunk

Commit 314cdab0d49d74237ddca987c3e0faa0870373cd

Parent c7e9625d4238f32…

1 file changed +10

M src/lookslike.c

+10

		--- src/lookslike.c
		+++ src/lookslike.c
		@@ -161,11 +161,21 @@
161	161	if( c2>=0x80 ){
162	162	if( ((c2<0xc2) \|\| (c2>=0xf4) \|\| ((c&0xc0)!=0x80)) &&
163	163	(((c2!=0xf4) \|\| (c>=0x90)) && ((c2!=0xc0) \|\| (c!=0x80))) ){
164	164	return LOOK_INVALID; /* Invalid UTF-8 */
165	165	}
	166	+ /* the first byte of the sequence is okay
	167	+ ** but we need to check the rest
	168	+ ** convert next byte to a prefix byte of the next shorter sequence
	169	+ ** or a simple space character if the two byte seq was valid
	170	+ */
166	171	c = (c2 >= 0xe0) ? (c2<<1)+1 : ' ';
	172	+ /* edge case: if three byte sequence started with 0xe0
	173	+ ** it becomes 0xc1, which is a too short two byte sequence
	174	+ ** so fix it up to be the start of a valid two byte sequence
	175	+ */
	176	+ if (c == 0xc1) c = 0xc2;
167	177	}
168	178	}
169	179	return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */
170	180	}
171	181
172	182

	--- src/lookslike.c
	+++ src/lookslike.c
	@@ -161,11 +161,21 @@
161	if( c2>=0x80 ){
162	if( ((c2<0xc2) \|\| (c2>=0xf4) \|\| ((c&0xc0)!=0x80)) &&
163	(((c2!=0xf4) \|\| (c>=0x90)) && ((c2!=0xc0) \|\| (c!=0x80))) ){
164	return LOOK_INVALID; /* Invalid UTF-8 */
165	}





166	c = (c2 >= 0xe0) ? (c2<<1)+1 : ' ';





167	}
168	}
169	return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */
170	}
171
172

	--- src/lookslike.c
	+++ src/lookslike.c
	@@ -161,11 +161,21 @@
161	if( c2>=0x80 ){
162	if( ((c2<0xc2) \|\| (c2>=0xf4) \|\| ((c&0xc0)!=0x80)) &&
163	(((c2!=0xf4) \|\| (c>=0x90)) && ((c2!=0xc0) \|\| (c!=0x80))) ){
164	return LOOK_INVALID; /* Invalid UTF-8 */
165	}
166	/* the first byte of the sequence is okay
167	** but we need to check the rest
168	** convert next byte to a prefix byte of the next shorter sequence
169	** or a simple space character if the two byte seq was valid
170	*/
171	c = (c2 >= 0xe0) ? (c2<<1)+1 : ' ';
172	/* edge case: if three byte sequence started with 0xe0
173	** it becomes 0xc1, which is a too short two byte sequence
174	** so fix it up to be the start of a valid two byte sequence
175	*/
176	if (c == 0xc1) c = 0xc2;
177	}
178	}
179	return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */
180	}
181
182