Fossil SCM

proposed new invalid_utf8 function

sdr 2016-06-10 08:07 trunk

Commit e58334a00799c303c6ad187e6c84c2151457b146

Parent 314cdab0d49d742…

1 file changed +73 -36

M src/lookslike.c

+73 -36

		--- src/lookslike.c
		+++ src/lookslike.c
		@@ -136,49 +136,86 @@
136	136
137	137
138	138	/*
139	139	** Checks for proper UTF-8. It uses the method described in:
140	140	** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
141		-** except for the "overlong form" of \u0000 which is not considered invalid
142		-** here: Some languages like Java and Tcl use it. For UTF-8 characters
143		-** > 7f, the variable 'c2' not necessary means the previous character.
144		-** It's number of higher 1-bits indicate the number of continuation bytes
145		-** that are expected to be followed. E.g. when 'c2' has a value in the range
146		-** 0xc0..0xdf it means that 'c' is expected to contain the last continuation
147		-** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
148		-** more continuation byte is expected.
	141	+** except for the "overlong form" of \u0000 (Modified UTF-8)
	142	+** which is not considered invalid here: Some languages like
	143	+** Java and Tcl use it. This function also considers valid
	144	+** the derivatives CESU-8 & WTF-8 (as described in the same
	145	+** wikipedia article referenced previously).
149	146	*/
150	147
151		-int invalid_utf8(const Blob *pContent){
152		- const unsigned char z = (unsigned char ) blob_buffer(pContent);
	148	+int invalid_utf8(const Blob *pContent)
	149	+{
	150	+ /* definitions for various utf-8 sequence lengths */
	151	+ static unsigned char def_1a[] = { 1, 0x00, 0x7F };
	152	+ static unsigned char def_2a[] = { 2, 0xC0, 0xC0, 0x80, 0x80 };
	153	+ static unsigned char def_2b[] = { 2, 0xC2, 0xDF, 0x80, 0xBF };
	154	+ static unsigned char def_3a[] = { 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF };
	155	+ static unsigned char def_3b[] = { 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF };
	156	+ static unsigned char def_4a[] = { 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF };
	157	+ static unsigned char def_4b[] = { 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF };
	158	+ static unsigned char def_4c[] = { 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF };
	159	+
	160	+ /* an array of all the definitions */
	161	+ static unsigned char* def_arr[] = { def_1a, def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL };
	162	+
	163	+ /* a table used for quick lookup of the definition that goes with a particular lead byte */
	164	+ static unsigned char* lb_tab[256] = { NULL };
	165	+
	166	+ /* a pointer to the table; NULL means not yet setup */
	167	+ static unsigned char** lb_ptr = NULL;
	168	+
	169	+ /* if the table pointer hasn't been initialized */
	170	+ if (lb_ptr == NULL)
	171	+ {
	172	+ lb_ptr = lb_tab;
	173	+
	174	+ /* for each definition, set the lead byte table pointer to the proper definition */
	175	+ unsigned char** pp = def_arr;
	176	+ while (*pp != NULL)
	177	+ {
	178	+ unsigned char lo = pp[0][1];
	179	+ unsigned char hi = pp[0][2];
	180	+ unsigned char i;
	181	+ for (i = lo; i <= hi; ++i)
	182	+ lb_ptr[i] = pp[0];
	183	+ ++pp;
	184	+ }
	185	+ }
	186	+
	187	+ /* buffer pointer and size */
	188	+ const unsigned char z = (unsigned char )blob_buffer(pContent);
153	189	unsigned int n = blob_size(pContent);
154		- unsigned char c, c2;
155		-
156		- if( n==0 ) return 0; /* Empty file -> OK */
157		- c = *z;
158		- while( --n>0 ){
159		- c2 = c;
160		- c = *++z;
161		- if( c2>=0x80 ){
162		- if( ((c2<0xc2) \|\| (c2>=0xf4) \|\| ((c&0xc0)!=0x80)) &&
163		- (((c2!=0xf4) \|\| (c>=0x90)) && ((c2!=0xc0) \|\| (c!=0x80))) ){
164		- return LOOK_INVALID; /* Invalid UTF-8 */
165		- }
166		- /* the first byte of the sequence is okay
167		- ** but we need to check the rest
168		- ** convert next byte to a prefix byte of the next shorter sequence
169		- ** or a simple space character if the two byte seq was valid
170		- */
171		- c = (c2 >= 0xe0) ? (c2<<1)+1 : ' ';
172		- /* edge case: if three byte sequence started with 0xe0
173		- ** it becomes 0xc1, which is a too short two byte sequence
174		- ** so fix it up to be the start of a valid two byte sequence
175		- */
176		- if (c == 0xc1) c = 0xc2;
177		- }
178		- }
179		- return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */
	190	+
	191	+ /* while we haven't checked all the bytes in the buffer */
	192	+ while (n > 0)
	193	+ {
	194	+ /* get the definition for this lead byte */
	195	+ unsigned char* def = lb_ptr[*z];
	196	+ unsigned char i;
	197	+
	198	+ /* if the definition doesn't exist, or there aren't enough bytes left, return invalid */
	199	+ if (!def \|\| (n < def[0]))
	200	+ return LOOK_INVALID;
	201	+
	202	+ /* we already know byte #0 is good, so check the remaining bytes */
	203	+ for (i = 1; i < def[0]; ++i)
	204	+ {
	205	+ /* if the byte is outside the allowed range for this definition, return invalid */
	206	+ if ((z[i] < def[1 + i * 2 + 0]) \|\| (z[i] > def[1 + i * 2 + 1]))
	207	+ return LOOK_INVALID;
	208	+ }
	209	+
	210	+ /* advance to the next sequence */
	211	+ z += def[0];
	212	+ n -= def[0];
	213	+ }
	214	+
	215	+ /* we made it all the way through the buffer so it's not invalid */
	216	+ return 0;
180	217	}
181	218
182	219
183	220	/*
184	221	** Define the type needed to represent a Unicode (UTF-16) character.
185	222

	--- src/lookslike.c
	+++ src/lookslike.c
	@@ -136,49 +136,86 @@
136
137
138	/*
139	** Checks for proper UTF-8. It uses the method described in:
140	** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
141	** except for the "overlong form" of \u0000 which is not considered invalid
142	** here: Some languages like Java and Tcl use it. For UTF-8 characters
143	** > 7f, the variable 'c2' not necessary means the previous character.
144	** It's number of higher 1-bits indicate the number of continuation bytes
145	** that are expected to be followed. E.g. when 'c2' has a value in the range
146	** 0xc0..0xdf it means that 'c' is expected to contain the last continuation
147	** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
148	** more continuation byte is expected.
149	*/
150
151	int invalid_utf8(const Blob *pContent){
152	const unsigned char z = (unsigned char ) blob_buffer(pContent);







































153	unsigned int n = blob_size(pContent);
154	unsigned char c, c2;
155
156	if( n==0 ) return 0; /* Empty file -> OK */
157	c = *z;
158	while( --n>0 ){
159	c2 = c;
160	c = *++z;
161	if( c2>=0x80 ){
162	if( ((c2<0xc2) \|\| (c2>=0xf4) \|\| ((c&0xc0)!=0x80)) &&
163	(((c2!=0xf4) \|\| (c>=0x90)) && ((c2!=0xc0) \|\| (c!=0x80))) ){
164	return LOOK_INVALID; /* Invalid UTF-8 */
165	}
166	/* the first byte of the sequence is okay
167	** but we need to check the rest
168	** convert next byte to a prefix byte of the next shorter sequence
169	** or a simple space character if the two byte seq was valid
170	*/
171	c = (c2 >= 0xe0) ? (c2<<1)+1 : ' ';
172	/* edge case: if three byte sequence started with 0xe0
173	** it becomes 0xc1, which is a too short two byte sequence
174	** so fix it up to be the start of a valid two byte sequence
175	*/
176	if (c == 0xc1) c = 0xc2;
177	}
178	}
179	return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */

180	}
181
182
183	/*
184	** Define the type needed to represent a Unicode (UTF-16) character.
185

	--- src/lookslike.c
	+++ src/lookslike.c
	@@ -136,49 +136,86 @@
136
137
138	/*
139	** Checks for proper UTF-8. It uses the method described in:
140	** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
141	** except for the "overlong form" of \u0000 (Modified UTF-8)
142	** which is not considered invalid here: Some languages like
143	** Java and Tcl use it. This function also considers valid
144	** the derivatives CESU-8 & WTF-8 (as described in the same
145	** wikipedia article referenced previously).



146	*/
147
148	int invalid_utf8(const Blob *pContent)
149	{
150	/* definitions for various utf-8 sequence lengths */
151	static unsigned char def_1a[] = { 1, 0x00, 0x7F };
152	static unsigned char def_2a[] = { 2, 0xC0, 0xC0, 0x80, 0x80 };
153	static unsigned char def_2b[] = { 2, 0xC2, 0xDF, 0x80, 0xBF };
154	static unsigned char def_3a[] = { 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF };
155	static unsigned char def_3b[] = { 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF };
156	static unsigned char def_4a[] = { 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF };
157	static unsigned char def_4b[] = { 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF };
158	static unsigned char def_4c[] = { 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF };
159
160	/* an array of all the definitions */
161	static unsigned char* def_arr[] = { def_1a, def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL };
162
163	/* a table used for quick lookup of the definition that goes with a particular lead byte */
164	static unsigned char* lb_tab[256] = { NULL };
165
166	/* a pointer to the table; NULL means not yet setup */
167	static unsigned char** lb_ptr = NULL;
168
169	/* if the table pointer hasn't been initialized */
170	if (lb_ptr == NULL)
171	{
172	lb_ptr = lb_tab;
173
174	/* for each definition, set the lead byte table pointer to the proper definition */
175	unsigned char** pp = def_arr;
176	while (*pp != NULL)
177	{
178	unsigned char lo = pp[0][1];
179	unsigned char hi = pp[0][2];
180	unsigned char i;
181	for (i = lo; i <= hi; ++i)
182	lb_ptr[i] = pp[0];
183	++pp;
184	}
185	}
186
187	/* buffer pointer and size */
188	const unsigned char z = (unsigned char )blob_buffer(pContent);
189	unsigned int n = blob_size(pContent);
190
191	/* while we haven't checked all the bytes in the buffer */
192	while (n > 0)
193	{
194	/* get the definition for this lead byte */
195	unsigned char* def = lb_ptr[*z];
196	unsigned char i;
197
198	/* if the definition doesn't exist, or there aren't enough bytes left, return invalid */
199	if (!def \|\| (n < def[0]))
200	return LOOK_INVALID;
201
202	/* we already know byte #0 is good, so check the remaining bytes */
203	for (i = 1; i < def[0]; ++i)
204	{
205	/* if the byte is outside the allowed range for this definition, return invalid */
206	if ((z[i] < def[1 + i * 2 + 0]) \|\| (z[i] > def[1 + i * 2 + 1]))
207	return LOOK_INVALID;
208	}
209
210	/* advance to the next sequence */
211	z += def[0];
212	n -= def[0];
213	}
214
215	/* we made it all the way through the buffer so it's not invalid */
216	return 0;
217	}
218
219
220	/*
221	** Define the type needed to represent a Unicode (UTF-16) character.
222

Fossil SCM

Keyboard Shortcuts