Fossil SCM

performance optimizations

sdr 2016-06-10 20:45 invalid_utf8_table

Commit 635f3b0300cffc2aa01ece178fe9684ca8120f0c

Parent 8a65d6f05c51962…

1 file changed +39 -22

M src/lookslike.c

+39 -22

		--- src/lookslike.c
		+++ src/lookslike.c
		@@ -143,24 +143,23 @@
143	143	** Java and Tcl use it. This function also considers valid
144	144	** the derivatives CESU-8 & WTF-8 (as described in the same
145	145	** wikipedia article referenced previously).
146	146	*/
147	147
148		-int invalid_utf8(const Blob *pContent)
	148	+int invalid_utf8_b(const Blob *pContent)
149	149	{
150	150	/* definitions for various utf-8 sequence lengths */
151		- static unsigned char def_1a[] = { 1, 0x00, 0x7F };
152	151	static unsigned char def_2a[] = { 2, 0xC0, 0xC0, 0x80, 0x80 };
153	152	static unsigned char def_2b[] = { 2, 0xC2, 0xDF, 0x80, 0xBF };
154	153	static unsigned char def_3a[] = { 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF };
155	154	static unsigned char def_3b[] = { 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF };
156	155	static unsigned char def_4a[] = { 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF };
157	156	static unsigned char def_4b[] = { 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF };
158	157	static unsigned char def_4c[] = { 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF };
159	158
160	159	/* an array of all the definitions */
161		- static unsigned char* def_arr[] = { def_1a, def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL };
	160	+ static unsigned char* def_arr[] = { def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL };
162	161
163	162	/* a table used for quick lookup of the definition that goes with a particular lead byte */
164	163	static unsigned char* lb_tab[256] = { NULL };
165	164
166	165	/* a pointer to the table; NULL means not yet setup */
		@@ -189,29 +188,47 @@
189	188	unsigned int n = blob_size(pContent);
190	189
191	190	/* while we haven't checked all the bytes in the buffer */
192	191	while (n > 0)
193	192	{
194		- /* get the definition for this lead byte */
195		- unsigned char* def = lb_ptr[*z];
196		- unsigned char i;
197		-
198		- /* if the definition doesn't exist, or there aren't enough bytes left, return invalid */
199		- if (!def \|\| (n < def[0]))
200		- return LOOK_INVALID;
201		-
202		- /* we already know byte #0 is good, so check the remaining bytes */
203		- for (i = 1; i < def[0]; ++i)
204		- {
205		- /* if the byte is outside the allowed range for this definition, return invalid */
206		- if ((z[i] < def[1 + i * 2 + 0]) \|\| (z[i] > def[1 + i * 2 + 1]))
207		- return LOOK_INVALID;
208		- }
209		-
210		- /* advance to the next sequence */
211		- z += def[0];
212		- n -= def[0];
	193	+ /* ascii is trivial */
	194	+ if (*z < 0x80)
	195	+ {
	196	+ ++z;
	197	+ --n;
	198	+ }
	199	+ else
	200	+ {
	201	+ /* get the definition for this lead byte */
	202	+ unsigned char* def = lb_ptr[*z++];
	203	+ unsigned char i, len;
	204	+
	205	+ /* if the definition doesn't exist, return invalid */
	206	+ if (!def)
	207	+ return LOOK_INVALID;
	208	+
	209	+ /* get the expected sequence length */
	210	+ len = *def;
	211	+
	212	+ /* if there aren't enough bytes left, return invalid */
	213	+ if (n < len)
	214	+ return LOOK_INVALID;
	215	+
	216	+ /* skip the length & lead byte range */
	217	+ def += 3;
	218	+
	219	+ /* we already know byte #0 is good, so check the remaining bytes */
	220	+ for (i = 1; i < len; ++i)
	221	+ {
	222	+ /* if the byte is outside the allowed range for this definition, return invalid */
	223	+ if ((z < def++) \|\| (z++ > def++))
	224	+ return LOOK_INVALID;
	225	+ }
	226	+
	227	+ /* advance to the next sequence */
	228	+ n -= len;
	229	+ }
213	230	}
214	231
215	232	/* we made it all the way through the buffer so it's not invalid */
216	233	return 0;
217	234	}
218	235

	--- src/lookslike.c
	+++ src/lookslike.c
	@@ -143,24 +143,23 @@
143	** Java and Tcl use it. This function also considers valid
144	** the derivatives CESU-8 & WTF-8 (as described in the same
145	** wikipedia article referenced previously).
146	*/
147
148	int invalid_utf8(const Blob *pContent)
149	{
150	/* definitions for various utf-8 sequence lengths */
151	static unsigned char def_1a[] = { 1, 0x00, 0x7F };
152	static unsigned char def_2a[] = { 2, 0xC0, 0xC0, 0x80, 0x80 };
153	static unsigned char def_2b[] = { 2, 0xC2, 0xDF, 0x80, 0xBF };
154	static unsigned char def_3a[] = { 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF };
155	static unsigned char def_3b[] = { 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF };
156	static unsigned char def_4a[] = { 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF };
157	static unsigned char def_4b[] = { 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF };
158	static unsigned char def_4c[] = { 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF };
159
160	/* an array of all the definitions */
161	static unsigned char* def_arr[] = { def_1a, def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL };
162
163	/* a table used for quick lookup of the definition that goes with a particular lead byte */
164	static unsigned char* lb_tab[256] = { NULL };
165
166	/* a pointer to the table; NULL means not yet setup */
	@@ -189,29 +188,47 @@
189	unsigned int n = blob_size(pContent);
190
191	/* while we haven't checked all the bytes in the buffer */
192	while (n > 0)
193	{
194	/* get the definition for this lead byte */
195	unsigned char* def = lb_ptr[*z];
196	unsigned char i;
197
198	/* if the definition doesn't exist, or there aren't enough bytes left, return invalid */
199	if (!def \|\| (n < def[0]))
200	return LOOK_INVALID;
201
202	/* we already know byte #0 is good, so check the remaining bytes */
203	for (i = 1; i < def[0]; ++i)
204	{
205	/* if the byte is outside the allowed range for this definition, return invalid */
206	if ((z[i] < def[1 + i * 2 + 0]) \|\| (z[i] > def[1 + i * 2 + 1]))
207	return LOOK_INVALID;
208	}
209
210	/* advance to the next sequence */
211	z += def[0];
212	n -= def[0];


















213	}
214
215	/* we made it all the way through the buffer so it's not invalid */
216	return 0;
217	}
218

	--- src/lookslike.c
	+++ src/lookslike.c
	@@ -143,24 +143,23 @@
143	** Java and Tcl use it. This function also considers valid
144	** the derivatives CESU-8 & WTF-8 (as described in the same
145	** wikipedia article referenced previously).
146	*/
147
148	int invalid_utf8_b(const Blob *pContent)
149	{
150	/* definitions for various utf-8 sequence lengths */

151	static unsigned char def_2a[] = { 2, 0xC0, 0xC0, 0x80, 0x80 };
152	static unsigned char def_2b[] = { 2, 0xC2, 0xDF, 0x80, 0xBF };
153	static unsigned char def_3a[] = { 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF };
154	static unsigned char def_3b[] = { 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF };
155	static unsigned char def_4a[] = { 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF };
156	static unsigned char def_4b[] = { 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF };
157	static unsigned char def_4c[] = { 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF };
158
159	/* an array of all the definitions */
160	static unsigned char* def_arr[] = { def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL };
161
162	/* a table used for quick lookup of the definition that goes with a particular lead byte */
163	static unsigned char* lb_tab[256] = { NULL };
164
165	/* a pointer to the table; NULL means not yet setup */
	@@ -189,29 +188,47 @@
188	unsigned int n = blob_size(pContent);
189
190	/* while we haven't checked all the bytes in the buffer */
191	while (n > 0)
192	{
193	/* ascii is trivial */
194	if (*z < 0x80)
195	{
196	++z;
197	--n;
198	}
199	else
200	{
201	/* get the definition for this lead byte */
202	unsigned char* def = lb_ptr[*z++];
203	unsigned char i, len;
204
205	/* if the definition doesn't exist, return invalid */
206	if (!def)
207	return LOOK_INVALID;
208
209	/* get the expected sequence length */
210	len = *def;
211
212	/* if there aren't enough bytes left, return invalid */
213	if (n < len)
214	return LOOK_INVALID;
215
216	/* skip the length & lead byte range */
217	def += 3;
218
219	/* we already know byte #0 is good, so check the remaining bytes */
220	for (i = 1; i < len; ++i)
221	{
222	/* if the byte is outside the allowed range for this definition, return invalid */
223	if ((z < def++) \|\| (z++ > def++))
224	return LOOK_INVALID;
225	}
226
227	/* advance to the next sequence */
228	n -= len;
229	}
230	}
231
232	/* we made it all the way through the buffer so it's not invalid */
233	return 0;
234	}
235

Fossil SCM

Keyboard Shortcuts