Fossil SCM

restructured the invalid_utf8 so that it doesn't have to initialize the table on the first pass and shrink the size of the table

sdr 2016-06-14 18:06 invalid_utf8_table

Commit d3fc377276b80b413d1d0f2eed2cd5d57517d029

Parent d22c72bc9248b5e…

1 file changed +48 -58

~ src/lookslike.c

M src/lookslike.c

+48 -58

		--- src/lookslike.c
		+++ src/lookslike.c
		@@ -141,81 +141,71 @@
141	141	** which is not considered invalid here: Some languages like
142	142	** Java and Tcl use it. This function also considers valid
143	143	** the derivatives CESU-8 & WTF-8 (as described in the same
144	144	** wikipedia article referenced previously).
145	145	*/
	146	+
	147	+/* definitions for various UTF-8 sequence lengths */
	148	+static const unsigned char us2a[] = {
	149	+ 2, 0xC0, 0xC0, 0x80, 0x80
	150	+};
	151	+static const unsigned char us2b[] = {
	152	+ 2, 0xC2, 0xDF, 0x80, 0xBF
	153	+};
	154	+static const unsigned char us3a[] = {
	155	+ 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF
	156	+};
	157	+static const unsigned char us3b[] = {
	158	+ 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF
	159	+};
	160	+static const unsigned char us4a[] = {
	161	+ 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF
	162	+};
	163	+static const unsigned char us4b[] = {
	164	+ 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF
	165	+};
	166	+static const unsigned char us4c[] = {
	167	+ 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF
	168	+};
	169	+
	170	+/* a table used for quick lookup of the definition that goes with a
	171	+ * particular lead byte */
	172	+static const unsigned char* lb_tab[] = {
	173	+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
	174	+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
	175	+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
	176	+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
	177	+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
	178	+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
	179	+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
	180	+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
	181	+ us2a, NULL, us2b, us2b, us2b, us2b, us2b, us2b,
	182	+ us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
	183	+ us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
	184	+ us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
	185	+ us3a, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
	186	+ us3b, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
	187	+ us4a, us4b, us4b, us4b, us4c, NULL, NULL, NULL,
	188	+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
	189	+};
146	190
147	191	int invalid_utf8(
148	192	const Blob *pContent
149	193	){
150		- /* definitions for various UTF-8 sequence lengths */
151		- static unsigned char def_2a[] = {
152		- 2, 0xC0, 0xC0, 0x80, 0x80
153		- };
154		- static unsigned char def_2b[] = {
155		- 2, 0xC2, 0xDF, 0x80, 0xBF
156		- };
157		- static unsigned char def_3a[] = {
158		- 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF
159		- };
160		- static unsigned char def_3b[] = {
161		- 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF
162		- };
163		- static unsigned char def_4a[] = {
164		- 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF
165		- };
166		- static unsigned char def_4b[] = {
167		- 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF
168		- };
169		- static unsigned char def_4c[] = {
170		- 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF
171		- };
172		-
173		- /* an array of all the definitions */
174		- static unsigned char* def_arr[] = {
175		- def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL
176		- };
177		-
178		- /* a table used for quick lookup of the definition that goes with a
179		- * particular lead byte */
180		- static unsigned char* lb_tab[256] = { NULL };
181		-
182		- /* a pointer to the table; NULL means not yet setup */
183		- static unsigned char** lb_ptr = NULL;
184		-
185	194	/* buffer pointer and size */
186		- const unsigned char *z;
187		- unsigned int n;
188		-
189		- /* if the table pointer hasn't been initialized */
190		- if( lb_ptr==NULL ){
191		- unsigned char** pp;
192		- /* for each definition, set the lead byte table pointer to the
193		- * proper definition */
194		- lb_ptr = lb_tab;
195		- pp = def_arr;
196		- while( *pp!=NULL ){
197		- unsigned char lo = pp[0][1];
198		- unsigned char hi = pp[0][2];
199		- unsigned char i;
200		- for(i=lo; i<=hi; ++i){
201		- lb_ptr[i] = pp[0];
202		- }
203		- ++pp;
204		- }
205		- }
206		- z = (unsigned char *)blob_buffer(pContent);
207		- n = blob_size(pContent);
	195	+ const unsigned char z = (unsigned char )blob_buffer(pContent);
	196	+ unsigned int n = blob_size(pContent);
	197	+
208	198	/* while we haven't checked all the bytes in the buffer */
209	199	while( n>0 ){
210	200	/* ascii is trivial */
211	201	if( *z<0x80 ){
212	202	++z;
213	203	--n;
214	204	}else{
215	205	/* get the definition for this lead byte */
216		- unsigned char* def = lb_ptr[*z++];
	206	+ unsigned char* def = lb_tab[(*z++)-0x80];
217	207	unsigned char i, len;
218	208
219	209	/* if the definition doesn't exist, return invalid */
220	210	if( !def ) return LOOK_INVALID;
221	211	/* get the expected sequence length */
222	212

	--- src/lookslike.c
	+++ src/lookslike.c
	@@ -141,81 +141,71 @@
141	** which is not considered invalid here: Some languages like
142	** Java and Tcl use it. This function also considers valid
143	** the derivatives CESU-8 & WTF-8 (as described in the same
144	** wikipedia article referenced previously).
145	*/












































146
147	int invalid_utf8(
148	const Blob *pContent
149	){
150	/* definitions for various UTF-8 sequence lengths */
151	static unsigned char def_2a[] = {
152	2, 0xC0, 0xC0, 0x80, 0x80
153	};
154	static unsigned char def_2b[] = {
155	2, 0xC2, 0xDF, 0x80, 0xBF
156	};
157	static unsigned char def_3a[] = {
158	3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF
159	};
160	static unsigned char def_3b[] = {
161	3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF
162	};
163	static unsigned char def_4a[] = {
164	4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF
165	};
166	static unsigned char def_4b[] = {
167	4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF
168	};
169	static unsigned char def_4c[] = {
170	4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF
171	};
172
173	/* an array of all the definitions */
174	static unsigned char* def_arr[] = {
175	def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL
176	};
177
178	/* a table used for quick lookup of the definition that goes with a
179	* particular lead byte */
180	static unsigned char* lb_tab[256] = { NULL };
181
182	/* a pointer to the table; NULL means not yet setup */
183	static unsigned char** lb_ptr = NULL;
184
185	/* buffer pointer and size */
186	const unsigned char *z;
187	unsigned int n;
188
189	/* if the table pointer hasn't been initialized */
190	if( lb_ptr==NULL ){
191	unsigned char** pp;
192	/* for each definition, set the lead byte table pointer to the
193	* proper definition */
194	lb_ptr = lb_tab;
195	pp = def_arr;
196	while( *pp!=NULL ){
197	unsigned char lo = pp[0][1];
198	unsigned char hi = pp[0][2];
199	unsigned char i;
200	for(i=lo; i<=hi; ++i){
201	lb_ptr[i] = pp[0];
202	}
203	++pp;
204	}
205	}
206	z = (unsigned char *)blob_buffer(pContent);
207	n = blob_size(pContent);
208	/* while we haven't checked all the bytes in the buffer */
209	while( n>0 ){
210	/* ascii is trivial */
211	if( *z<0x80 ){
212	++z;
213	--n;
214	}else{
215	/* get the definition for this lead byte */
216	unsigned char* def = lb_ptr[*z++];
217	unsigned char i, len;
218
219	/* if the definition doesn't exist, return invalid */
220	if( !def ) return LOOK_INVALID;
221	/* get the expected sequence length */
222

	--- src/lookslike.c
	+++ src/lookslike.c
	@@ -141,81 +141,71 @@
141	** which is not considered invalid here: Some languages like
142	** Java and Tcl use it. This function also considers valid
143	** the derivatives CESU-8 & WTF-8 (as described in the same
144	** wikipedia article referenced previously).
145	*/
146
147	/* definitions for various UTF-8 sequence lengths */
148	static const unsigned char us2a[] = {
149	2, 0xC0, 0xC0, 0x80, 0x80
150	};
151	static const unsigned char us2b[] = {
152	2, 0xC2, 0xDF, 0x80, 0xBF
153	};
154	static const unsigned char us3a[] = {
155	3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF
156	};
157	static const unsigned char us3b[] = {
158	3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF
159	};
160	static const unsigned char us4a[] = {
161	4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF
162	};
163	static const unsigned char us4b[] = {
164	4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF
165	};
166	static const unsigned char us4c[] = {
167	4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF
168	};
169
170	/* a table used for quick lookup of the definition that goes with a
171	* particular lead byte */
172	static const unsigned char* lb_tab[] = {
173	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
174	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
175	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
176	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
177	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
178	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
179	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
180	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
181	us2a, NULL, us2b, us2b, us2b, us2b, us2b, us2b,
182	us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
183	us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
184	us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
185	us3a, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
186	us3b, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
187	us4a, us4b, us4b, us4b, us4c, NULL, NULL, NULL,
188	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
189	};
190
191	int invalid_utf8(
192	const Blob *pContent
193	){



































194	/* buffer pointer and size */
195	const unsigned char z = (unsigned char )blob_buffer(pContent);
196	unsigned int n = blob_size(pContent);
197



















198	/* while we haven't checked all the bytes in the buffer */
199	while( n>0 ){
200	/* ascii is trivial */
201	if( *z<0x80 ){
202	++z;
203	--n;
204	}else{
205	/* get the definition for this lead byte */
206	unsigned char* def = lb_tab[(*z++)-0x80];
207	unsigned char i, len;
208
209	/* if the definition doesn't exist, return invalid */
210	if( !def ) return LOOK_INVALID;
211	/* get the expected sequence length */
212

Fossil SCM

Keyboard Shortcuts