Fossil SCM

Further coding style improvements for the new invalid_utf8() function.

mistachkin 2016-06-11 05:23 UTC invalid_utf8_table

Commit 2fb7d59beed17f94613e3108de7460681d0e0a1c

Parent dd3bb22cd726f26…

1 file changed +57 -41

M src/lookslike.c

+57 -41

		--- src/lookslike.c
		+++ src/lookslike.c
		@@ -132,11 +132,10 @@
132	132	flags \|= LOOK_LONG; /* Very long line -> binary */
133	133	}
134	134	return flags;
135	135	}
136	136
137		-
138	137	/*
139	138	** Checks for proper UTF-8. It uses the method described in:
140	139	** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
141	140	** except for the "overlong form" of \u0000 (Modified UTF-8)
142	141	** which is not considered invalid here: Some languages like
		@@ -143,88 +142,105 @@
143	142	** Java and Tcl use it. This function also considers valid
144	143	** the derivatives CESU-8 & WTF-8 (as described in the same
145	144	** wikipedia article referenced previously).
146	145	*/
147	146
148		-int invalid_utf8(const Blob *pContent) {
149		- /* definitions for various utf-8 sequence lengths */
150		- static unsigned char def_2a[] = { 2, 0xC0, 0xC0, 0x80, 0x80 };
151		- static unsigned char def_2b[] = { 2, 0xC2, 0xDF, 0x80, 0xBF };
152		- static unsigned char def_3a[] = { 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF };
153		- static unsigned char def_3b[] = { 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF };
154		- static unsigned char def_4a[] = { 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF };
155		- static unsigned char def_4b[] = { 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF };
156		- static unsigned char def_4c[] = { 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF };
	147	+int invalid_utf8(
	148	+ const Blob *pContent
	149	+){
	150	+ /* definitions for various UTF-8 sequence lengths */
	151	+ static unsigned char def_2a[] = {
	152	+ 2, 0xC0, 0xC0, 0x80, 0x80
	153	+ };
	154	+ static unsigned char def_2b[] = {
	155	+ 2, 0xC2, 0xDF, 0x80, 0xBF
	156	+ };
	157	+ static unsigned char def_3a[] = {
	158	+ 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF
	159	+ };
	160	+ static unsigned char def_3b[] = {
	161	+ 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF
	162	+ };
	163	+ static unsigned char def_4a[] = {
	164	+ 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF
	165	+ };
	166	+ static unsigned char def_4b[] = {
	167	+ 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF
	168	+ };
	169	+ static unsigned char def_4c[] = {
	170	+ 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF
	171	+ };
157	172
158	173	/* an array of all the definitions */
159		- static unsigned char* def_arr[] = { def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL };
	174	+ static unsigned char* def_arr[] = {
	175	+ def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL
	176	+ };
160	177
161		- /* a table used for quick lookup of the definition that goes with a particular lead byte */
	178	+ /* a table used for quick lookup of the definition that goes with a
	179	+ * particular lead byte */
162	180	static unsigned char* lb_tab[256] = { NULL };
163	181
164	182	/* a pointer to the table; NULL means not yet setup */
165	183	static unsigned char** lb_ptr = NULL;
	184	+
	185	+ /* buffer pointer and size */
	186	+ const unsigned char *z;
	187	+ unsigned int n;
166	188
167	189	/* if the table pointer hasn't been initialized */
168		- if (lb_ptr == NULL) {
	190	+ if( lb_ptr==NULL ){
	191	+ unsigned char** pp;
	192	+ /* for each definition, set the lead byte table pointer to the
	193	+ * proper definition */
169	194	lb_ptr = lb_tab;
170		-
171		- /* for each definition, set the lead byte table pointer to the proper definition */
172		- unsigned char** pp = def_arr;
173		- while (*pp != NULL) {
	195	+ pp = def_arr;
	196	+ while( *pp!=NULL ){
174	197	unsigned char lo = pp[0][1];
175	198	unsigned char hi = pp[0][2];
176	199	unsigned char i;
177		- for (i = lo; i <= hi; ++i)
	200	+ for(i=lo; i<=hi; ++i){
178	201	lb_ptr[i] = pp[0];
	202	+ }
179	203	++pp;
180	204	}
181	205	}
182		-
183		- /* buffer pointer and size */
184		- const unsigned char z = (unsigned char )blob_buffer(pContent);
185		- unsigned int n = blob_size(pContent);
186		-
	206	+ z = (unsigned char *)blob_buffer(pContent);
	207	+ n = blob_size(pContent);
187	208	/* while we haven't checked all the bytes in the buffer */
188		- while (n > 0) {
189		-
	209	+ while( n>0 ){
190	210	/* ascii is trivial */
191		- if (*z < 0x80) {
	211	+ if( *z<0x80 ){
192	212	++z;
193	213	--n;
194		- } else {
	214	+ }else{
195	215	/* get the definition for this lead byte */
196	216	unsigned char* def = lb_ptr[*z++];
197	217	unsigned char i, len;
198	218
199	219	/* if the definition doesn't exist, return invalid */
200		- if (!def) return LOOK_INVALID;
201		-
	220	+ if( !def ) return LOOK_INVALID;
202	221	/* get the expected sequence length */
203	222	len = *def;
204		-
205	223	/* if there aren't enough bytes left, return invalid */
206		- if (n < len) return LOOK_INVALID;
207		-
	224	+ if( n<len ) return LOOK_INVALID;
208	225	/* skip the length & lead byte range */
209	226	def += 3;
210		-
211	227	/* we already know byte #0 is good, so check the remaining bytes */
212		- for (i = 1; i < len; ++i)
213		- /* if the byte is outside the allowed range for this definition, return invalid */
214		- if ((z < def++) \|\| (z++ > def++))
	228	+ for(i=1; i<len; ++i){
	229	+ /* if the byte is outside the allowed range for this definition,
	230	+ * return invalid */
	231	+ if( (z<def++) \|\| (z++>def++) ){
215	232	return LOOK_INVALID;
216		-
	233	+ }
	234	+ }
217	235	/* advance to the next sequence */
218	236	n -= len;
219	237	}
220	238	}
221		-
222	239	/* we made it all the way through the buffer so it's not invalid */
223		- return 0;
	240	+ return LOOK_NONE;
224	241	}
225		-
226	242
227	243	/*
228	244	** Define the type needed to represent a Unicode (UTF-16) character.
229	245	*/
230	246	#ifndef WCHAR_T
		@@ -452,11 +468,11 @@
452	468	fUnicode = could_be_utf16(&blob, 0) \|\| fForceUtf16;
453	469	}
454	470	if( fUnicode ){
455	471	lookFlags = looks_like_utf16(&blob, bRevUtf16, 0);
456	472	}else{
457		- lookFlags = looks_like_utf8(&blob, 0)\|invalid_utf8(&blob);
	473	+ lookFlags = looks_like_utf8(&blob, 0) \| invalid_utf8(&blob);
458	474	}
459	475	}
460	476	fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
461	477	fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
462	478	fossil_print("Starts with UTF-16 BOM: %s\n",
463	479

	--- src/lookslike.c
	+++ src/lookslike.c
	@@ -132,11 +132,10 @@
132	flags \|= LOOK_LONG; /* Very long line -> binary */
133	}
134	return flags;
135	}
136
137
138	/*
139	** Checks for proper UTF-8. It uses the method described in:
140	** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
141	** except for the "overlong form" of \u0000 (Modified UTF-8)
142	** which is not considered invalid here: Some languages like
	@@ -143,88 +142,105 @@
143	** Java and Tcl use it. This function also considers valid
144	** the derivatives CESU-8 & WTF-8 (as described in the same
145	** wikipedia article referenced previously).
146	*/
147
148	int invalid_utf8(const Blob *pContent) {
149	/* definitions for various utf-8 sequence lengths */
150	static unsigned char def_2a[] = { 2, 0xC0, 0xC0, 0x80, 0x80 };
151	static unsigned char def_2b[] = { 2, 0xC2, 0xDF, 0x80, 0xBF };
152	static unsigned char def_3a[] = { 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF };
153	static unsigned char def_3b[] = { 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF };
154	static unsigned char def_4a[] = { 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF };
155	static unsigned char def_4b[] = { 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF };
156	static unsigned char def_4c[] = { 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF };
















157
158	/* an array of all the definitions */
159	static unsigned char* def_arr[] = { def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL };


160
161	/* a table used for quick lookup of the definition that goes with a particular lead byte */

162	static unsigned char* lb_tab[256] = { NULL };
163
164	/* a pointer to the table; NULL means not yet setup */
165	static unsigned char** lb_ptr = NULL;




166
167	/* if the table pointer hasn't been initialized */
168	if (lb_ptr == NULL) {



169	lb_ptr = lb_tab;
170
171	/* for each definition, set the lead byte table pointer to the proper definition */
172	unsigned char** pp = def_arr;
173	while (*pp != NULL) {
174	unsigned char lo = pp[0][1];
175	unsigned char hi = pp[0][2];
176	unsigned char i;
177	for (i = lo; i <= hi; ++i)
178	lb_ptr[i] = pp[0];

179	++pp;
180	}
181	}
182
183	/* buffer pointer and size */
184	const unsigned char z = (unsigned char )blob_buffer(pContent);
185	unsigned int n = blob_size(pContent);
186
187	/* while we haven't checked all the bytes in the buffer */
188	while (n > 0) {
189
190	/* ascii is trivial */
191	if (*z < 0x80) {
192	++z;
193	--n;
194	} else {
195	/* get the definition for this lead byte */
196	unsigned char* def = lb_ptr[*z++];
197	unsigned char i, len;
198
199	/* if the definition doesn't exist, return invalid */
200	if (!def) return LOOK_INVALID;
201
202	/* get the expected sequence length */
203	len = *def;
204
205	/* if there aren't enough bytes left, return invalid */
206	if (n < len) return LOOK_INVALID;
207
208	/* skip the length & lead byte range */
209	def += 3;
210
211	/* we already know byte #0 is good, so check the remaining bytes */
212	for (i = 1; i < len; ++i)
213	/* if the byte is outside the allowed range for this definition, return invalid */
214	if ((z < def++) \|\| (z++ > def++))

215	return LOOK_INVALID;
216

217	/* advance to the next sequence */
218	n -= len;
219	}
220	}
221
222	/* we made it all the way through the buffer so it's not invalid */
223	return 0;
224	}
225
226
227	/*
228	** Define the type needed to represent a Unicode (UTF-16) character.
229	*/
230	#ifndef WCHAR_T
	@@ -452,11 +468,11 @@
452	fUnicode = could_be_utf16(&blob, 0) \|\| fForceUtf16;
453	}
454	if( fUnicode ){
455	lookFlags = looks_like_utf16(&blob, bRevUtf16, 0);
456	}else{
457	lookFlags = looks_like_utf8(&blob, 0)\|invalid_utf8(&blob);
458	}
459	}
460	fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
461	fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
462	fossil_print("Starts with UTF-16 BOM: %s\n",
463

	--- src/lookslike.c
	+++ src/lookslike.c
	@@ -132,11 +132,10 @@
132	flags \|= LOOK_LONG; /* Very long line -> binary */
133	}
134	return flags;
135	}
136

137	/*
138	** Checks for proper UTF-8. It uses the method described in:
139	** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
140	** except for the "overlong form" of \u0000 (Modified UTF-8)
141	** which is not considered invalid here: Some languages like
	@@ -143,88 +142,105 @@
142	** Java and Tcl use it. This function also considers valid
143	** the derivatives CESU-8 & WTF-8 (as described in the same
144	** wikipedia article referenced previously).
145	*/
146
147	int invalid_utf8(
148	const Blob *pContent
149	){
150	/* definitions for various UTF-8 sequence lengths */
151	static unsigned char def_2a[] = {
152	2, 0xC0, 0xC0, 0x80, 0x80
153	};
154	static unsigned char def_2b[] = {
155	2, 0xC2, 0xDF, 0x80, 0xBF
156	};
157	static unsigned char def_3a[] = {
158	3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF
159	};
160	static unsigned char def_3b[] = {
161	3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF
162	};
163	static unsigned char def_4a[] = {
164	4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF
165	};
166	static unsigned char def_4b[] = {
167	4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF
168	};
169	static unsigned char def_4c[] = {
170	4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF
171	};
172
173	/* an array of all the definitions */
174	static unsigned char* def_arr[] = {
175	def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL
176	};
177
178	/* a table used for quick lookup of the definition that goes with a
179	* particular lead byte */
180	static unsigned char* lb_tab[256] = { NULL };
181
182	/* a pointer to the table; NULL means not yet setup */
183	static unsigned char** lb_ptr = NULL;
184
185	/* buffer pointer and size */
186	const unsigned char *z;
187	unsigned int n;
188
189	/* if the table pointer hasn't been initialized */
190	if( lb_ptr==NULL ){
191	unsigned char** pp;
192	/* for each definition, set the lead byte table pointer to the
193	* proper definition */
194	lb_ptr = lb_tab;
195	pp = def_arr;
196	while( *pp!=NULL ){


197	unsigned char lo = pp[0][1];
198	unsigned char hi = pp[0][2];
199	unsigned char i;
200	for(i=lo; i<=hi; ++i){
201	lb_ptr[i] = pp[0];
202	}
203	++pp;
204	}
205	}
206	z = (unsigned char *)blob_buffer(pContent);
207	n = blob_size(pContent);



208	/* while we haven't checked all the bytes in the buffer */
209	while( n>0 ){

210	/* ascii is trivial */
211	if( *z<0x80 ){
212	++z;
213	--n;
214	}else{
215	/* get the definition for this lead byte */
216	unsigned char* def = lb_ptr[*z++];
217	unsigned char i, len;
218
219	/* if the definition doesn't exist, return invalid */
220	if( !def ) return LOOK_INVALID;

221	/* get the expected sequence length */
222	len = *def;

223	/* if there aren't enough bytes left, return invalid */
224	if( n<len ) return LOOK_INVALID;

225	/* skip the length & lead byte range */
226	def += 3;

227	/* we already know byte #0 is good, so check the remaining bytes */
228	for(i=1; i<len; ++i){
229	/* if the byte is outside the allowed range for this definition,
230	* return invalid */
231	if( (z<def++) \|\| (z++>def++) ){
232	return LOOK_INVALID;
233	}
234	}
235	/* advance to the next sequence */
236	n -= len;
237	}
238	}

239	/* we made it all the way through the buffer so it's not invalid */
240	return LOOK_NONE;
241	}

242
243	/*
244	** Define the type needed to represent a Unicode (UTF-16) character.
245	*/
246	#ifndef WCHAR_T
	@@ -452,11 +468,11 @@
468	fUnicode = could_be_utf16(&blob, 0) \|\| fForceUtf16;
469	}
470	if( fUnicode ){
471	lookFlags = looks_like_utf16(&blob, bRevUtf16, 0);
472	}else{
473	lookFlags = looks_like_utf8(&blob, 0) \| invalid_utf8(&blob);
474	}
475	}
476	fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
477	fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
478	fossil_print("Starts with UTF-16 BOM: %s\n",
479

Fossil SCM

Keyboard Shortcuts