Fossil SCM

Use faster table-based approach when checking for invalid utf-8, in stead of complex bit-operations.

jan.nijtmans 2016-06-16 09:13 trunk

Commit 60349a6617490676a2ea1a31fdce56decb641dc6

Parent bd559ff0d0db4e8…

1 file changed +53 -10

M src/lookslike.c

+53 -10

		--- src/lookslike.c
		+++ src/lookslike.c
		@@ -132,25 +132,72 @@
132	132	flags \|= LOOK_LONG; /* Very long line -> binary */
133	133	}
134	134	return flags;
135	135	}
136	136
137		-
138	137	/*
139	138	** Checks for proper UTF-8. It uses the method described in:
140	139	** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
141	140	** except for the "overlong form" of \u0000 which is not considered invalid
142		-** here: Some languages like Java and Tcl use it. For UTF-8 characters
	141	+** here: Some languages like Java and Tcl use it. This function also
	142	+** considers valid the derivatives CESU-8 & WTF-8 (as described in the
	143	+** same wikipedia article referenced previously). For UTF-8 characters
143	144	** > 7f, the variable 'c2' not necessary means the previous character.
144	145	** It's number of higher 1-bits indicate the number of continuation bytes
145	146	** that are expected to be followed. E.g. when 'c2' has a value in the range
146	147	** 0xc0..0xdf it means that 'c' is expected to contain the last continuation
147	148	** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
148	149	** more continuation byte is expected.
149	150	*/
150	151
151		-int invalid_utf8(const Blob *pContent){
	152	+/* definitions for various UTF-8 sequence lengths */
	153	+static const unsigned char us2a[] = { /* for lead byte 0xC0 */
	154	+ 0x80, 0x80
	155	+};
	156	+static const unsigned char us2b[] = { /* for lead bytes 0xC2-0xDF */
	157	+ 0x80, 0xBF
	158	+};
	159	+static const unsigned char us3a[] = { /* for lead byte 0xE0 */
	160	+ 0xA0, 0xBF
	161	+};
	162	+static const unsigned char us3b[] = { /* for lead bytes 0xE1-0xEF */
	163	+ 0x80, 0xBF
	164	+};
	165	+static const unsigned char us4a[] = { /* for lead byte 0xF0 */
	166	+ 0x90, 0xBF
	167	+};
	168	+static const unsigned char us4b[] = { /* for lead bytes 0xF1-0xF3 */
	169	+ 0x80, 0xBF
	170	+};
	171	+static const unsigned char us4c[] = { /* for lead byte 0xF4 */
	172	+ 0x80, 0x8F
	173	+};
	174	+
	175	+/* a table used for quick lookup of the definition that goes with a
	176	+ * particular lead byte */
	177	+static const unsigned char* const lb_tab[] = {
	178	+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
	179	+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
	180	+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
	181	+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
	182	+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
	183	+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
	184	+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
	185	+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
	186	+ us2a, NULL, us2b, us2b, us2b, us2b, us2b, us2b,
	187	+ us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
	188	+ us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
	189	+ us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
	190	+ us3a, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
	191	+ us3b, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
	192	+ us4a, us4b, us4b, us4b, us4c, NULL, NULL, NULL,
	193	+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
	194	+};
	195	+
	196	+int invalid_utf8(
	197	+ const Blob *pContent
	198	+){
152	199	const unsigned char z = (unsigned char ) blob_buffer(pContent);
153	200	unsigned int n = blob_size(pContent);
154	201	unsigned char c, c2;
155	202
156	203	if( n==0 ) return 0; /* Empty file -> OK */
		@@ -157,27 +204,23 @@
157	204	c = *z;
158	205	while( --n>0 ){
159	206	c2 = c;
160	207	c = *++z;
161	208	if( c2>=0x80 ){
162		- if( ((c&0xc0)!=0x80) \|\| (((c2<0xc2) \|\| (c2>=0xf4)) &&
163		- (((c2!=0xf4) \|\| (c>=0x90)) && ((c2!=0xc0) \|\| (c!=0x80)))) ){
	209	+ const unsigned char *def = lb_tab[(c2)-0x80];
	210	+ if( !def \|\| (c<def++) \|\| (c>def++) ){
164	211	return LOOK_INVALID; /* Invalid UTF-8 */
165	212	}
166	213	if( c2>=0xe0 ){
167		- if ((c2==0xf0 && c<0x90)\|\|(c2==0xe0 && c<0xa0) ){
168		- return LOOK_INVALID; /* Invalid UTF-8, too short */
169		- }
170	214	c = (c2<<1)\|3;
171	215	}else{
172	216	c = ' ';
173	217	}
174	218	}
175	219	}
176	220	return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */
177	221	}
178		-
179	222
180	223	/*
181	224	** Define the type needed to represent a Unicode (UTF-16) character.
182	225	*/
183	226	#ifndef WCHAR_T
		@@ -405,11 +448,11 @@
405	448	fUnicode = could_be_utf16(&blob, 0) \|\| fForceUtf16;
406	449	}
407	450	if( fUnicode ){
408	451	lookFlags = looks_like_utf16(&blob, bRevUtf16, 0);
409	452	}else{
410		- lookFlags = looks_like_utf8(&blob, 0)\|invalid_utf8(&blob);
	453	+ lookFlags = looks_like_utf8(&blob, 0) \| invalid_utf8(&blob);
411	454	}
412	455	}
413	456	fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
414	457	fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
415	458	fossil_print("Starts with UTF-16 BOM: %s\n",
416	459

	--- src/lookslike.c
	+++ src/lookslike.c
	@@ -132,25 +132,72 @@
132	flags \|= LOOK_LONG; /* Very long line -> binary */
133	}
134	return flags;
135	}
136
137
138	/*
139	** Checks for proper UTF-8. It uses the method described in:
140	** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
141	** except for the "overlong form" of \u0000 which is not considered invalid
142	** here: Some languages like Java and Tcl use it. For UTF-8 characters


143	** > 7f, the variable 'c2' not necessary means the previous character.
144	** It's number of higher 1-bits indicate the number of continuation bytes
145	** that are expected to be followed. E.g. when 'c2' has a value in the range
146	** 0xc0..0xdf it means that 'c' is expected to contain the last continuation
147	** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
148	** more continuation byte is expected.
149	*/
150
151	int invalid_utf8(const Blob *pContent){














































152	const unsigned char z = (unsigned char ) blob_buffer(pContent);
153	unsigned int n = blob_size(pContent);
154	unsigned char c, c2;
155
156	if( n==0 ) return 0; /* Empty file -> OK */
	@@ -157,27 +204,23 @@
157	c = *z;
158	while( --n>0 ){
159	c2 = c;
160	c = *++z;
161	if( c2>=0x80 ){
162	if( ((c&0xc0)!=0x80) \|\| (((c2<0xc2) \|\| (c2>=0xf4)) &&
163	(((c2!=0xf4) \|\| (c>=0x90)) && ((c2!=0xc0) \|\| (c!=0x80)))) ){
164	return LOOK_INVALID; /* Invalid UTF-8 */
165	}
166	if( c2>=0xe0 ){
167	if ((c2==0xf0 && c<0x90)\|\|(c2==0xe0 && c<0xa0) ){
168	return LOOK_INVALID; /* Invalid UTF-8, too short */
169	}
170	c = (c2<<1)\|3;
171	}else{
172	c = ' ';
173	}
174	}
175	}
176	return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */
177	}
178
179
180	/*
181	** Define the type needed to represent a Unicode (UTF-16) character.
182	*/
183	#ifndef WCHAR_T
	@@ -405,11 +448,11 @@
405	fUnicode = could_be_utf16(&blob, 0) \|\| fForceUtf16;
406	}
407	if( fUnicode ){
408	lookFlags = looks_like_utf16(&blob, bRevUtf16, 0);
409	}else{
410	lookFlags = looks_like_utf8(&blob, 0)\|invalid_utf8(&blob);
411	}
412	}
413	fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
414	fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
415	fossil_print("Starts with UTF-16 BOM: %s\n",
416

	--- src/lookslike.c
	+++ src/lookslike.c
	@@ -132,25 +132,72 @@
132	flags \|= LOOK_LONG; /* Very long line -> binary */
133	}
134	return flags;
135	}
136

137	/*
138	** Checks for proper UTF-8. It uses the method described in:
139	** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
140	** except for the "overlong form" of \u0000 which is not considered invalid
141	** here: Some languages like Java and Tcl use it. This function also
142	** considers valid the derivatives CESU-8 & WTF-8 (as described in the
143	** same wikipedia article referenced previously). For UTF-8 characters
144	** > 7f, the variable 'c2' not necessary means the previous character.
145	** It's number of higher 1-bits indicate the number of continuation bytes
146	** that are expected to be followed. E.g. when 'c2' has a value in the range
147	** 0xc0..0xdf it means that 'c' is expected to contain the last continuation
148	** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
149	** more continuation byte is expected.
150	*/
151
152	/* definitions for various UTF-8 sequence lengths */
153	static const unsigned char us2a[] = { /* for lead byte 0xC0 */
154	0x80, 0x80
155	};
156	static const unsigned char us2b[] = { /* for lead bytes 0xC2-0xDF */
157	0x80, 0xBF
158	};
159	static const unsigned char us3a[] = { /* for lead byte 0xE0 */
160	0xA0, 0xBF
161	};
162	static const unsigned char us3b[] = { /* for lead bytes 0xE1-0xEF */
163	0x80, 0xBF
164	};
165	static const unsigned char us4a[] = { /* for lead byte 0xF0 */
166	0x90, 0xBF
167	};
168	static const unsigned char us4b[] = { /* for lead bytes 0xF1-0xF3 */
169	0x80, 0xBF
170	};
171	static const unsigned char us4c[] = { /* for lead byte 0xF4 */
172	0x80, 0x8F
173	};
174
175	/* a table used for quick lookup of the definition that goes with a
176	* particular lead byte */
177	static const unsigned char* const lb_tab[] = {
178	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
179	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
180	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
181	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
182	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
183	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
184	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
185	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
186	us2a, NULL, us2b, us2b, us2b, us2b, us2b, us2b,
187	us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
188	us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
189	us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
190	us3a, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
191	us3b, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
192	us4a, us4b, us4b, us4b, us4c, NULL, NULL, NULL,
193	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
194	};
195
196	int invalid_utf8(
197	const Blob *pContent
198	){
199	const unsigned char z = (unsigned char ) blob_buffer(pContent);
200	unsigned int n = blob_size(pContent);
201	unsigned char c, c2;
202
203	if( n==0 ) return 0; /* Empty file -> OK */
	@@ -157,27 +204,23 @@
204	c = *z;
205	while( --n>0 ){
206	c2 = c;
207	c = *++z;
208	if( c2>=0x80 ){
209	const unsigned char *def = lb_tab[(c2)-0x80];
210	if( !def \|\| (c<def++) \|\| (c>def++) ){
211	return LOOK_INVALID; /* Invalid UTF-8 */
212	}
213	if( c2>=0xe0 ){



214	c = (c2<<1)\|3;
215	}else{
216	c = ' ';
217	}
218	}
219	}
220	return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */
221	}

222
223	/*
224	** Define the type needed to represent a Unicode (UTF-16) character.
225	*/
226	#ifndef WCHAR_T
	@@ -405,11 +448,11 @@
448	fUnicode = could_be_utf16(&blob, 0) \|\| fForceUtf16;
449	}
450	if( fUnicode ){
451	lookFlags = looks_like_utf16(&blob, bRevUtf16, 0);
452	}else{
453	lookFlags = looks_like_utf8(&blob, 0) \| invalid_utf8(&blob);
454	}
455	}
456	fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
457	fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
458	fossil_print("Starts with UTF-16 BOM: %s\n",
459

Fossil SCM

Keyboard Shortcuts