Fossil SCM

micro-optimizing invalid_utf8 function, should be as fast as possible now

jan.nijtmans 2016-06-26 17:05 trunk merge

Commit 7c08a68503a45da327ac55ca9252d9a71b43ff17

Parent e1034c4c35195ef…

1 file changed +52 -47

M src/lookslike.c

+52 -47

		--- src/lookslike.c
		+++ src/lookslike.c
		@@ -50,10 +50,41 @@
50	50	#define LOOK_INVALID ((int)0x00000200) /* Invalid sequence was found. */
51	51	#define LOOK_BINARY (LOOK_NUL \| LOOK_LONG \| LOOK_SHORT) /* May be binary. */
52	52	#define LOOK_EOL (LOOK_LONE_CR \| LOOK_LONE_LF \| LOOK_CRLF) /* Line seps. */
53	53	#endif /* INTERFACE */
54	54
	55	+/* definitions for various UTF-8 sequence lengths, encoded as start value
	56	+ * and size of each valid range belonging to some lead byte*/
	57	+#define US2A 0x80, 0x01 /* for lead byte 0xC0 */
	58	+#define US2B 0x80, 0x40 /* for lead bytes 0xC2-0xDF */
	59	+#define US3A 0xA0, 0x20 /* for lead byte 0xE0 */
	60	+#define US3B 0x80, 0x40 /* for lead bytes 0xE1-0xEF */
	61	+#define US4A 0x90, 0x30 /* for lead byte 0xF0 */
	62	+#define US4B 0x80, 0x40 /* for lead bytes 0xF1-0xF3 */
	63	+#define US4C 0x80, 0x10 /* for lead byte 0xF4 */
	64	+#define US0A 0x00, 0x00 /* for any other lead byte */
	65	+
	66	+/* a table used for quick lookup of the definition that goes with a
	67	+ * particular lead byte */
	68	+static const unsigned char lb_tab[] = {
	69	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
	70	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
	71	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
	72	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
	73	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
	74	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
	75	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
	76	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
	77	+ US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
	78	+ US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
	79	+ US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
	80	+ US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
	81	+ US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
	82	+ US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
	83	+ US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A,
	84	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A
	85	+};
55	86
56	87	/*
57	88	** This function attempts to scan each logical line within the blob to
58	89	** determine the type of content it appears to contain. The return value
59	90	** is a combination of one or more of the LOOK_XXX flags (see above):
		@@ -135,72 +166,46 @@
135	166	}
136	167
137	168	/*
138	169	** Checks for proper UTF-8. It uses the method described in:
139	170	** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
140		-** except for the "overlong form" of \u0000 which is not considered invalid
141		-** here: Some languages like Java and Tcl use it. This function also
142		-** considers valid the derivatives CESU-8 & WTF-8 (as described in the
143		-** same wikipedia article referenced previously). For UTF-8 characters
144		-** > 7f, the variable 'c2' not necessary means the previous character.
145		-** It's number of higher 1-bits indicate the number of continuation bytes
146		-** that are expected to be followed. E.g. when 'c2' has a value in the range
147		-** 0xc0..0xdf it means that 'c' is expected to contain the last continuation
148		-** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
149		-** more continuation byte is expected.
	171	+** except for the "overlong form" of \u0000 which is not considered
	172	+** invalid here: Some languages like Java and Tcl use it. This function
	173	+** also considers valid the derivatives CESU-8 & WTF-8 (as described in
	174	+** the same wikipedia article referenced previously). For UTF-8 characters
	175	+** > 0x7f, the variable 'c' not necessary means the real lead byte.
	176	+** It's number of higher 1-bits indicate the number of continuation
	177	+** bytes that are expected to be followed. E.g. when 'c' has a value
	178	+** in the range 0xc0..0xdf it means that after 'c' a single continuation
	179	+** byte is expected. A value 0xe0..0xef means that after 'c' two more
	180	+** continuation bytes are expected.
150	181	*/
151	182
152		-/* definitions for various UTF-8 sequence lengths */
153		-#define US2A 0x80, 0x80 /* for lead byte 0xC0 */
154		-#define US2B 0x80, 0xBF /* for lead bytes 0xC2-0xDF */
155		-#define US3A 0xA0, 0xBF /* for lead byte 0xE0 */
156		-#define US3B 0x80, 0xBF /* for lead bytes 0xE1-0xEF */
157		-#define US4A 0x90, 0xBF /* for lead byte 0xF0 */
158		-#define US4B 0x80, 0xBF /* for lead bytes 0xF1-0xF3 */
159		-#define US4C 0x80, 0x8F /* for lead byte 0xF4 */
160		-#define US0A 0xFF, 0x00 /* for any other lead byte */
161		-
162		-/* a table used for quick lookup of the definition that goes with a
163		- * particular lead byte */
164		-static const unsigned char lb_tab[] = {
165		- US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
166		- US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
167		- US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
168		- US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
169		- US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
170		- US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
171		- US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A,
172		- US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A
173		-};
174		-
175	183	int invalid_utf8(
176	184	const Blob *pContent
177	185	){
178	186	const unsigned char z = (unsigned char ) blob_buffer(pContent);
179	187	unsigned int n = blob_size(pContent);
180		- unsigned char c, c2;
	188	+ unsigned char c; /* lead byte to be handled. */
181	189
182	190	if( n==0 ) return 0; /* Empty file -> OK */
183	191	c = *z;
184	192	while( --n>0 ){
185		- c2 = c;
186		- c = *++z;
187		- if( c2>=0xC0 ){
188		- const unsigned char def = &lb_tab[(2c2)-0x180];
189		- if( (c<def) \|\| (c>++def) ){
	193	+ if( c>=0x80 ){
	194	+ const unsigned char def; / pointer to range table*/
	195	+
	196	+ c <<= 1; /* multiply by 2 and get rid of highest bit */
	197	+ def = &lb_tab[c]; /* search fb's valid range in table */
	198	+ if( (unsigned int)(*++z-def[0])>=def[1] ){
190	199	return LOOK_INVALID; /* Invalid UTF-8 */
191	200	}
192		- if( c2>=0xe0 ){
193		- c = (c2<<1)\|3;
194		- }else{
195		- c = ' ';
196		- }
197		- }else if( c2>=0x80 ){
198		- return LOOK_INVALID;
	201	+ c = (c>=0xC0) ? (c\|3) : ' '; /* determine next lead byte */
	202	+ } else {
	203	+ c = *++z;
199	204	}
200	205	}
201		- return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */
	206	+ return (c>=0x80) ? LOOK_INVALID : 0; /* Final lead byte must be ASCII. */
202	207	}
203	208
204	209	/*
205	210	** Define the type needed to represent a Unicode (UTF-16) character.
206	211	*/
207	212

	--- src/lookslike.c
	+++ src/lookslike.c
	@@ -50,10 +50,41 @@
50	#define LOOK_INVALID ((int)0x00000200) /* Invalid sequence was found. */
51	#define LOOK_BINARY (LOOK_NUL \| LOOK_LONG \| LOOK_SHORT) /* May be binary. */
52	#define LOOK_EOL (LOOK_LONE_CR \| LOOK_LONE_LF \| LOOK_CRLF) /* Line seps. */
53	#endif /* INTERFACE */
54































55
56	/*
57	** This function attempts to scan each logical line within the blob to
58	** determine the type of content it appears to contain. The return value
59	** is a combination of one or more of the LOOK_XXX flags (see above):
	@@ -135,72 +166,46 @@
135	}
136
137	/*
138	** Checks for proper UTF-8. It uses the method described in:
139	** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
140	** except for the "overlong form" of \u0000 which is not considered invalid
141	** here: Some languages like Java and Tcl use it. This function also
142	** considers valid the derivatives CESU-8 & WTF-8 (as described in the
143	** same wikipedia article referenced previously). For UTF-8 characters
144	** > 7f, the variable 'c2' not necessary means the previous character.
145	** It's number of higher 1-bits indicate the number of continuation bytes
146	** that are expected to be followed. E.g. when 'c2' has a value in the range
147	** 0xc0..0xdf it means that 'c' is expected to contain the last continuation
148	** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
149	** more continuation byte is expected.
150	*/
151
152	/* definitions for various UTF-8 sequence lengths */
153	#define US2A 0x80, 0x80 /* for lead byte 0xC0 */
154	#define US2B 0x80, 0xBF /* for lead bytes 0xC2-0xDF */
155	#define US3A 0xA0, 0xBF /* for lead byte 0xE0 */
156	#define US3B 0x80, 0xBF /* for lead bytes 0xE1-0xEF */
157	#define US4A 0x90, 0xBF /* for lead byte 0xF0 */
158	#define US4B 0x80, 0xBF /* for lead bytes 0xF1-0xF3 */
159	#define US4C 0x80, 0x8F /* for lead byte 0xF4 */
160	#define US0A 0xFF, 0x00 /* for any other lead byte */
161
162	/* a table used for quick lookup of the definition that goes with a
163	* particular lead byte */
164	static const unsigned char lb_tab[] = {
165	US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
166	US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
167	US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
168	US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
169	US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
170	US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
171	US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A,
172	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A
173	};
174
175	int invalid_utf8(
176	const Blob *pContent
177	){
178	const unsigned char z = (unsigned char ) blob_buffer(pContent);
179	unsigned int n = blob_size(pContent);
180	unsigned char c, c2;
181
182	if( n==0 ) return 0; /* Empty file -> OK */
183	c = *z;
184	while( --n>0 ){
185	c2 = c;
186	c = *++z;
187	if( c2>=0xC0 ){
188	const unsigned char def = &lb_tab[(2c2)-0x180];
189	if( (c<def) \|\| (c>++def) ){

190	return LOOK_INVALID; /* Invalid UTF-8 */
191	}
192	if( c2>=0xe0 ){
193	c = (c2<<1)\|3;
194	}else{
195	c = ' ';
196	}
197	}else if( c2>=0x80 ){
198	return LOOK_INVALID;
199	}
200	}
201	return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */
202	}
203
204	/*
205	** Define the type needed to represent a Unicode (UTF-16) character.
206	*/
207

	--- src/lookslike.c
	+++ src/lookslike.c
	@@ -50,10 +50,41 @@
50	#define LOOK_INVALID ((int)0x00000200) /* Invalid sequence was found. */
51	#define LOOK_BINARY (LOOK_NUL \| LOOK_LONG \| LOOK_SHORT) /* May be binary. */
52	#define LOOK_EOL (LOOK_LONE_CR \| LOOK_LONE_LF \| LOOK_CRLF) /* Line seps. */
53	#endif /* INTERFACE */
54
55	/* definitions for various UTF-8 sequence lengths, encoded as start value
56	* and size of each valid range belonging to some lead byte*/
57	#define US2A 0x80, 0x01 /* for lead byte 0xC0 */
58	#define US2B 0x80, 0x40 /* for lead bytes 0xC2-0xDF */
59	#define US3A 0xA0, 0x20 /* for lead byte 0xE0 */
60	#define US3B 0x80, 0x40 /* for lead bytes 0xE1-0xEF */
61	#define US4A 0x90, 0x30 /* for lead byte 0xF0 */
62	#define US4B 0x80, 0x40 /* for lead bytes 0xF1-0xF3 */
63	#define US4C 0x80, 0x10 /* for lead byte 0xF4 */
64	#define US0A 0x00, 0x00 /* for any other lead byte */
65
66	/* a table used for quick lookup of the definition that goes with a
67	* particular lead byte */
68	static const unsigned char lb_tab[] = {
69	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
70	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
71	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
72	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
73	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
74	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
75	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
76	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
77	US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
78	US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
79	US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
80	US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
81	US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
82	US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
83	US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A,
84	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A
85	};
86
87	/*
88	** This function attempts to scan each logical line within the blob to
89	** determine the type of content it appears to contain. The return value
90	** is a combination of one or more of the LOOK_XXX flags (see above):
	@@ -135,72 +166,46 @@
166	}
167
168	/*
169	** Checks for proper UTF-8. It uses the method described in:
170	** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
171	** except for the "overlong form" of \u0000 which is not considered
172	** invalid here: Some languages like Java and Tcl use it. This function
173	** also considers valid the derivatives CESU-8 & WTF-8 (as described in
174	** the same wikipedia article referenced previously). For UTF-8 characters
175	** > 0x7f, the variable 'c' not necessary means the real lead byte.
176	** It's number of higher 1-bits indicate the number of continuation
177	** bytes that are expected to be followed. E.g. when 'c' has a value
178	** in the range 0xc0..0xdf it means that after 'c' a single continuation
179	** byte is expected. A value 0xe0..0xef means that after 'c' two more
180	** continuation bytes are expected.
181	*/
182























183	int invalid_utf8(
184	const Blob *pContent
185	){
186	const unsigned char z = (unsigned char ) blob_buffer(pContent);
187	unsigned int n = blob_size(pContent);
188	unsigned char c; /* lead byte to be handled. */
189
190	if( n==0 ) return 0; /* Empty file -> OK */
191	c = *z;
192	while( --n>0 ){
193	if( c>=0x80 ){
194	const unsigned char def; / pointer to range table*/
195
196	c <<= 1; /* multiply by 2 and get rid of highest bit */
197	def = &lb_tab[c]; /* search fb's valid range in table */
198	if( (unsigned int)(*++z-def[0])>=def[1] ){
199	return LOOK_INVALID; /* Invalid UTF-8 */
200	}
201	c = (c>=0xC0) ? (c\|3) : ' '; /* determine next lead byte */
202	} else {
203	c = *++z;




204	}
205	}
206	return (c>=0x80) ? LOOK_INVALID : 0; /* Final lead byte must be ASCII. */
207	}
208
209	/*
210	** Define the type needed to represent a Unicode (UTF-16) character.
211	*/
212

Fossil SCM

Keyboard Shortcuts