Fossil SCM

More optimizations, taken over from trunk.

jan.nijtmans 2016-06-16 12:14 invalid_utf8_table merge

Commit ec7f6b2e71c5001416d37d2d5a88b63ab1b0da23

Parent c22ec007ea9a62a…

2 files changed +28 -43 +28 -43

M src/lookslike.c

+28 -43

		--- src/lookslike.c
		+++ src/lookslike.c
		@@ -143,51 +143,38 @@
143	143	** the derivatives CESU-8 & WTF-8 (as described in the same
144	144	** wikipedia article referenced previously).
145	145	*/
146	146
147	147	/* definitions for various UTF-8 sequence lengths */
148		-static const unsigned char us2a[] = { /* for lead byte 0xC0 */
149		- 2, 0x80, 0x80
150		-};
151		-static const unsigned char us2b[] = { /* for lead bytes 0xC2-0xDF */
152		- 2, 0x80, 0xBF
153		-};
154		-static const unsigned char us3a[] = { /* for lead byte 0xE0 */
155		- 3, 0xA0, 0xBF
156		-};
157		-static const unsigned char us3b[] = { /* for lead bytes 0xE1-0xEF */
158		- 3, 0x80, 0xBF
159		-};
160		-static const unsigned char us4a[] = { /* for lead byte 0xF0 */
161		- 4, 0x90, 0xBF
162		-};
163		-static const unsigned char us4b[] = { /* for lead bytes 0xF1-0xF3 */
164		- 4, 0x80, 0xBF
165		-};
166		-static const unsigned char us4c[] = { /* for lead byte 0xF4 */
167		- 4, 0x80, 0x8F
168		-};
	148	+#define US2A 2, 0x80, 0x80 /* for lead byte 0xC0 */
	149	+#define US2B 2, 0x80, 0xBF /* for lead bytes 0xC2-0xDF */
	150	+#define US3A 3, 0xA0, 0xBF /* for lead byte 0xE0 */
	151	+#define US3B 3, 0x80, 0xBF /* for lead bytes 0xE1-0xEF */
	152	+#define US4A 4, 0x90, 0xBF /* for lead byte 0xF0 */
	153	+#define US4B 4, 0x80, 0xBF /* for lead bytes 0xF1-0xF3 */
	154	+#define US4C 4, 0x80, 0x8F /* for lead byte 0xF4 */
	155	+#define US0A 0xFF, 0xFF, 0x00 /* for any other lead byte */
169	156
170	157	/* a table used for quick lookup of the definition that goes with a
171	158	* particular lead byte */
172		-static const unsigned char* const lb_tab[] = {
173		- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
174		- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
175		- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
176		- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
177		- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
178		- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
179		- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
180		- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
181		- us2a, NULL, us2b, us2b, us2b, us2b, us2b, us2b,
182		- us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
183		- us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
184		- us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
185		- us3a, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
186		- us3b, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
187		- us4a, us4b, us4b, us4b, us4c, NULL, NULL, NULL,
188		- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
	159	+static const unsigned char lb_tab[] = {
	160	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
	161	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
	162	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
	163	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
	164	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
	165	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
	166	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
	167	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
	168	+ US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
	169	+ US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
	170	+ US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
	171	+ US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
	172	+ US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
	173	+ US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
	174	+ US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A,
	175	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A
189	176	};
190	177
191	178	int invalid_utf8(
192	179	const Blob *pContent
193	180	){
		@@ -201,23 +188,21 @@
201	188	if( *z<0x80 ){
202	189	++z;
203	190	--n;
204	191	}else{
205	192	/* get the definition for this lead byte */
206		- const unsigned char* def = lb_tab[(*z++)-0x80];
	193	+ const unsigned char* def = &lb_tab[(3 * *z++)-0x180];
207	194	unsigned char len;
208	195
209		- /* if the definition doesn't exist, return invalid */
210		- if( !def ) return LOOK_INVALID;
211	196	/* get the expected sequence length */
212		- len = *def++;
	197	+ len = *def;
213	198	/* if there aren't enough bytes left, return invalid */
214	199	if( n<len ) {
215	200	return LOOK_INVALID;
216	201	}
217	202	/* we already know byte #0 is good, so check the remaining bytes */
218		- if( (z<def++) \|\| (z++>def++) ){
	203	+ if( (z<++def) \|\| (z++>++def) ){
219	204	/* if the byte is outside the allowed range for this definition,
220	205	* return invalid */
221	206	return LOOK_INVALID;
222	207	}
223	208	if( len > 2 ){
224	209

	--- src/lookslike.c
	+++ src/lookslike.c
	@@ -143,51 +143,38 @@
143	** the derivatives CESU-8 & WTF-8 (as described in the same
144	** wikipedia article referenced previously).
145	*/
146
147	/* definitions for various UTF-8 sequence lengths */
148	static const unsigned char us2a[] = { /* for lead byte 0xC0 */
149	2, 0x80, 0x80
150	};
151	static const unsigned char us2b[] = { /* for lead bytes 0xC2-0xDF */
152	2, 0x80, 0xBF
153	};
154	static const unsigned char us3a[] = { /* for lead byte 0xE0 */
155	3, 0xA0, 0xBF
156	};
157	static const unsigned char us3b[] = { /* for lead bytes 0xE1-0xEF */
158	3, 0x80, 0xBF
159	};
160	static const unsigned char us4a[] = { /* for lead byte 0xF0 */
161	4, 0x90, 0xBF
162	};
163	static const unsigned char us4b[] = { /* for lead bytes 0xF1-0xF3 */
164	4, 0x80, 0xBF
165	};
166	static const unsigned char us4c[] = { /* for lead byte 0xF4 */
167	4, 0x80, 0x8F
168	};
169
170	/* a table used for quick lookup of the definition that goes with a
171	* particular lead byte */
172	static const unsigned char* const lb_tab[] = {
173	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
174	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
175	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
176	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
177	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
178	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
179	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
180	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
181	us2a, NULL, us2b, us2b, us2b, us2b, us2b, us2b,
182	us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
183	us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
184	us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
185	us3a, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
186	us3b, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
187	us4a, us4b, us4b, us4b, us4c, NULL, NULL, NULL,
188	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
189	};
190
191	int invalid_utf8(
192	const Blob *pContent
193	){
	@@ -201,23 +188,21 @@
201	if( *z<0x80 ){
202	++z;
203	--n;
204	}else{
205	/* get the definition for this lead byte */
206	const unsigned char* def = lb_tab[(*z++)-0x80];
207	unsigned char len;
208
209	/* if the definition doesn't exist, return invalid */
210	if( !def ) return LOOK_INVALID;
211	/* get the expected sequence length */
212	len = *def++;
213	/* if there aren't enough bytes left, return invalid */
214	if( n<len ) {
215	return LOOK_INVALID;
216	}
217	/* we already know byte #0 is good, so check the remaining bytes */
218	if( (z<def++) \|\| (z++>def++) ){
219	/* if the byte is outside the allowed range for this definition,
220	* return invalid */
221	return LOOK_INVALID;
222	}
223	if( len > 2 ){
224

	--- src/lookslike.c
	+++ src/lookslike.c
	@@ -143,51 +143,38 @@
143	** the derivatives CESU-8 & WTF-8 (as described in the same
144	** wikipedia article referenced previously).
145	*/
146
147	/* definitions for various UTF-8 sequence lengths */
148	#define US2A 2, 0x80, 0x80 /* for lead byte 0xC0 */
149	#define US2B 2, 0x80, 0xBF /* for lead bytes 0xC2-0xDF */
150	#define US3A 3, 0xA0, 0xBF /* for lead byte 0xE0 */
151	#define US3B 3, 0x80, 0xBF /* for lead bytes 0xE1-0xEF */
152	#define US4A 4, 0x90, 0xBF /* for lead byte 0xF0 */
153	#define US4B 4, 0x80, 0xBF /* for lead bytes 0xF1-0xF3 */
154	#define US4C 4, 0x80, 0x8F /* for lead byte 0xF4 */
155	#define US0A 0xFF, 0xFF, 0x00 /* for any other lead byte */













156
157	/* a table used for quick lookup of the definition that goes with a
158	* particular lead byte */
159	static const unsigned char lb_tab[] = {
160	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
161	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
162	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
163	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
164	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
165	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
166	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
167	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
168	US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
169	US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
170	US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
171	US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
172	US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
173	US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
174	US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A,
175	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A
176	};
177
178	int invalid_utf8(
179	const Blob *pContent
180	){
	@@ -201,23 +188,21 @@
188	if( *z<0x80 ){
189	++z;
190	--n;
191	}else{
192	/* get the definition for this lead byte */
193	const unsigned char* def = &lb_tab[(3 * *z++)-0x180];
194	unsigned char len;
195


196	/* get the expected sequence length */
197	len = *def;
198	/* if there aren't enough bytes left, return invalid */
199	if( n<len ) {
200	return LOOK_INVALID;
201	}
202	/* we already know byte #0 is good, so check the remaining bytes */
203	if( (z<++def) \|\| (z++>++def) ){
204	/* if the byte is outside the allowed range for this definition,
205	* return invalid */
206	return LOOK_INVALID;
207	}
208	if( len > 2 ){
209

M src/lookslike.c

+28 -43

		--- src/lookslike.c
		+++ src/lookslike.c
		@@ -143,51 +143,38 @@
143	143	** the derivatives CESU-8 & WTF-8 (as described in the same
144	144	** wikipedia article referenced previously).
145	145	*/
146	146
147	147	/* definitions for various UTF-8 sequence lengths */
148		-static const unsigned char us2a[] = { /* for lead byte 0xC0 */
149		- 2, 0x80, 0x80
150		-};
151		-static const unsigned char us2b[] = { /* for lead bytes 0xC2-0xDF */
152		- 2, 0x80, 0xBF
153		-};
154		-static const unsigned char us3a[] = { /* for lead byte 0xE0 */
155		- 3, 0xA0, 0xBF
156		-};
157		-static const unsigned char us3b[] = { /* for lead bytes 0xE1-0xEF */
158		- 3, 0x80, 0xBF
159		-};
160		-static const unsigned char us4a[] = { /* for lead byte 0xF0 */
161		- 4, 0x90, 0xBF
162		-};
163		-static const unsigned char us4b[] = { /* for lead bytes 0xF1-0xF3 */
164		- 4, 0x80, 0xBF
165		-};
166		-static const unsigned char us4c[] = { /* for lead byte 0xF4 */
167		- 4, 0x80, 0x8F
168		-};
	148	+#define US2A 2, 0x80, 0x80 /* for lead byte 0xC0 */
	149	+#define US2B 2, 0x80, 0xBF /* for lead bytes 0xC2-0xDF */
	150	+#define US3A 3, 0xA0, 0xBF /* for lead byte 0xE0 */
	151	+#define US3B 3, 0x80, 0xBF /* for lead bytes 0xE1-0xEF */
	152	+#define US4A 4, 0x90, 0xBF /* for lead byte 0xF0 */
	153	+#define US4B 4, 0x80, 0xBF /* for lead bytes 0xF1-0xF3 */
	154	+#define US4C 4, 0x80, 0x8F /* for lead byte 0xF4 */
	155	+#define US0A 0xFF, 0xFF, 0x00 /* for any other lead byte */
169	156
170	157	/* a table used for quick lookup of the definition that goes with a
171	158	* particular lead byte */
172		-static const unsigned char* const lb_tab[] = {
173		- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
174		- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
175		- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
176		- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
177		- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
178		- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
179		- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
180		- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
181		- us2a, NULL, us2b, us2b, us2b, us2b, us2b, us2b,
182		- us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
183		- us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
184		- us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
185		- us3a, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
186		- us3b, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
187		- us4a, us4b, us4b, us4b, us4c, NULL, NULL, NULL,
188		- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
	159	+static const unsigned char lb_tab[] = {
	160	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
	161	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
	162	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
	163	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
	164	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
	165	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
	166	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
	167	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
	168	+ US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
	169	+ US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
	170	+ US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
	171	+ US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
	172	+ US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
	173	+ US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
	174	+ US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A,
	175	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A
189	176	};
190	177
191	178	int invalid_utf8(
192	179	const Blob *pContent
193	180	){
		@@ -201,23 +188,21 @@
201	188	if( *z<0x80 ){
202	189	++z;
203	190	--n;
204	191	}else{
205	192	/* get the definition for this lead byte */
206		- const unsigned char* def = lb_tab[(*z++)-0x80];
	193	+ const unsigned char* def = &lb_tab[(3 * *z++)-0x180];
207	194	unsigned char len;
208	195
209		- /* if the definition doesn't exist, return invalid */
210		- if( !def ) return LOOK_INVALID;
211	196	/* get the expected sequence length */
212		- len = *def++;
	197	+ len = *def;
213	198	/* if there aren't enough bytes left, return invalid */
214	199	if( n<len ) {
215	200	return LOOK_INVALID;
216	201	}
217	202	/* we already know byte #0 is good, so check the remaining bytes */
218		- if( (z<def++) \|\| (z++>def++) ){
	203	+ if( (z<++def) \|\| (z++>++def) ){
219	204	/* if the byte is outside the allowed range for this definition,
220	205	* return invalid */
221	206	return LOOK_INVALID;
222	207	}
223	208	if( len > 2 ){
224	209

	--- src/lookslike.c
	+++ src/lookslike.c
	@@ -143,51 +143,38 @@
143	** the derivatives CESU-8 & WTF-8 (as described in the same
144	** wikipedia article referenced previously).
145	*/
146
147	/* definitions for various UTF-8 sequence lengths */
148	static const unsigned char us2a[] = { /* for lead byte 0xC0 */
149	2, 0x80, 0x80
150	};
151	static const unsigned char us2b[] = { /* for lead bytes 0xC2-0xDF */
152	2, 0x80, 0xBF
153	};
154	static const unsigned char us3a[] = { /* for lead byte 0xE0 */
155	3, 0xA0, 0xBF
156	};
157	static const unsigned char us3b[] = { /* for lead bytes 0xE1-0xEF */
158	3, 0x80, 0xBF
159	};
160	static const unsigned char us4a[] = { /* for lead byte 0xF0 */
161	4, 0x90, 0xBF
162	};
163	static const unsigned char us4b[] = { /* for lead bytes 0xF1-0xF3 */
164	4, 0x80, 0xBF
165	};
166	static const unsigned char us4c[] = { /* for lead byte 0xF4 */
167	4, 0x80, 0x8F
168	};
169
170	/* a table used for quick lookup of the definition that goes with a
171	* particular lead byte */
172	static const unsigned char* const lb_tab[] = {
173	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
174	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
175	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
176	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
177	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
178	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
179	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
180	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
181	us2a, NULL, us2b, us2b, us2b, us2b, us2b, us2b,
182	us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
183	us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
184	us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
185	us3a, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
186	us3b, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
187	us4a, us4b, us4b, us4b, us4c, NULL, NULL, NULL,
188	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
189	};
190
191	int invalid_utf8(
192	const Blob *pContent
193	){
	@@ -201,23 +188,21 @@
201	if( *z<0x80 ){
202	++z;
203	--n;
204	}else{
205	/* get the definition for this lead byte */
206	const unsigned char* def = lb_tab[(*z++)-0x80];
207	unsigned char len;
208
209	/* if the definition doesn't exist, return invalid */
210	if( !def ) return LOOK_INVALID;
211	/* get the expected sequence length */
212	len = *def++;
213	/* if there aren't enough bytes left, return invalid */
214	if( n<len ) {
215	return LOOK_INVALID;
216	}
217	/* we already know byte #0 is good, so check the remaining bytes */
218	if( (z<def++) \|\| (z++>def++) ){
219	/* if the byte is outside the allowed range for this definition,
220	* return invalid */
221	return LOOK_INVALID;
222	}
223	if( len > 2 ){
224

	--- src/lookslike.c
	+++ src/lookslike.c
	@@ -143,51 +143,38 @@
143	** the derivatives CESU-8 & WTF-8 (as described in the same
144	** wikipedia article referenced previously).
145	*/
146
147	/* definitions for various UTF-8 sequence lengths */
148	#define US2A 2, 0x80, 0x80 /* for lead byte 0xC0 */
149	#define US2B 2, 0x80, 0xBF /* for lead bytes 0xC2-0xDF */
150	#define US3A 3, 0xA0, 0xBF /* for lead byte 0xE0 */
151	#define US3B 3, 0x80, 0xBF /* for lead bytes 0xE1-0xEF */
152	#define US4A 4, 0x90, 0xBF /* for lead byte 0xF0 */
153	#define US4B 4, 0x80, 0xBF /* for lead bytes 0xF1-0xF3 */
154	#define US4C 4, 0x80, 0x8F /* for lead byte 0xF4 */
155	#define US0A 0xFF, 0xFF, 0x00 /* for any other lead byte */













156
157	/* a table used for quick lookup of the definition that goes with a
158	* particular lead byte */
159	static const unsigned char lb_tab[] = {
160	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
161	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
162	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
163	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
164	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
165	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
166	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
167	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
168	US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
169	US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
170	US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
171	US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
172	US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
173	US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
174	US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A,
175	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A
176	};
177
178	int invalid_utf8(
179	const Blob *pContent
180	){
	@@ -201,23 +188,21 @@
188	if( *z<0x80 ){
189	++z;
190	--n;
191	}else{
192	/* get the definition for this lead byte */
193	const unsigned char* def = &lb_tab[(3 * *z++)-0x180];
194	unsigned char len;
195


196	/* get the expected sequence length */
197	len = *def;
198	/* if there aren't enough bytes left, return invalid */
199	if( n<len ) {
200	return LOOK_INVALID;
201	}
202	/* we already know byte #0 is good, so check the remaining bytes */
203	if( (z<++def) \|\| (z++>++def) ){
204	/* if the byte is outside the allowed range for this definition,
205	* return invalid */
206	return LOOK_INVALID;
207	}
208	if( len > 2 ){
209

Fossil SCM

Keyboard Shortcuts