Fossil SCM

Juggle variables and code arround, making it as efficient and readable as possible. Also add more comments.

jan.nijtmans 2016-06-18 14:44 trunk

Commit 7f067f29400dea123adce3f822e43e19bf278dc4

Parent 6bcfe1d22c13281…

1 file changed +27 -21

M src/lookslike.c

+27 -21

		--- src/lookslike.c
		+++ src/lookslike.c
		@@ -148,22 +148,30 @@
148	148	** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
149	149	** more continuation byte is expected.
150	150	*/
151	151
152	152	/* definitions for various UTF-8 sequence lengths */
153		-#define US2A 0x80, 0x80 /* for lead byte 0xC0 */
154		-#define US2B 0x80, 0xBF /* for lead bytes 0xC2-0xDF */
155		-#define US3A 0xA0, 0xBF /* for lead byte 0xE0 */
156		-#define US3B 0x80, 0xBF /* for lead bytes 0xE1-0xEF */
157		-#define US4A 0x90, 0xBF /* for lead byte 0xF0 */
158		-#define US4B 0x80, 0xBF /* for lead bytes 0xF1-0xF3 */
159		-#define US4C 0x80, 0x8F /* for lead byte 0xF4 */
	153	+#define US2A 0x7F, 0x80 /* for lead byte 0xC0 */
	154	+#define US2B 0x7F, 0xBF /* for lead bytes 0xC2-0xDF */
	155	+#define US3A 0x9F, 0xBF /* for lead byte 0xE0 */
	156	+#define US3B 0x7F, 0xBF /* for lead bytes 0xE1-0xEF */
	157	+#define US4A 0x8F, 0xBF /* for lead byte 0xF0 */
	158	+#define US4B 0x7F, 0xBF /* for lead bytes 0xF1-0xF3 */
	159	+#define US4C 0x7F, 0x8F /* for lead byte 0xF4 */
160	160	#define US0A 0xFF, 0x00 /* for any other lead byte */
161	161
162	162	/* a table used for quick lookup of the definition that goes with a
163	163	* particular lead byte */
164	164	static const unsigned char lb_tab[] = {
	165	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
	166	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
	167	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
	168	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
	169	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
	170	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
	171	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
	172	+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
165	173	US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
166	174	US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
167	175	US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
168	176	US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
169	177	US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
		@@ -175,32 +183,30 @@
175	183	int invalid_utf8(
176	184	const Blob *pContent
177	185	){
178	186	const unsigned char z = (unsigned char ) blob_buffer(pContent);
179	187	unsigned int n = blob_size(pContent);
180		- unsigned char c, c2;
	188	+ unsigned char c; /* lead byte to be handled. */
181	189
182	190	if( n==0 ) return 0; /* Empty file -> OK */
183	191	c = *z;
184	192	while( --n>0 ){
185		- c2 = c;
186		- c = *++z;
187		- if( c2>=0xC0 ){
188		- const unsigned char def = &lb_tab[(2c2)-0x180];
189		- if( (c<def) \|\| (c>++def) ){
	193	+ if( c>=0x80 ){
	194	+ unsigned char fb = ++z; / follow-up byte after lead byte */
	195	+ const unsigned char def; / pointer to range table*/
	196	+
	197	+ c <<= 1; /* multiply by 2 and get rid of highest bit */
	198	+ def = &lb_tab[c]; /* search fb's valid range in table */
	199	+ if( (fb<=def[0]) \|\| (fb>def[1]) ){
190	200	return LOOK_INVALID; /* Invalid UTF-8 */
191	201	}
192		- if( c2>=0xe0 ){
193		- c = (c2<<1)\|3;
194		- }else{
195		- c = ' ';
196		- }
197		- }else if( c2>=0x80 ){
198		- return LOOK_INVALID;
	202	+ c = (c>=0xC0) ? (c\|3) : ' '; /* determine next lead byte */
	203	+ } else {
	204	+ c = *++z;
199	205	}
200	206	}
201		- return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */
	207	+ return (c>=0x80) ? LOOK_INVALID : 0; /* Final lead byte must be ASCII. */
202	208	}
203	209
204	210	/*
205	211	** Define the type needed to represent a Unicode (UTF-16) character.
206	212	*/
207	213

	--- src/lookslike.c
	+++ src/lookslike.c
	@@ -148,22 +148,30 @@
148	** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
149	** more continuation byte is expected.
150	*/
151
152	/* definitions for various UTF-8 sequence lengths */
153	#define US2A 0x80, 0x80 /* for lead byte 0xC0 */
154	#define US2B 0x80, 0xBF /* for lead bytes 0xC2-0xDF */
155	#define US3A 0xA0, 0xBF /* for lead byte 0xE0 */
156	#define US3B 0x80, 0xBF /* for lead bytes 0xE1-0xEF */
157	#define US4A 0x90, 0xBF /* for lead byte 0xF0 */
158	#define US4B 0x80, 0xBF /* for lead bytes 0xF1-0xF3 */
159	#define US4C 0x80, 0x8F /* for lead byte 0xF4 */
160	#define US0A 0xFF, 0x00 /* for any other lead byte */
161
162	/* a table used for quick lookup of the definition that goes with a
163	* particular lead byte */
164	static const unsigned char lb_tab[] = {








165	US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
166	US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
167	US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
168	US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
169	US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
	@@ -175,32 +183,30 @@
175	int invalid_utf8(
176	const Blob *pContent
177	){
178	const unsigned char z = (unsigned char ) blob_buffer(pContent);
179	unsigned int n = blob_size(pContent);
180	unsigned char c, c2;
181
182	if( n==0 ) return 0; /* Empty file -> OK */
183	c = *z;
184	while( --n>0 ){
185	c2 = c;
186	c = *++z;
187	if( c2>=0xC0 ){
188	const unsigned char def = &lb_tab[(2c2)-0x180];
189	if( (c<def) \|\| (c>++def) ){


190	return LOOK_INVALID; /* Invalid UTF-8 */
191	}
192	if( c2>=0xe0 ){
193	c = (c2<<1)\|3;
194	}else{
195	c = ' ';
196	}
197	}else if( c2>=0x80 ){
198	return LOOK_INVALID;
199	}
200	}
201	return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */
202	}
203
204	/*
205	** Define the type needed to represent a Unicode (UTF-16) character.
206	*/
207

	--- src/lookslike.c
	+++ src/lookslike.c
	@@ -148,22 +148,30 @@
148	** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
149	** more continuation byte is expected.
150	*/
151
152	/* definitions for various UTF-8 sequence lengths */
153	#define US2A 0x7F, 0x80 /* for lead byte 0xC0 */
154	#define US2B 0x7F, 0xBF /* for lead bytes 0xC2-0xDF */
155	#define US3A 0x9F, 0xBF /* for lead byte 0xE0 */
156	#define US3B 0x7F, 0xBF /* for lead bytes 0xE1-0xEF */
157	#define US4A 0x8F, 0xBF /* for lead byte 0xF0 */
158	#define US4B 0x7F, 0xBF /* for lead bytes 0xF1-0xF3 */
159	#define US4C 0x7F, 0x8F /* for lead byte 0xF4 */
160	#define US0A 0xFF, 0x00 /* for any other lead byte */
161
162	/* a table used for quick lookup of the definition that goes with a
163	* particular lead byte */
164	static const unsigned char lb_tab[] = {
165	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
166	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
167	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
168	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
169	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
170	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
171	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
172	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
173	US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
174	US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
175	US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
176	US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
177	US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
	@@ -175,32 +183,30 @@
183	int invalid_utf8(
184	const Blob *pContent
185	){
186	const unsigned char z = (unsigned char ) blob_buffer(pContent);
187	unsigned int n = blob_size(pContent);
188	unsigned char c; /* lead byte to be handled. */
189
190	if( n==0 ) return 0; /* Empty file -> OK */
191	c = *z;
192	while( --n>0 ){
193	if( c>=0x80 ){
194	unsigned char fb = ++z; / follow-up byte after lead byte */
195	const unsigned char def; / pointer to range table*/
196
197	c <<= 1; /* multiply by 2 and get rid of highest bit */
198	def = &lb_tab[c]; /* search fb's valid range in table */
199	if( (fb<=def[0]) \|\| (fb>def[1]) ){
200	return LOOK_INVALID; /* Invalid UTF-8 */
201	}
202	c = (c>=0xC0) ? (c\|3) : ' '; /* determine next lead byte */
203	} else {
204	c = *++z;




205	}
206	}
207	return (c>=0x80) ? LOOK_INVALID : 0; /* Final lead byte must be ASCII. */
208	}
209
210	/*
211	** Define the type needed to represent a Unicode (UTF-16) character.
212	*/
213

Fossil SCM

Keyboard Shortcuts