Fossil SCM

Juggle variables and code arround, making it as efficient and readable as possible. Also add more comments.

jan.nijtmans 2016-06-18 14:44 trunk
Commit 7f067f29400dea123adce3f822e43e19bf278dc4
1 file changed +27 -21
+27 -21
--- src/lookslike.c
+++ src/lookslike.c
@@ -148,22 +148,30 @@
148148
** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
149149
** more continuation byte is expected.
150150
*/
151151
152152
/* definitions for various UTF-8 sequence lengths */
153
-#define US2A 0x80, 0x80 /* for lead byte 0xC0 */
154
-#define US2B 0x80, 0xBF /* for lead bytes 0xC2-0xDF */
155
-#define US3A 0xA0, 0xBF /* for lead byte 0xE0 */
156
-#define US3B 0x80, 0xBF /* for lead bytes 0xE1-0xEF */
157
-#define US4A 0x90, 0xBF /* for lead byte 0xF0 */
158
-#define US4B 0x80, 0xBF /* for lead bytes 0xF1-0xF3 */
159
-#define US4C 0x80, 0x8F /* for lead byte 0xF4 */
153
+#define US2A 0x7F, 0x80 /* for lead byte 0xC0 */
154
+#define US2B 0x7F, 0xBF /* for lead bytes 0xC2-0xDF */
155
+#define US3A 0x9F, 0xBF /* for lead byte 0xE0 */
156
+#define US3B 0x7F, 0xBF /* for lead bytes 0xE1-0xEF */
157
+#define US4A 0x8F, 0xBF /* for lead byte 0xF0 */
158
+#define US4B 0x7F, 0xBF /* for lead bytes 0xF1-0xF3 */
159
+#define US4C 0x7F, 0x8F /* for lead byte 0xF4 */
160160
#define US0A 0xFF, 0x00 /* for any other lead byte */
161161
162162
/* a table used for quick lookup of the definition that goes with a
163163
* particular lead byte */
164164
static const unsigned char lb_tab[] = {
165
+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
166
+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
167
+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
168
+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
169
+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
170
+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
171
+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
172
+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
165173
US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
166174
US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
167175
US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
168176
US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
169177
US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
@@ -175,32 +183,30 @@
175183
int invalid_utf8(
176184
const Blob *pContent
177185
){
178186
const unsigned char *z = (unsigned char *) blob_buffer(pContent);
179187
unsigned int n = blob_size(pContent);
180
- unsigned char c, c2;
188
+ unsigned char c; /* lead byte to be handled. */
181189
182190
if( n==0 ) return 0; /* Empty file -> OK */
183191
c = *z;
184192
while( --n>0 ){
185
- c2 = c;
186
- c = *++z;
187
- if( c2>=0xC0 ){
188
- const unsigned char *def = &lb_tab[(2*c2)-0x180];
189
- if( (c<*def) || (c>*++def) ){
193
+ if( c>=0x80 ){
194
+ unsigned char fb = *++z; /* follow-up byte after lead byte */
195
+ const unsigned char *def; /* pointer to range table*/
196
+
197
+ c <<= 1; /* multiply by 2 and get rid of highest bit */
198
+ def = &lb_tab[c]; /* search fb's valid range in table */
199
+ if( (fb<=def[0]) || (fb>def[1]) ){
190200
return LOOK_INVALID; /* Invalid UTF-8 */
191201
}
192
- if( c2>=0xe0 ){
193
- c = (c2<<1)|3;
194
- }else{
195
- c = ' ';
196
- }
197
- }else if( c2>=0x80 ){
198
- return LOOK_INVALID;
202
+ c = (c>=0xC0) ? (c|3) : ' '; /* determine next lead byte */
203
+ } else {
204
+ c = *++z;
199205
}
200206
}
201
- return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */
207
+ return (c>=0x80) ? LOOK_INVALID : 0; /* Final lead byte must be ASCII. */
202208
}
203209
204210
/*
205211
** Define the type needed to represent a Unicode (UTF-16) character.
206212
*/
207213
--- src/lookslike.c
+++ src/lookslike.c
@@ -148,22 +148,30 @@
148 ** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
149 ** more continuation byte is expected.
150 */
151
152 /* definitions for various UTF-8 sequence lengths */
153 #define US2A 0x80, 0x80 /* for lead byte 0xC0 */
154 #define US2B 0x80, 0xBF /* for lead bytes 0xC2-0xDF */
155 #define US3A 0xA0, 0xBF /* for lead byte 0xE0 */
156 #define US3B 0x80, 0xBF /* for lead bytes 0xE1-0xEF */
157 #define US4A 0x90, 0xBF /* for lead byte 0xF0 */
158 #define US4B 0x80, 0xBF /* for lead bytes 0xF1-0xF3 */
159 #define US4C 0x80, 0x8F /* for lead byte 0xF4 */
160 #define US0A 0xFF, 0x00 /* for any other lead byte */
161
162 /* a table used for quick lookup of the definition that goes with a
163 * particular lead byte */
164 static const unsigned char lb_tab[] = {
 
 
 
 
 
 
 
 
165 US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
166 US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
167 US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
168 US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
169 US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
@@ -175,32 +183,30 @@
175 int invalid_utf8(
176 const Blob *pContent
177 ){
178 const unsigned char *z = (unsigned char *) blob_buffer(pContent);
179 unsigned int n = blob_size(pContent);
180 unsigned char c, c2;
181
182 if( n==0 ) return 0; /* Empty file -> OK */
183 c = *z;
184 while( --n>0 ){
185 c2 = c;
186 c = *++z;
187 if( c2>=0xC0 ){
188 const unsigned char *def = &lb_tab[(2*c2)-0x180];
189 if( (c<*def) || (c>*++def) ){
 
 
190 return LOOK_INVALID; /* Invalid UTF-8 */
191 }
192 if( c2>=0xe0 ){
193 c = (c2<<1)|3;
194 }else{
195 c = ' ';
196 }
197 }else if( c2>=0x80 ){
198 return LOOK_INVALID;
199 }
200 }
201 return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */
202 }
203
204 /*
205 ** Define the type needed to represent a Unicode (UTF-16) character.
206 */
207
--- src/lookslike.c
+++ src/lookslike.c
@@ -148,22 +148,30 @@
148 ** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
149 ** more continuation byte is expected.
150 */
151
152 /* definitions for various UTF-8 sequence lengths */
153 #define US2A 0x7F, 0x80 /* for lead byte 0xC0 */
154 #define US2B 0x7F, 0xBF /* for lead bytes 0xC2-0xDF */
155 #define US3A 0x9F, 0xBF /* for lead byte 0xE0 */
156 #define US3B 0x7F, 0xBF /* for lead bytes 0xE1-0xEF */
157 #define US4A 0x8F, 0xBF /* for lead byte 0xF0 */
158 #define US4B 0x7F, 0xBF /* for lead bytes 0xF1-0xF3 */
159 #define US4C 0x7F, 0x8F /* for lead byte 0xF4 */
160 #define US0A 0xFF, 0x00 /* for any other lead byte */
161
162 /* a table used for quick lookup of the definition that goes with a
163 * particular lead byte */
164 static const unsigned char lb_tab[] = {
165 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
166 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
167 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
168 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
169 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
170 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
171 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
172 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
173 US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
174 US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
175 US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
176 US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
177 US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
@@ -175,32 +183,30 @@
183 int invalid_utf8(
184 const Blob *pContent
185 ){
186 const unsigned char *z = (unsigned char *) blob_buffer(pContent);
187 unsigned int n = blob_size(pContent);
188 unsigned char c; /* lead byte to be handled. */
189
190 if( n==0 ) return 0; /* Empty file -> OK */
191 c = *z;
192 while( --n>0 ){
193 if( c>=0x80 ){
194 unsigned char fb = *++z; /* follow-up byte after lead byte */
195 const unsigned char *def; /* pointer to range table*/
196
197 c <<= 1; /* multiply by 2 and get rid of highest bit */
198 def = &lb_tab[c]; /* search fb's valid range in table */
199 if( (fb<=def[0]) || (fb>def[1]) ){
200 return LOOK_INVALID; /* Invalid UTF-8 */
201 }
202 c = (c>=0xC0) ? (c|3) : ' '; /* determine next lead byte */
203 } else {
204 c = *++z;
 
 
 
 
205 }
206 }
207 return (c>=0x80) ? LOOK_INVALID : 0; /* Final lead byte must be ASCII. */
208 }
209
210 /*
211 ** Define the type needed to represent a Unicode (UTF-16) character.
212 */
213

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button