Fossil SCM

Further coding style improvements for the new invalid_utf8() function.

mistachkin 2016-06-11 05:23 UTC invalid_utf8_table
Commit 2fb7d59beed17f94613e3108de7460681d0e0a1c
1 file changed +57 -41
+57 -41
--- src/lookslike.c
+++ src/lookslike.c
@@ -132,11 +132,10 @@
132132
flags |= LOOK_LONG; /* Very long line -> binary */
133133
}
134134
return flags;
135135
}
136136
137
-
138137
/*
139138
** Checks for proper UTF-8. It uses the method described in:
140139
** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
141140
** except for the "overlong form" of \u0000 (Modified UTF-8)
142141
** which is not considered invalid here: Some languages like
@@ -143,88 +142,105 @@
143142
** Java and Tcl use it. This function also considers valid
144143
** the derivatives CESU-8 & WTF-8 (as described in the same
145144
** wikipedia article referenced previously).
146145
*/
147146
148
-int invalid_utf8(const Blob *pContent) {
149
- /* definitions for various utf-8 sequence lengths */
150
- static unsigned char def_2a[] = { 2, 0xC0, 0xC0, 0x80, 0x80 };
151
- static unsigned char def_2b[] = { 2, 0xC2, 0xDF, 0x80, 0xBF };
152
- static unsigned char def_3a[] = { 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF };
153
- static unsigned char def_3b[] = { 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF };
154
- static unsigned char def_4a[] = { 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF };
155
- static unsigned char def_4b[] = { 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF };
156
- static unsigned char def_4c[] = { 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF };
147
+int invalid_utf8(
148
+ const Blob *pContent
149
+){
150
+ /* definitions for various UTF-8 sequence lengths */
151
+ static unsigned char def_2a[] = {
152
+ 2, 0xC0, 0xC0, 0x80, 0x80
153
+ };
154
+ static unsigned char def_2b[] = {
155
+ 2, 0xC2, 0xDF, 0x80, 0xBF
156
+ };
157
+ static unsigned char def_3a[] = {
158
+ 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF
159
+ };
160
+ static unsigned char def_3b[] = {
161
+ 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF
162
+ };
163
+ static unsigned char def_4a[] = {
164
+ 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF
165
+ };
166
+ static unsigned char def_4b[] = {
167
+ 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF
168
+ };
169
+ static unsigned char def_4c[] = {
170
+ 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF
171
+ };
157172
158173
/* an array of all the definitions */
159
- static unsigned char* def_arr[] = { def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL };
174
+ static unsigned char* def_arr[] = {
175
+ def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL
176
+ };
160177
161
- /* a table used for quick lookup of the definition that goes with a particular lead byte */
178
+ /* a table used for quick lookup of the definition that goes with a
179
+ * particular lead byte */
162180
static unsigned char* lb_tab[256] = { NULL };
163181
164182
/* a pointer to the table; NULL means not yet setup */
165183
static unsigned char** lb_ptr = NULL;
184
+
185
+ /* buffer pointer and size */
186
+ const unsigned char *z;
187
+ unsigned int n;
166188
167189
/* if the table pointer hasn't been initialized */
168
- if (lb_ptr == NULL) {
190
+ if( lb_ptr==NULL ){
191
+ unsigned char** pp;
192
+ /* for each definition, set the lead byte table pointer to the
193
+ * proper definition */
169194
lb_ptr = lb_tab;
170
-
171
- /* for each definition, set the lead byte table pointer to the proper definition */
172
- unsigned char** pp = def_arr;
173
- while (*pp != NULL) {
195
+ pp = def_arr;
196
+ while( *pp!=NULL ){
174197
unsigned char lo = pp[0][1];
175198
unsigned char hi = pp[0][2];
176199
unsigned char i;
177
- for (i = lo; i <= hi; ++i)
200
+ for(i=lo; i<=hi; ++i){
178201
lb_ptr[i] = pp[0];
202
+ }
179203
++pp;
180204
}
181205
}
182
-
183
- /* buffer pointer and size */
184
- const unsigned char *z = (unsigned char *)blob_buffer(pContent);
185
- unsigned int n = blob_size(pContent);
186
-
206
+ z = (unsigned char *)blob_buffer(pContent);
207
+ n = blob_size(pContent);
187208
/* while we haven't checked all the bytes in the buffer */
188
- while (n > 0) {
189
-
209
+ while( n>0 ){
190210
/* ascii is trivial */
191
- if (*z < 0x80) {
211
+ if( *z<0x80 ){
192212
++z;
193213
--n;
194
- } else {
214
+ }else{
195215
/* get the definition for this lead byte */
196216
unsigned char* def = lb_ptr[*z++];
197217
unsigned char i, len;
198218
199219
/* if the definition doesn't exist, return invalid */
200
- if (!def) return LOOK_INVALID;
201
-
220
+ if( !def ) return LOOK_INVALID;
202221
/* get the expected sequence length */
203222
len = *def;
204
-
205223
/* if there aren't enough bytes left, return invalid */
206
- if (n < len) return LOOK_INVALID;
207
-
224
+ if( n<len ) return LOOK_INVALID;
208225
/* skip the length & lead byte range */
209226
def += 3;
210
-
211227
/* we already know byte #0 is good, so check the remaining bytes */
212
- for (i = 1; i < len; ++i)
213
- /* if the byte is outside the allowed range for this definition, return invalid */
214
- if ((*z < *def++) || (*z++ > *def++))
228
+ for(i=1; i<len; ++i){
229
+ /* if the byte is outside the allowed range for this definition,
230
+ * return invalid */
231
+ if( (*z<*def++) || (*z++>*def++) ){
215232
return LOOK_INVALID;
216
-
233
+ }
234
+ }
217235
/* advance to the next sequence */
218236
n -= len;
219237
}
220238
}
221
-
222239
/* we made it all the way through the buffer so it's not invalid */
223
- return 0;
240
+ return LOOK_NONE;
224241
}
225
-
226242
227243
/*
228244
** Define the type needed to represent a Unicode (UTF-16) character.
229245
*/
230246
#ifndef WCHAR_T
@@ -452,11 +468,11 @@
452468
fUnicode = could_be_utf16(&blob, 0) || fForceUtf16;
453469
}
454470
if( fUnicode ){
455471
lookFlags = looks_like_utf16(&blob, bRevUtf16, 0);
456472
}else{
457
- lookFlags = looks_like_utf8(&blob, 0)|invalid_utf8(&blob);
473
+ lookFlags = looks_like_utf8(&blob, 0) | invalid_utf8(&blob);
458474
}
459475
}
460476
fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
461477
fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
462478
fossil_print("Starts with UTF-16 BOM: %s\n",
463479
--- src/lookslike.c
+++ src/lookslike.c
@@ -132,11 +132,10 @@
132 flags |= LOOK_LONG; /* Very long line -> binary */
133 }
134 return flags;
135 }
136
137
138 /*
139 ** Checks for proper UTF-8. It uses the method described in:
140 ** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
141 ** except for the "overlong form" of \u0000 (Modified UTF-8)
142 ** which is not considered invalid here: Some languages like
@@ -143,88 +142,105 @@
143 ** Java and Tcl use it. This function also considers valid
144 ** the derivatives CESU-8 & WTF-8 (as described in the same
145 ** wikipedia article referenced previously).
146 */
147
148 int invalid_utf8(const Blob *pContent) {
149 /* definitions for various utf-8 sequence lengths */
150 static unsigned char def_2a[] = { 2, 0xC0, 0xC0, 0x80, 0x80 };
151 static unsigned char def_2b[] = { 2, 0xC2, 0xDF, 0x80, 0xBF };
152 static unsigned char def_3a[] = { 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF };
153 static unsigned char def_3b[] = { 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF };
154 static unsigned char def_4a[] = { 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF };
155 static unsigned char def_4b[] = { 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF };
156 static unsigned char def_4c[] = { 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
158 /* an array of all the definitions */
159 static unsigned char* def_arr[] = { def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL };
 
 
160
161 /* a table used for quick lookup of the definition that goes with a particular lead byte */
 
162 static unsigned char* lb_tab[256] = { NULL };
163
164 /* a pointer to the table; NULL means not yet setup */
165 static unsigned char** lb_ptr = NULL;
 
 
 
 
166
167 /* if the table pointer hasn't been initialized */
168 if (lb_ptr == NULL) {
 
 
 
169 lb_ptr = lb_tab;
170
171 /* for each definition, set the lead byte table pointer to the proper definition */
172 unsigned char** pp = def_arr;
173 while (*pp != NULL) {
174 unsigned char lo = pp[0][1];
175 unsigned char hi = pp[0][2];
176 unsigned char i;
177 for (i = lo; i <= hi; ++i)
178 lb_ptr[i] = pp[0];
 
179 ++pp;
180 }
181 }
182
183 /* buffer pointer and size */
184 const unsigned char *z = (unsigned char *)blob_buffer(pContent);
185 unsigned int n = blob_size(pContent);
186
187 /* while we haven't checked all the bytes in the buffer */
188 while (n > 0) {
189
190 /* ascii is trivial */
191 if (*z < 0x80) {
192 ++z;
193 --n;
194 } else {
195 /* get the definition for this lead byte */
196 unsigned char* def = lb_ptr[*z++];
197 unsigned char i, len;
198
199 /* if the definition doesn't exist, return invalid */
200 if (!def) return LOOK_INVALID;
201
202 /* get the expected sequence length */
203 len = *def;
204
205 /* if there aren't enough bytes left, return invalid */
206 if (n < len) return LOOK_INVALID;
207
208 /* skip the length & lead byte range */
209 def += 3;
210
211 /* we already know byte #0 is good, so check the remaining bytes */
212 for (i = 1; i < len; ++i)
213 /* if the byte is outside the allowed range for this definition, return invalid */
214 if ((*z < *def++) || (*z++ > *def++))
 
215 return LOOK_INVALID;
216
 
217 /* advance to the next sequence */
218 n -= len;
219 }
220 }
221
222 /* we made it all the way through the buffer so it's not invalid */
223 return 0;
224 }
225
226
227 /*
228 ** Define the type needed to represent a Unicode (UTF-16) character.
229 */
230 #ifndef WCHAR_T
@@ -452,11 +468,11 @@
452 fUnicode = could_be_utf16(&blob, 0) || fForceUtf16;
453 }
454 if( fUnicode ){
455 lookFlags = looks_like_utf16(&blob, bRevUtf16, 0);
456 }else{
457 lookFlags = looks_like_utf8(&blob, 0)|invalid_utf8(&blob);
458 }
459 }
460 fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
461 fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
462 fossil_print("Starts with UTF-16 BOM: %s\n",
463
--- src/lookslike.c
+++ src/lookslike.c
@@ -132,11 +132,10 @@
132 flags |= LOOK_LONG; /* Very long line -> binary */
133 }
134 return flags;
135 }
136
 
137 /*
138 ** Checks for proper UTF-8. It uses the method described in:
139 ** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
140 ** except for the "overlong form" of \u0000 (Modified UTF-8)
141 ** which is not considered invalid here: Some languages like
@@ -143,88 +142,105 @@
142 ** Java and Tcl use it. This function also considers valid
143 ** the derivatives CESU-8 & WTF-8 (as described in the same
144 ** wikipedia article referenced previously).
145 */
146
147 int invalid_utf8(
148 const Blob *pContent
149 ){
150 /* definitions for various UTF-8 sequence lengths */
151 static unsigned char def_2a[] = {
152 2, 0xC0, 0xC0, 0x80, 0x80
153 };
154 static unsigned char def_2b[] = {
155 2, 0xC2, 0xDF, 0x80, 0xBF
156 };
157 static unsigned char def_3a[] = {
158 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF
159 };
160 static unsigned char def_3b[] = {
161 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF
162 };
163 static unsigned char def_4a[] = {
164 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF
165 };
166 static unsigned char def_4b[] = {
167 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF
168 };
169 static unsigned char def_4c[] = {
170 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF
171 };
172
173 /* an array of all the definitions */
174 static unsigned char* def_arr[] = {
175 def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL
176 };
177
178 /* a table used for quick lookup of the definition that goes with a
179 * particular lead byte */
180 static unsigned char* lb_tab[256] = { NULL };
181
182 /* a pointer to the table; NULL means not yet setup */
183 static unsigned char** lb_ptr = NULL;
184
185 /* buffer pointer and size */
186 const unsigned char *z;
187 unsigned int n;
188
189 /* if the table pointer hasn't been initialized */
190 if( lb_ptr==NULL ){
191 unsigned char** pp;
192 /* for each definition, set the lead byte table pointer to the
193 * proper definition */
194 lb_ptr = lb_tab;
195 pp = def_arr;
196 while( *pp!=NULL ){
 
 
197 unsigned char lo = pp[0][1];
198 unsigned char hi = pp[0][2];
199 unsigned char i;
200 for(i=lo; i<=hi; ++i){
201 lb_ptr[i] = pp[0];
202 }
203 ++pp;
204 }
205 }
206 z = (unsigned char *)blob_buffer(pContent);
207 n = blob_size(pContent);
 
 
 
208 /* while we haven't checked all the bytes in the buffer */
209 while( n>0 ){
 
210 /* ascii is trivial */
211 if( *z<0x80 ){
212 ++z;
213 --n;
214 }else{
215 /* get the definition for this lead byte */
216 unsigned char* def = lb_ptr[*z++];
217 unsigned char i, len;
218
219 /* if the definition doesn't exist, return invalid */
220 if( !def ) return LOOK_INVALID;
 
221 /* get the expected sequence length */
222 len = *def;
 
223 /* if there aren't enough bytes left, return invalid */
224 if( n<len ) return LOOK_INVALID;
 
225 /* skip the length & lead byte range */
226 def += 3;
 
227 /* we already know byte #0 is good, so check the remaining bytes */
228 for(i=1; i<len; ++i){
229 /* if the byte is outside the allowed range for this definition,
230 * return invalid */
231 if( (*z<*def++) || (*z++>*def++) ){
232 return LOOK_INVALID;
233 }
234 }
235 /* advance to the next sequence */
236 n -= len;
237 }
238 }
 
239 /* we made it all the way through the buffer so it's not invalid */
240 return LOOK_NONE;
241 }
 
242
243 /*
244 ** Define the type needed to represent a Unicode (UTF-16) character.
245 */
246 #ifndef WCHAR_T
@@ -452,11 +468,11 @@
468 fUnicode = could_be_utf16(&blob, 0) || fForceUtf16;
469 }
470 if( fUnicode ){
471 lookFlags = looks_like_utf16(&blob, bRevUtf16, 0);
472 }else{
473 lookFlags = looks_like_utf8(&blob, 0) | invalid_utf8(&blob);
474 }
475 }
476 fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
477 fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
478 fossil_print("Starts with UTF-16 BOM: %s\n",
479

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button