Fossil SCM
Further coding style improvements for the new invalid_utf8() function.
Commit
2fb7d59beed17f94613e3108de7460681d0e0a1c
Parent
dd3bb22cd726f26…
1 file changed
+57
-41
+57
-41
| --- src/lookslike.c | ||
| +++ src/lookslike.c | ||
| @@ -132,11 +132,10 @@ | ||
| 132 | 132 | flags |= LOOK_LONG; /* Very long line -> binary */ |
| 133 | 133 | } |
| 134 | 134 | return flags; |
| 135 | 135 | } |
| 136 | 136 | |
| 137 | - | |
| 138 | 137 | /* |
| 139 | 138 | ** Checks for proper UTF-8. It uses the method described in: |
| 140 | 139 | ** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences |
| 141 | 140 | ** except for the "overlong form" of \u0000 (Modified UTF-8) |
| 142 | 141 | ** which is not considered invalid here: Some languages like |
| @@ -143,88 +142,105 @@ | ||
| 143 | 142 | ** Java and Tcl use it. This function also considers valid |
| 144 | 143 | ** the derivatives CESU-8 & WTF-8 (as described in the same |
| 145 | 144 | ** wikipedia article referenced previously). |
| 146 | 145 | */ |
| 147 | 146 | |
| 148 | -int invalid_utf8(const Blob *pContent) { | |
| 149 | - /* definitions for various utf-8 sequence lengths */ | |
| 150 | - static unsigned char def_2a[] = { 2, 0xC0, 0xC0, 0x80, 0x80 }; | |
| 151 | - static unsigned char def_2b[] = { 2, 0xC2, 0xDF, 0x80, 0xBF }; | |
| 152 | - static unsigned char def_3a[] = { 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF }; | |
| 153 | - static unsigned char def_3b[] = { 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF }; | |
| 154 | - static unsigned char def_4a[] = { 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF }; | |
| 155 | - static unsigned char def_4b[] = { 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF }; | |
| 156 | - static unsigned char def_4c[] = { 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF }; | |
| 147 | +int invalid_utf8( | |
| 148 | + const Blob *pContent | |
| 149 | +){ | |
| 150 | + /* definitions for various UTF-8 sequence lengths */ | |
| 151 | + static unsigned char def_2a[] = { | |
| 152 | + 2, 0xC0, 0xC0, 0x80, 0x80 | |
| 153 | + }; | |
| 154 | + static unsigned char def_2b[] = { | |
| 155 | + 2, 0xC2, 0xDF, 0x80, 0xBF | |
| 156 | + }; | |
| 157 | + static unsigned char def_3a[] = { | |
| 158 | + 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF | |
| 159 | + }; | |
| 160 | + static unsigned char def_3b[] = { | |
| 161 | + 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF | |
| 162 | + }; | |
| 163 | + static unsigned char def_4a[] = { | |
| 164 | + 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF | |
| 165 | + }; | |
| 166 | + static unsigned char def_4b[] = { | |
| 167 | + 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF | |
| 168 | + }; | |
| 169 | + static unsigned char def_4c[] = { | |
| 170 | + 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF | |
| 171 | + }; | |
| 157 | 172 | |
| 158 | 173 | /* an array of all the definitions */ |
| 159 | - static unsigned char* def_arr[] = { def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL }; | |
| 174 | + static unsigned char* def_arr[] = { | |
| 175 | + def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL | |
| 176 | + }; | |
| 160 | 177 | |
| 161 | - /* a table used for quick lookup of the definition that goes with a particular lead byte */ | |
| 178 | + /* a table used for quick lookup of the definition that goes with a | |
| 179 | + * particular lead byte */ | |
| 162 | 180 | static unsigned char* lb_tab[256] = { NULL }; |
| 163 | 181 | |
| 164 | 182 | /* a pointer to the table; NULL means not yet setup */ |
| 165 | 183 | static unsigned char** lb_ptr = NULL; |
| 184 | + | |
| 185 | + /* buffer pointer and size */ | |
| 186 | + const unsigned char *z; | |
| 187 | + unsigned int n; | |
| 166 | 188 | |
| 167 | 189 | /* if the table pointer hasn't been initialized */ |
| 168 | - if (lb_ptr == NULL) { | |
| 190 | + if( lb_ptr==NULL ){ | |
| 191 | + unsigned char** pp; | |
| 192 | + /* for each definition, set the lead byte table pointer to the | |
| 193 | + * proper definition */ | |
| 169 | 194 | lb_ptr = lb_tab; |
| 170 | - | |
| 171 | - /* for each definition, set the lead byte table pointer to the proper definition */ | |
| 172 | - unsigned char** pp = def_arr; | |
| 173 | - while (*pp != NULL) { | |
| 195 | + pp = def_arr; | |
| 196 | + while( *pp!=NULL ){ | |
| 174 | 197 | unsigned char lo = pp[0][1]; |
| 175 | 198 | unsigned char hi = pp[0][2]; |
| 176 | 199 | unsigned char i; |
| 177 | - for (i = lo; i <= hi; ++i) | |
| 200 | + for(i=lo; i<=hi; ++i){ | |
| 178 | 201 | lb_ptr[i] = pp[0]; |
| 202 | + } | |
| 179 | 203 | ++pp; |
| 180 | 204 | } |
| 181 | 205 | } |
| 182 | - | |
| 183 | - /* buffer pointer and size */ | |
| 184 | - const unsigned char *z = (unsigned char *)blob_buffer(pContent); | |
| 185 | - unsigned int n = blob_size(pContent); | |
| 186 | - | |
| 206 | + z = (unsigned char *)blob_buffer(pContent); | |
| 207 | + n = blob_size(pContent); | |
| 187 | 208 | /* while we haven't checked all the bytes in the buffer */ |
| 188 | - while (n > 0) { | |
| 189 | - | |
| 209 | + while( n>0 ){ | |
| 190 | 210 | /* ascii is trivial */ |
| 191 | - if (*z < 0x80) { | |
| 211 | + if( *z<0x80 ){ | |
| 192 | 212 | ++z; |
| 193 | 213 | --n; |
| 194 | - } else { | |
| 214 | + }else{ | |
| 195 | 215 | /* get the definition for this lead byte */ |
| 196 | 216 | unsigned char* def = lb_ptr[*z++]; |
| 197 | 217 | unsigned char i, len; |
| 198 | 218 | |
| 199 | 219 | /* if the definition doesn't exist, return invalid */ |
| 200 | - if (!def) return LOOK_INVALID; | |
| 201 | - | |
| 220 | + if( !def ) return LOOK_INVALID; | |
| 202 | 221 | /* get the expected sequence length */ |
| 203 | 222 | len = *def; |
| 204 | - | |
| 205 | 223 | /* if there aren't enough bytes left, return invalid */ |
| 206 | - if (n < len) return LOOK_INVALID; | |
| 207 | - | |
| 224 | + if( n<len ) return LOOK_INVALID; | |
| 208 | 225 | /* skip the length & lead byte range */ |
| 209 | 226 | def += 3; |
| 210 | - | |
| 211 | 227 | /* we already know byte #0 is good, so check the remaining bytes */ |
| 212 | - for (i = 1; i < len; ++i) | |
| 213 | - /* if the byte is outside the allowed range for this definition, return invalid */ | |
| 214 | - if ((*z < *def++) || (*z++ > *def++)) | |
| 228 | + for(i=1; i<len; ++i){ | |
| 229 | + /* if the byte is outside the allowed range for this definition, | |
| 230 | + * return invalid */ | |
| 231 | + if( (*z<*def++) || (*z++>*def++) ){ | |
| 215 | 232 | return LOOK_INVALID; |
| 216 | - | |
| 233 | + } | |
| 234 | + } | |
| 217 | 235 | /* advance to the next sequence */ |
| 218 | 236 | n -= len; |
| 219 | 237 | } |
| 220 | 238 | } |
| 221 | - | |
| 222 | 239 | /* we made it all the way through the buffer so it's not invalid */ |
| 223 | - return 0; | |
| 240 | + return LOOK_NONE; | |
| 224 | 241 | } |
| 225 | - | |
| 226 | 242 | |
| 227 | 243 | /* |
| 228 | 244 | ** Define the type needed to represent a Unicode (UTF-16) character. |
| 229 | 245 | */ |
| 230 | 246 | #ifndef WCHAR_T |
| @@ -452,11 +468,11 @@ | ||
| 452 | 468 | fUnicode = could_be_utf16(&blob, 0) || fForceUtf16; |
| 453 | 469 | } |
| 454 | 470 | if( fUnicode ){ |
| 455 | 471 | lookFlags = looks_like_utf16(&blob, bRevUtf16, 0); |
| 456 | 472 | }else{ |
| 457 | - lookFlags = looks_like_utf8(&blob, 0)|invalid_utf8(&blob); | |
| 473 | + lookFlags = looks_like_utf8(&blob, 0) | invalid_utf8(&blob); | |
| 458 | 474 | } |
| 459 | 475 | } |
| 460 | 476 | fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob)); |
| 461 | 477 | fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no"); |
| 462 | 478 | fossil_print("Starts with UTF-16 BOM: %s\n", |
| 463 | 479 |
| --- src/lookslike.c | |
| +++ src/lookslike.c | |
| @@ -132,11 +132,10 @@ | |
| 132 | flags |= LOOK_LONG; /* Very long line -> binary */ |
| 133 | } |
| 134 | return flags; |
| 135 | } |
| 136 | |
| 137 | |
| 138 | /* |
| 139 | ** Checks for proper UTF-8. It uses the method described in: |
| 140 | ** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences |
| 141 | ** except for the "overlong form" of \u0000 (Modified UTF-8) |
| 142 | ** which is not considered invalid here: Some languages like |
| @@ -143,88 +142,105 @@ | |
| 143 | ** Java and Tcl use it. This function also considers valid |
| 144 | ** the derivatives CESU-8 & WTF-8 (as described in the same |
| 145 | ** wikipedia article referenced previously). |
| 146 | */ |
| 147 | |
| 148 | int invalid_utf8(const Blob *pContent) { |
| 149 | /* definitions for various utf-8 sequence lengths */ |
| 150 | static unsigned char def_2a[] = { 2, 0xC0, 0xC0, 0x80, 0x80 }; |
| 151 | static unsigned char def_2b[] = { 2, 0xC2, 0xDF, 0x80, 0xBF }; |
| 152 | static unsigned char def_3a[] = { 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF }; |
| 153 | static unsigned char def_3b[] = { 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF }; |
| 154 | static unsigned char def_4a[] = { 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF }; |
| 155 | static unsigned char def_4b[] = { 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF }; |
| 156 | static unsigned char def_4c[] = { 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF }; |
| 157 | |
| 158 | /* an array of all the definitions */ |
| 159 | static unsigned char* def_arr[] = { def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL }; |
| 160 | |
| 161 | /* a table used for quick lookup of the definition that goes with a particular lead byte */ |
| 162 | static unsigned char* lb_tab[256] = { NULL }; |
| 163 | |
| 164 | /* a pointer to the table; NULL means not yet setup */ |
| 165 | static unsigned char** lb_ptr = NULL; |
| 166 | |
| 167 | /* if the table pointer hasn't been initialized */ |
| 168 | if (lb_ptr == NULL) { |
| 169 | lb_ptr = lb_tab; |
| 170 | |
| 171 | /* for each definition, set the lead byte table pointer to the proper definition */ |
| 172 | unsigned char** pp = def_arr; |
| 173 | while (*pp != NULL) { |
| 174 | unsigned char lo = pp[0][1]; |
| 175 | unsigned char hi = pp[0][2]; |
| 176 | unsigned char i; |
| 177 | for (i = lo; i <= hi; ++i) |
| 178 | lb_ptr[i] = pp[0]; |
| 179 | ++pp; |
| 180 | } |
| 181 | } |
| 182 | |
| 183 | /* buffer pointer and size */ |
| 184 | const unsigned char *z = (unsigned char *)blob_buffer(pContent); |
| 185 | unsigned int n = blob_size(pContent); |
| 186 | |
| 187 | /* while we haven't checked all the bytes in the buffer */ |
| 188 | while (n > 0) { |
| 189 | |
| 190 | /* ascii is trivial */ |
| 191 | if (*z < 0x80) { |
| 192 | ++z; |
| 193 | --n; |
| 194 | } else { |
| 195 | /* get the definition for this lead byte */ |
| 196 | unsigned char* def = lb_ptr[*z++]; |
| 197 | unsigned char i, len; |
| 198 | |
| 199 | /* if the definition doesn't exist, return invalid */ |
| 200 | if (!def) return LOOK_INVALID; |
| 201 | |
| 202 | /* get the expected sequence length */ |
| 203 | len = *def; |
| 204 | |
| 205 | /* if there aren't enough bytes left, return invalid */ |
| 206 | if (n < len) return LOOK_INVALID; |
| 207 | |
| 208 | /* skip the length & lead byte range */ |
| 209 | def += 3; |
| 210 | |
| 211 | /* we already know byte #0 is good, so check the remaining bytes */ |
| 212 | for (i = 1; i < len; ++i) |
| 213 | /* if the byte is outside the allowed range for this definition, return invalid */ |
| 214 | if ((*z < *def++) || (*z++ > *def++)) |
| 215 | return LOOK_INVALID; |
| 216 | |
| 217 | /* advance to the next sequence */ |
| 218 | n -= len; |
| 219 | } |
| 220 | } |
| 221 | |
| 222 | /* we made it all the way through the buffer so it's not invalid */ |
| 223 | return 0; |
| 224 | } |
| 225 | |
| 226 | |
| 227 | /* |
| 228 | ** Define the type needed to represent a Unicode (UTF-16) character. |
| 229 | */ |
| 230 | #ifndef WCHAR_T |
| @@ -452,11 +468,11 @@ | |
| 452 | fUnicode = could_be_utf16(&blob, 0) || fForceUtf16; |
| 453 | } |
| 454 | if( fUnicode ){ |
| 455 | lookFlags = looks_like_utf16(&blob, bRevUtf16, 0); |
| 456 | }else{ |
| 457 | lookFlags = looks_like_utf8(&blob, 0)|invalid_utf8(&blob); |
| 458 | } |
| 459 | } |
| 460 | fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob)); |
| 461 | fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no"); |
| 462 | fossil_print("Starts with UTF-16 BOM: %s\n", |
| 463 |
| --- src/lookslike.c | |
| +++ src/lookslike.c | |
| @@ -132,11 +132,10 @@ | |
| 132 | flags |= LOOK_LONG; /* Very long line -> binary */ |
| 133 | } |
| 134 | return flags; |
| 135 | } |
| 136 | |
| 137 | /* |
| 138 | ** Checks for proper UTF-8. It uses the method described in: |
| 139 | ** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences |
| 140 | ** except for the "overlong form" of \u0000 (Modified UTF-8) |
| 141 | ** which is not considered invalid here: Some languages like |
| @@ -143,88 +142,105 @@ | |
| 142 | ** Java and Tcl use it. This function also considers valid |
| 143 | ** the derivatives CESU-8 & WTF-8 (as described in the same |
| 144 | ** wikipedia article referenced previously). |
| 145 | */ |
| 146 | |
| 147 | int invalid_utf8( |
| 148 | const Blob *pContent |
| 149 | ){ |
| 150 | /* definitions for various UTF-8 sequence lengths */ |
| 151 | static unsigned char def_2a[] = { |
| 152 | 2, 0xC0, 0xC0, 0x80, 0x80 |
| 153 | }; |
| 154 | static unsigned char def_2b[] = { |
| 155 | 2, 0xC2, 0xDF, 0x80, 0xBF |
| 156 | }; |
| 157 | static unsigned char def_3a[] = { |
| 158 | 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF |
| 159 | }; |
| 160 | static unsigned char def_3b[] = { |
| 161 | 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF |
| 162 | }; |
| 163 | static unsigned char def_4a[] = { |
| 164 | 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF |
| 165 | }; |
| 166 | static unsigned char def_4b[] = { |
| 167 | 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF |
| 168 | }; |
| 169 | static unsigned char def_4c[] = { |
| 170 | 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF |
| 171 | }; |
| 172 | |
| 173 | /* an array of all the definitions */ |
| 174 | static unsigned char* def_arr[] = { |
| 175 | def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL |
| 176 | }; |
| 177 | |
| 178 | /* a table used for quick lookup of the definition that goes with a |
| 179 | * particular lead byte */ |
| 180 | static unsigned char* lb_tab[256] = { NULL }; |
| 181 | |
| 182 | /* a pointer to the table; NULL means not yet setup */ |
| 183 | static unsigned char** lb_ptr = NULL; |
| 184 | |
| 185 | /* buffer pointer and size */ |
| 186 | const unsigned char *z; |
| 187 | unsigned int n; |
| 188 | |
| 189 | /* if the table pointer hasn't been initialized */ |
| 190 | if( lb_ptr==NULL ){ |
| 191 | unsigned char** pp; |
| 192 | /* for each definition, set the lead byte table pointer to the |
| 193 | * proper definition */ |
| 194 | lb_ptr = lb_tab; |
| 195 | pp = def_arr; |
| 196 | while( *pp!=NULL ){ |
| 197 | unsigned char lo = pp[0][1]; |
| 198 | unsigned char hi = pp[0][2]; |
| 199 | unsigned char i; |
| 200 | for(i=lo; i<=hi; ++i){ |
| 201 | lb_ptr[i] = pp[0]; |
| 202 | } |
| 203 | ++pp; |
| 204 | } |
| 205 | } |
| 206 | z = (unsigned char *)blob_buffer(pContent); |
| 207 | n = blob_size(pContent); |
| 208 | /* while we haven't checked all the bytes in the buffer */ |
| 209 | while( n>0 ){ |
| 210 | /* ascii is trivial */ |
| 211 | if( *z<0x80 ){ |
| 212 | ++z; |
| 213 | --n; |
| 214 | }else{ |
| 215 | /* get the definition for this lead byte */ |
| 216 | unsigned char* def = lb_ptr[*z++]; |
| 217 | unsigned char i, len; |
| 218 | |
| 219 | /* if the definition doesn't exist, return invalid */ |
| 220 | if( !def ) return LOOK_INVALID; |
| 221 | /* get the expected sequence length */ |
| 222 | len = *def; |
| 223 | /* if there aren't enough bytes left, return invalid */ |
| 224 | if( n<len ) return LOOK_INVALID; |
| 225 | /* skip the length & lead byte range */ |
| 226 | def += 3; |
| 227 | /* we already know byte #0 is good, so check the remaining bytes */ |
| 228 | for(i=1; i<len; ++i){ |
| 229 | /* if the byte is outside the allowed range for this definition, |
| 230 | * return invalid */ |
| 231 | if( (*z<*def++) || (*z++>*def++) ){ |
| 232 | return LOOK_INVALID; |
| 233 | } |
| 234 | } |
| 235 | /* advance to the next sequence */ |
| 236 | n -= len; |
| 237 | } |
| 238 | } |
| 239 | /* we made it all the way through the buffer so it's not invalid */ |
| 240 | return LOOK_NONE; |
| 241 | } |
| 242 | |
| 243 | /* |
| 244 | ** Define the type needed to represent a Unicode (UTF-16) character. |
| 245 | */ |
| 246 | #ifndef WCHAR_T |
| @@ -452,11 +468,11 @@ | |
| 468 | fUnicode = could_be_utf16(&blob, 0) || fForceUtf16; |
| 469 | } |
| 470 | if( fUnicode ){ |
| 471 | lookFlags = looks_like_utf16(&blob, bRevUtf16, 0); |
| 472 | }else{ |
| 473 | lookFlags = looks_like_utf8(&blob, 0) | invalid_utf8(&blob); |
| 474 | } |
| 475 | } |
| 476 | fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob)); |
| 477 | fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no"); |
| 478 | fossil_print("Starts with UTF-16 BOM: %s\n", |
| 479 |