Fossil SCM
Use faster table-based approach when checking for invalid utf-8, in stead of complex bit-operations.
Commit
60349a6617490676a2ea1a31fdce56decb641dc6
Parent
bd559ff0d0db4e8…
1 file changed
+53
-10
+53
-10
| --- src/lookslike.c | ||
| +++ src/lookslike.c | ||
| @@ -132,25 +132,72 @@ | ||
| 132 | 132 | flags |= LOOK_LONG; /* Very long line -> binary */ |
| 133 | 133 | } |
| 134 | 134 | return flags; |
| 135 | 135 | } |
| 136 | 136 | |
| 137 | - | |
| 138 | 137 | /* |
| 139 | 138 | ** Checks for proper UTF-8. It uses the method described in: |
| 140 | 139 | ** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences |
| 141 | 140 | ** except for the "overlong form" of \u0000 which is not considered invalid |
| 142 | -** here: Some languages like Java and Tcl use it. For UTF-8 characters | |
| 141 | +** here: Some languages like Java and Tcl use it. This function also | |
| 142 | +** considers valid the derivatives CESU-8 & WTF-8 (as described in the | |
| 143 | +** same wikipedia article referenced previously). For UTF-8 characters | |
| 143 | 144 | ** > 7f, the variable 'c2' not necessary means the previous character. |
| 144 | 145 | ** It's number of higher 1-bits indicate the number of continuation bytes |
| 145 | 146 | ** that are expected to be followed. E.g. when 'c2' has a value in the range |
| 146 | 147 | ** 0xc0..0xdf it means that 'c' is expected to contain the last continuation |
| 147 | 148 | ** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one |
| 148 | 149 | ** more continuation byte is expected. |
| 149 | 150 | */ |
| 150 | 151 | |
| 151 | -int invalid_utf8(const Blob *pContent){ | |
| 152 | +/* definitions for various UTF-8 sequence lengths */ | |
| 153 | +static const unsigned char us2a[] = { /* for lead byte 0xC0 */ | |
| 154 | + 0x80, 0x80 | |
| 155 | +}; | |
| 156 | +static const unsigned char us2b[] = { /* for lead bytes 0xC2-0xDF */ | |
| 157 | + 0x80, 0xBF | |
| 158 | +}; | |
| 159 | +static const unsigned char us3a[] = { /* for lead byte 0xE0 */ | |
| 160 | + 0xA0, 0xBF | |
| 161 | +}; | |
| 162 | +static const unsigned char us3b[] = { /* for lead bytes 0xE1-0xEF */ | |
| 163 | + 0x80, 0xBF | |
| 164 | +}; | |
| 165 | +static const unsigned char us4a[] = { /* for lead byte 0xF0 */ | |
| 166 | + 0x90, 0xBF | |
| 167 | +}; | |
| 168 | +static const unsigned char us4b[] = { /* for lead bytes 0xF1-0xF3 */ | |
| 169 | + 0x80, 0xBF | |
| 170 | +}; | |
| 171 | +static const unsigned char us4c[] = { /* for lead byte 0xF4 */ | |
| 172 | + 0x80, 0x8F | |
| 173 | +}; | |
| 174 | + | |
| 175 | +/* a table used for quick lookup of the definition that goes with a | |
| 176 | + * particular lead byte */ | |
| 177 | +static const unsigned char* const lb_tab[] = { | |
| 178 | + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 179 | + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 180 | + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 181 | + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 182 | + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 183 | + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 184 | + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 185 | + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 186 | + us2a, NULL, us2b, us2b, us2b, us2b, us2b, us2b, | |
| 187 | + us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b, | |
| 188 | + us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b, | |
| 189 | + us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b, | |
| 190 | + us3a, us3b, us3b, us3b, us3b, us3b, us3b, us3b, | |
| 191 | + us3b, us3b, us3b, us3b, us3b, us3b, us3b, us3b, | |
| 192 | + us4a, us4b, us4b, us4b, us4c, NULL, NULL, NULL, | |
| 193 | + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL | |
| 194 | +}; | |
| 195 | + | |
| 196 | +int invalid_utf8( | |
| 197 | + const Blob *pContent | |
| 198 | +){ | |
| 152 | 199 | const unsigned char *z = (unsigned char *) blob_buffer(pContent); |
| 153 | 200 | unsigned int n = blob_size(pContent); |
| 154 | 201 | unsigned char c, c2; |
| 155 | 202 | |
| 156 | 203 | if( n==0 ) return 0; /* Empty file -> OK */ |
| @@ -157,27 +204,23 @@ | ||
| 157 | 204 | c = *z; |
| 158 | 205 | while( --n>0 ){ |
| 159 | 206 | c2 = c; |
| 160 | 207 | c = *++z; |
| 161 | 208 | if( c2>=0x80 ){ |
| 162 | - if( ((c&0xc0)!=0x80) || (((c2<0xc2) || (c2>=0xf4)) && | |
| 163 | - (((c2!=0xf4) || (c>=0x90)) && ((c2!=0xc0) || (c!=0x80)))) ){ | |
| 209 | + const unsigned char *def = lb_tab[(c2)-0x80]; | |
| 210 | + if( !def || (c<*def++) || (c>*def++) ){ | |
| 164 | 211 | return LOOK_INVALID; /* Invalid UTF-8 */ |
| 165 | 212 | } |
| 166 | 213 | if( c2>=0xe0 ){ |
| 167 | - if ((c2==0xf0 && c<0x90)||(c2==0xe0 && c<0xa0) ){ | |
| 168 | - return LOOK_INVALID; /* Invalid UTF-8, too short */ | |
| 169 | - } | |
| 170 | 214 | c = (c2<<1)|3; |
| 171 | 215 | }else{ |
| 172 | 216 | c = ' '; |
| 173 | 217 | } |
| 174 | 218 | } |
| 175 | 219 | } |
| 176 | 220 | return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */ |
| 177 | 221 | } |
| 178 | - | |
| 179 | 222 | |
| 180 | 223 | /* |
| 181 | 224 | ** Define the type needed to represent a Unicode (UTF-16) character. |
| 182 | 225 | */ |
| 183 | 226 | #ifndef WCHAR_T |
| @@ -405,11 +448,11 @@ | ||
| 405 | 448 | fUnicode = could_be_utf16(&blob, 0) || fForceUtf16; |
| 406 | 449 | } |
| 407 | 450 | if( fUnicode ){ |
| 408 | 451 | lookFlags = looks_like_utf16(&blob, bRevUtf16, 0); |
| 409 | 452 | }else{ |
| 410 | - lookFlags = looks_like_utf8(&blob, 0)|invalid_utf8(&blob); | |
| 453 | + lookFlags = looks_like_utf8(&blob, 0) | invalid_utf8(&blob); | |
| 411 | 454 | } |
| 412 | 455 | } |
| 413 | 456 | fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob)); |
| 414 | 457 | fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no"); |
| 415 | 458 | fossil_print("Starts with UTF-16 BOM: %s\n", |
| 416 | 459 |
| --- src/lookslike.c | |
| +++ src/lookslike.c | |
| @@ -132,25 +132,72 @@ | |
| 132 | flags |= LOOK_LONG; /* Very long line -> binary */ |
| 133 | } |
| 134 | return flags; |
| 135 | } |
| 136 | |
| 137 | |
| 138 | /* |
| 139 | ** Checks for proper UTF-8. It uses the method described in: |
| 140 | ** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences |
| 141 | ** except for the "overlong form" of \u0000 which is not considered invalid |
| 142 | ** here: Some languages like Java and Tcl use it. For UTF-8 characters |
| 143 | ** > 7f, the variable 'c2' not necessary means the previous character. |
| 144 | ** It's number of higher 1-bits indicate the number of continuation bytes |
| 145 | ** that are expected to be followed. E.g. when 'c2' has a value in the range |
| 146 | ** 0xc0..0xdf it means that 'c' is expected to contain the last continuation |
| 147 | ** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one |
| 148 | ** more continuation byte is expected. |
| 149 | */ |
| 150 | |
| 151 | int invalid_utf8(const Blob *pContent){ |
| 152 | const unsigned char *z = (unsigned char *) blob_buffer(pContent); |
| 153 | unsigned int n = blob_size(pContent); |
| 154 | unsigned char c, c2; |
| 155 | |
| 156 | if( n==0 ) return 0; /* Empty file -> OK */ |
| @@ -157,27 +204,23 @@ | |
| 157 | c = *z; |
| 158 | while( --n>0 ){ |
| 159 | c2 = c; |
| 160 | c = *++z; |
| 161 | if( c2>=0x80 ){ |
| 162 | if( ((c&0xc0)!=0x80) || (((c2<0xc2) || (c2>=0xf4)) && |
| 163 | (((c2!=0xf4) || (c>=0x90)) && ((c2!=0xc0) || (c!=0x80)))) ){ |
| 164 | return LOOK_INVALID; /* Invalid UTF-8 */ |
| 165 | } |
| 166 | if( c2>=0xe0 ){ |
| 167 | if ((c2==0xf0 && c<0x90)||(c2==0xe0 && c<0xa0) ){ |
| 168 | return LOOK_INVALID; /* Invalid UTF-8, too short */ |
| 169 | } |
| 170 | c = (c2<<1)|3; |
| 171 | }else{ |
| 172 | c = ' '; |
| 173 | } |
| 174 | } |
| 175 | } |
| 176 | return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */ |
| 177 | } |
| 178 | |
| 179 | |
| 180 | /* |
| 181 | ** Define the type needed to represent a Unicode (UTF-16) character. |
| 182 | */ |
| 183 | #ifndef WCHAR_T |
| @@ -405,11 +448,11 @@ | |
| 405 | fUnicode = could_be_utf16(&blob, 0) || fForceUtf16; |
| 406 | } |
| 407 | if( fUnicode ){ |
| 408 | lookFlags = looks_like_utf16(&blob, bRevUtf16, 0); |
| 409 | }else{ |
| 410 | lookFlags = looks_like_utf8(&blob, 0)|invalid_utf8(&blob); |
| 411 | } |
| 412 | } |
| 413 | fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob)); |
| 414 | fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no"); |
| 415 | fossil_print("Starts with UTF-16 BOM: %s\n", |
| 416 |
| --- src/lookslike.c | |
| +++ src/lookslike.c | |
| @@ -132,25 +132,72 @@ | |
| 132 | flags |= LOOK_LONG; /* Very long line -> binary */ |
| 133 | } |
| 134 | return flags; |
| 135 | } |
| 136 | |
| 137 | /* |
| 138 | ** Checks for proper UTF-8. It uses the method described in: |
| 139 | ** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences |
| 140 | ** except for the "overlong form" of \u0000 which is not considered invalid |
| 141 | ** here: Some languages like Java and Tcl use it. This function also |
| 142 | ** considers valid the derivatives CESU-8 & WTF-8 (as described in the |
| 143 | ** same wikipedia article referenced previously). For UTF-8 characters |
| 144 | ** > 7f, the variable 'c2' not necessary means the previous character. |
| 145 | ** It's number of higher 1-bits indicate the number of continuation bytes |
| 146 | ** that are expected to be followed. E.g. when 'c2' has a value in the range |
| 147 | ** 0xc0..0xdf it means that 'c' is expected to contain the last continuation |
| 148 | ** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one |
| 149 | ** more continuation byte is expected. |
| 150 | */ |
| 151 | |
| 152 | /* definitions for various UTF-8 sequence lengths */ |
| 153 | static const unsigned char us2a[] = { /* for lead byte 0xC0 */ |
| 154 | 0x80, 0x80 |
| 155 | }; |
| 156 | static const unsigned char us2b[] = { /* for lead bytes 0xC2-0xDF */ |
| 157 | 0x80, 0xBF |
| 158 | }; |
| 159 | static const unsigned char us3a[] = { /* for lead byte 0xE0 */ |
| 160 | 0xA0, 0xBF |
| 161 | }; |
| 162 | static const unsigned char us3b[] = { /* for lead bytes 0xE1-0xEF */ |
| 163 | 0x80, 0xBF |
| 164 | }; |
| 165 | static const unsigned char us4a[] = { /* for lead byte 0xF0 */ |
| 166 | 0x90, 0xBF |
| 167 | }; |
| 168 | static const unsigned char us4b[] = { /* for lead bytes 0xF1-0xF3 */ |
| 169 | 0x80, 0xBF |
| 170 | }; |
| 171 | static const unsigned char us4c[] = { /* for lead byte 0xF4 */ |
| 172 | 0x80, 0x8F |
| 173 | }; |
| 174 | |
| 175 | /* a table used for quick lookup of the definition that goes with a |
| 176 | * particular lead byte */ |
| 177 | static const unsigned char* const lb_tab[] = { |
| 178 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 179 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 180 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 181 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 182 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 183 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 184 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 185 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 186 | us2a, NULL, us2b, us2b, us2b, us2b, us2b, us2b, |
| 187 | us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b, |
| 188 | us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b, |
| 189 | us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b, |
| 190 | us3a, us3b, us3b, us3b, us3b, us3b, us3b, us3b, |
| 191 | us3b, us3b, us3b, us3b, us3b, us3b, us3b, us3b, |
| 192 | us4a, us4b, us4b, us4b, us4c, NULL, NULL, NULL, |
| 193 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL |
| 194 | }; |
| 195 | |
| 196 | int invalid_utf8( |
| 197 | const Blob *pContent |
| 198 | ){ |
| 199 | const unsigned char *z = (unsigned char *) blob_buffer(pContent); |
| 200 | unsigned int n = blob_size(pContent); |
| 201 | unsigned char c, c2; |
| 202 | |
| 203 | if( n==0 ) return 0; /* Empty file -> OK */ |
| @@ -157,27 +204,23 @@ | |
| 204 | c = *z; |
| 205 | while( --n>0 ){ |
| 206 | c2 = c; |
| 207 | c = *++z; |
| 208 | if( c2>=0x80 ){ |
| 209 | const unsigned char *def = lb_tab[(c2)-0x80]; |
| 210 | if( !def || (c<*def++) || (c>*def++) ){ |
| 211 | return LOOK_INVALID; /* Invalid UTF-8 */ |
| 212 | } |
| 213 | if( c2>=0xe0 ){ |
| 214 | c = (c2<<1)|3; |
| 215 | }else{ |
| 216 | c = ' '; |
| 217 | } |
| 218 | } |
| 219 | } |
| 220 | return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */ |
| 221 | } |
| 222 | |
| 223 | /* |
| 224 | ** Define the type needed to represent a Unicode (UTF-16) character. |
| 225 | */ |
| 226 | #ifndef WCHAR_T |
| @@ -405,11 +448,11 @@ | |
| 448 | fUnicode = could_be_utf16(&blob, 0) || fForceUtf16; |
| 449 | } |
| 450 | if( fUnicode ){ |
| 451 | lookFlags = looks_like_utf16(&blob, bRevUtf16, 0); |
| 452 | }else{ |
| 453 | lookFlags = looks_like_utf8(&blob, 0) | invalid_utf8(&blob); |
| 454 | } |
| 455 | } |
| 456 | fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob)); |
| 457 | fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no"); |
| 458 | fossil_print("Starts with UTF-16 BOM: %s\n", |
| 459 |