Fossil SCM
proposed new invalid_utf8 function
Commit
e58334a00799c303c6ad187e6c84c2151457b146
Parent
314cdab0d49d742…
1 file changed
+73
-36
+73
-36
| --- src/lookslike.c | ||
| +++ src/lookslike.c | ||
| @@ -136,49 +136,86 @@ | ||
| 136 | 136 | |
| 137 | 137 | |
| 138 | 138 | /* |
| 139 | 139 | ** Checks for proper UTF-8. It uses the method described in: |
| 140 | 140 | ** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences |
| 141 | -** except for the "overlong form" of \u0000 which is not considered invalid | |
| 142 | -** here: Some languages like Java and Tcl use it. For UTF-8 characters | |
| 143 | -** > 7f, the variable 'c2' not necessary means the previous character. | |
| 144 | -** It's number of higher 1-bits indicate the number of continuation bytes | |
| 145 | -** that are expected to be followed. E.g. when 'c2' has a value in the range | |
| 146 | -** 0xc0..0xdf it means that 'c' is expected to contain the last continuation | |
| 147 | -** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one | |
| 148 | -** more continuation byte is expected. | |
| 141 | +** except for the "overlong form" of \u0000 (Modified UTF-8) | |
| 142 | +** which is not considered invalid here: Some languages like | |
| 143 | +** Java and Tcl use it. This function also considers valid | |
| 144 | +** the derivatives CESU-8 & WTF-8 (as described in the same | |
| 145 | +** wikipedia article referenced previously). | |
| 149 | 146 | */ |
| 150 | 147 | |
| 151 | -int invalid_utf8(const Blob *pContent){ | |
| 152 | - const unsigned char *z = (unsigned char *) blob_buffer(pContent); | |
| 148 | +int invalid_utf8(const Blob *pContent) | |
| 149 | +{ | |
| 150 | + /* definitions for various utf-8 sequence lengths */ | |
| 151 | + static unsigned char def_1a[] = { 1, 0x00, 0x7F }; | |
| 152 | + static unsigned char def_2a[] = { 2, 0xC0, 0xC0, 0x80, 0x80 }; | |
| 153 | + static unsigned char def_2b[] = { 2, 0xC2, 0xDF, 0x80, 0xBF }; | |
| 154 | + static unsigned char def_3a[] = { 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF }; | |
| 155 | + static unsigned char def_3b[] = { 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF }; | |
| 156 | + static unsigned char def_4a[] = { 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF }; | |
| 157 | + static unsigned char def_4b[] = { 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF }; | |
| 158 | + static unsigned char def_4c[] = { 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF }; | |
| 159 | + | |
| 160 | + /* an array of all the definitions */ | |
| 161 | + static unsigned char* def_arr[] = { def_1a, def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL }; | |
| 162 | + | |
| 163 | + /* a table used for quick lookup of the definition that goes with a particular lead byte */ | |
| 164 | + static unsigned char* lb_tab[256] = { NULL }; | |
| 165 | + | |
| 166 | + /* a pointer to the table; NULL means not yet setup */ | |
| 167 | + static unsigned char** lb_ptr = NULL; | |
| 168 | + | |
| 169 | + /* if the table pointer hasn't been initialized */ | |
| 170 | + if (lb_ptr == NULL) | |
| 171 | + { | |
| 172 | + lb_ptr = lb_tab; | |
| 173 | + | |
| 174 | + /* for each definition, set the lead byte table pointer to the proper definition */ | |
| 175 | + unsigned char** pp = def_arr; | |
| 176 | + while (*pp != NULL) | |
| 177 | + { | |
| 178 | + unsigned char lo = pp[0][1]; | |
| 179 | + unsigned char hi = pp[0][2]; | |
| 180 | + unsigned char i; | |
| 181 | + for (i = lo; i <= hi; ++i) | |
| 182 | + lb_ptr[i] = pp[0]; | |
| 183 | + ++pp; | |
| 184 | + } | |
| 185 | + } | |
| 186 | + | |
| 187 | + /* buffer pointer and size */ | |
| 188 | + const unsigned char *z = (unsigned char *)blob_buffer(pContent); | |
| 153 | 189 | unsigned int n = blob_size(pContent); |
| 154 | - unsigned char c, c2; | |
| 155 | - | |
| 156 | - if( n==0 ) return 0; /* Empty file -> OK */ | |
| 157 | - c = *z; | |
| 158 | - while( --n>0 ){ | |
| 159 | - c2 = c; | |
| 160 | - c = *++z; | |
| 161 | - if( c2>=0x80 ){ | |
| 162 | - if( ((c2<0xc2) || (c2>=0xf4) || ((c&0xc0)!=0x80)) && | |
| 163 | - (((c2!=0xf4) || (c>=0x90)) && ((c2!=0xc0) || (c!=0x80))) ){ | |
| 164 | - return LOOK_INVALID; /* Invalid UTF-8 */ | |
| 165 | - } | |
| 166 | - /* the first byte of the sequence is okay | |
| 167 | - ** but we need to check the rest | |
| 168 | - ** convert next byte to a prefix byte of the next shorter sequence | |
| 169 | - ** or a simple space character if the two byte seq was valid | |
| 170 | - */ | |
| 171 | - c = (c2 >= 0xe0) ? (c2<<1)+1 : ' '; | |
| 172 | - /* edge case: if three byte sequence started with 0xe0 | |
| 173 | - ** it becomes 0xc1, which is a too short two byte sequence | |
| 174 | - ** so fix it up to be the start of a valid two byte sequence | |
| 175 | - */ | |
| 176 | - if (c == 0xc1) c = 0xc2; | |
| 177 | - } | |
| 178 | - } | |
| 179 | - return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */ | |
| 190 | + | |
| 191 | + /* while we haven't checked all the bytes in the buffer */ | |
| 192 | + while (n > 0) | |
| 193 | + { | |
| 194 | + /* get the definition for this lead byte */ | |
| 195 | + unsigned char* def = lb_ptr[*z]; | |
| 196 | + unsigned char i; | |
| 197 | + | |
| 198 | + /* if the definition doesn't exist, or there aren't enough bytes left, return invalid */ | |
| 199 | + if (!def || (n < def[0])) | |
| 200 | + return LOOK_INVALID; | |
| 201 | + | |
| 202 | + /* we already know byte #0 is good, so check the remaining bytes */ | |
| 203 | + for (i = 1; i < def[0]; ++i) | |
| 204 | + { | |
| 205 | + /* if the byte is outside the allowed range for this definition, return invalid */ | |
| 206 | + if ((z[i] < def[1 + i * 2 + 0]) || (z[i] > def[1 + i * 2 + 1])) | |
| 207 | + return LOOK_INVALID; | |
| 208 | + } | |
| 209 | + | |
| 210 | + /* advance to the next sequence */ | |
| 211 | + z += def[0]; | |
| 212 | + n -= def[0]; | |
| 213 | + } | |
| 214 | + | |
| 215 | + /* we made it all the way through the buffer so it's not invalid */ | |
| 216 | + return 0; | |
| 180 | 217 | } |
| 181 | 218 | |
| 182 | 219 | |
| 183 | 220 | /* |
| 184 | 221 | ** Define the type needed to represent a Unicode (UTF-16) character. |
| 185 | 222 |
| --- src/lookslike.c | |
| +++ src/lookslike.c | |
| @@ -136,49 +136,86 @@ | |
| 136 | |
| 137 | |
| 138 | /* |
| 139 | ** Checks for proper UTF-8. It uses the method described in: |
| 140 | ** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences |
| 141 | ** except for the "overlong form" of \u0000 which is not considered invalid |
| 142 | ** here: Some languages like Java and Tcl use it. For UTF-8 characters |
| 143 | ** > 7f, the variable 'c2' not necessary means the previous character. |
| 144 | ** It's number of higher 1-bits indicate the number of continuation bytes |
| 145 | ** that are expected to be followed. E.g. when 'c2' has a value in the range |
| 146 | ** 0xc0..0xdf it means that 'c' is expected to contain the last continuation |
| 147 | ** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one |
| 148 | ** more continuation byte is expected. |
| 149 | */ |
| 150 | |
| 151 | int invalid_utf8(const Blob *pContent){ |
| 152 | const unsigned char *z = (unsigned char *) blob_buffer(pContent); |
| 153 | unsigned int n = blob_size(pContent); |
| 154 | unsigned char c, c2; |
| 155 | |
| 156 | if( n==0 ) return 0; /* Empty file -> OK */ |
| 157 | c = *z; |
| 158 | while( --n>0 ){ |
| 159 | c2 = c; |
| 160 | c = *++z; |
| 161 | if( c2>=0x80 ){ |
| 162 | if( ((c2<0xc2) || (c2>=0xf4) || ((c&0xc0)!=0x80)) && |
| 163 | (((c2!=0xf4) || (c>=0x90)) && ((c2!=0xc0) || (c!=0x80))) ){ |
| 164 | return LOOK_INVALID; /* Invalid UTF-8 */ |
| 165 | } |
| 166 | /* the first byte of the sequence is okay |
| 167 | ** but we need to check the rest |
| 168 | ** convert next byte to a prefix byte of the next shorter sequence |
| 169 | ** or a simple space character if the two byte seq was valid |
| 170 | */ |
| 171 | c = (c2 >= 0xe0) ? (c2<<1)+1 : ' '; |
| 172 | /* edge case: if three byte sequence started with 0xe0 |
| 173 | ** it becomes 0xc1, which is a too short two byte sequence |
| 174 | ** so fix it up to be the start of a valid two byte sequence |
| 175 | */ |
| 176 | if (c == 0xc1) c = 0xc2; |
| 177 | } |
| 178 | } |
| 179 | return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */ |
| 180 | } |
| 181 | |
| 182 | |
| 183 | /* |
| 184 | ** Define the type needed to represent a Unicode (UTF-16) character. |
| 185 |
| --- src/lookslike.c | |
| +++ src/lookslike.c | |
| @@ -136,49 +136,86 @@ | |
| 136 | |
| 137 | |
| 138 | /* |
| 139 | ** Checks for proper UTF-8. It uses the method described in: |
| 140 | ** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences |
| 141 | ** except for the "overlong form" of \u0000 (Modified UTF-8) |
| 142 | ** which is not considered invalid here: Some languages like |
| 143 | ** Java and Tcl use it. This function also considers valid |
| 144 | ** the derivatives CESU-8 & WTF-8 (as described in the same |
| 145 | ** wikipedia article referenced previously). |
| 146 | */ |
| 147 | |
| 148 | int invalid_utf8(const Blob *pContent) |
| 149 | { |
| 150 | /* definitions for various utf-8 sequence lengths */ |
| 151 | static unsigned char def_1a[] = { 1, 0x00, 0x7F }; |
| 152 | static unsigned char def_2a[] = { 2, 0xC0, 0xC0, 0x80, 0x80 }; |
| 153 | static unsigned char def_2b[] = { 2, 0xC2, 0xDF, 0x80, 0xBF }; |
| 154 | static unsigned char def_3a[] = { 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF }; |
| 155 | static unsigned char def_3b[] = { 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF }; |
| 156 | static unsigned char def_4a[] = { 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF }; |
| 157 | static unsigned char def_4b[] = { 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF }; |
| 158 | static unsigned char def_4c[] = { 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF }; |
| 159 | |
| 160 | /* an array of all the definitions */ |
| 161 | static unsigned char* def_arr[] = { def_1a, def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL }; |
| 162 | |
| 163 | /* a table used for quick lookup of the definition that goes with a particular lead byte */ |
| 164 | static unsigned char* lb_tab[256] = { NULL }; |
| 165 | |
| 166 | /* a pointer to the table; NULL means not yet setup */ |
| 167 | static unsigned char** lb_ptr = NULL; |
| 168 | |
| 169 | /* if the table pointer hasn't been initialized */ |
| 170 | if (lb_ptr == NULL) |
| 171 | { |
| 172 | lb_ptr = lb_tab; |
| 173 | |
| 174 | /* for each definition, set the lead byte table pointer to the proper definition */ |
| 175 | unsigned char** pp = def_arr; |
| 176 | while (*pp != NULL) |
| 177 | { |
| 178 | unsigned char lo = pp[0][1]; |
| 179 | unsigned char hi = pp[0][2]; |
| 180 | unsigned char i; |
| 181 | for (i = lo; i <= hi; ++i) |
| 182 | lb_ptr[i] = pp[0]; |
| 183 | ++pp; |
| 184 | } |
| 185 | } |
| 186 | |
| 187 | /* buffer pointer and size */ |
| 188 | const unsigned char *z = (unsigned char *)blob_buffer(pContent); |
| 189 | unsigned int n = blob_size(pContent); |
| 190 | |
| 191 | /* while we haven't checked all the bytes in the buffer */ |
| 192 | while (n > 0) |
| 193 | { |
| 194 | /* get the definition for this lead byte */ |
| 195 | unsigned char* def = lb_ptr[*z]; |
| 196 | unsigned char i; |
| 197 | |
| 198 | /* if the definition doesn't exist, or there aren't enough bytes left, return invalid */ |
| 199 | if (!def || (n < def[0])) |
| 200 | return LOOK_INVALID; |
| 201 | |
| 202 | /* we already know byte #0 is good, so check the remaining bytes */ |
| 203 | for (i = 1; i < def[0]; ++i) |
| 204 | { |
| 205 | /* if the byte is outside the allowed range for this definition, return invalid */ |
| 206 | if ((z[i] < def[1 + i * 2 + 0]) || (z[i] > def[1 + i * 2 + 1])) |
| 207 | return LOOK_INVALID; |
| 208 | } |
| 209 | |
| 210 | /* advance to the next sequence */ |
| 211 | z += def[0]; |
| 212 | n -= def[0]; |
| 213 | } |
| 214 | |
| 215 | /* we made it all the way through the buffer so it's not invalid */ |
| 216 | return 0; |
| 217 | } |
| 218 | |
| 219 | |
| 220 | /* |
| 221 | ** Define the type needed to represent a Unicode (UTF-16) character. |
| 222 |