| | @@ -168,10 +168,46 @@ |
| 168 | 168 | |
| 169 | 169 | /* Return results */ |
| 170 | 170 | *pnLine = nLine; |
| 171 | 171 | return a; |
| 172 | 172 | } |
| 173 | + |
| 174 | +/* |
| 175 | +** Macro which checks for proper UTF-8, when the first byte >= 0x80 |
| 176 | +** It uses the method described in: |
| 177 | +** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences |
| 178 | +** except for the "overlong form" which is not considered |
| 179 | +** invalid: Some languages like Java and Tcl use it. |
| 180 | +** |
| 181 | +** Any invalid byte causes bit 2 of result to be set (result |= 4), |
| 182 | +** otherwise for valid multibyte utf-8 sequences n, j and z are |
| 183 | +** updated so the continuation bytes are not checked again. |
| 184 | + */ |
| 185 | +#define CHECKUTF8(c) \ |
| 186 | +if( c<0xC0 ){ \ |
| 187 | + result |= 4; /* Invalid 1-byte UTF-8, continue */ \ |
| 188 | +}else if( c<0xE0 ){ \ |
| 189 | + if( n<2 || ((z[1]&0xC0)!=0x80) ){ \ |
| 190 | + result |= 4; /* Invalid 2-byte UTF-8, continue */ \ |
| 191 | + }else{ \ |
| 192 | + --n; ++j; ++z; \ |
| 193 | + } \ |
| 194 | +}else if( c<0xF0 ){ \ |
| 195 | + if( n<3 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) ){ \ |
| 196 | + result |= 4; /* Invalid 3-byte UTF-8, continue */ \ |
| 197 | + }else{ \ |
| 198 | + n-=2; j+=2; z+=2; \ |
| 199 | + } \ |
| 200 | +}else if( c<0xF8 ){ \ |
| 201 | + if( n<4 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) || ((z[3]&0xC0)!=0x80) ){ \ |
| 202 | + result |= 4; /* Invalid 4-byte UTF-8, continue */ \ |
| 203 | + }else{ \ |
| 204 | + n-=3; j+=3; z+=3; \ |
| 205 | + } \ |
| 206 | +}else{ \ |
| 207 | + result |= 4; /* Invalid multi-byte UTF-8, continue */ \ |
| 208 | +} |
| 173 | 209 | |
| 174 | 210 | /* |
| 175 | 211 | ** This function attempts to scan each logical line within the blob to |
| 176 | 212 | ** determine the type of content it appears to contain. Possible return |
| 177 | 213 | ** values are: |
| | @@ -195,14 +231,11 @@ |
| 195 | 231 | ** delimited by carriage-return, line-feed pairs; however, the |
| 196 | 232 | ** encoding is not UTF-8 or ASCII. |
| 197 | 233 | ** |
| 198 | 234 | ************************************ WARNING ********************************** |
| 199 | 235 | ** |
| 200 | | -** This function does not validate that the blob content is properly formed |
| 201 | | -** UTF-8. It assumes that all code points are the same size. It does not |
| 202 | | -** validate any code points. It makes no attempt to detect if any [invalid] |
| 203 | | -** switches between UTF-8 and other encodings occur. |
| 236 | +** This function does not validate any code points. |
| 204 | 237 | ** |
| 205 | 238 | ** The only code points that this function cares about are the NUL character, |
| 206 | 239 | ** carriage-return, and line-feed. |
| 207 | 240 | ** |
| 208 | 241 | ************************************ WARNING ********************************** |
| | @@ -218,67 +251,29 @@ |
| 218 | 251 | /* Check individual lines. |
| 219 | 252 | */ |
| 220 | 253 | if( n==0 ) return 1; /* Empty file -> text */ |
| 221 | 254 | c = *z; |
| 222 | 255 | j = (c!='\n'); |
| 223 | | - if( c<0x80 ){ |
| 224 | | - if( c==0 ) return 0; /* Zero byte in a file -> binary */ |
| 225 | | - }else if( c<0xC0 ){ |
| 226 | | - result |= 4; /* Invalid UTF-8, continue */ |
| 227 | | - }else if( c<0xE0 ){ |
| 228 | | - if( n<2 || ((z[1]&0xC0)!=0x80) ){ |
| 229 | | - result |= 4; /* Invalid 2-byte UTF-8, continue */ |
| 230 | | - }else{ |
| 231 | | - --n; ++j; ++z; |
| 232 | | - } |
| 233 | | - }else if( c<0xF0 ){ |
| 234 | | - if( n<3 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) ){ |
| 235 | | - result |= 4; /* Invalid 3-byte UTF-8, continue */ |
| 236 | | - }else{ |
| 237 | | - n-=2; j+=2; z+=2; |
| 238 | | - } |
| 239 | | - }else if( c<0xF8 ){ |
| 240 | | - if( n<4 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) || ((z[3]&0xC0)!=0x80) ){ |
| 241 | | - result |= 4; /* Invalid 4-byte UTF-8, continue */ |
| 242 | | - }else{ |
| 243 | | - n-=3; j+=3; z+=3; |
| 244 | | - } |
| 245 | | - }else{ |
| 246 | | - result |= 4; /* Invalid multi-byte UTF-8, continue */ |
| 256 | + if( c>=0x80 ){ |
| 257 | + CHECKUTF8(c) |
| 258 | + } else if( c==0 ){ |
| 259 | + return 0; /* Zero byte in a file -> binary */ \ |
| 247 | 260 | } |
| 248 | 261 | while( --n>0 ){ |
| 249 | 262 | c = *++z; ++j; |
| 250 | | - if( c<0x80 ){ |
| 251 | | - if( c==0 ) return 0; /* Zero byte in a file -> binary */ |
| 252 | | - if( c=='\n' ){ |
| 253 | | - if( z[-1]=='\r'){ |
| 254 | | - result |= 2; /* Contains CR/NL, continue */ |
| 255 | | - } |
| 256 | | - if( j>LENGTH_MASK ){ |
| 257 | | - return 0; /* Very long line -> binary */ |
| 258 | | - } |
| 259 | | - j = 0; |
| 260 | | - } |
| 261 | | - }else if( c<0xC0 ){ |
| 262 | | - result |= 4; /* Invalid UTF-8, continue */ |
| 263 | | - }else if( c<0xE0 ){ |
| 264 | | - if( n<2 || ((z[1]&0xC0)!=0x80) ){ |
| 265 | | - result |= 4; continue; /* Invalid 2-byte UTF-8, continue */ |
| 266 | | - } |
| 267 | | - --n; ++j; ++z; |
| 268 | | - }else if( c<0xF0 ){ |
| 269 | | - if( n<3 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) ){ |
| 270 | | - result |= 4; continue; /* Invalid 3-byte UTF-8, continue */ |
| 271 | | - } |
| 272 | | - n-=2; j+=2; z+=2; |
| 273 | | - }else if( c<0xF8 ){ |
| 274 | | - if( n<4 || ((z[1]&0xC0)!=0x80) || ((z[2]&0xC0)!=0x80) || ((z[3]&0xC0)!=0x80) ){ |
| 275 | | - result |= 4; continue; /* Invalid 4-byte UTF-8, continue */ |
| 276 | | - } |
| 277 | | - n-=3; j+=3; z+=3; |
| 278 | | - }else{ |
| 279 | | - result |= 4; /* Invalid multi-byte UTF-8, continue */ |
| 263 | + if( c>=0x80 ){ |
| 264 | + CHECKUTF8(c) |
| 265 | + } else if( c==0 ){ |
| 266 | + return 0; /* Zero byte in a file -> binary */ \ |
| 267 | + } else if( c=='\n' ){ |
| 268 | + if( z[-1]=='\r' ){ |
| 269 | + result |= 2; /* Contains CR/NL, continue */ |
| 270 | + } |
| 271 | + if( j>LENGTH_MASK ){ |
| 272 | + return 0; /* Very long line -> binary */ |
| 273 | + } |
| 274 | + j = 0; |
| 280 | 275 | } |
| 281 | 276 | } |
| 282 | 277 | if( j>LENGTH_MASK ){ |
| 283 | 278 | return 0; /* Very long line -> binary */ |
| 284 | 279 | } |
| 285 | 280 | |