| | @@ -69,13 +69,15 @@ |
| 69 | 69 | ** Output flags for the looks_like_utf8() and looks_like_utf16() routines used |
| 70 | 70 | ** to convey status information about the blob content. |
| 71 | 71 | */ |
| 72 | 72 | #define LOOK_NONE ((int)0x00000000) /* Nothing special was found. */ |
| 73 | 73 | #define LOOK_NUL ((int)0x00000001) /* One or more NUL chars were found. */ |
| 74 | | -#define LOOK_LF ((int)0x00000002) /* One or more LF chars were found. */ |
| 75 | | -#define LOOK_CRLF ((int)0x00000004) /* One or more CR/LF pairs were found. */ |
| 76 | | -#define LOOK_LENGTH ((int)0x00000008) /* An over length line was found. */ |
| 74 | +#define LOOK_CR ((int)0x00000002) /* One or more CR chars were found. */ |
| 75 | +#define LOOK_LF ((int)0x00000004) /* One or more LF chars were found. */ |
| 76 | +#define LOOK_CRLF ((int)0x00000008) /* One or more CR/LF pairs were found. */ |
| 77 | +#define LOOK_LENGTH ((int)0x00000010) /* An over length line was found. */ |
| 78 | +#define LOOK_ODD ((int)0x00000020) /* An odd number of bytes was found. */ |
| 77 | 79 | #endif /* INTERFACE */ |
| 78 | 80 | |
| 79 | 81 | /* |
| 80 | 82 | ** Maximum length of a line in a text file, in bytes. (2**13 = 8192 bytes) |
| 81 | 83 | */ |
| | @@ -217,31 +219,34 @@ |
| 217 | 219 | ** validate any code points. It makes no attempt to detect if any [invalid] |
| 218 | 220 | ** switches between UTF-8 and other encodings occur. |
| 219 | 221 | ** |
| 220 | 222 | ** The only code points that this function cares about are the NUL character, |
| 221 | 223 | ** carriage-return, and line-feed. |
| 224 | +** |
| 225 | +** Whether or not this function examines the entire contents of the blob is |
| 226 | +** officially unspecified. |
| 222 | 227 | ** |
| 223 | 228 | ************************************ WARNING ********************************** |
| 224 | 229 | */ |
| 225 | 230 | int looks_like_utf8(const Blob *pContent, int *pFlags){ |
| 226 | 231 | const char *z = blob_buffer(pContent); |
| 227 | 232 | unsigned int n = blob_size(pContent); |
| 228 | | - int j, c; |
| 233 | + int j, c, result = 1; /* Assume UTF-8 text, prove otherwise */ |
| 229 | 234 | |
| 230 | 235 | if( pFlags ) *pFlags = LOOK_NONE; |
| 231 | | - if( n==0 ) return 1; /* Empty file -> text */ |
| 236 | + if( n==0 ) return result; /* Empty file -> text */ |
| 232 | 237 | c = *z; |
| 233 | 238 | if( c==0 ){ |
| 234 | 239 | if( pFlags ) *pFlags |= LOOK_NUL; |
| 235 | | - return 0; /* NUL character in a file -> binary */ |
| 240 | + result = 0; /* NUL character in a file -> binary */ |
| 236 | 241 | } |
| 237 | 242 | j = (c!='\n'); |
| 238 | 243 | while( --n>0 ){ |
| 239 | 244 | c = *++z; ++j; |
| 240 | 245 | if( c==0 ){ |
| 241 | 246 | if( pFlags ) *pFlags |= LOOK_NUL; |
| 242 | | - return 0; /* NUL character in a file -> binary */ |
| 247 | + result = 0; /* NUL character in a file -> binary */ |
| 243 | 248 | } |
| 244 | 249 | if( c=='\n' ){ |
| 245 | 250 | int c2 = z[-1]; |
| 246 | 251 | if( pFlags ){ |
| 247 | 252 | *pFlags |= LOOK_LF; |
| | @@ -249,20 +254,22 @@ |
| 249 | 254 | *pFlags |= LOOK_CRLF; |
| 250 | 255 | } |
| 251 | 256 | } |
| 252 | 257 | if( j>LENGTH_MASK ){ |
| 253 | 258 | if( pFlags ) *pFlags |= LOOK_LENGTH; |
| 254 | | - return 0; /* Very long line -> binary */ |
| 259 | + result = 0; /* Very long line -> binary */ |
| 255 | 260 | } |
| 256 | 261 | j = 0; |
| 262 | + }else if( c=='\r' ){ |
| 263 | + if( pFlags ) *pFlags |= LOOK_CR; |
| 257 | 264 | } |
| 258 | 265 | } |
| 259 | 266 | if( j>LENGTH_MASK ){ |
| 260 | 267 | if( pFlags ) *pFlags |= LOOK_LENGTH; |
| 261 | | - return 0; /* Very long line -> binary */ |
| 268 | + result = 0; /* Very long line -> binary */ |
| 262 | 269 | } |
| 263 | | - return 1; /* No problems seen -> not binary */ |
| 270 | + return result; /* No problems seen -> not binary */ |
| 264 | 271 | } |
| 265 | 272 | |
| 266 | 273 | /* |
| 267 | 274 | ** Define the type needed to represent a Unicode (UTF-16) character. |
| 268 | 275 | */ |
| | @@ -311,32 +318,38 @@ |
| 311 | 318 | ** validate any code points. It makes no attempt to detect if any [invalid] |
| 312 | 319 | ** switches between the UTF-16be and UTF-16le encodings occur. |
| 313 | 320 | ** |
| 314 | 321 | ** The only code points that this function cares about are the NUL character, |
| 315 | 322 | ** carriage-return, and line-feed. |
| 323 | +** |
| 324 | +** Whether or not this function examines the entire contents of the blob is |
| 325 | +** officially unspecified. |
| 316 | 326 | ** |
| 317 | 327 | ************************************ WARNING ********************************** |
| 318 | 328 | */ |
| 319 | 329 | int looks_like_utf16(const Blob *pContent, int *pFlags){ |
| 320 | 330 | const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent); |
| 321 | 331 | unsigned int n = blob_size(pContent); |
| 322 | | - int j, c; |
| 332 | + int j, c, result = 1; /* Assume UTF-16 text, prove otherwise */ |
| 323 | 333 | |
| 324 | 334 | if( pFlags ) *pFlags = LOOK_NONE; |
| 325 | | - if( n==0 ) return 1; /* Empty file -> text */ |
| 326 | | - if( n%2 ) return 0; /* Odd number of bytes -> binary (or UTF-8) */ |
| 335 | + if( n==0 ) return result; /* Empty file -> text */ |
| 336 | + if( n%2 ){ |
| 337 | + if( pFlags ) *pFlags |= LOOK_ODD; |
| 338 | + return 0; /* Odd number of bytes -> binary (or UTF-8) */ |
| 339 | + } |
| 327 | 340 | c = *z; |
| 328 | 341 | if( c==0 ){ |
| 329 | 342 | if( pFlags ) *pFlags |= LOOK_NUL; |
| 330 | | - return 0; /* NUL character in a file -> binary */ |
| 343 | + result = 0; /* NUL character in a file -> binary */ |
| 331 | 344 | } |
| 332 | 345 | j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF)); |
| 333 | 346 | while( (n-=2)>0 ){ |
| 334 | 347 | c = *++z; ++j; |
| 335 | 348 | if( c==0 ){ |
| 336 | 349 | if( pFlags ) *pFlags |= LOOK_NUL; |
| 337 | | - return 0; /* NUL character in a file -> binary */ |
| 350 | + result = 0; /* NUL character in a file -> binary */ |
| 338 | 351 | } |
| 339 | 352 | if( c==UTF16BE_LF || c==UTF16LE_LF ){ |
| 340 | 353 | int c2 = z[-1]; |
| 341 | 354 | if( pFlags ){ |
| 342 | 355 | *pFlags |= LOOK_LF; |
| | @@ -344,20 +357,22 @@ |
| 344 | 357 | *pFlags |= LOOK_CRLF; |
| 345 | 358 | } |
| 346 | 359 | } |
| 347 | 360 | if( j>UTF16_LENGTH_MASK ){ |
| 348 | 361 | if( pFlags ) *pFlags |= LOOK_LENGTH; |
| 349 | | - return 0; /* Very long line -> binary */ |
| 362 | + result = 0; /* Very long line -> binary */ |
| 350 | 363 | } |
| 351 | 364 | j = 0; |
| 365 | + }else if( c==UTF16BE_CR || c==UTF16LE_CR ){ |
| 366 | + if( pFlags ) *pFlags |= LOOK_CR; |
| 352 | 367 | } |
| 353 | 368 | } |
| 354 | 369 | if( j>UTF16_LENGTH_MASK ){ |
| 355 | 370 | if( pFlags ) *pFlags |= LOOK_LENGTH; |
| 356 | | - return 0; /* Very long line -> binary */ |
| 371 | + result = 0; /* Very long line -> binary */ |
| 357 | 372 | } |
| 358 | | - return 1; /* No problems seen -> not binary */ |
| 373 | + return result; /* No problems seen -> not binary */ |
| 359 | 374 | } |
| 360 | 375 | |
| 361 | 376 | /* |
| 362 | 377 | ** This function returns an array of bytes representing the byte-order-mark |
| 363 | 378 | ** for UTF-8. |
| | @@ -395,23 +410,24 @@ |
| 395 | 410 | const Blob *pContent, /* IN: Blob content to perform BOM detection on. */ |
| 396 | 411 | int *pnByte, /* OUT: The number of bytes used for the BOM. */ |
| 397 | 412 | int *pbReverse /* OUT: Non-zero for BOM in reverse byte-order. */ |
| 398 | 413 | ){ |
| 399 | 414 | const unsigned short *z = (unsigned short *)blob_buffer(pContent); |
| 415 | + int bomSize = sizeof(unsigned short); |
| 400 | 416 | int size = blob_size(pContent); |
| 401 | 417 | |
| 402 | | - if( (size<2) || (size%2) |
| 403 | | - || (size>=4 && z[1]==0) ) return 0; |
| 404 | | - if( z[0] == 0xfffe ){ |
| 418 | + if( size<bomSize ) return 0; /* No: cannot read BOM. */ |
| 419 | + if( size>=(2*bomSize) && z[1]==0 ) return 0; /* No: possible UTF-32. */ |
| 420 | + if( z[0]==0xfffe ){ |
| 405 | 421 | if( pbReverse ) *pbReverse = 1; |
| 406 | | - }else if( z[0] == 0xfeff ){ |
| 422 | + }else if( z[0]==0xfeff ){ |
| 407 | 423 | if( pbReverse ) *pbReverse = 0; |
| 408 | 424 | }else{ |
| 409 | | - return 0; |
| 425 | + return 0; /* No: UTF-16 byte-order-mark not found. */ |
| 410 | 426 | } |
| 411 | | - if( pnByte ) *pnByte = 2; |
| 412 | | - return 1; |
| 427 | + if( pnByte ) *pnByte = bomSize; |
| 428 | + return 1; /* Yes. */ |
| 413 | 429 | } |
| 414 | 430 | |
| 415 | 431 | /* |
| 416 | 432 | ** Return true if two DLine elements are identical. |
| 417 | 433 | */ |
| | @@ -2450,5 +2466,38 @@ |
| 2450 | 2466 | for(i=0; i<ann.nOrig; i++){ |
| 2451 | 2467 | fossil_print("%s: %.*s\n", |
| 2452 | 2468 | ann.aOrig[i].zSrc, ann.aOrig[i].n, ann.aOrig[i].z); |
| 2453 | 2469 | } |
| 2454 | 2470 | } |
| 2471 | + |
| 2472 | +/* |
| 2473 | +** COMMAND: test-looks-like-utf |
| 2474 | +** |
| 2475 | +** Usage: %fossil test-looks-like-utf FILENAME |
| 2476 | +** |
| 2477 | +** FILENAME is the name of a file to check for textual content in the UTF-8 |
| 2478 | +** and/or UTF-16 encodings. |
| 2479 | +*/ |
| 2480 | +void looks_like_utf_test_cmd(void){ |
| 2481 | + Blob blob; /* the contents of the specified file */ |
| 2482 | + int eType; /* return value of looks_like_utf8/utf16() */ |
| 2483 | + int fUtf8; /* return value of starts_with_utf8_bom() */ |
| 2484 | + int fUtf16; /* return value of starts_with_utf16_bom() */ |
| 2485 | + int lookFlags; /* output flags from looks_like_utf8/utf16() */ |
| 2486 | + if( g.argc<3 ) usage("FILENAME"); |
| 2487 | + blob_read_from_file(&blob, g.argv[2]); |
| 2488 | + fUtf8 = starts_with_utf8_bom(&blob, 0); |
| 2489 | + fUtf16 = starts_with_utf16_bom(&blob, 0, 0); |
| 2490 | + eType = fUtf16 ? looks_like_utf16(&blob, &lookFlags) : |
| 2491 | + looks_like_utf8(&blob, &lookFlags); |
| 2492 | + fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob)); |
| 2493 | + fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no"); |
| 2494 | + fossil_print("Starts with UTF-16 BOM: %s\n",fUtf16?"yes":"no"); |
| 2495 | + fossil_print("Looks like UTF-%s: %s\n",fUtf16?"16":"8",eType?"yes":"no"); |
| 2496 | + fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no"); |
| 2497 | + fossil_print("Has flag LOOK_CR: %s\n",(lookFlags&LOOK_CR)?"yes":"no"); |
| 2498 | + fossil_print("Has flag LOOK_LF: %s\n",(lookFlags&LOOK_LF)?"yes":"no"); |
| 2499 | + fossil_print("Has flag LOOK_CRLF: %s\n",(lookFlags&LOOK_CRLF)?"yes":"no"); |
| 2500 | + fossil_print("Has flag LOOK_LENGTH: %s\n",(lookFlags&LOOK_LENGTH)?"yes":"no"); |
| 2501 | + fossil_print("Has flag LOOK_ODD: %s\n",(lookFlags&LOOK_ODD)?"yes":"no"); |
| 2502 | + blob_reset(&blob); |
| 2503 | +} |
| 2455 | 2504 | |