Fossil SCM
Style and clarity revisions to the looks_like_utf*() functions. Correct off-by-one fix for the looks_like_utf16() function. Add -utf8 and -utf16 options to the 'test-looks-like-utf' command.
Commit
4ffaf2ee08a4bc54cd1f1ff2c5dd48aef2d7b491
Parent
bb4776e2e0fdfb6…
1 file changed
+31
-22
+31
-22
| --- src/diff.c | ||
| +++ src/diff.c | ||
| @@ -243,25 +243,26 @@ | ||
| 243 | 243 | unsigned int n = blob_size(pContent); |
| 244 | 244 | int j, c, flags = LOOK_NONE; /* Assume UTF-8 text, prove otherwise */ |
| 245 | 245 | |
| 246 | 246 | if( n==0 ) return flags; /* Empty file -> text */ |
| 247 | 247 | c = *z; |
| 248 | - j = (c!='\n'); | |
| 249 | - if( !j ){ | |
| 250 | - flags |= (LOOK_LF | LOOK_LONE_LF); /* Found LF as first char */ | |
| 251 | - }else if( c==0 ){ | |
| 248 | + if( c==0 ){ | |
| 252 | 249 | flags |= LOOK_NUL; /* NUL character in a file -> binary */ |
| 253 | 250 | }else if( c=='\r' ){ |
| 254 | 251 | flags |= LOOK_CR; |
| 255 | 252 | if( n<=1 || z[1]!='\n' ){ |
| 256 | 253 | flags |= LOOK_LONE_CR; /* More chars, next char is not LF */ |
| 257 | 254 | } |
| 258 | 255 | } |
| 256 | + j = (c!='\n'); | |
| 257 | + if( !j ) flags |= (LOOK_LF | LOOK_LONE_LF); /* Found LF as first char */ | |
| 259 | 258 | while( !(flags&stopFlags) && --n>0 ){ |
| 260 | 259 | int c2 = c; |
| 261 | 260 | c = *++z; ++j; |
| 262 | - if( c=='\n' ){ | |
| 261 | + if( c==0 ){ | |
| 262 | + flags |= LOOK_NUL; /* NUL character in a file -> binary */ | |
| 263 | + }else if( c=='\n' ){ | |
| 263 | 264 | flags |= LOOK_LF; |
| 264 | 265 | if( c2=='\r' ){ |
| 265 | 266 | flags |= (LOOK_CR | LOOK_CRLF); /* Found LF preceded by CR */ |
| 266 | 267 | }else{ |
| 267 | 268 | flags |= LOOK_LONE_LF; |
| @@ -268,21 +269,19 @@ | ||
| 268 | 269 | } |
| 269 | 270 | if( j>LENGTH_MASK ){ |
| 270 | 271 | flags |= LOOK_LONG; /* Very long line -> binary */ |
| 271 | 272 | } |
| 272 | 273 | j = 0; |
| 273 | - }else if( c==0 ){ | |
| 274 | - flags |= LOOK_NUL; /* NUL character in a file -> binary */ | |
| 275 | 274 | }else if( c=='\r' ){ |
| 276 | 275 | flags |= LOOK_CR; |
| 277 | 276 | if( n<=1 || z[1]!='\n' ){ |
| 278 | 277 | flags |= LOOK_LONE_CR; /* More chars, next char is not LF */ |
| 279 | 278 | } |
| 280 | 279 | } |
| 281 | 280 | } |
| 282 | 281 | if( n ){ |
| 283 | - flags |= LOOK_SHORT; /* Not the whole blob is examined */ | |
| 282 | + flags |= LOOK_SHORT; /* The whole blob was not examined */ | |
| 284 | 283 | } |
| 285 | 284 | if( j>LENGTH_MASK ){ |
| 286 | 285 | flags |= LOOK_LONG; /* Very long line -> binary */ |
| 287 | 286 | } |
| 288 | 287 | return flags; |
| @@ -358,31 +357,33 @@ | ||
| 358 | 357 | } |
| 359 | 358 | c = *z; |
| 360 | 359 | if( bReverse ){ |
| 361 | 360 | c = UTF16_SWAP(c); |
| 362 | 361 | } |
| 363 | - j = (c!='\n'); | |
| 364 | - if( !j ){ | |
| 365 | - flags |= (LOOK_LF | LOOK_LONE_LF); /* Found LF as first char */ | |
| 366 | - }else if( c==0 ){ | |
| 362 | + if( c==0 ){ | |
| 367 | 363 | flags |= LOOK_NUL; /* NUL character in a file -> binary */ |
| 368 | 364 | }else if( c=='\r' ){ |
| 369 | 365 | flags |= LOOK_CR; |
| 370 | - if( n<2*sizeof(WCHAR_T) || UTF16_SWAP_IF(bReverse, z[1])!='\n' ){ | |
| 366 | + if( n<=sizeof(WCHAR_T) || UTF16_SWAP_IF(bReverse, z[1])!='\n' ){ | |
| 371 | 367 | flags |= LOOK_LONE_CR; /* More chars, next char is not LF */ |
| 372 | 368 | } |
| 373 | 369 | } |
| 370 | + j = (c!='\n'); | |
| 371 | + if( !j ) flags |= (LOOK_LF | LOOK_LONE_LF); /* Found LF as first char */ | |
| 374 | 372 | while( 1 ){ |
| 375 | 373 | int c2 = c; |
| 374 | + if( flags&stopFlags ) break; | |
| 376 | 375 | n -= sizeof(WCHAR_T); |
| 377 | - if( (flags&stopFlags) || n<sizeof(WCHAR_T) ) break; | |
| 376 | + if( n<sizeof(WCHAR_T) ) break; | |
| 378 | 377 | c = *++z; |
| 379 | 378 | if( bReverse ){ |
| 380 | 379 | c = UTF16_SWAP(c); |
| 381 | 380 | } |
| 382 | 381 | ++j; |
| 383 | - if( c=='\n' ){ | |
| 382 | + if( c==0 ){ | |
| 383 | + flags |= LOOK_NUL; /* NUL character in a file -> binary */ | |
| 384 | + }else if( c=='\n' ){ | |
| 384 | 385 | flags |= LOOK_LF; |
| 385 | 386 | if( c2=='\r' ){ |
| 386 | 387 | flags |= (LOOK_CR | LOOK_CRLF); /* Found LF preceded by CR */ |
| 387 | 388 | }else{ |
| 388 | 389 | flags |= LOOK_LONE_LF; |
| @@ -389,21 +390,19 @@ | ||
| 389 | 390 | } |
| 390 | 391 | if( j>UTF16_LENGTH_MASK ){ |
| 391 | 392 | flags |= LOOK_LONG; /* Very long line -> binary */ |
| 392 | 393 | } |
| 393 | 394 | j = 0; |
| 394 | - }else if( c==0 ){ | |
| 395 | - flags |= LOOK_NUL; /* NUL character in a file -> binary */ | |
| 396 | 395 | }else if( c=='\r' ){ |
| 397 | 396 | flags |= LOOK_CR; |
| 398 | - if( n<2*sizeof(WCHAR_T) || UTF16_SWAP_IF(bReverse, z[1])!='\n' ){ | |
| 397 | + if( n<(2*sizeof(WCHAR_T)) || UTF16_SWAP_IF(bReverse, z[1])!='\n' ){ | |
| 399 | 398 | flags |= LOOK_LONE_CR; /* More chars, next char is not LF */ |
| 400 | 399 | } |
| 401 | 400 | } |
| 402 | 401 | } |
| 403 | 402 | if( n ){ |
| 404 | - flags |= LOOK_SHORT; /* Not the whole blob is examined */ | |
| 403 | + flags |= LOOK_SHORT; /* The whole blob was not examined */ | |
| 405 | 404 | } |
| 406 | 405 | if( j>UTF16_LENGTH_MASK ){ |
| 407 | 406 | flags |= LOOK_LONG; /* Very long line -> binary */ |
| 408 | 407 | } |
| 409 | 408 | return flags; |
| @@ -440,11 +439,11 @@ | ||
| 440 | 439 | ** byte-order-mark (BOM), either in the endianness of the machine |
| 441 | 440 | ** or in reversed byte order. The UTF-32 BOM is ruled out by checking |
| 442 | 441 | ** if the UTF-16 BOM is not immediately followed by (utf16) 0. |
| 443 | 442 | ** pnByte is only set when the function returns 1. |
| 444 | 443 | ** |
| 445 | -** pbReverse is always set, even when no BOM is found. Without BOM, | |
| 444 | +** pbReverse is always set, even when no BOM is found. Without a BOM, | |
| 446 | 445 | ** it is set to 1 on little-endian and 0 on big-endian platforms. See |
| 447 | 446 | ** clause D98 of conformance (section 3.10) of the Unicode standard. |
| 448 | 447 | */ |
| 449 | 448 | int starts_with_utf16_bom( |
| 450 | 449 | const Blob *pContent, /* IN: Blob content to perform BOM detection on. */ |
| @@ -452,19 +451,19 @@ | ||
| 452 | 451 | int *pbReverse /* OUT: Non-zero for BOM in reverse byte-order. */ |
| 453 | 452 | ){ |
| 454 | 453 | const unsigned short *z = (unsigned short *)blob_buffer(pContent); |
| 455 | 454 | int bomSize = sizeof(unsigned short); |
| 456 | 455 | int size = blob_size(pContent); |
| 457 | - static const int one = 1; | |
| 458 | 456 | |
| 459 | 457 | if( size<bomSize ) goto noBom; /* No: cannot read BOM. */ |
| 460 | 458 | if( size>=(2*bomSize) && z[1]==0 ) goto noBom; /* No: possible UTF-32. */ |
| 461 | 459 | if( z[0]==0xfeff ){ |
| 462 | 460 | if( pbReverse ) *pbReverse = 0; |
| 463 | 461 | }else if( z[0]==0xfffe ){ |
| 464 | 462 | if( pbReverse ) *pbReverse = 1; |
| 465 | 463 | }else{ |
| 464 | + static const int one = 1; | |
| 466 | 465 | noBom: |
| 467 | 466 | if( pbReverse ) *pbReverse = *(char *) &one; |
| 468 | 467 | return 0; /* No: UTF-16 byte-order-mark not found. */ |
| 469 | 468 | } |
| 470 | 469 | if( pnByte ) *pnByte = bomSize; |
| @@ -2586,10 +2585,14 @@ | ||
| 2586 | 2585 | /* |
| 2587 | 2586 | ** COMMAND: test-looks-like-utf |
| 2588 | 2587 | ** |
| 2589 | 2588 | ** Usage: %fossil test-looks-like-utf FILENAME |
| 2590 | 2589 | ** |
| 2590 | +** Options: | |
| 2591 | +** --utf8 Ignoring BOM and file size, force UTF-8 checking | |
| 2592 | +** --utf16 Ignoring BOM and file size, force UTF-16 checking | |
| 2593 | +** | |
| 2591 | 2594 | ** FILENAME is the name of a file to check for textual content in the UTF-8 |
| 2592 | 2595 | ** and/or UTF-16 encodings. |
| 2593 | 2596 | */ |
| 2594 | 2597 | void looks_like_utf_test_cmd(void){ |
| 2595 | 2598 | Blob blob; /* the contents of the specified file */ |
| @@ -2597,15 +2600,21 @@ | ||
| 2597 | 2600 | int fUtf16; /* return value of starts_with_utf16_bom() */ |
| 2598 | 2601 | int fUnicode; /* return value of could_be_utf16() */ |
| 2599 | 2602 | int lookFlags; /* output flags from looks_like_utf8/utf16() */ |
| 2600 | 2603 | int bRevUtf16 = 0; /* non-zero -> UTF-16 byte order reversed */ |
| 2601 | 2604 | int bRevUnicode = 0; /* non-zero -> UTF-16 byte order reversed */ |
| 2605 | + int fForceUtf8 = find_option("utf8",0,0)!=0; | |
| 2606 | + int fForceUtf16 = find_option("utf16",0,0)!=0; | |
| 2602 | 2607 | if( g.argc!=3 ) usage("FILENAME"); |
| 2603 | 2608 | blob_read_from_file(&blob, g.argv[2]); |
| 2604 | 2609 | fUtf8 = starts_with_utf8_bom(&blob, 0); |
| 2605 | 2610 | fUtf16 = starts_with_utf16_bom(&blob, 0, &bRevUtf16); |
| 2606 | - fUnicode = could_be_utf16(&blob, &bRevUnicode); | |
| 2611 | + if( fForceUtf8 ){ | |
| 2612 | + fUnicode = 0; | |
| 2613 | + }else{ | |
| 2614 | + fUnicode = fForceUtf16 || could_be_utf16(&blob, &bRevUnicode); | |
| 2615 | + } | |
| 2607 | 2616 | lookFlags = fUnicode ? looks_like_utf16(&blob, bRevUnicode, 0) : |
| 2608 | 2617 | looks_like_utf8(&blob, 0); |
| 2609 | 2618 | fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob)); |
| 2610 | 2619 | fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no"); |
| 2611 | 2620 | fossil_print("Starts with UTF-16 BOM: %s\n", |
| 2612 | 2621 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -243,25 +243,26 @@ | |
| 243 | unsigned int n = blob_size(pContent); |
| 244 | int j, c, flags = LOOK_NONE; /* Assume UTF-8 text, prove otherwise */ |
| 245 | |
| 246 | if( n==0 ) return flags; /* Empty file -> text */ |
| 247 | c = *z; |
| 248 | j = (c!='\n'); |
| 249 | if( !j ){ |
| 250 | flags |= (LOOK_LF | LOOK_LONE_LF); /* Found LF as first char */ |
| 251 | }else if( c==0 ){ |
| 252 | flags |= LOOK_NUL; /* NUL character in a file -> binary */ |
| 253 | }else if( c=='\r' ){ |
| 254 | flags |= LOOK_CR; |
| 255 | if( n<=1 || z[1]!='\n' ){ |
| 256 | flags |= LOOK_LONE_CR; /* More chars, next char is not LF */ |
| 257 | } |
| 258 | } |
| 259 | while( !(flags&stopFlags) && --n>0 ){ |
| 260 | int c2 = c; |
| 261 | c = *++z; ++j; |
| 262 | if( c=='\n' ){ |
| 263 | flags |= LOOK_LF; |
| 264 | if( c2=='\r' ){ |
| 265 | flags |= (LOOK_CR | LOOK_CRLF); /* Found LF preceded by CR */ |
| 266 | }else{ |
| 267 | flags |= LOOK_LONE_LF; |
| @@ -268,21 +269,19 @@ | |
| 268 | } |
| 269 | if( j>LENGTH_MASK ){ |
| 270 | flags |= LOOK_LONG; /* Very long line -> binary */ |
| 271 | } |
| 272 | j = 0; |
| 273 | }else if( c==0 ){ |
| 274 | flags |= LOOK_NUL; /* NUL character in a file -> binary */ |
| 275 | }else if( c=='\r' ){ |
| 276 | flags |= LOOK_CR; |
| 277 | if( n<=1 || z[1]!='\n' ){ |
| 278 | flags |= LOOK_LONE_CR; /* More chars, next char is not LF */ |
| 279 | } |
| 280 | } |
| 281 | } |
| 282 | if( n ){ |
| 283 | flags |= LOOK_SHORT; /* Not the whole blob is examined */ |
| 284 | } |
| 285 | if( j>LENGTH_MASK ){ |
| 286 | flags |= LOOK_LONG; /* Very long line -> binary */ |
| 287 | } |
| 288 | return flags; |
| @@ -358,31 +357,33 @@ | |
| 358 | } |
| 359 | c = *z; |
| 360 | if( bReverse ){ |
| 361 | c = UTF16_SWAP(c); |
| 362 | } |
| 363 | j = (c!='\n'); |
| 364 | if( !j ){ |
| 365 | flags |= (LOOK_LF | LOOK_LONE_LF); /* Found LF as first char */ |
| 366 | }else if( c==0 ){ |
| 367 | flags |= LOOK_NUL; /* NUL character in a file -> binary */ |
| 368 | }else if( c=='\r' ){ |
| 369 | flags |= LOOK_CR; |
| 370 | if( n<2*sizeof(WCHAR_T) || UTF16_SWAP_IF(bReverse, z[1])!='\n' ){ |
| 371 | flags |= LOOK_LONE_CR; /* More chars, next char is not LF */ |
| 372 | } |
| 373 | } |
| 374 | while( 1 ){ |
| 375 | int c2 = c; |
| 376 | n -= sizeof(WCHAR_T); |
| 377 | if( (flags&stopFlags) || n<sizeof(WCHAR_T) ) break; |
| 378 | c = *++z; |
| 379 | if( bReverse ){ |
| 380 | c = UTF16_SWAP(c); |
| 381 | } |
| 382 | ++j; |
| 383 | if( c=='\n' ){ |
| 384 | flags |= LOOK_LF; |
| 385 | if( c2=='\r' ){ |
| 386 | flags |= (LOOK_CR | LOOK_CRLF); /* Found LF preceded by CR */ |
| 387 | }else{ |
| 388 | flags |= LOOK_LONE_LF; |
| @@ -389,21 +390,19 @@ | |
| 389 | } |
| 390 | if( j>UTF16_LENGTH_MASK ){ |
| 391 | flags |= LOOK_LONG; /* Very long line -> binary */ |
| 392 | } |
| 393 | j = 0; |
| 394 | }else if( c==0 ){ |
| 395 | flags |= LOOK_NUL; /* NUL character in a file -> binary */ |
| 396 | }else if( c=='\r' ){ |
| 397 | flags |= LOOK_CR; |
| 398 | if( n<2*sizeof(WCHAR_T) || UTF16_SWAP_IF(bReverse, z[1])!='\n' ){ |
| 399 | flags |= LOOK_LONE_CR; /* More chars, next char is not LF */ |
| 400 | } |
| 401 | } |
| 402 | } |
| 403 | if( n ){ |
| 404 | flags |= LOOK_SHORT; /* Not the whole blob is examined */ |
| 405 | } |
| 406 | if( j>UTF16_LENGTH_MASK ){ |
| 407 | flags |= LOOK_LONG; /* Very long line -> binary */ |
| 408 | } |
| 409 | return flags; |
| @@ -440,11 +439,11 @@ | |
| 440 | ** byte-order-mark (BOM), either in the endianness of the machine |
| 441 | ** or in reversed byte order. The UTF-32 BOM is ruled out by checking |
| 442 | ** if the UTF-16 BOM is not immediately followed by (utf16) 0. |
| 443 | ** pnByte is only set when the function returns 1. |
| 444 | ** |
| 445 | ** pbReverse is always set, even when no BOM is found. Without BOM, |
| 446 | ** it is set to 1 on little-endian and 0 on big-endian platforms. See |
| 447 | ** clause D98 of conformance (section 3.10) of the Unicode standard. |
| 448 | */ |
| 449 | int starts_with_utf16_bom( |
| 450 | const Blob *pContent, /* IN: Blob content to perform BOM detection on. */ |
| @@ -452,19 +451,19 @@ | |
| 452 | int *pbReverse /* OUT: Non-zero for BOM in reverse byte-order. */ |
| 453 | ){ |
| 454 | const unsigned short *z = (unsigned short *)blob_buffer(pContent); |
| 455 | int bomSize = sizeof(unsigned short); |
| 456 | int size = blob_size(pContent); |
| 457 | static const int one = 1; |
| 458 | |
| 459 | if( size<bomSize ) goto noBom; /* No: cannot read BOM. */ |
| 460 | if( size>=(2*bomSize) && z[1]==0 ) goto noBom; /* No: possible UTF-32. */ |
| 461 | if( z[0]==0xfeff ){ |
| 462 | if( pbReverse ) *pbReverse = 0; |
| 463 | }else if( z[0]==0xfffe ){ |
| 464 | if( pbReverse ) *pbReverse = 1; |
| 465 | }else{ |
| 466 | noBom: |
| 467 | if( pbReverse ) *pbReverse = *(char *) &one; |
| 468 | return 0; /* No: UTF-16 byte-order-mark not found. */ |
| 469 | } |
| 470 | if( pnByte ) *pnByte = bomSize; |
| @@ -2586,10 +2585,14 @@ | |
| 2586 | /* |
| 2587 | ** COMMAND: test-looks-like-utf |
| 2588 | ** |
| 2589 | ** Usage: %fossil test-looks-like-utf FILENAME |
| 2590 | ** |
| 2591 | ** FILENAME is the name of a file to check for textual content in the UTF-8 |
| 2592 | ** and/or UTF-16 encodings. |
| 2593 | */ |
| 2594 | void looks_like_utf_test_cmd(void){ |
| 2595 | Blob blob; /* the contents of the specified file */ |
| @@ -2597,15 +2600,21 @@ | |
| 2597 | int fUtf16; /* return value of starts_with_utf16_bom() */ |
| 2598 | int fUnicode; /* return value of could_be_utf16() */ |
| 2599 | int lookFlags; /* output flags from looks_like_utf8/utf16() */ |
| 2600 | int bRevUtf16 = 0; /* non-zero -> UTF-16 byte order reversed */ |
| 2601 | int bRevUnicode = 0; /* non-zero -> UTF-16 byte order reversed */ |
| 2602 | if( g.argc!=3 ) usage("FILENAME"); |
| 2603 | blob_read_from_file(&blob, g.argv[2]); |
| 2604 | fUtf8 = starts_with_utf8_bom(&blob, 0); |
| 2605 | fUtf16 = starts_with_utf16_bom(&blob, 0, &bRevUtf16); |
| 2606 | fUnicode = could_be_utf16(&blob, &bRevUnicode); |
| 2607 | lookFlags = fUnicode ? looks_like_utf16(&blob, bRevUnicode, 0) : |
| 2608 | looks_like_utf8(&blob, 0); |
| 2609 | fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob)); |
| 2610 | fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no"); |
| 2611 | fossil_print("Starts with UTF-16 BOM: %s\n", |
| 2612 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -243,25 +243,26 @@ | |
| 243 | unsigned int n = blob_size(pContent); |
| 244 | int j, c, flags = LOOK_NONE; /* Assume UTF-8 text, prove otherwise */ |
| 245 | |
| 246 | if( n==0 ) return flags; /* Empty file -> text */ |
| 247 | c = *z; |
| 248 | if( c==0 ){ |
| 249 | flags |= LOOK_NUL; /* NUL character in a file -> binary */ |
| 250 | }else if( c=='\r' ){ |
| 251 | flags |= LOOK_CR; |
| 252 | if( n<=1 || z[1]!='\n' ){ |
| 253 | flags |= LOOK_LONE_CR; /* More chars, next char is not LF */ |
| 254 | } |
| 255 | } |
| 256 | j = (c!='\n'); |
| 257 | if( !j ) flags |= (LOOK_LF | LOOK_LONE_LF); /* Found LF as first char */ |
| 258 | while( !(flags&stopFlags) && --n>0 ){ |
| 259 | int c2 = c; |
| 260 | c = *++z; ++j; |
| 261 | if( c==0 ){ |
| 262 | flags |= LOOK_NUL; /* NUL character in a file -> binary */ |
| 263 | }else if( c=='\n' ){ |
| 264 | flags |= LOOK_LF; |
| 265 | if( c2=='\r' ){ |
| 266 | flags |= (LOOK_CR | LOOK_CRLF); /* Found LF preceded by CR */ |
| 267 | }else{ |
| 268 | flags |= LOOK_LONE_LF; |
| @@ -268,21 +269,19 @@ | |
| 269 | } |
| 270 | if( j>LENGTH_MASK ){ |
| 271 | flags |= LOOK_LONG; /* Very long line -> binary */ |
| 272 | } |
| 273 | j = 0; |
| 274 | }else if( c=='\r' ){ |
| 275 | flags |= LOOK_CR; |
| 276 | if( n<=1 || z[1]!='\n' ){ |
| 277 | flags |= LOOK_LONE_CR; /* More chars, next char is not LF */ |
| 278 | } |
| 279 | } |
| 280 | } |
| 281 | if( n ){ |
| 282 | flags |= LOOK_SHORT; /* The whole blob was not examined */ |
| 283 | } |
| 284 | if( j>LENGTH_MASK ){ |
| 285 | flags |= LOOK_LONG; /* Very long line -> binary */ |
| 286 | } |
| 287 | return flags; |
| @@ -358,31 +357,33 @@ | |
| 357 | } |
| 358 | c = *z; |
| 359 | if( bReverse ){ |
| 360 | c = UTF16_SWAP(c); |
| 361 | } |
| 362 | if( c==0 ){ |
| 363 | flags |= LOOK_NUL; /* NUL character in a file -> binary */ |
| 364 | }else if( c=='\r' ){ |
| 365 | flags |= LOOK_CR; |
| 366 | if( n<=sizeof(WCHAR_T) || UTF16_SWAP_IF(bReverse, z[1])!='\n' ){ |
| 367 | flags |= LOOK_LONE_CR; /* More chars, next char is not LF */ |
| 368 | } |
| 369 | } |
| 370 | j = (c!='\n'); |
| 371 | if( !j ) flags |= (LOOK_LF | LOOK_LONE_LF); /* Found LF as first char */ |
| 372 | while( 1 ){ |
| 373 | int c2 = c; |
| 374 | if( flags&stopFlags ) break; |
| 375 | n -= sizeof(WCHAR_T); |
| 376 | if( n<sizeof(WCHAR_T) ) break; |
| 377 | c = *++z; |
| 378 | if( bReverse ){ |
| 379 | c = UTF16_SWAP(c); |
| 380 | } |
| 381 | ++j; |
| 382 | if( c==0 ){ |
| 383 | flags |= LOOK_NUL; /* NUL character in a file -> binary */ |
| 384 | }else if( c=='\n' ){ |
| 385 | flags |= LOOK_LF; |
| 386 | if( c2=='\r' ){ |
| 387 | flags |= (LOOK_CR | LOOK_CRLF); /* Found LF preceded by CR */ |
| 388 | }else{ |
| 389 | flags |= LOOK_LONE_LF; |
| @@ -389,21 +390,19 @@ | |
| 390 | } |
| 391 | if( j>UTF16_LENGTH_MASK ){ |
| 392 | flags |= LOOK_LONG; /* Very long line -> binary */ |
| 393 | } |
| 394 | j = 0; |
| 395 | }else if( c=='\r' ){ |
| 396 | flags |= LOOK_CR; |
| 397 | if( n<(2*sizeof(WCHAR_T)) || UTF16_SWAP_IF(bReverse, z[1])!='\n' ){ |
| 398 | flags |= LOOK_LONE_CR; /* More chars, next char is not LF */ |
| 399 | } |
| 400 | } |
| 401 | } |
| 402 | if( n ){ |
| 403 | flags |= LOOK_SHORT; /* The whole blob was not examined */ |
| 404 | } |
| 405 | if( j>UTF16_LENGTH_MASK ){ |
| 406 | flags |= LOOK_LONG; /* Very long line -> binary */ |
| 407 | } |
| 408 | return flags; |
| @@ -440,11 +439,11 @@ | |
| 439 | ** byte-order-mark (BOM), either in the endianness of the machine |
| 440 | ** or in reversed byte order. The UTF-32 BOM is ruled out by checking |
| 441 | ** if the UTF-16 BOM is not immediately followed by (utf16) 0. |
| 442 | ** pnByte is only set when the function returns 1. |
| 443 | ** |
| 444 | ** pbReverse is always set, even when no BOM is found. Without a BOM, |
| 445 | ** it is set to 1 on little-endian and 0 on big-endian platforms. See |
| 446 | ** clause D98 of conformance (section 3.10) of the Unicode standard. |
| 447 | */ |
| 448 | int starts_with_utf16_bom( |
| 449 | const Blob *pContent, /* IN: Blob content to perform BOM detection on. */ |
| @@ -452,19 +451,19 @@ | |
| 451 | int *pbReverse /* OUT: Non-zero for BOM in reverse byte-order. */ |
| 452 | ){ |
| 453 | const unsigned short *z = (unsigned short *)blob_buffer(pContent); |
| 454 | int bomSize = sizeof(unsigned short); |
| 455 | int size = blob_size(pContent); |
| 456 | |
| 457 | if( size<bomSize ) goto noBom; /* No: cannot read BOM. */ |
| 458 | if( size>=(2*bomSize) && z[1]==0 ) goto noBom; /* No: possible UTF-32. */ |
| 459 | if( z[0]==0xfeff ){ |
| 460 | if( pbReverse ) *pbReverse = 0; |
| 461 | }else if( z[0]==0xfffe ){ |
| 462 | if( pbReverse ) *pbReverse = 1; |
| 463 | }else{ |
| 464 | static const int one = 1; |
| 465 | noBom: |
| 466 | if( pbReverse ) *pbReverse = *(char *) &one; |
| 467 | return 0; /* No: UTF-16 byte-order-mark not found. */ |
| 468 | } |
| 469 | if( pnByte ) *pnByte = bomSize; |
| @@ -2586,10 +2585,14 @@ | |
| 2585 | /* |
| 2586 | ** COMMAND: test-looks-like-utf |
| 2587 | ** |
| 2588 | ** Usage: %fossil test-looks-like-utf FILENAME |
| 2589 | ** |
| 2590 | ** Options: |
| 2591 | ** --utf8 Ignoring BOM and file size, force UTF-8 checking |
| 2592 | ** --utf16 Ignoring BOM and file size, force UTF-16 checking |
| 2593 | ** |
| 2594 | ** FILENAME is the name of a file to check for textual content in the UTF-8 |
| 2595 | ** and/or UTF-16 encodings. |
| 2596 | */ |
| 2597 | void looks_like_utf_test_cmd(void){ |
| 2598 | Blob blob; /* the contents of the specified file */ |
| @@ -2597,15 +2600,21 @@ | |
| 2600 | int fUtf16; /* return value of starts_with_utf16_bom() */ |
| 2601 | int fUnicode; /* return value of could_be_utf16() */ |
| 2602 | int lookFlags; /* output flags from looks_like_utf8/utf16() */ |
| 2603 | int bRevUtf16 = 0; /* non-zero -> UTF-16 byte order reversed */ |
| 2604 | int bRevUnicode = 0; /* non-zero -> UTF-16 byte order reversed */ |
| 2605 | int fForceUtf8 = find_option("utf8",0,0)!=0; |
| 2606 | int fForceUtf16 = find_option("utf16",0,0)!=0; |
| 2607 | if( g.argc!=3 ) usage("FILENAME"); |
| 2608 | blob_read_from_file(&blob, g.argv[2]); |
| 2609 | fUtf8 = starts_with_utf8_bom(&blob, 0); |
| 2610 | fUtf16 = starts_with_utf16_bom(&blob, 0, &bRevUtf16); |
| 2611 | if( fForceUtf8 ){ |
| 2612 | fUnicode = 0; |
| 2613 | }else{ |
| 2614 | fUnicode = fForceUtf16 || could_be_utf16(&blob, &bRevUnicode); |
| 2615 | } |
| 2616 | lookFlags = fUnicode ? looks_like_utf16(&blob, bRevUnicode, 0) : |
| 2617 | looks_like_utf8(&blob, 0); |
| 2618 | fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob)); |
| 2619 | fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no"); |
| 2620 | fossil_print("Starts with UTF-16 BOM: %s\n", |
| 2621 |