Fossil SCM
Merge UTF-16 byte swapping fix and test-looks-like-utf command enhancements.
Commit
b4bec3753dd6106eaa840275aa891277609a33f2
Parent
f58bc2dfc73a847…
1 file changed
+33
-24
+33
-24
| --- src/diff.c | ||
| +++ src/diff.c | ||
| @@ -258,11 +258,11 @@ | ||
| 258 | 258 | if( c==0 ){ |
| 259 | 259 | flags |= LOOK_NUL; /* NUL character in a file -> binary */ |
| 260 | 260 | }else if( c=='\n' ){ |
| 261 | 261 | flags |= LOOK_LF; |
| 262 | 262 | if( c2=='\r' ){ |
| 263 | - flags |= LOOK_CRLF; /* Found LF preceded by CR */ | |
| 263 | + flags |= (LOOK_CR | LOOK_CRLF); /* Found LF preceded by CR */ | |
| 264 | 264 | }else{ |
| 265 | 265 | flags |= LOOK_LONE_LF; |
| 266 | 266 | } |
| 267 | 267 | if( j>LENGTH_MASK ){ |
| 268 | 268 | flags |= LOOK_LONG; /* Very long line -> binary */ |
| @@ -295,18 +295,19 @@ | ||
| 295 | 295 | /* |
| 296 | 296 | ** Maximum length of a line in a text file, in UTF-16 characters. (4096) |
| 297 | 297 | ** The number of bytes represented by this value cannot exceed LENGTH_MASK |
| 298 | 298 | ** bytes, because that is the line buffer size used by the diff engine. |
| 299 | 299 | */ |
| 300 | -#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char))) | |
| 301 | -#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1) | |
| 300 | +#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char))) | |
| 301 | +#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1) | |
| 302 | 302 | |
| 303 | 303 | /* |
| 304 | 304 | ** This macro is used to swap the byte order of a UTF-16 character in the |
| 305 | 305 | ** looks_like_utf16() function. |
| 306 | 306 | */ |
| 307 | -#define UTF16_SWAP(ch) (((ch) << 8) & 0xFF00) | (((ch) >> 8) & 0xFF) | |
| 307 | +#define UTF16_SWAP(ch) ((((ch) << 8) & 0xFF00) | (((ch) >> 8) & 0xFF)) | |
| 308 | +#define UTF16_SWAP_IF(expr,ch) ((expr) ? UTF16_SWAP((ch)) : (ch)) | |
| 308 | 309 | |
| 309 | 310 | /* |
| 310 | 311 | ** This function attempts to scan each logical line within the blob to |
| 311 | 312 | ** determine the type of content it appears to contain. The return value |
| 312 | 313 | ** is a combination of one or more of the LOOK_XXX flags (see above): |
| @@ -347,45 +348,52 @@ | ||
| 347 | 348 | if( n%sizeof(WCHAR_T) ){ |
| 348 | 349 | flags |= LOOK_ODD; /* Odd number of bytes -> binary (UTF-8?) */ |
| 349 | 350 | if( n<sizeof(WCHAR_T) ) return flags; /* One byte -> binary (UTF-8?) */ |
| 350 | 351 | } |
| 351 | 352 | c = *z; |
| 353 | + if( bReverse ){ | |
| 354 | + c = UTF16_SWAP(c); | |
| 355 | + } | |
| 352 | 356 | if( c==0 ){ |
| 353 | 357 | flags |= LOOK_NUL; /* NUL character in a file -> binary */ |
| 354 | - }else if( bReverse ){ | |
| 355 | - c = UTF16_SWAP(c); | |
| 358 | + }else if( c=='\r' ){ | |
| 359 | + flags |= LOOK_CR; | |
| 360 | + if( n<=sizeof(WCHAR_T) || UTF16_SWAP_IF(bReverse, z[1])!='\n' ){ | |
| 361 | + flags |= LOOK_LONE_CR; /* More chars, next char is not LF */ | |
| 362 | + } | |
| 356 | 363 | } |
| 357 | 364 | j = (c!='\n'); |
| 358 | 365 | if( !j ) flags |= (LOOK_LF | LOOK_LONE_LF); /* Found LF as first char */ |
| 359 | 366 | while( 1 ){ |
| 360 | 367 | int c2 = c; |
| 361 | 368 | n -= sizeof(WCHAR_T); |
| 362 | 369 | if( n<sizeof(WCHAR_T) ) break; |
| 363 | 370 | c = *++z; |
| 371 | + if( bReverse ){ | |
| 372 | + c = UTF16_SWAP(c); | |
| 373 | + } | |
| 364 | 374 | ++j; |
| 365 | 375 | if( c==0 ){ |
| 366 | 376 | flags |= LOOK_NUL; /* NUL character in a file -> binary */ |
| 367 | - }else if( bReverse ){ | |
| 368 | - c = UTF16_SWAP(c); | |
| 369 | - } | |
| 370 | - if( c=='\n' ){ | |
| 377 | + }else if( c=='\n' ){ | |
| 378 | + flags |= LOOK_LF; | |
| 371 | 379 | if( c2=='\r' ){ |
| 372 | - flags |= (LOOK_CRLF | LOOK_CR | LOOK_LF); | |
| 380 | + flags |= (LOOK_CR | LOOK_CRLF); /* Found LF preceded by CR */ | |
| 373 | 381 | }else{ |
| 374 | - flags |= (LOOK_LONE_LF | LOOK_LF); | |
| 382 | + flags |= LOOK_LONE_LF; | |
| 375 | 383 | } |
| 376 | 384 | if( j>UTF16_LENGTH_MASK ){ |
| 377 | 385 | flags |= LOOK_LONG; /* Very long line -> binary */ |
| 378 | 386 | } |
| 379 | 387 | j = 0; |
| 380 | - }else if( c2=='\r' ){ | |
| 381 | - flags |= (LOOK_CR | LOOK_LONE_CR); | |
| 388 | + }else if( c=='\r' ){ | |
| 389 | + flags |= LOOK_CR; | |
| 390 | + if( n<=sizeof(WCHAR_T) || UTF16_SWAP_IF(bReverse, z[1])!='\n' ){ | |
| 391 | + flags |= LOOK_LONE_CR; /* More chars, next char is not LF */ | |
| 392 | + } | |
| 382 | 393 | } |
| 383 | 394 | } |
| 384 | - if( c=='\r' ){ | |
| 385 | - flags |= (LOOK_CR | LOOK_LONE_CR); /* Found CR as last char */ | |
| 386 | - } | |
| 387 | 395 | if( j>UTF16_LENGTH_MASK ){ |
| 388 | 396 | flags |= LOOK_LONG; /* Very long line -> binary */ |
| 389 | 397 | } |
| 390 | 398 | return flags; |
| 391 | 399 | } |
| @@ -2524,22 +2532,23 @@ | ||
| 2524 | 2532 | Blob blob; /* the contents of the specified file */ |
| 2525 | 2533 | int fUtf8; /* return value of starts_with_utf8_bom() */ |
| 2526 | 2534 | int fUtf16; /* return value of starts_with_utf16_bom() */ |
| 2527 | 2535 | int fUnicode; /* return value of could_be_utf16() */ |
| 2528 | 2536 | int lookFlags; /* output flags from looks_like_utf8/utf16() */ |
| 2529 | - int bReverse = 0; /* non-zero -> UTF-16 byte order reversed */ | |
| 2530 | - if( g.argc<3 ) usage("FILENAME"); | |
| 2537 | + int bRevUtf16 = 0; /* non-zero -> UTF-16 byte order reversed */ | |
| 2538 | + int bRevUnicode = 0; /* non-zero -> UTF-16 byte order reversed */ | |
| 2539 | + if( g.argc!=3 ) usage("FILENAME"); | |
| 2531 | 2540 | blob_read_from_file(&blob, g.argv[2]); |
| 2532 | 2541 | fUtf8 = starts_with_utf8_bom(&blob, 0); |
| 2533 | - fUtf16 = starts_with_utf16_bom(&blob, 0, &bReverse); | |
| 2534 | - fUnicode = could_be_utf16(&blob, &bReverse); | |
| 2535 | - lookFlags = fUnicode ? looks_like_utf16(&blob, bReverse) : | |
| 2536 | - looks_like_utf8(&blob); | |
| 2542 | + fUtf16 = starts_with_utf16_bom(&blob, 0, &bRevUtf16); | |
| 2543 | + fUnicode = could_be_utf16(&blob, &bRevUnicode); | |
| 2544 | + lookFlags = fUnicode ? looks_like_utf16(&blob, bRevUnicode) : | |
| 2545 | + looks_like_utf8(&blob); | |
| 2537 | 2546 | fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob)); |
| 2538 | 2547 | fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no"); |
| 2539 | 2548 | fossil_print("Starts with UTF-16 BOM: %s\n", |
| 2540 | - fUtf16?(bReverse?"reversed":"yes"):"no"); | |
| 2549 | + fUtf16?(bRevUtf16?"reversed":"yes"):"no"); | |
| 2541 | 2550 | fossil_print("Looks like UTF-%s: %s\n",fUnicode?"16":"8", |
| 2542 | 2551 | (lookFlags&LOOK_BINARY)?"no":"yes"); |
| 2543 | 2552 | fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no"); |
| 2544 | 2553 | fossil_print("Has flag LOOK_CR: %s\n",(lookFlags&LOOK_CR)?"yes":"no"); |
| 2545 | 2554 | fossil_print("Has flag LOOK_LONE_CR: %s\n", |
| 2546 | 2555 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -258,11 +258,11 @@ | |
| 258 | if( c==0 ){ |
| 259 | flags |= LOOK_NUL; /* NUL character in a file -> binary */ |
| 260 | }else if( c=='\n' ){ |
| 261 | flags |= LOOK_LF; |
| 262 | if( c2=='\r' ){ |
| 263 | flags |= LOOK_CRLF; /* Found LF preceded by CR */ |
| 264 | }else{ |
| 265 | flags |= LOOK_LONE_LF; |
| 266 | } |
| 267 | if( j>LENGTH_MASK ){ |
| 268 | flags |= LOOK_LONG; /* Very long line -> binary */ |
| @@ -295,18 +295,19 @@ | |
| 295 | /* |
| 296 | ** Maximum length of a line in a text file, in UTF-16 characters. (4096) |
| 297 | ** The number of bytes represented by this value cannot exceed LENGTH_MASK |
| 298 | ** bytes, because that is the line buffer size used by the diff engine. |
| 299 | */ |
| 300 | #define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char))) |
| 301 | #define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1) |
| 302 | |
| 303 | /* |
| 304 | ** This macro is used to swap the byte order of a UTF-16 character in the |
| 305 | ** looks_like_utf16() function. |
| 306 | */ |
| 307 | #define UTF16_SWAP(ch) (((ch) << 8) & 0xFF00) | (((ch) >> 8) & 0xFF) |
| 308 | |
| 309 | /* |
| 310 | ** This function attempts to scan each logical line within the blob to |
| 311 | ** determine the type of content it appears to contain. The return value |
| 312 | ** is a combination of one or more of the LOOK_XXX flags (see above): |
| @@ -347,45 +348,52 @@ | |
| 347 | if( n%sizeof(WCHAR_T) ){ |
| 348 | flags |= LOOK_ODD; /* Odd number of bytes -> binary (UTF-8?) */ |
| 349 | if( n<sizeof(WCHAR_T) ) return flags; /* One byte -> binary (UTF-8?) */ |
| 350 | } |
| 351 | c = *z; |
| 352 | if( c==0 ){ |
| 353 | flags |= LOOK_NUL; /* NUL character in a file -> binary */ |
| 354 | }else if( bReverse ){ |
| 355 | c = UTF16_SWAP(c); |
| 356 | } |
| 357 | j = (c!='\n'); |
| 358 | if( !j ) flags |= (LOOK_LF | LOOK_LONE_LF); /* Found LF as first char */ |
| 359 | while( 1 ){ |
| 360 | int c2 = c; |
| 361 | n -= sizeof(WCHAR_T); |
| 362 | if( n<sizeof(WCHAR_T) ) break; |
| 363 | c = *++z; |
| 364 | ++j; |
| 365 | if( c==0 ){ |
| 366 | flags |= LOOK_NUL; /* NUL character in a file -> binary */ |
| 367 | }else if( bReverse ){ |
| 368 | c = UTF16_SWAP(c); |
| 369 | } |
| 370 | if( c=='\n' ){ |
| 371 | if( c2=='\r' ){ |
| 372 | flags |= (LOOK_CRLF | LOOK_CR | LOOK_LF); |
| 373 | }else{ |
| 374 | flags |= (LOOK_LONE_LF | LOOK_LF); |
| 375 | } |
| 376 | if( j>UTF16_LENGTH_MASK ){ |
| 377 | flags |= LOOK_LONG; /* Very long line -> binary */ |
| 378 | } |
| 379 | j = 0; |
| 380 | }else if( c2=='\r' ){ |
| 381 | flags |= (LOOK_CR | LOOK_LONE_CR); |
| 382 | } |
| 383 | } |
| 384 | if( c=='\r' ){ |
| 385 | flags |= (LOOK_CR | LOOK_LONE_CR); /* Found CR as last char */ |
| 386 | } |
| 387 | if( j>UTF16_LENGTH_MASK ){ |
| 388 | flags |= LOOK_LONG; /* Very long line -> binary */ |
| 389 | } |
| 390 | return flags; |
| 391 | } |
| @@ -2524,22 +2532,23 @@ | |
| 2524 | Blob blob; /* the contents of the specified file */ |
| 2525 | int fUtf8; /* return value of starts_with_utf8_bom() */ |
| 2526 | int fUtf16; /* return value of starts_with_utf16_bom() */ |
| 2527 | int fUnicode; /* return value of could_be_utf16() */ |
| 2528 | int lookFlags; /* output flags from looks_like_utf8/utf16() */ |
| 2529 | int bReverse = 0; /* non-zero -> UTF-16 byte order reversed */ |
| 2530 | if( g.argc<3 ) usage("FILENAME"); |
| 2531 | blob_read_from_file(&blob, g.argv[2]); |
| 2532 | fUtf8 = starts_with_utf8_bom(&blob, 0); |
| 2533 | fUtf16 = starts_with_utf16_bom(&blob, 0, &bReverse); |
| 2534 | fUnicode = could_be_utf16(&blob, &bReverse); |
| 2535 | lookFlags = fUnicode ? looks_like_utf16(&blob, bReverse) : |
| 2536 | looks_like_utf8(&blob); |
| 2537 | fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob)); |
| 2538 | fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no"); |
| 2539 | fossil_print("Starts with UTF-16 BOM: %s\n", |
| 2540 | fUtf16?(bReverse?"reversed":"yes"):"no"); |
| 2541 | fossil_print("Looks like UTF-%s: %s\n",fUnicode?"16":"8", |
| 2542 | (lookFlags&LOOK_BINARY)?"no":"yes"); |
| 2543 | fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no"); |
| 2544 | fossil_print("Has flag LOOK_CR: %s\n",(lookFlags&LOOK_CR)?"yes":"no"); |
| 2545 | fossil_print("Has flag LOOK_LONE_CR: %s\n", |
| 2546 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -258,11 +258,11 @@ | |
| 258 | if( c==0 ){ |
| 259 | flags |= LOOK_NUL; /* NUL character in a file -> binary */ |
| 260 | }else if( c=='\n' ){ |
| 261 | flags |= LOOK_LF; |
| 262 | if( c2=='\r' ){ |
| 263 | flags |= (LOOK_CR | LOOK_CRLF); /* Found LF preceded by CR */ |
| 264 | }else{ |
| 265 | flags |= LOOK_LONE_LF; |
| 266 | } |
| 267 | if( j>LENGTH_MASK ){ |
| 268 | flags |= LOOK_LONG; /* Very long line -> binary */ |
| @@ -295,18 +295,19 @@ | |
| 295 | /* |
| 296 | ** Maximum length of a line in a text file, in UTF-16 characters. (4096) |
| 297 | ** The number of bytes represented by this value cannot exceed LENGTH_MASK |
| 298 | ** bytes, because that is the line buffer size used by the diff engine. |
| 299 | */ |
| 300 | #define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char))) |
| 301 | #define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1) |
| 302 | |
| 303 | /* |
| 304 | ** This macro is used to swap the byte order of a UTF-16 character in the |
| 305 | ** looks_like_utf16() function. |
| 306 | */ |
| 307 | #define UTF16_SWAP(ch) ((((ch) << 8) & 0xFF00) | (((ch) >> 8) & 0xFF)) |
| 308 | #define UTF16_SWAP_IF(expr,ch) ((expr) ? UTF16_SWAP((ch)) : (ch)) |
| 309 | |
| 310 | /* |
| 311 | ** This function attempts to scan each logical line within the blob to |
| 312 | ** determine the type of content it appears to contain. The return value |
| 313 | ** is a combination of one or more of the LOOK_XXX flags (see above): |
| @@ -347,45 +348,52 @@ | |
| 348 | if( n%sizeof(WCHAR_T) ){ |
| 349 | flags |= LOOK_ODD; /* Odd number of bytes -> binary (UTF-8?) */ |
| 350 | if( n<sizeof(WCHAR_T) ) return flags; /* One byte -> binary (UTF-8?) */ |
| 351 | } |
| 352 | c = *z; |
| 353 | if( bReverse ){ |
| 354 | c = UTF16_SWAP(c); |
| 355 | } |
| 356 | if( c==0 ){ |
| 357 | flags |= LOOK_NUL; /* NUL character in a file -> binary */ |
| 358 | }else if( c=='\r' ){ |
| 359 | flags |= LOOK_CR; |
| 360 | if( n<=sizeof(WCHAR_T) || UTF16_SWAP_IF(bReverse, z[1])!='\n' ){ |
| 361 | flags |= LOOK_LONE_CR; /* More chars, next char is not LF */ |
| 362 | } |
| 363 | } |
| 364 | j = (c!='\n'); |
| 365 | if( !j ) flags |= (LOOK_LF | LOOK_LONE_LF); /* Found LF as first char */ |
| 366 | while( 1 ){ |
| 367 | int c2 = c; |
| 368 | n -= sizeof(WCHAR_T); |
| 369 | if( n<sizeof(WCHAR_T) ) break; |
| 370 | c = *++z; |
| 371 | if( bReverse ){ |
| 372 | c = UTF16_SWAP(c); |
| 373 | } |
| 374 | ++j; |
| 375 | if( c==0 ){ |
| 376 | flags |= LOOK_NUL; /* NUL character in a file -> binary */ |
| 377 | }else if( c=='\n' ){ |
| 378 | flags |= LOOK_LF; |
| 379 | if( c2=='\r' ){ |
| 380 | flags |= (LOOK_CR | LOOK_CRLF); /* Found LF preceded by CR */ |
| 381 | }else{ |
| 382 | flags |= LOOK_LONE_LF; |
| 383 | } |
| 384 | if( j>UTF16_LENGTH_MASK ){ |
| 385 | flags |= LOOK_LONG; /* Very long line -> binary */ |
| 386 | } |
| 387 | j = 0; |
| 388 | }else if( c=='\r' ){ |
| 389 | flags |= LOOK_CR; |
| 390 | if( n<=sizeof(WCHAR_T) || UTF16_SWAP_IF(bReverse, z[1])!='\n' ){ |
| 391 | flags |= LOOK_LONE_CR; /* More chars, next char is not LF */ |
| 392 | } |
| 393 | } |
| 394 | } |
| 395 | if( j>UTF16_LENGTH_MASK ){ |
| 396 | flags |= LOOK_LONG; /* Very long line -> binary */ |
| 397 | } |
| 398 | return flags; |
| 399 | } |
| @@ -2524,22 +2532,23 @@ | |
| 2532 | Blob blob; /* the contents of the specified file */ |
| 2533 | int fUtf8; /* return value of starts_with_utf8_bom() */ |
| 2534 | int fUtf16; /* return value of starts_with_utf16_bom() */ |
| 2535 | int fUnicode; /* return value of could_be_utf16() */ |
| 2536 | int lookFlags; /* output flags from looks_like_utf8/utf16() */ |
| 2537 | int bRevUtf16 = 0; /* non-zero -> UTF-16 byte order reversed */ |
| 2538 | int bRevUnicode = 0; /* non-zero -> UTF-16 byte order reversed */ |
| 2539 | if( g.argc!=3 ) usage("FILENAME"); |
| 2540 | blob_read_from_file(&blob, g.argv[2]); |
| 2541 | fUtf8 = starts_with_utf8_bom(&blob, 0); |
| 2542 | fUtf16 = starts_with_utf16_bom(&blob, 0, &bRevUtf16); |
| 2543 | fUnicode = could_be_utf16(&blob, &bRevUnicode); |
| 2544 | lookFlags = fUnicode ? looks_like_utf16(&blob, bRevUnicode) : |
| 2545 | looks_like_utf8(&blob); |
| 2546 | fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob)); |
| 2547 | fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no"); |
| 2548 | fossil_print("Starts with UTF-16 BOM: %s\n", |
| 2549 | fUtf16?(bRevUtf16?"reversed":"yes"):"no"); |
| 2550 | fossil_print("Looks like UTF-%s: %s\n",fUnicode?"16":"8", |
| 2551 | (lookFlags&LOOK_BINARY)?"no":"yes"); |
| 2552 | fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no"); |
| 2553 | fossil_print("Has flag LOOK_CR: %s\n",(lookFlags&LOOK_CR)?"yes":"no"); |
| 2554 | fossil_print("Has flag LOOK_LONE_CR: %s\n", |
| 2555 |