Fossil SCM
For the looks_like_utf*() functions, continue to examine blob content in order to fully set the output flags, even if it appears to be binary. Also, increase the strictness of starts_with_utf16_bom() and make it more accurate.
Commit
13fac7f74a95059f3ac42246676d80045df829c3
Parent
a93b58cf83ce00d…
1 file changed
+35
-23
+35
-23
| --- src/diff.c | ||
| +++ src/diff.c | ||
| @@ -72,10 +72,11 @@ | ||
| 72 | 72 | #define LOOK_NONE ((int)0x00000000) /* Nothing special was found. */ |
| 73 | 73 | #define LOOK_NUL ((int)0x00000001) /* One or more NUL chars were found. */ |
| 74 | 74 | #define LOOK_LF ((int)0x00000002) /* One or more LF chars were found. */ |
| 75 | 75 | #define LOOK_CRLF ((int)0x00000004) /* One or more CR/LF pairs were found. */ |
| 76 | 76 | #define LOOK_LENGTH ((int)0x00000008) /* An over length line was found. */ |
| 77 | +#define LOOK_ODD ((int)0x00000010) /* An odd number of bytes was found. */ | |
| 77 | 78 | #endif /* INTERFACE */ |
| 78 | 79 | |
| 79 | 80 | /* |
| 80 | 81 | ** Maximum length of a line in a text file, in bytes. (2**13 = 8192 bytes) |
| 81 | 82 | */ |
| @@ -217,31 +218,34 @@ | ||
| 217 | 218 | ** validate any code points. It makes no attempt to detect if any [invalid] |
| 218 | 219 | ** switches between UTF-8 and other encodings occur. |
| 219 | 220 | ** |
| 220 | 221 | ** The only code points that this function cares about are the NUL character, |
| 221 | 222 | ** carriage-return, and line-feed. |
| 223 | +** | |
| 224 | +** Whether or not this function examines the entire contents of the blob are | |
| 225 | +** officially unspecified. | |
| 222 | 226 | ** |
| 223 | 227 | ************************************ WARNING ********************************** |
| 224 | 228 | */ |
| 225 | 229 | int looks_like_utf8(const Blob *pContent, int *pFlags){ |
| 226 | 230 | const char *z = blob_buffer(pContent); |
| 227 | 231 | unsigned int n = blob_size(pContent); |
| 228 | - int j, c; | |
| 232 | + int j, c, result = 1; /* Assume UTF-8 text, prove otherwise */ | |
| 229 | 233 | |
| 230 | 234 | if( pFlags ) *pFlags = LOOK_NONE; |
| 231 | - if( n==0 ) return 1; /* Empty file -> text */ | |
| 235 | + if( n==0 ) return result; /* Empty file -> text */ | |
| 232 | 236 | c = *z; |
| 233 | 237 | if( c==0 ){ |
| 234 | 238 | if( pFlags ) *pFlags |= LOOK_NUL; |
| 235 | - return 0; /* NUL character in a file -> binary */ | |
| 239 | + result = 0; /* NUL character in a file -> binary */ | |
| 236 | 240 | } |
| 237 | 241 | j = (c!='\n'); |
| 238 | 242 | while( --n>0 ){ |
| 239 | 243 | c = *++z; ++j; |
| 240 | 244 | if( c==0 ){ |
| 241 | 245 | if( pFlags ) *pFlags |= LOOK_NUL; |
| 242 | - return 0; /* NUL character in a file -> binary */ | |
| 246 | + result = 0; /* NUL character in a file -> binary */ | |
| 243 | 247 | } |
| 244 | 248 | if( c=='\n' ){ |
| 245 | 249 | int c2 = z[-1]; |
| 246 | 250 | if( pFlags ){ |
| 247 | 251 | *pFlags |= LOOK_LF; |
| @@ -249,20 +253,20 @@ | ||
| 249 | 253 | *pFlags |= LOOK_CRLF; |
| 250 | 254 | } |
| 251 | 255 | } |
| 252 | 256 | if( j>LENGTH_MASK ){ |
| 253 | 257 | if( pFlags ) *pFlags |= LOOK_LENGTH; |
| 254 | - return 0; /* Very long line -> binary */ | |
| 258 | + result = 0; /* Very long line -> binary */ | |
| 255 | 259 | } |
| 256 | 260 | j = 0; |
| 257 | 261 | } |
| 258 | 262 | } |
| 259 | 263 | if( j>LENGTH_MASK ){ |
| 260 | 264 | if( pFlags ) *pFlags |= LOOK_LENGTH; |
| 261 | - return 0; /* Very long line -> binary */ | |
| 265 | + result = 0; /* Very long line -> binary */ | |
| 262 | 266 | } |
| 263 | - return 1; /* No problems seen -> not binary */ | |
| 267 | + return result; /* No problems seen -> not binary */ | |
| 264 | 268 | } |
| 265 | 269 | |
| 266 | 270 | /* |
| 267 | 271 | ** Define the type needed to represent a Unicode (UTF-16) character. |
| 268 | 272 | */ |
| @@ -311,32 +315,38 @@ | ||
| 311 | 315 | ** validate any code points. It makes no attempt to detect if any [invalid] |
| 312 | 316 | ** switches between the UTF-16be and UTF-16le encodings occur. |
| 313 | 317 | ** |
| 314 | 318 | ** The only code points that this function cares about are the NUL character, |
| 315 | 319 | ** carriage-return, and line-feed. |
| 320 | +** | |
| 321 | +** Whether or not this function examines the entire contents of the blob are | |
| 322 | +** officially unspecified. | |
| 316 | 323 | ** |
| 317 | 324 | ************************************ WARNING ********************************** |
| 318 | 325 | */ |
| 319 | 326 | int looks_like_utf16(const Blob *pContent, int *pFlags){ |
| 320 | 327 | const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent); |
| 321 | 328 | unsigned int n = blob_size(pContent); |
| 322 | - int j, c; | |
| 329 | + int j, c, result = 1; /* Assume UTF-16 text, prove otherwise */ | |
| 323 | 330 | |
| 324 | 331 | if( pFlags ) *pFlags = LOOK_NONE; |
| 325 | - if( n==0 ) return 1; /* Empty file -> text */ | |
| 326 | - if( n%2 ) return 0; /* Odd number of bytes -> binary (or UTF-8) */ | |
| 332 | + if( n==0 ) return result; /* Empty file -> text */ | |
| 333 | + if( n%2 ){ | |
| 334 | + if( pFlags ) *pFlags |= LOOK_ODD; | |
| 335 | + result = 0; /* Odd number of bytes -> binary (or UTF-8) */ | |
| 336 | + } | |
| 327 | 337 | c = *z; |
| 328 | 338 | if( c==0 ){ |
| 329 | 339 | if( pFlags ) *pFlags |= LOOK_NUL; |
| 330 | - return 0; /* NUL character in a file -> binary */ | |
| 340 | + result = 0; /* NUL character in a file -> binary */ | |
| 331 | 341 | } |
| 332 | 342 | j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF)); |
| 333 | 343 | while( (n-=2)>0 ){ |
| 334 | 344 | c = *++z; ++j; |
| 335 | 345 | if( c==0 ){ |
| 336 | 346 | if( pFlags ) *pFlags |= LOOK_NUL; |
| 337 | - return 0; /* NUL character in a file -> binary */ | |
| 347 | + result = 0; /* NUL character in a file -> binary */ | |
| 338 | 348 | } |
| 339 | 349 | if( c==UTF16BE_LF || c==UTF16LE_LF ){ |
| 340 | 350 | int c2 = z[-1]; |
| 341 | 351 | if( pFlags ){ |
| 342 | 352 | *pFlags |= LOOK_LF; |
| @@ -344,20 +354,20 @@ | ||
| 344 | 354 | *pFlags |= LOOK_CRLF; |
| 345 | 355 | } |
| 346 | 356 | } |
| 347 | 357 | if( j>UTF16_LENGTH_MASK ){ |
| 348 | 358 | if( pFlags ) *pFlags |= LOOK_LENGTH; |
| 349 | - return 0; /* Very long line -> binary */ | |
| 359 | + result = 0; /* Very long line -> binary */ | |
| 350 | 360 | } |
| 351 | 361 | j = 0; |
| 352 | 362 | } |
| 353 | 363 | } |
| 354 | 364 | if( j>UTF16_LENGTH_MASK ){ |
| 355 | 365 | if( pFlags ) *pFlags |= LOOK_LENGTH; |
| 356 | - return 0; /* Very long line -> binary */ | |
| 366 | + result = 0; /* Very long line -> binary */ | |
| 357 | 367 | } |
| 358 | - return 1; /* No problems seen -> not binary */ | |
| 368 | + return result; /* No problems seen -> not binary */ | |
| 359 | 369 | } |
| 360 | 370 | |
| 361 | 371 | /* |
| 362 | 372 | ** This function returns an array of bytes representing the byte-order-mark |
| 363 | 373 | ** for UTF-8. |
| @@ -395,23 +405,24 @@ | ||
| 395 | 405 | const Blob *pContent, /* IN: Blob content to perform BOM detection on. */ |
| 396 | 406 | int *pnByte, /* OUT: The number of bytes used for the BOM. */ |
| 397 | 407 | int *pbReverse /* OUT: Non-zero for BOM in reverse byte-order. */ |
| 398 | 408 | ){ |
| 399 | 409 | const unsigned short *z = (unsigned short *)blob_buffer(pContent); |
| 410 | + int bomSize = sizeof(unsigned short); | |
| 400 | 411 | int size = blob_size(pContent); |
| 401 | 412 | |
| 402 | - if( (size<2) || (size%2) | |
| 403 | - || (size>=4 && z[1]==0) ) return 0; | |
| 404 | - if( z[0] == 0xfffe ){ | |
| 413 | + if( size<bomSize ) return 0; /* No: cannot read BOM. */ | |
| 414 | + if( size>=(2*bomSize) && z[1]==0 ) return 0; /* No: possible UTF-32. */ | |
| 415 | + if( z[0]==0xfffe ){ | |
| 405 | 416 | if( pbReverse ) *pbReverse = 1; |
| 406 | - }else if( z[0] == 0xfeff ){ | |
| 417 | + }else if( z[0]==0xfeff ){ | |
| 407 | 418 | if( pbReverse ) *pbReverse = 0; |
| 408 | 419 | }else{ |
| 409 | - return 0; | |
| 420 | + return 0; /* No: UTF-16 byte-order-mark not found. */ | |
| 410 | 421 | } |
| 411 | - if( pnByte ) *pnByte = 2; | |
| 412 | - return 1; | |
| 422 | + if( pnByte ) *pnByte = bomSize; | |
| 423 | + return 1; /* Yes. */ | |
| 413 | 424 | } |
| 414 | 425 | |
| 415 | 426 | /* |
| 416 | 427 | ** Return true if two DLine elements are identical. |
| 417 | 428 | */ |
| @@ -2474,12 +2485,13 @@ | ||
| 2474 | 2485 | eType = fUtf16 ? looks_like_utf16(&blob, &lookFlags) : |
| 2475 | 2486 | looks_like_utf8(&blob, &lookFlags); |
| 2476 | 2487 | fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob)); |
| 2477 | 2488 | fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no"); |
| 2478 | 2489 | fossil_print("Starts with UTF-16 BOM: %s\n",fUtf16?"yes":"no"); |
| 2479 | - fossil_print("Looks like UTF-%s: %s\n", fUtf16?"16":"8",eType?"yes":"no"); | |
| 2490 | + fossil_print("Looks like UTF-%s: %s\n",fUtf16?"16":"8",eType?"yes":"no"); | |
| 2480 | 2491 | fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no"); |
| 2481 | 2492 | fossil_print("Has flag LOOK_LF: %s\n",(lookFlags&LOOK_LF)?"yes":"no"); |
| 2482 | 2493 | fossil_print("Has flag LOOK_CRLF: %s\n",(lookFlags&LOOK_CRLF)?"yes":"no"); |
| 2483 | 2494 | fossil_print("Has flag LOOK_LENGTH: %s\n",(lookFlags&LOOK_LENGTH)?"yes":"no"); |
| 2495 | + fossil_print("Has flag LOOK_ODD: %s\n",(lookFlags&LOOK_ODD)?"yes":"no"); | |
| 2484 | 2496 | blob_reset(&blob); |
| 2485 | 2497 | } |
| 2486 | 2498 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -72,10 +72,11 @@ | |
| 72 | #define LOOK_NONE ((int)0x00000000) /* Nothing special was found. */ |
| 73 | #define LOOK_NUL ((int)0x00000001) /* One or more NUL chars were found. */ |
| 74 | #define LOOK_LF ((int)0x00000002) /* One or more LF chars were found. */ |
| 75 | #define LOOK_CRLF ((int)0x00000004) /* One or more CR/LF pairs were found. */ |
| 76 | #define LOOK_LENGTH ((int)0x00000008) /* An over length line was found. */ |
| 77 | #endif /* INTERFACE */ |
| 78 | |
| 79 | /* |
| 80 | ** Maximum length of a line in a text file, in bytes. (2**13 = 8192 bytes) |
| 81 | */ |
| @@ -217,31 +218,34 @@ | |
| 217 | ** validate any code points. It makes no attempt to detect if any [invalid] |
| 218 | ** switches between UTF-8 and other encodings occur. |
| 219 | ** |
| 220 | ** The only code points that this function cares about are the NUL character, |
| 221 | ** carriage-return, and line-feed. |
| 222 | ** |
| 223 | ************************************ WARNING ********************************** |
| 224 | */ |
| 225 | int looks_like_utf8(const Blob *pContent, int *pFlags){ |
| 226 | const char *z = blob_buffer(pContent); |
| 227 | unsigned int n = blob_size(pContent); |
| 228 | int j, c; |
| 229 | |
| 230 | if( pFlags ) *pFlags = LOOK_NONE; |
| 231 | if( n==0 ) return 1; /* Empty file -> text */ |
| 232 | c = *z; |
| 233 | if( c==0 ){ |
| 234 | if( pFlags ) *pFlags |= LOOK_NUL; |
| 235 | return 0; /* NUL character in a file -> binary */ |
| 236 | } |
| 237 | j = (c!='\n'); |
| 238 | while( --n>0 ){ |
| 239 | c = *++z; ++j; |
| 240 | if( c==0 ){ |
| 241 | if( pFlags ) *pFlags |= LOOK_NUL; |
| 242 | return 0; /* NUL character in a file -> binary */ |
| 243 | } |
| 244 | if( c=='\n' ){ |
| 245 | int c2 = z[-1]; |
| 246 | if( pFlags ){ |
| 247 | *pFlags |= LOOK_LF; |
| @@ -249,20 +253,20 @@ | |
| 249 | *pFlags |= LOOK_CRLF; |
| 250 | } |
| 251 | } |
| 252 | if( j>LENGTH_MASK ){ |
| 253 | if( pFlags ) *pFlags |= LOOK_LENGTH; |
| 254 | return 0; /* Very long line -> binary */ |
| 255 | } |
| 256 | j = 0; |
| 257 | } |
| 258 | } |
| 259 | if( j>LENGTH_MASK ){ |
| 260 | if( pFlags ) *pFlags |= LOOK_LENGTH; |
| 261 | return 0; /* Very long line -> binary */ |
| 262 | } |
| 263 | return 1; /* No problems seen -> not binary */ |
| 264 | } |
| 265 | |
| 266 | /* |
| 267 | ** Define the type needed to represent a Unicode (UTF-16) character. |
| 268 | */ |
| @@ -311,32 +315,38 @@ | |
| 311 | ** validate any code points. It makes no attempt to detect if any [invalid] |
| 312 | ** switches between the UTF-16be and UTF-16le encodings occur. |
| 313 | ** |
| 314 | ** The only code points that this function cares about are the NUL character, |
| 315 | ** carriage-return, and line-feed. |
| 316 | ** |
| 317 | ************************************ WARNING ********************************** |
| 318 | */ |
| 319 | int looks_like_utf16(const Blob *pContent, int *pFlags){ |
| 320 | const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent); |
| 321 | unsigned int n = blob_size(pContent); |
| 322 | int j, c; |
| 323 | |
| 324 | if( pFlags ) *pFlags = LOOK_NONE; |
| 325 | if( n==0 ) return 1; /* Empty file -> text */ |
| 326 | if( n%2 ) return 0; /* Odd number of bytes -> binary (or UTF-8) */ |
| 327 | c = *z; |
| 328 | if( c==0 ){ |
| 329 | if( pFlags ) *pFlags |= LOOK_NUL; |
| 330 | return 0; /* NUL character in a file -> binary */ |
| 331 | } |
| 332 | j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF)); |
| 333 | while( (n-=2)>0 ){ |
| 334 | c = *++z; ++j; |
| 335 | if( c==0 ){ |
| 336 | if( pFlags ) *pFlags |= LOOK_NUL; |
| 337 | return 0; /* NUL character in a file -> binary */ |
| 338 | } |
| 339 | if( c==UTF16BE_LF || c==UTF16LE_LF ){ |
| 340 | int c2 = z[-1]; |
| 341 | if( pFlags ){ |
| 342 | *pFlags |= LOOK_LF; |
| @@ -344,20 +354,20 @@ | |
| 344 | *pFlags |= LOOK_CRLF; |
| 345 | } |
| 346 | } |
| 347 | if( j>UTF16_LENGTH_MASK ){ |
| 348 | if( pFlags ) *pFlags |= LOOK_LENGTH; |
| 349 | return 0; /* Very long line -> binary */ |
| 350 | } |
| 351 | j = 0; |
| 352 | } |
| 353 | } |
| 354 | if( j>UTF16_LENGTH_MASK ){ |
| 355 | if( pFlags ) *pFlags |= LOOK_LENGTH; |
| 356 | return 0; /* Very long line -> binary */ |
| 357 | } |
| 358 | return 1; /* No problems seen -> not binary */ |
| 359 | } |
| 360 | |
| 361 | /* |
| 362 | ** This function returns an array of bytes representing the byte-order-mark |
| 363 | ** for UTF-8. |
| @@ -395,23 +405,24 @@ | |
| 395 | const Blob *pContent, /* IN: Blob content to perform BOM detection on. */ |
| 396 | int *pnByte, /* OUT: The number of bytes used for the BOM. */ |
| 397 | int *pbReverse /* OUT: Non-zero for BOM in reverse byte-order. */ |
| 398 | ){ |
| 399 | const unsigned short *z = (unsigned short *)blob_buffer(pContent); |
| 400 | int size = blob_size(pContent); |
| 401 | |
| 402 | if( (size<2) || (size%2) |
| 403 | || (size>=4 && z[1]==0) ) return 0; |
| 404 | if( z[0] == 0xfffe ){ |
| 405 | if( pbReverse ) *pbReverse = 1; |
| 406 | }else if( z[0] == 0xfeff ){ |
| 407 | if( pbReverse ) *pbReverse = 0; |
| 408 | }else{ |
| 409 | return 0; |
| 410 | } |
| 411 | if( pnByte ) *pnByte = 2; |
| 412 | return 1; |
| 413 | } |
| 414 | |
| 415 | /* |
| 416 | ** Return true if two DLine elements are identical. |
| 417 | */ |
| @@ -2474,12 +2485,13 @@ | |
| 2474 | eType = fUtf16 ? looks_like_utf16(&blob, &lookFlags) : |
| 2475 | looks_like_utf8(&blob, &lookFlags); |
| 2476 | fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob)); |
| 2477 | fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no"); |
| 2478 | fossil_print("Starts with UTF-16 BOM: %s\n",fUtf16?"yes":"no"); |
| 2479 | fossil_print("Looks like UTF-%s: %s\n", fUtf16?"16":"8",eType?"yes":"no"); |
| 2480 | fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no"); |
| 2481 | fossil_print("Has flag LOOK_LF: %s\n",(lookFlags&LOOK_LF)?"yes":"no"); |
| 2482 | fossil_print("Has flag LOOK_CRLF: %s\n",(lookFlags&LOOK_CRLF)?"yes":"no"); |
| 2483 | fossil_print("Has flag LOOK_LENGTH: %s\n",(lookFlags&LOOK_LENGTH)?"yes":"no"); |
| 2484 | blob_reset(&blob); |
| 2485 | } |
| 2486 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -72,10 +72,11 @@ | |
| 72 | #define LOOK_NONE ((int)0x00000000) /* Nothing special was found. */ |
| 73 | #define LOOK_NUL ((int)0x00000001) /* One or more NUL chars were found. */ |
| 74 | #define LOOK_LF ((int)0x00000002) /* One or more LF chars were found. */ |
| 75 | #define LOOK_CRLF ((int)0x00000004) /* One or more CR/LF pairs were found. */ |
| 76 | #define LOOK_LENGTH ((int)0x00000008) /* An over length line was found. */ |
| 77 | #define LOOK_ODD ((int)0x00000010) /* An odd number of bytes was found. */ |
| 78 | #endif /* INTERFACE */ |
| 79 | |
| 80 | /* |
| 81 | ** Maximum length of a line in a text file, in bytes. (2**13 = 8192 bytes) |
| 82 | */ |
| @@ -217,31 +218,34 @@ | |
| 218 | ** validate any code points. It makes no attempt to detect if any [invalid] |
| 219 | ** switches between UTF-8 and other encodings occur. |
| 220 | ** |
| 221 | ** The only code points that this function cares about are the NUL character, |
| 222 | ** carriage-return, and line-feed. |
| 223 | ** |
| 224 | ** Whether or not this function examines the entire contents of the blob are |
| 225 | ** officially unspecified. |
| 226 | ** |
| 227 | ************************************ WARNING ********************************** |
| 228 | */ |
| 229 | int looks_like_utf8(const Blob *pContent, int *pFlags){ |
| 230 | const char *z = blob_buffer(pContent); |
| 231 | unsigned int n = blob_size(pContent); |
| 232 | int j, c, result = 1; /* Assume UTF-8 text, prove otherwise */ |
| 233 | |
| 234 | if( pFlags ) *pFlags = LOOK_NONE; |
| 235 | if( n==0 ) return result; /* Empty file -> text */ |
| 236 | c = *z; |
| 237 | if( c==0 ){ |
| 238 | if( pFlags ) *pFlags |= LOOK_NUL; |
| 239 | result = 0; /* NUL character in a file -> binary */ |
| 240 | } |
| 241 | j = (c!='\n'); |
| 242 | while( --n>0 ){ |
| 243 | c = *++z; ++j; |
| 244 | if( c==0 ){ |
| 245 | if( pFlags ) *pFlags |= LOOK_NUL; |
| 246 | result = 0; /* NUL character in a file -> binary */ |
| 247 | } |
| 248 | if( c=='\n' ){ |
| 249 | int c2 = z[-1]; |
| 250 | if( pFlags ){ |
| 251 | *pFlags |= LOOK_LF; |
| @@ -249,20 +253,20 @@ | |
| 253 | *pFlags |= LOOK_CRLF; |
| 254 | } |
| 255 | } |
| 256 | if( j>LENGTH_MASK ){ |
| 257 | if( pFlags ) *pFlags |= LOOK_LENGTH; |
| 258 | result = 0; /* Very long line -> binary */ |
| 259 | } |
| 260 | j = 0; |
| 261 | } |
| 262 | } |
| 263 | if( j>LENGTH_MASK ){ |
| 264 | if( pFlags ) *pFlags |= LOOK_LENGTH; |
| 265 | result = 0; /* Very long line -> binary */ |
| 266 | } |
| 267 | return result; /* No problems seen -> not binary */ |
| 268 | } |
| 269 | |
| 270 | /* |
| 271 | ** Define the type needed to represent a Unicode (UTF-16) character. |
| 272 | */ |
| @@ -311,32 +315,38 @@ | |
| 315 | ** validate any code points. It makes no attempt to detect if any [invalid] |
| 316 | ** switches between the UTF-16be and UTF-16le encodings occur. |
| 317 | ** |
| 318 | ** The only code points that this function cares about are the NUL character, |
| 319 | ** carriage-return, and line-feed. |
| 320 | ** |
| 321 | ** Whether or not this function examines the entire contents of the blob are |
| 322 | ** officially unspecified. |
| 323 | ** |
| 324 | ************************************ WARNING ********************************** |
| 325 | */ |
| 326 | int looks_like_utf16(const Blob *pContent, int *pFlags){ |
| 327 | const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent); |
| 328 | unsigned int n = blob_size(pContent); |
| 329 | int j, c, result = 1; /* Assume UTF-16 text, prove otherwise */ |
| 330 | |
| 331 | if( pFlags ) *pFlags = LOOK_NONE; |
| 332 | if( n==0 ) return result; /* Empty file -> text */ |
| 333 | if( n%2 ){ |
| 334 | if( pFlags ) *pFlags |= LOOK_ODD; |
| 335 | result = 0; /* Odd number of bytes -> binary (or UTF-8) */ |
| 336 | } |
| 337 | c = *z; |
| 338 | if( c==0 ){ |
| 339 | if( pFlags ) *pFlags |= LOOK_NUL; |
| 340 | result = 0; /* NUL character in a file -> binary */ |
| 341 | } |
| 342 | j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF)); |
| 343 | while( (n-=2)>0 ){ |
| 344 | c = *++z; ++j; |
| 345 | if( c==0 ){ |
| 346 | if( pFlags ) *pFlags |= LOOK_NUL; |
| 347 | result = 0; /* NUL character in a file -> binary */ |
| 348 | } |
| 349 | if( c==UTF16BE_LF || c==UTF16LE_LF ){ |
| 350 | int c2 = z[-1]; |
| 351 | if( pFlags ){ |
| 352 | *pFlags |= LOOK_LF; |
| @@ -344,20 +354,20 @@ | |
| 354 | *pFlags |= LOOK_CRLF; |
| 355 | } |
| 356 | } |
| 357 | if( j>UTF16_LENGTH_MASK ){ |
| 358 | if( pFlags ) *pFlags |= LOOK_LENGTH; |
| 359 | result = 0; /* Very long line -> binary */ |
| 360 | } |
| 361 | j = 0; |
| 362 | } |
| 363 | } |
| 364 | if( j>UTF16_LENGTH_MASK ){ |
| 365 | if( pFlags ) *pFlags |= LOOK_LENGTH; |
| 366 | result = 0; /* Very long line -> binary */ |
| 367 | } |
| 368 | return result; /* No problems seen -> not binary */ |
| 369 | } |
| 370 | |
| 371 | /* |
| 372 | ** This function returns an array of bytes representing the byte-order-mark |
| 373 | ** for UTF-8. |
| @@ -395,23 +405,24 @@ | |
| 405 | const Blob *pContent, /* IN: Blob content to perform BOM detection on. */ |
| 406 | int *pnByte, /* OUT: The number of bytes used for the BOM. */ |
| 407 | int *pbReverse /* OUT: Non-zero for BOM in reverse byte-order. */ |
| 408 | ){ |
| 409 | const unsigned short *z = (unsigned short *)blob_buffer(pContent); |
| 410 | int bomSize = sizeof(unsigned short); |
| 411 | int size = blob_size(pContent); |
| 412 | |
| 413 | if( size<bomSize ) return 0; /* No: cannot read BOM. */ |
| 414 | if( size>=(2*bomSize) && z[1]==0 ) return 0; /* No: possible UTF-32. */ |
| 415 | if( z[0]==0xfffe ){ |
| 416 | if( pbReverse ) *pbReverse = 1; |
| 417 | }else if( z[0]==0xfeff ){ |
| 418 | if( pbReverse ) *pbReverse = 0; |
| 419 | }else{ |
| 420 | return 0; /* No: UTF-16 byte-order-mark not found. */ |
| 421 | } |
| 422 | if( pnByte ) *pnByte = bomSize; |
| 423 | return 1; /* Yes. */ |
| 424 | } |
| 425 | |
| 426 | /* |
| 427 | ** Return true if two DLine elements are identical. |
| 428 | */ |
| @@ -2474,12 +2485,13 @@ | |
| 2485 | eType = fUtf16 ? looks_like_utf16(&blob, &lookFlags) : |
| 2486 | looks_like_utf8(&blob, &lookFlags); |
| 2487 | fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob)); |
| 2488 | fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no"); |
| 2489 | fossil_print("Starts with UTF-16 BOM: %s\n",fUtf16?"yes":"no"); |
| 2490 | fossil_print("Looks like UTF-%s: %s\n",fUtf16?"16":"8",eType?"yes":"no"); |
| 2491 | fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no"); |
| 2492 | fossil_print("Has flag LOOK_LF: %s\n",(lookFlags&LOOK_LF)?"yes":"no"); |
| 2493 | fossil_print("Has flag LOOK_CRLF: %s\n",(lookFlags&LOOK_CRLF)?"yes":"no"); |
| 2494 | fossil_print("Has flag LOOK_LENGTH: %s\n",(lookFlags&LOOK_LENGTH)?"yes":"no"); |
| 2495 | fossil_print("Has flag LOOK_ODD: %s\n",(lookFlags&LOOK_ODD)?"yes":"no"); |
| 2496 | blob_reset(&blob); |
| 2497 | } |
| 2498 |