Fossil SCM
Don't let looks_like_utf8/16 decide any more whether the blob is text or binary. Calling code can do that based on the returned flags. This simplifies looks_like_utf8/16 a lot.
Commit
276b34955bd4880495400e1f22214c583605270d
Parent
4b2c2a519f02525…
2 files changed
+5
-6
+45
-74
+5
-6
| --- src/checkin.c | ||
| +++ src/checkin.c | ||
| @@ -905,11 +905,10 @@ | ||
| 905 | 905 | int crnlOk, /* Non-zero if CR/NL warnings should be disabled. */ |
| 906 | 906 | int binOk, /* Non-zero if binary warnings should be disabled. */ |
| 907 | 907 | int encodingOk, /* Non-zero if encoding warnings should be disabled. */ |
| 908 | 908 | const char *zFilename /* The full name of the file being committed. */ |
| 909 | 909 | ){ |
| 910 | - int eType; /* return value of looks_like_utf8/utf16() */ | |
| 911 | 910 | int fUnicode; /* return value of starts_with_utf16_bom() */ |
| 912 | 911 | int lookFlags; /* output flags from looks_like_utf8/utf16() */ |
| 913 | 912 | int fHasNul; /* the blob contains one or more NUL chars */ |
| 914 | 913 | int fHasCrLf; /* the blob contains one or more CR/LF pairs */ |
| 915 | 914 | int fHasLength; /* the blob contains an overly long line */ |
| @@ -918,31 +917,31 @@ | ||
| 918 | 917 | static int allOk = 0; /* Set to true to disable this routine */ |
| 919 | 918 | |
| 920 | 919 | if( allOk ) return 0; |
| 921 | 920 | fUnicode = starts_with_utf16_bom(p, 0, 0); |
| 922 | 921 | if( fUnicode ){ |
| 923 | - eType = looks_like_utf16(p, &lookFlags); | |
| 922 | + lookFlags = looks_like_utf16(p); | |
| 924 | 923 | if( lookFlags&LOOK_ODD ){ |
| 925 | 924 | /* Content with an odd number of bytes cannot be UTF-16. */ |
| 926 | 925 | fUnicode = 0; |
| 927 | 926 | /* Therefore, check if the content appears to be UTF-8. */ |
| 928 | - eType = looks_like_utf8(p, &lookFlags); | |
| 927 | + lookFlags = looks_like_utf8(p); | |
| 929 | 928 | } |
| 930 | 929 | }else{ |
| 931 | - eType = looks_like_utf8(p, &lookFlags); | |
| 930 | + lookFlags = looks_like_utf8(p); | |
| 932 | 931 | } |
| 933 | 932 | fHasNul = (lookFlags & LOOK_NUL); |
| 934 | 933 | fHasCrLf = (lookFlags & LOOK_CRLF); |
| 935 | 934 | fHasLength = (lookFlags & LOOK_LENGTH); |
| 936 | - if( eType==0 || fHasCrLf || fUnicode ){ | |
| 935 | + if( fHasNul || fHasLength || fHasCrLf || fUnicode ){ | |
| 937 | 936 | const char *zWarning; |
| 938 | 937 | const char *zDisable; |
| 939 | 938 | const char *zConvert = "c=convert/"; |
| 940 | 939 | Blob ans; |
| 941 | 940 | char cReply; |
| 942 | 941 | |
| 943 | - if( eType==0 ){ | |
| 942 | + if( fHasNul || fHasLength ){ | |
| 944 | 943 | if( binOk ){ |
| 945 | 944 | return 0; /* We don't want binary warnings for this file. */ |
| 946 | 945 | } |
| 947 | 946 | if( !fHasNul && fHasLength ){ |
| 948 | 947 | zWarning = "long lines"; |
| 949 | 948 |
| --- src/checkin.c | |
| +++ src/checkin.c | |
| @@ -905,11 +905,10 @@ | |
| 905 | int crnlOk, /* Non-zero if CR/NL warnings should be disabled. */ |
| 906 | int binOk, /* Non-zero if binary warnings should be disabled. */ |
| 907 | int encodingOk, /* Non-zero if encoding warnings should be disabled. */ |
| 908 | const char *zFilename /* The full name of the file being committed. */ |
| 909 | ){ |
| 910 | int eType; /* return value of looks_like_utf8/utf16() */ |
| 911 | int fUnicode; /* return value of starts_with_utf16_bom() */ |
| 912 | int lookFlags; /* output flags from looks_like_utf8/utf16() */ |
| 913 | int fHasNul; /* the blob contains one or more NUL chars */ |
| 914 | int fHasCrLf; /* the blob contains one or more CR/LF pairs */ |
| 915 | int fHasLength; /* the blob contains an overly long line */ |
| @@ -918,31 +917,31 @@ | |
| 918 | static int allOk = 0; /* Set to true to disable this routine */ |
| 919 | |
| 920 | if( allOk ) return 0; |
| 921 | fUnicode = starts_with_utf16_bom(p, 0, 0); |
| 922 | if( fUnicode ){ |
| 923 | eType = looks_like_utf16(p, &lookFlags); |
| 924 | if( lookFlags&LOOK_ODD ){ |
| 925 | /* Content with an odd number of bytes cannot be UTF-16. */ |
| 926 | fUnicode = 0; |
| 927 | /* Therefore, check if the content appears to be UTF-8. */ |
| 928 | eType = looks_like_utf8(p, &lookFlags); |
| 929 | } |
| 930 | }else{ |
| 931 | eType = looks_like_utf8(p, &lookFlags); |
| 932 | } |
| 933 | fHasNul = (lookFlags & LOOK_NUL); |
| 934 | fHasCrLf = (lookFlags & LOOK_CRLF); |
| 935 | fHasLength = (lookFlags & LOOK_LENGTH); |
| 936 | if( eType==0 || fHasCrLf || fUnicode ){ |
| 937 | const char *zWarning; |
| 938 | const char *zDisable; |
| 939 | const char *zConvert = "c=convert/"; |
| 940 | Blob ans; |
| 941 | char cReply; |
| 942 | |
| 943 | if( eType==0 ){ |
| 944 | if( binOk ){ |
| 945 | return 0; /* We don't want binary warnings for this file. */ |
| 946 | } |
| 947 | if( !fHasNul && fHasLength ){ |
| 948 | zWarning = "long lines"; |
| 949 |
| --- src/checkin.c | |
| +++ src/checkin.c | |
| @@ -905,11 +905,10 @@ | |
| 905 | int crnlOk, /* Non-zero if CR/NL warnings should be disabled. */ |
| 906 | int binOk, /* Non-zero if binary warnings should be disabled. */ |
| 907 | int encodingOk, /* Non-zero if encoding warnings should be disabled. */ |
| 908 | const char *zFilename /* The full name of the file being committed. */ |
| 909 | ){ |
| 910 | int fUnicode; /* return value of starts_with_utf16_bom() */ |
| 911 | int lookFlags; /* output flags from looks_like_utf8/utf16() */ |
| 912 | int fHasNul; /* the blob contains one or more NUL chars */ |
| 913 | int fHasCrLf; /* the blob contains one or more CR/LF pairs */ |
| 914 | int fHasLength; /* the blob contains an overly long line */ |
| @@ -918,31 +917,31 @@ | |
| 917 | static int allOk = 0; /* Set to true to disable this routine */ |
| 918 | |
| 919 | if( allOk ) return 0; |
| 920 | fUnicode = starts_with_utf16_bom(p, 0, 0); |
| 921 | if( fUnicode ){ |
| 922 | lookFlags = looks_like_utf16(p); |
| 923 | if( lookFlags&LOOK_ODD ){ |
| 924 | /* Content with an odd number of bytes cannot be UTF-16. */ |
| 925 | fUnicode = 0; |
| 926 | /* Therefore, check if the content appears to be UTF-8. */ |
| 927 | lookFlags = looks_like_utf8(p); |
| 928 | } |
| 929 | }else{ |
| 930 | lookFlags = looks_like_utf8(p); |
| 931 | } |
| 932 | fHasNul = (lookFlags & LOOK_NUL); |
| 933 | fHasCrLf = (lookFlags & LOOK_CRLF); |
| 934 | fHasLength = (lookFlags & LOOK_LENGTH); |
| 935 | if( fHasNul || fHasLength || fHasCrLf || fUnicode ){ |
| 936 | const char *zWarning; |
| 937 | const char *zDisable; |
| 938 | const char *zConvert = "c=convert/"; |
| 939 | Blob ans; |
| 940 | char cReply; |
| 941 | |
| 942 | if( fHasNul || fHasLength ){ |
| 943 | if( binOk ){ |
| 944 | return 0; /* We don't want binary warnings for this file. */ |
| 945 | } |
| 946 | if( !fHasNul && fHasLength ){ |
| 947 | zWarning = "long lines"; |
| 948 |
+45
-74
| --- src/diff.c | ||
| +++ src/diff.c | ||
| @@ -61,11 +61,11 @@ | ||
| 61 | 61 | |
| 62 | 62 | /* |
| 63 | 63 | ** This macro is designed to return non-zero if the specified blob contains |
| 64 | 64 | ** data that MAY be binary in nature; otherwise, zero will be returned. |
| 65 | 65 | */ |
| 66 | -#define looks_like_binary(blob) (looks_like_utf8((blob), 0) == 0) | |
| 66 | +#define looks_like_binary(blob) !(looks_like_utf8(blob)&(LOOK_LENGTH|LOOK_NUL)) | |
| 67 | 67 | |
| 68 | 68 | /* |
| 69 | 69 | ** Output flags for the looks_like_utf8() and looks_like_utf16() routines used |
| 70 | 70 | ** to convey status information about the blob content. |
| 71 | 71 | */ |
| @@ -202,20 +202,12 @@ | ||
| 202 | 202 | return a; |
| 203 | 203 | } |
| 204 | 204 | |
| 205 | 205 | /* |
| 206 | 206 | ** This function attempts to scan each logical line within the blob to |
| 207 | -** determine the type of content it appears to contain. Possible return | |
| 208 | -** values are: | |
| 209 | -** | |
| 210 | -** (1) -- The content appears to consist entirely of text; however, the | |
| 211 | -** encoding may not be UTF-8. | |
| 212 | -** | |
| 213 | -** (0) -- The content appears to be binary because it contains embedded | |
| 214 | -** NUL characters or an extremely long line. Since this function | |
| 215 | -** does not understand UTF-16, it may falsely consider UTF-16 text | |
| 216 | -** to be binary. | |
| 207 | +** determine the type of content it appears to contain. Its return | |
| 208 | +** value is a combination of the LOOK_XXX flags above. | |
| 217 | 209 | ** |
| 218 | 210 | ************************************ WARNING ********************************** |
| 219 | 211 | ** |
| 220 | 212 | ** This function does not validate that the blob content is properly formed |
| 221 | 213 | ** UTF-8. It assumes that all code points are the same size. It does not |
| @@ -228,52 +220,45 @@ | ||
| 228 | 220 | ** Whether or not this function examines the entire contents of the blob is |
| 229 | 221 | ** officially unspecified. |
| 230 | 222 | ** |
| 231 | 223 | ************************************ WARNING ********************************** |
| 232 | 224 | */ |
| 233 | -int looks_like_utf8(const Blob *pContent, int *pFlags){ | |
| 225 | +int looks_like_utf8(const Blob *pContent){ | |
| 234 | 226 | const char *z = blob_buffer(pContent); |
| 235 | 227 | unsigned int n = blob_size(pContent); |
| 236 | - int j, c, result = 1; /* Assume UTF-8 text, prove otherwise */ | |
| 228 | + int j, c, flags = LOOK_NONE; | |
| 237 | 229 | |
| 238 | - if( pFlags ) *pFlags = LOOK_NONE; | |
| 239 | - if( n==0 ) return result; /* Empty file -> text */ | |
| 230 | + if( n==0 ) return flags; /* Empty file -> text */ | |
| 240 | 231 | c = *z; |
| 241 | 232 | if( c==0 ){ |
| 242 | - if( pFlags ) *pFlags |= LOOK_NUL; | |
| 243 | - result = 0; /* NUL character in a file -> binary */ | |
| 233 | + flags |= LOOK_NUL; | |
| 244 | 234 | } |
| 245 | 235 | j = (c!='\n'); |
| 246 | - if( !j && pFlags ) *pFlags |= LOOK_LONE_LF; | |
| 236 | + if( !j ) flags |= LOOK_LONE_LF; | |
| 247 | 237 | while( --n>0 ){ |
| 248 | 238 | int c2 = c; |
| 249 | 239 | c = *++z; ++j; |
| 250 | 240 | if( c==0 ){ |
| 251 | - if( pFlags ) *pFlags |= LOOK_NUL; | |
| 252 | - result = 0; /* NUL character in a file -> binary */ | |
| 241 | + flags |= LOOK_NUL; | |
| 253 | 242 | } |
| 254 | 243 | if( c=='\n' ){ |
| 255 | - if( pFlags ){ | |
| 256 | - *pFlags |= (c2=='\r')?LOOK_CRLF:LOOK_LONE_LF; | |
| 257 | - } | |
| 244 | + flags |= (c2=='\r')?LOOK_CRLF:LOOK_LONE_LF; | |
| 258 | 245 | if( j>LENGTH_MASK ){ |
| 259 | - if( pFlags ) *pFlags |= LOOK_LENGTH; | |
| 260 | - result = 0; /* Very long line -> binary */ | |
| 246 | + flags |= LOOK_LENGTH; | |
| 261 | 247 | } |
| 262 | 248 | j = 0; |
| 263 | - }else if( c2=='\r' && pFlags ){ | |
| 264 | - *pFlags |= LOOK_LONE_CR; | |
| 249 | + }else if( c2=='\r' ){ | |
| 250 | + flags |= LOOK_LONE_CR; | |
| 265 | 251 | } |
| 266 | 252 | } |
| 267 | - if( c=='\r' && pFlags ){ | |
| 268 | - *pFlags |= LOOK_LONE_CR; | |
| 253 | + if( c=='\r' ){ | |
| 254 | + flags |= LOOK_LONE_CR; | |
| 269 | 255 | } |
| 270 | 256 | if( j>LENGTH_MASK ){ |
| 271 | - if( pFlags ) *pFlags |= LOOK_LENGTH; | |
| 272 | - result = 0; /* Very long line -> binary */ | |
| 257 | + flags |= LOOK_LENGTH; | |
| 273 | 258 | } |
| 274 | - return result; /* No problems seen -> not binary */ | |
| 259 | + return flags; | |
| 275 | 260 | } |
| 276 | 261 | |
| 277 | 262 | /* |
| 278 | 263 | ** Define the type needed to represent a Unicode (UTF-16) character. |
| 279 | 264 | */ |
| @@ -293,20 +278,12 @@ | ||
| 293 | 278 | #define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char))) |
| 294 | 279 | #define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1) |
| 295 | 280 | |
| 296 | 281 | /* |
| 297 | 282 | ** This function attempts to scan each logical line within the blob to |
| 298 | -** determine the type of content it appears to contain. Possible return | |
| 299 | -** values are: | |
| 300 | -** | |
| 301 | -** (1) -- The content appears to consist entirely of text; however, the | |
| 302 | -** encoding may not be UTF-16. | |
| 303 | -** | |
| 304 | -** (0) -- The content appears to be binary because it contains embedded | |
| 305 | -** NUL characters or an extremely long line. Since this function | |
| 306 | -** does not understand UTF-8, it may falsely consider UTF-8 text | |
| 307 | -** to be binary. | |
| 283 | +** determine the type of content it appears to contain. Its return | |
| 284 | +** value is a combination of the LOOK_XXX flags above. | |
| 308 | 285 | ** |
| 309 | 286 | ************************************ WARNING ********************************** |
| 310 | 287 | ** |
| 311 | 288 | ** This function does not validate that the blob content is properly formed |
| 312 | 289 | ** UTF-16. It assumes that all code points are the same size. It does not |
| @@ -319,54 +296,47 @@ | ||
| 319 | 296 | ** Whether or not this function examines the entire contents of the blob is |
| 320 | 297 | ** officially unspecified. |
| 321 | 298 | ** |
| 322 | 299 | ************************************ WARNING ********************************** |
| 323 | 300 | */ |
| 324 | -int looks_like_utf16(const Blob *pContent, int *pFlags){ | |
| 301 | +int looks_like_utf16(const Blob *pContent){ | |
| 325 | 302 | const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent); |
| 326 | 303 | unsigned int n = blob_size(pContent); |
| 327 | - int j = 1, c, result = 1; /* Assume UTF-16 text, prove otherwise */ | |
| 304 | + int j = 1, c, flags = LOOK_NONE; | |
| 328 | 305 | |
| 329 | - if( !starts_with_utf16_bom(pContent, 0, pFlags) ) return 0; /* Not UTF-16. */ | |
| 306 | + if( !starts_with_utf16_bom(pContent, 0, &flags) ) return flags; | |
| 330 | 307 | if( n%sizeof(WCHAR_T) ){ |
| 331 | - if( pFlags ) *pFlags |= LOOK_ODD; | |
| 332 | - result = 0; /* Odd number of bytes -> binary (UTF-8?) */ | |
| 308 | + flags |= LOOK_ODD; | |
| 333 | 309 | } |
| 334 | 310 | c = *z; |
| 335 | - while( 1 ){ | |
| 311 | + while( n>=sizeof(WCHAR_T) ){ | |
| 336 | 312 | int c2 = c; |
| 337 | - if( n<sizeof(WCHAR_T) ) break; | |
| 338 | 313 | n -= sizeof(WCHAR_T); |
| 339 | 314 | c = *++z; ++j; |
| 340 | - if (pFlags && ((*pFlags)&LOOK_REVERSE) ){ | |
| 315 | + if( flags&LOOK_REVERSE ){ | |
| 341 | 316 | c = ((c<<8)&0xff00) | ((c>>8)&0xff); |
| 342 | 317 | } |
| 343 | 318 | if( c==0 ){ |
| 344 | - if( pFlags ) *pFlags |= LOOK_NUL; | |
| 345 | - result = 0; /* NUL character in a file -> binary */ | |
| 319 | + flags |= LOOK_NUL; | |
| 346 | 320 | } |
| 347 | 321 | if( c=='\n' ){ |
| 348 | - if( pFlags ){ | |
| 349 | - *pFlags |= (c2=='\r')?LOOK_CRLF:LOOK_LONE_LF; | |
| 350 | - } | |
| 351 | - if( j>UTF16_LENGTH_MASK ){ | |
| 352 | - if( pFlags ) *pFlags |= LOOK_LENGTH; | |
| 353 | - result = 0; /* Very long line -> binary */ | |
| 354 | - } | |
| 355 | - j = 0; | |
| 356 | - }else if( (c2=='\r') && pFlags ){ | |
| 357 | - *pFlags |= LOOK_LONE_CR; | |
| 358 | - } | |
| 359 | - } | |
| 360 | - if( (c=='\r') && pFlags ){ | |
| 361 | - *pFlags |= LOOK_LONE_CR; | |
| 362 | - } | |
| 363 | - if( j>UTF16_LENGTH_MASK ){ | |
| 364 | - if( pFlags ) *pFlags |= LOOK_LENGTH; | |
| 365 | - result = 0; /* Very long line -> binary */ | |
| 366 | - } | |
| 367 | - return result; /* No problems seen -> not binary */ | |
| 322 | + flags |= (c2=='\r')?LOOK_CRLF:LOOK_LONE_LF; | |
| 323 | + if( j>UTF16_LENGTH_MASK ){ | |
| 324 | + flags |= LOOK_LENGTH; | |
| 325 | + } | |
| 326 | + j = 0; | |
| 327 | + }else if( c2=='\r' ){ | |
| 328 | + flags |= LOOK_LONE_CR; | |
| 329 | + } | |
| 330 | + } | |
| 331 | + if( c=='\r' ){ | |
| 332 | + flags |= LOOK_LONE_CR; | |
| 333 | + } | |
| 334 | + if( j>UTF16_LENGTH_MASK ){ | |
| 335 | + flags |= LOOK_LENGTH; | |
| 336 | + } | |
| 337 | + return flags; | |
| 368 | 338 | } |
| 369 | 339 | |
| 370 | 340 | /* |
| 371 | 341 | ** This function returns an array of bytes representing the byte-order-mark |
| 372 | 342 | ** for UTF-8. |
| @@ -2497,12 +2467,13 @@ | ||
| 2497 | 2467 | int lookFlags; /* output flags from looks_like_utf8/utf16() */ |
| 2498 | 2468 | if( g.argc<3 ) usage("FILENAME"); |
| 2499 | 2469 | blob_read_from_file(&blob, g.argv[2]); |
| 2500 | 2470 | fUtf8 = starts_with_utf8_bom(&blob, 0); |
| 2501 | 2471 | fUtf16 = starts_with_utf16_bom(&blob, 0, 0); |
| 2502 | - eType = fUtf16 ? looks_like_utf16(&blob, &lookFlags) : | |
| 2503 | - looks_like_utf8(&blob, &lookFlags); | |
| 2472 | + lookFlags = fUtf16 ? looks_like_utf16(&blob) : | |
| 2473 | + looks_like_utf8(&blob); | |
| 2474 | + eType = !(lookFlags&(LOOK_NUL|LOOK_LENGTH|LOOK_ODD)); | |
| 2504 | 2475 | fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob)); |
| 2505 | 2476 | fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no"); |
| 2506 | 2477 | fossil_print("Starts with UTF-16 BOM: %s\n",fUtf16?"yes":"no"); |
| 2507 | 2478 | fossil_print("Looks like UTF-%s: %s\n",fUtf16?"16":"8",eType?"yes":"no"); |
| 2508 | 2479 | fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no"); |
| 2509 | 2480 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -61,11 +61,11 @@ | |
| 61 | |
| 62 | /* |
| 63 | ** This macro is designed to return non-zero if the specified blob contains |
| 64 | ** data that MAY be binary in nature; otherwise, zero will be returned. |
| 65 | */ |
| 66 | #define looks_like_binary(blob) (looks_like_utf8((blob), 0) == 0) |
| 67 | |
| 68 | /* |
| 69 | ** Output flags for the looks_like_utf8() and looks_like_utf16() routines used |
| 70 | ** to convey status information about the blob content. |
| 71 | */ |
| @@ -202,20 +202,12 @@ | |
| 202 | return a; |
| 203 | } |
| 204 | |
| 205 | /* |
| 206 | ** This function attempts to scan each logical line within the blob to |
| 207 | ** determine the type of content it appears to contain. Possible return |
| 208 | ** values are: |
| 209 | ** |
| 210 | ** (1) -- The content appears to consist entirely of text; however, the |
| 211 | ** encoding may not be UTF-8. |
| 212 | ** |
| 213 | ** (0) -- The content appears to be binary because it contains embedded |
| 214 | ** NUL characters or an extremely long line. Since this function |
| 215 | ** does not understand UTF-16, it may falsely consider UTF-16 text |
| 216 | ** to be binary. |
| 217 | ** |
| 218 | ************************************ WARNING ********************************** |
| 219 | ** |
| 220 | ** This function does not validate that the blob content is properly formed |
| 221 | ** UTF-8. It assumes that all code points are the same size. It does not |
| @@ -228,52 +220,45 @@ | |
| 228 | ** Whether or not this function examines the entire contents of the blob is |
| 229 | ** officially unspecified. |
| 230 | ** |
| 231 | ************************************ WARNING ********************************** |
| 232 | */ |
| 233 | int looks_like_utf8(const Blob *pContent, int *pFlags){ |
| 234 | const char *z = blob_buffer(pContent); |
| 235 | unsigned int n = blob_size(pContent); |
| 236 | int j, c, result = 1; /* Assume UTF-8 text, prove otherwise */ |
| 237 | |
| 238 | if( pFlags ) *pFlags = LOOK_NONE; |
| 239 | if( n==0 ) return result; /* Empty file -> text */ |
| 240 | c = *z; |
| 241 | if( c==0 ){ |
| 242 | if( pFlags ) *pFlags |= LOOK_NUL; |
| 243 | result = 0; /* NUL character in a file -> binary */ |
| 244 | } |
| 245 | j = (c!='\n'); |
| 246 | if( !j && pFlags ) *pFlags |= LOOK_LONE_LF; |
| 247 | while( --n>0 ){ |
| 248 | int c2 = c; |
| 249 | c = *++z; ++j; |
| 250 | if( c==0 ){ |
| 251 | if( pFlags ) *pFlags |= LOOK_NUL; |
| 252 | result = 0; /* NUL character in a file -> binary */ |
| 253 | } |
| 254 | if( c=='\n' ){ |
| 255 | if( pFlags ){ |
| 256 | *pFlags |= (c2=='\r')?LOOK_CRLF:LOOK_LONE_LF; |
| 257 | } |
| 258 | if( j>LENGTH_MASK ){ |
| 259 | if( pFlags ) *pFlags |= LOOK_LENGTH; |
| 260 | result = 0; /* Very long line -> binary */ |
| 261 | } |
| 262 | j = 0; |
| 263 | }else if( c2=='\r' && pFlags ){ |
| 264 | *pFlags |= LOOK_LONE_CR; |
| 265 | } |
| 266 | } |
| 267 | if( c=='\r' && pFlags ){ |
| 268 | *pFlags |= LOOK_LONE_CR; |
| 269 | } |
| 270 | if( j>LENGTH_MASK ){ |
| 271 | if( pFlags ) *pFlags |= LOOK_LENGTH; |
| 272 | result = 0; /* Very long line -> binary */ |
| 273 | } |
| 274 | return result; /* No problems seen -> not binary */ |
| 275 | } |
| 276 | |
| 277 | /* |
| 278 | ** Define the type needed to represent a Unicode (UTF-16) character. |
| 279 | */ |
| @@ -293,20 +278,12 @@ | |
| 293 | #define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char))) |
| 294 | #define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1) |
| 295 | |
| 296 | /* |
| 297 | ** This function attempts to scan each logical line within the blob to |
| 298 | ** determine the type of content it appears to contain. Possible return |
| 299 | ** values are: |
| 300 | ** |
| 301 | ** (1) -- The content appears to consist entirely of text; however, the |
| 302 | ** encoding may not be UTF-16. |
| 303 | ** |
| 304 | ** (0) -- The content appears to be binary because it contains embedded |
| 305 | ** NUL characters or an extremely long line. Since this function |
| 306 | ** does not understand UTF-8, it may falsely consider UTF-8 text |
| 307 | ** to be binary. |
| 308 | ** |
| 309 | ************************************ WARNING ********************************** |
| 310 | ** |
| 311 | ** This function does not validate that the blob content is properly formed |
| 312 | ** UTF-16. It assumes that all code points are the same size. It does not |
| @@ -319,54 +296,47 @@ | |
| 319 | ** Whether or not this function examines the entire contents of the blob is |
| 320 | ** officially unspecified. |
| 321 | ** |
| 322 | ************************************ WARNING ********************************** |
| 323 | */ |
| 324 | int looks_like_utf16(const Blob *pContent, int *pFlags){ |
| 325 | const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent); |
| 326 | unsigned int n = blob_size(pContent); |
| 327 | int j = 1, c, result = 1; /* Assume UTF-16 text, prove otherwise */ |
| 328 | |
| 329 | if( !starts_with_utf16_bom(pContent, 0, pFlags) ) return 0; /* Not UTF-16. */ |
| 330 | if( n%sizeof(WCHAR_T) ){ |
| 331 | if( pFlags ) *pFlags |= LOOK_ODD; |
| 332 | result = 0; /* Odd number of bytes -> binary (UTF-8?) */ |
| 333 | } |
| 334 | c = *z; |
| 335 | while( 1 ){ |
| 336 | int c2 = c; |
| 337 | if( n<sizeof(WCHAR_T) ) break; |
| 338 | n -= sizeof(WCHAR_T); |
| 339 | c = *++z; ++j; |
| 340 | if (pFlags && ((*pFlags)&LOOK_REVERSE) ){ |
| 341 | c = ((c<<8)&0xff00) | ((c>>8)&0xff); |
| 342 | } |
| 343 | if( c==0 ){ |
| 344 | if( pFlags ) *pFlags |= LOOK_NUL; |
| 345 | result = 0; /* NUL character in a file -> binary */ |
| 346 | } |
| 347 | if( c=='\n' ){ |
| 348 | if( pFlags ){ |
| 349 | *pFlags |= (c2=='\r')?LOOK_CRLF:LOOK_LONE_LF; |
| 350 | } |
| 351 | if( j>UTF16_LENGTH_MASK ){ |
| 352 | if( pFlags ) *pFlags |= LOOK_LENGTH; |
| 353 | result = 0; /* Very long line -> binary */ |
| 354 | } |
| 355 | j = 0; |
| 356 | }else if( (c2=='\r') && pFlags ){ |
| 357 | *pFlags |= LOOK_LONE_CR; |
| 358 | } |
| 359 | } |
| 360 | if( (c=='\r') && pFlags ){ |
| 361 | *pFlags |= LOOK_LONE_CR; |
| 362 | } |
| 363 | if( j>UTF16_LENGTH_MASK ){ |
| 364 | if( pFlags ) *pFlags |= LOOK_LENGTH; |
| 365 | result = 0; /* Very long line -> binary */ |
| 366 | } |
| 367 | return result; /* No problems seen -> not binary */ |
| 368 | } |
| 369 | |
| 370 | /* |
| 371 | ** This function returns an array of bytes representing the byte-order-mark |
| 372 | ** for UTF-8. |
| @@ -2497,12 +2467,13 @@ | |
| 2497 | int lookFlags; /* output flags from looks_like_utf8/utf16() */ |
| 2498 | if( g.argc<3 ) usage("FILENAME"); |
| 2499 | blob_read_from_file(&blob, g.argv[2]); |
| 2500 | fUtf8 = starts_with_utf8_bom(&blob, 0); |
| 2501 | fUtf16 = starts_with_utf16_bom(&blob, 0, 0); |
| 2502 | eType = fUtf16 ? looks_like_utf16(&blob, &lookFlags) : |
| 2503 | looks_like_utf8(&blob, &lookFlags); |
| 2504 | fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob)); |
| 2505 | fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no"); |
| 2506 | fossil_print("Starts with UTF-16 BOM: %s\n",fUtf16?"yes":"no"); |
| 2507 | fossil_print("Looks like UTF-%s: %s\n",fUtf16?"16":"8",eType?"yes":"no"); |
| 2508 | fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no"); |
| 2509 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -61,11 +61,11 @@ | |
| 61 | |
| 62 | /* |
| 63 | ** This macro is designed to return non-zero if the specified blob contains |
| 64 | ** data that MAY be binary in nature; otherwise, zero will be returned. |
| 65 | */ |
| 66 | #define looks_like_binary(blob) !(looks_like_utf8(blob)&(LOOK_LENGTH|LOOK_NUL)) |
| 67 | |
| 68 | /* |
| 69 | ** Output flags for the looks_like_utf8() and looks_like_utf16() routines used |
| 70 | ** to convey status information about the blob content. |
| 71 | */ |
| @@ -202,20 +202,12 @@ | |
| 202 | return a; |
| 203 | } |
| 204 | |
| 205 | /* |
| 206 | ** This function attempts to scan each logical line within the blob to |
| 207 | ** determine the type of content it appears to contain. Its return |
| 208 | ** value is a combination of the LOOK_XXX flags above. |
| 209 | ** |
| 210 | ************************************ WARNING ********************************** |
| 211 | ** |
| 212 | ** This function does not validate that the blob content is properly formed |
| 213 | ** UTF-8. It assumes that all code points are the same size. It does not |
| @@ -228,52 +220,45 @@ | |
| 220 | ** Whether or not this function examines the entire contents of the blob is |
| 221 | ** officially unspecified. |
| 222 | ** |
| 223 | ************************************ WARNING ********************************** |
| 224 | */ |
| 225 | int looks_like_utf8(const Blob *pContent){ |
| 226 | const char *z = blob_buffer(pContent); |
| 227 | unsigned int n = blob_size(pContent); |
| 228 | int j, c, flags = LOOK_NONE; |
| 229 | |
| 230 | if( n==0 ) return flags; /* Empty file -> text */ |
| 231 | c = *z; |
| 232 | if( c==0 ){ |
| 233 | flags |= LOOK_NUL; |
| 234 | } |
| 235 | j = (c!='\n'); |
| 236 | if( !j ) flags |= LOOK_LONE_LF; |
| 237 | while( --n>0 ){ |
| 238 | int c2 = c; |
| 239 | c = *++z; ++j; |
| 240 | if( c==0 ){ |
| 241 | flags |= LOOK_NUL; |
| 242 | } |
| 243 | if( c=='\n' ){ |
| 244 | flags |= (c2=='\r')?LOOK_CRLF:LOOK_LONE_LF; |
| 245 | if( j>LENGTH_MASK ){ |
| 246 | flags |= LOOK_LENGTH; |
| 247 | } |
| 248 | j = 0; |
| 249 | }else if( c2=='\r' ){ |
| 250 | flags |= LOOK_LONE_CR; |
| 251 | } |
| 252 | } |
| 253 | if( c=='\r' ){ |
| 254 | flags |= LOOK_LONE_CR; |
| 255 | } |
| 256 | if( j>LENGTH_MASK ){ |
| 257 | flags |= LOOK_LENGTH; |
| 258 | } |
| 259 | return flags; |
| 260 | } |
| 261 | |
| 262 | /* |
| 263 | ** Define the type needed to represent a Unicode (UTF-16) character. |
| 264 | */ |
| @@ -293,20 +278,12 @@ | |
| 278 | #define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char))) |
| 279 | #define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1) |
| 280 | |
| 281 | /* |
| 282 | ** This function attempts to scan each logical line within the blob to |
| 283 | ** determine the type of content it appears to contain. Its return |
| 284 | ** value is a combination of the LOOK_XXX flags above. |
| 285 | ** |
| 286 | ************************************ WARNING ********************************** |
| 287 | ** |
| 288 | ** This function does not validate that the blob content is properly formed |
| 289 | ** UTF-16. It assumes that all code points are the same size. It does not |
| @@ -319,54 +296,47 @@ | |
| 296 | ** Whether or not this function examines the entire contents of the blob is |
| 297 | ** officially unspecified. |
| 298 | ** |
| 299 | ************************************ WARNING ********************************** |
| 300 | */ |
| 301 | int looks_like_utf16(const Blob *pContent){ |
| 302 | const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent); |
| 303 | unsigned int n = blob_size(pContent); |
| 304 | int j = 1, c, flags = LOOK_NONE; |
| 305 | |
| 306 | if( !starts_with_utf16_bom(pContent, 0, &flags) ) return flags; |
| 307 | if( n%sizeof(WCHAR_T) ){ |
| 308 | flags |= LOOK_ODD; |
| 309 | } |
| 310 | c = *z; |
| 311 | while( n>=sizeof(WCHAR_T) ){ |
| 312 | int c2 = c; |
| 313 | n -= sizeof(WCHAR_T); |
| 314 | c = *++z; ++j; |
| 315 | if( flags&LOOK_REVERSE ){ |
| 316 | c = ((c<<8)&0xff00) | ((c>>8)&0xff); |
| 317 | } |
| 318 | if( c==0 ){ |
| 319 | flags |= LOOK_NUL; |
| 320 | } |
| 321 | if( c=='\n' ){ |
| 322 | flags |= (c2=='\r')?LOOK_CRLF:LOOK_LONE_LF; |
| 323 | if( j>UTF16_LENGTH_MASK ){ |
| 324 | flags |= LOOK_LENGTH; |
| 325 | } |
| 326 | j = 0; |
| 327 | }else if( c2=='\r' ){ |
| 328 | flags |= LOOK_LONE_CR; |
| 329 | } |
| 330 | } |
| 331 | if( c=='\r' ){ |
| 332 | flags |= LOOK_LONE_CR; |
| 333 | } |
| 334 | if( j>UTF16_LENGTH_MASK ){ |
| 335 | flags |= LOOK_LENGTH; |
| 336 | } |
| 337 | return flags; |
| 338 | } |
| 339 | |
| 340 | /* |
| 341 | ** This function returns an array of bytes representing the byte-order-mark |
| 342 | ** for UTF-8. |
| @@ -2497,12 +2467,13 @@ | |
| 2467 | int lookFlags; /* output flags from looks_like_utf8/utf16() */ |
| 2468 | if( g.argc<3 ) usage("FILENAME"); |
| 2469 | blob_read_from_file(&blob, g.argv[2]); |
| 2470 | fUtf8 = starts_with_utf8_bom(&blob, 0); |
| 2471 | fUtf16 = starts_with_utf16_bom(&blob, 0, 0); |
| 2472 | lookFlags = fUtf16 ? looks_like_utf16(&blob) : |
| 2473 | looks_like_utf8(&blob); |
| 2474 | eType = !(lookFlags&(LOOK_NUL|LOOK_LENGTH|LOOK_ODD)); |
| 2475 | fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob)); |
| 2476 | fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no"); |
| 2477 | fossil_print("Starts with UTF-16 BOM: %s\n",fUtf16?"yes":"no"); |
| 2478 | fossil_print("Looks like UTF-%s: %s\n",fUtf16?"16":"8",eType?"yes":"no"); |
| 2479 | fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no"); |
| 2480 |