Fossil SCM
Refactor the looks_like_utf*() functions to use a single output flags argument to convey the various pieces of blob status information.
Commit
30a63b8b66f176dc0402e6e6f1b3b2e4a17af2fb
Parent
0e5f0da7eb9541d…
2 files changed
+11
-8
+57
-41
+11
-8
| --- src/checkin.c | ||
| +++ src/checkin.c | ||
| @@ -905,44 +905,47 @@ | ||
| 905 | 905 | int encodingOk, /* Non-zero if encoding warnings should be disabled. */ |
| 906 | 906 | const char *zFilename /* The full name of the file being committed. */ |
| 907 | 907 | ){ |
| 908 | 908 | int eType; /* return value of looks_like_utf8/utf16() */ |
| 909 | 909 | int fUnicode; /* return value of starts_with_utf16_bom() */ |
| 910 | - int longLine = 0; /* non-zero if blob has "long lines" */ | |
| 911 | - int crlf = 0; /* non-zero if blob has "crlf" */ | |
| 910 | + int lookFlags; /* output flags from looks_like_utf8/utf16() */ | |
| 911 | + int fHasCrLf; /* the blob contains one or more CR/LF pairs */ | |
| 912 | + int fHasLength; /* the blob contains an overly long line */ | |
| 912 | 913 | char *zMsg; /* Warning message */ |
| 913 | 914 | Blob fname; /* Relative pathname of the file */ |
| 914 | 915 | static int allOk = 0; /* Set to true to disable this routine */ |
| 915 | 916 | |
| 916 | 917 | if( allOk ) return 0; |
| 917 | 918 | fUnicode = starts_with_utf16_bom(p, 0, 0); |
| 918 | - eType = fUnicode ? looks_like_utf16(p, &longLine, &crlf) : | |
| 919 | - looks_like_utf8(p, &longLine, &crlf); | |
| 920 | - if( eType==0 || crlf || fUnicode ){ | |
| 919 | + eType = fUnicode ? looks_like_utf16(p, &lookFlags) : | |
| 920 | + looks_like_utf8(p, &lookFlags); | |
| 921 | + fHasCrLf = (lookFlags & LOOK_CRLF); | |
| 922 | + fHasLength = (lookFlags & LOOK_LENGTH); | |
| 923 | + if( eType==0 || fHasCrLf || fUnicode ){ | |
| 921 | 924 | const char *zWarning; |
| 922 | 925 | const char *zDisable; |
| 923 | 926 | const char *zConvert = "c=convert/"; |
| 924 | 927 | Blob ans; |
| 925 | 928 | char cReply; |
| 926 | 929 | |
| 927 | - if( crlf && fUnicode ){ | |
| 930 | + if( fHasCrLf && fUnicode ){ | |
| 928 | 931 | if ( crnlOk && encodingOk ){ |
| 929 | 932 | return 0; /* We don't want CR/NL and Unicode warnings for this file. */ |
| 930 | 933 | } |
| 931 | 934 | zWarning = "CR/NL line endings and Unicode"; |
| 932 | 935 | zDisable = "\"crnl-glob\" and \"encoding-glob\" settings"; |
| 933 | - }else if( crlf ){ | |
| 936 | + }else if( fHasCrLf ){ | |
| 934 | 937 | if( crnlOk ){ |
| 935 | 938 | return 0; /* We don't want CR/NL warnings for this file. */ |
| 936 | 939 | } |
| 937 | 940 | zWarning = "CR/NL line endings"; |
| 938 | 941 | zDisable = "\"crnl-glob\" setting"; |
| 939 | 942 | }else if( eType==0 ){ |
| 940 | 943 | if( binOk ){ |
| 941 | 944 | return 0; /* We don't want binary warnings for this file. */ |
| 942 | 945 | } |
| 943 | - if( longLine ){ | |
| 946 | + if( fHasLength ){ | |
| 944 | 947 | zWarning = "long lines"; |
| 945 | 948 | }else{ |
| 946 | 949 | zWarning = "binary data"; |
| 947 | 950 | } |
| 948 | 951 | zDisable = "\"binary-glob\" setting"; |
| 949 | 952 |
| --- src/checkin.c | |
| +++ src/checkin.c | |
| @@ -905,44 +905,47 @@ | |
| 905 | int encodingOk, /* Non-zero if encoding warnings should be disabled. */ |
| 906 | const char *zFilename /* The full name of the file being committed. */ |
| 907 | ){ |
| 908 | int eType; /* return value of looks_like_utf8/utf16() */ |
| 909 | int fUnicode; /* return value of starts_with_utf16_bom() */ |
| 910 | int longLine = 0; /* non-zero if blob has "long lines" */ |
| 911 | int crlf = 0; /* non-zero if blob has "crlf" */ |
| 912 | char *zMsg; /* Warning message */ |
| 913 | Blob fname; /* Relative pathname of the file */ |
| 914 | static int allOk = 0; /* Set to true to disable this routine */ |
| 915 | |
| 916 | if( allOk ) return 0; |
| 917 | fUnicode = starts_with_utf16_bom(p, 0, 0); |
| 918 | eType = fUnicode ? looks_like_utf16(p, &longLine, &crlf) : |
| 919 | looks_like_utf8(p, &longLine, &crlf); |
| 920 | if( eType==0 || crlf || fUnicode ){ |
| 921 | const char *zWarning; |
| 922 | const char *zDisable; |
| 923 | const char *zConvert = "c=convert/"; |
| 924 | Blob ans; |
| 925 | char cReply; |
| 926 | |
| 927 | if( crlf && fUnicode ){ |
| 928 | if ( crnlOk && encodingOk ){ |
| 929 | return 0; /* We don't want CR/NL and Unicode warnings for this file. */ |
| 930 | } |
| 931 | zWarning = "CR/NL line endings and Unicode"; |
| 932 | zDisable = "\"crnl-glob\" and \"encoding-glob\" settings"; |
| 933 | }else if( crlf ){ |
| 934 | if( crnlOk ){ |
| 935 | return 0; /* We don't want CR/NL warnings for this file. */ |
| 936 | } |
| 937 | zWarning = "CR/NL line endings"; |
| 938 | zDisable = "\"crnl-glob\" setting"; |
| 939 | }else if( eType==0 ){ |
| 940 | if( binOk ){ |
| 941 | return 0; /* We don't want binary warnings for this file. */ |
| 942 | } |
| 943 | if( longLine ){ |
| 944 | zWarning = "long lines"; |
| 945 | }else{ |
| 946 | zWarning = "binary data"; |
| 947 | } |
| 948 | zDisable = "\"binary-glob\" setting"; |
| 949 |
| --- src/checkin.c | |
| +++ src/checkin.c | |
| @@ -905,44 +905,47 @@ | |
| 905 | int encodingOk, /* Non-zero if encoding warnings should be disabled. */ |
| 906 | const char *zFilename /* The full name of the file being committed. */ |
| 907 | ){ |
| 908 | int eType; /* return value of looks_like_utf8/utf16() */ |
| 909 | int fUnicode; /* return value of starts_with_utf16_bom() */ |
| 910 | int lookFlags; /* output flags from looks_like_utf8/utf16() */ |
| 911 | int fHasCrLf; /* the blob contains one or more CR/LF pairs */ |
| 912 | int fHasLength; /* the blob contains an overly long line */ |
| 913 | char *zMsg; /* Warning message */ |
| 914 | Blob fname; /* Relative pathname of the file */ |
| 915 | static int allOk = 0; /* Set to true to disable this routine */ |
| 916 | |
| 917 | if( allOk ) return 0; |
| 918 | fUnicode = starts_with_utf16_bom(p, 0, 0); |
| 919 | eType = fUnicode ? looks_like_utf16(p, &lookFlags) : |
| 920 | looks_like_utf8(p, &lookFlags); |
| 921 | fHasCrLf = (lookFlags & LOOK_CRLF); |
| 922 | fHasLength = (lookFlags & LOOK_LENGTH); |
| 923 | if( eType==0 || fHasCrLf || fUnicode ){ |
| 924 | const char *zWarning; |
| 925 | const char *zDisable; |
| 926 | const char *zConvert = "c=convert/"; |
| 927 | Blob ans; |
| 928 | char cReply; |
| 929 | |
| 930 | if( fHasCrLf && fUnicode ){ |
| 931 | if ( crnlOk && encodingOk ){ |
| 932 | return 0; /* We don't want CR/NL and Unicode warnings for this file. */ |
| 933 | } |
| 934 | zWarning = "CR/NL line endings and Unicode"; |
| 935 | zDisable = "\"crnl-glob\" and \"encoding-glob\" settings"; |
| 936 | }else if( fHasCrLf ){ |
| 937 | if( crnlOk ){ |
| 938 | return 0; /* We don't want CR/NL warnings for this file. */ |
| 939 | } |
| 940 | zWarning = "CR/NL line endings"; |
| 941 | zDisable = "\"crnl-glob\" setting"; |
| 942 | }else if( eType==0 ){ |
| 943 | if( binOk ){ |
| 944 | return 0; /* We don't want binary warnings for this file. */ |
| 945 | } |
| 946 | if( fHasLength ){ |
| 947 | zWarning = "long lines"; |
| 948 | }else{ |
| 949 | zWarning = "binary data"; |
| 950 | } |
| 951 | zDisable = "\"binary-glob\" setting"; |
| 952 |
+57
-41
| --- src/diff.c | ||
| +++ src/diff.c | ||
| @@ -57,11 +57,25 @@ | ||
| 57 | 57 | "more than 10,000 changes\n" |
| 58 | 58 | |
| 59 | 59 | #define DIFF_TOO_MANY_CHANGES_HTML \ |
| 60 | 60 | "<p class='generalError'>More than 10,000 changes</p>\n" |
| 61 | 61 | |
| 62 | -#define looks_like_binary(blob) (looks_like_utf8((blob), 0, 0) != 1) | |
| 62 | +/* | |
| 63 | +** This macro is designed to return non-zero if the specified blob contains | |
| 64 | +** data that MAY be binary in nature; otherwise, zero will be returned. | |
| 65 | +*/ | |
| 66 | +#define looks_like_binary(blob) (looks_like_utf8((blob), 0) == 0) | |
| 67 | + | |
| 68 | +/* | |
| 69 | +** Output flags for the looks_like_utf8() and looks_like_utf16() routines used | |
| 70 | +** to convey status information about the blob content. | |
| 71 | +*/ | |
| 72 | +#define LOOK_NONE ((int)0x00000000) /* Nothing special was found. */ | |
| 73 | +#define LOOK_NUL ((int)0x00000001) /* One or more NUL chars were found. */ | |
| 74 | +#define LOOK_LF ((int)0x00000002) /* One or more LF chars were found. */ | |
| 75 | +#define LOOK_CRLF ((int)0x00000004) /* One or more CR/LF pairs were found. */ | |
| 76 | +#define LOOK_LENGTH ((int)0x00000008) /* An over length line was found. */ | |
| 63 | 77 | #endif /* INTERFACE */ |
| 64 | 78 | |
| 65 | 79 | /* |
| 66 | 80 | ** Maximum length of a line in a text file, in bytes. (2**13 = 8192 bytes) |
| 67 | 81 | */ |
| @@ -186,12 +200,12 @@ | ||
| 186 | 200 | /* |
| 187 | 201 | ** This function attempts to scan each logical line within the blob to |
| 188 | 202 | ** determine the type of content it appears to contain. Possible return |
| 189 | 203 | ** values are: |
| 190 | 204 | ** |
| 191 | -** (1) -- The content appears to consist entirely of text; | |
| 192 | -** however, the encoding may not be UTF-8. | |
| 205 | +** (1) -- The content appears to consist entirely of text; however, the | |
| 206 | +** encoding may not be UTF-8. | |
| 193 | 207 | ** |
| 194 | 208 | ** (0) -- The content appears to be binary because it contains embedded |
| 195 | 209 | ** NUL characters or an extremely long line. Since this function |
| 196 | 210 | ** does not understand UTF-16, it may falsely consider UTF-16 text |
| 197 | 211 | ** to be binary. |
| @@ -204,49 +218,50 @@ | ||
| 204 | 218 | ** switches between UTF-8 and other encodings occur. |
| 205 | 219 | ** |
| 206 | 220 | ** The only code points that this function cares about are the NUL character, |
| 207 | 221 | ** carriage-return, and line-feed. |
| 208 | 222 | ** |
| 209 | -** If pbLongLine is not NULL and the blob is detected as being binary only because | |
| 210 | -** of long lines, the integer pointed to is set to 1. Otherwise, it is left as is. | |
| 211 | -** If pbCrlf is not NULL and the blob contains crlf, the integer pointed | |
| 212 | -** to is set to 1. Otherwise, it is left as is. | |
| 213 | -** | |
| 214 | 223 | ************************************ WARNING ********************************** |
| 215 | 224 | */ |
| 216 | -int looks_like_utf8(const Blob *pContent, int *pbLongLine, int *pbCrlf){ | |
| 225 | +int looks_like_utf8(const Blob *pContent, int *pFlags){ | |
| 217 | 226 | const char *z = blob_buffer(pContent); |
| 218 | 227 | unsigned int n = blob_size(pContent); |
| 219 | 228 | int j, c; |
| 220 | - int crlf = 0; | |
| 221 | - int longline = 0; | |
| 222 | 229 | |
| 223 | - /* Check individual lines. | |
| 224 | - */ | |
| 230 | + if( pFlags ) *pFlags = LOOK_NONE; | |
| 225 | 231 | if( n==0 ) return 1; /* Empty file -> text */ |
| 226 | 232 | c = *z; |
| 227 | - if( c==0 ) return 0; /* Zero byte in a file -> binary */ | |
| 233 | + if( c==0 ){ | |
| 234 | + if( pFlags ) *pFlags |= LOOK_NUL; | |
| 235 | + return 0; /* NUL character in a file -> binary */ | |
| 236 | + } | |
| 228 | 237 | j = (c!='\n'); |
| 229 | 238 | while( --n>0 ){ |
| 230 | 239 | c = *++z; ++j; |
| 231 | - if( c==0 ) return 0; /* Zero byte in a file -> binary */ | |
| 240 | + if( c==0 ){ | |
| 241 | + if( pFlags ) *pFlags |= LOOK_NUL; | |
| 242 | + return 0; /* NUL character in a file -> binary */ | |
| 243 | + } | |
| 232 | 244 | if( c=='\n' ){ |
| 233 | 245 | int c2 = z[-1]; |
| 234 | - if( c2=='\r' ){ | |
| 235 | - crlf = 1; /* Contains CR/NL, continue */ | |
| 246 | + if( pFlags ){ | |
| 247 | + *pFlags |= LOOK_LF; | |
| 248 | + if( c2=='\r' ){ | |
| 249 | + *pFlags |= LOOK_CRLF; | |
| 250 | + } | |
| 236 | 251 | } |
| 237 | 252 | if( j>LENGTH_MASK ){ |
| 238 | - longline = 1; /* Contains long line, continue */ | |
| 253 | + if( pFlags ) *pFlags |= LOOK_LENGTH; | |
| 254 | + return 0; /* Very long line -> binary */ | |
| 239 | 255 | } |
| 240 | 256 | j = 0; |
| 241 | 257 | } |
| 242 | 258 | } |
| 243 | - if( longline || (j>LENGTH_MASK) ){ | |
| 244 | - if( pbLongLine ) *pbLongLine = 1; | |
| 259 | + if( j>LENGTH_MASK ){ | |
| 260 | + if( pFlags ) *pFlags |= LOOK_LENGTH; | |
| 245 | 261 | return 0; /* Very long line -> binary */ |
| 246 | 262 | } |
| 247 | - if( pbCrlf && crlf) *pbCrlf = 1; | |
| 248 | 263 | return 1; /* No problems seen -> not binary */ |
| 249 | 264 | } |
| 250 | 265 | |
| 251 | 266 | /* |
| 252 | 267 | ** Define the type needed to represent a Unicode (UTF-16) character. |
| @@ -279,12 +294,12 @@ | ||
| 279 | 294 | /* |
| 280 | 295 | ** This function attempts to scan each logical line within the blob to |
| 281 | 296 | ** determine the type of content it appears to contain. Possible return |
| 282 | 297 | ** values are: |
| 283 | 298 | ** |
| 284 | -** (1) -- The content appears to consist entirely of text; | |
| 285 | -** however, the encoding may not be UTF-16. | |
| 299 | +** (1) -- The content appears to consist entirely of text; however, the | |
| 300 | +** encoding may not be UTF-16. | |
| 286 | 301 | ** |
| 287 | 302 | ** (0) -- The content appears to be binary because it contains embedded |
| 288 | 303 | ** NUL characters or an extremely long line. Since this function |
| 289 | 304 | ** does not understand UTF-8, it may falsely consider UTF-8 text |
| 290 | 305 | ** to be binary. |
| @@ -297,50 +312,51 @@ | ||
| 297 | 312 | ** switches between the UTF-16be and UTF-16le encodings occur. |
| 298 | 313 | ** |
| 299 | 314 | ** The only code points that this function cares about are the NUL character, |
| 300 | 315 | ** carriage-return, and line-feed. |
| 301 | 316 | ** |
| 302 | -** If pbLongLine is not NULL and the blob is detected as being binary only because | |
| 303 | -** of long lines, the integer pointed to is set to 1. Otherwise, it is left as is. | |
| 304 | -** If pbCrlf is not NULL and the blob contains crlf, the integer pointed | |
| 305 | -** to is set to 1. Otherwise, it is left as is. | |
| 306 | -** | |
| 307 | 317 | ************************************ WARNING ********************************** |
| 308 | 318 | */ |
| 309 | -int looks_like_utf16(const Blob *pContent, int *pbLongLine, int *pbCrlf){ | |
| 319 | +int looks_like_utf16(const Blob *pContent, int *pFlags){ | |
| 310 | 320 | const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent); |
| 311 | 321 | unsigned int n = blob_size(pContent); |
| 312 | 322 | int j, c; |
| 313 | - int crlf = 0; | |
| 314 | - int longline = 0; | |
| 315 | 323 | |
| 316 | - /* Check individual lines. | |
| 317 | - */ | |
| 324 | + if( pFlags ) *pFlags = LOOK_NONE; | |
| 318 | 325 | if( n==0 ) return 1; /* Empty file -> text */ |
| 319 | 326 | if( n%2 ) return 0; /* Odd number of bytes -> binary (or UTF-8) */ |
| 320 | 327 | c = *z; |
| 321 | - if( c==0 ) return 0; /* NUL character in a file -> binary */ | |
| 328 | + if( c==0 ){ | |
| 329 | + if( pFlags ) *pFlags |= LOOK_NUL; | |
| 330 | + return 0; /* NUL character in a file -> binary */ | |
| 331 | + } | |
| 322 | 332 | j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF)); |
| 323 | 333 | while( (n-=2)>0 ){ |
| 324 | 334 | c = *++z; ++j; |
| 325 | - if( c==0 ) return 0; /* NUL character in a file -> binary */ | |
| 335 | + if( c==0 ){ | |
| 336 | + if( pFlags ) *pFlags |= LOOK_NUL; | |
| 337 | + return 0; /* NUL character in a file -> binary */ | |
| 338 | + } | |
| 326 | 339 | if( c==UTF16BE_LF || c==UTF16LE_LF ){ |
| 327 | 340 | int c2 = z[-1]; |
| 328 | - if( c2==UTF16BE_CR || c2==UTF16LE_CR ){ | |
| 329 | - crlf = 1; /* Contains CR/NL, continue */ | |
| 341 | + if( pFlags ){ | |
| 342 | + *pFlags |= LOOK_LF; | |
| 343 | + if( c2==UTF16BE_CR || c2==UTF16LE_CR ){ | |
| 344 | + *pFlags |= LOOK_CRLF; | |
| 345 | + } | |
| 330 | 346 | } |
| 331 | 347 | if( j>UTF16_LENGTH_MASK ){ |
| 332 | - longline = 1; /* Contains long line, continue */ | |
| 348 | + if( pFlags ) *pFlags |= LOOK_LENGTH; | |
| 349 | + return 0; /* Very long line -> binary */ | |
| 333 | 350 | } |
| 334 | 351 | j = 0; |
| 335 | 352 | } |
| 336 | 353 | } |
| 337 | - if( longline || j>UTF16_LENGTH_MASK ){ | |
| 338 | - if( pbLongLine ) *pbLongLine = 1; | |
| 354 | + if( j>UTF16_LENGTH_MASK ){ | |
| 355 | + if( pFlags ) *pFlags |= LOOK_LENGTH; | |
| 339 | 356 | return 0; /* Very long line -> binary */ |
| 340 | 357 | } |
| 341 | - if( pbCrlf ) *pbCrlf = crlf; | |
| 342 | 358 | return 1; /* No problems seen -> not binary */ |
| 343 | 359 | } |
| 344 | 360 | |
| 345 | 361 | /* |
| 346 | 362 | ** This function returns an array of bytes representing the byte-order-mark |
| 347 | 363 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -57,11 +57,25 @@ | |
| 57 | "more than 10,000 changes\n" |
| 58 | |
| 59 | #define DIFF_TOO_MANY_CHANGES_HTML \ |
| 60 | "<p class='generalError'>More than 10,000 changes</p>\n" |
| 61 | |
| 62 | #define looks_like_binary(blob) (looks_like_utf8((blob), 0, 0) != 1) |
| 63 | #endif /* INTERFACE */ |
| 64 | |
| 65 | /* |
| 66 | ** Maximum length of a line in a text file, in bytes. (2**13 = 8192 bytes) |
| 67 | */ |
| @@ -186,12 +200,12 @@ | |
| 186 | /* |
| 187 | ** This function attempts to scan each logical line within the blob to |
| 188 | ** determine the type of content it appears to contain. Possible return |
| 189 | ** values are: |
| 190 | ** |
| 191 | ** (1) -- The content appears to consist entirely of text; |
| 192 | ** however, the encoding may not be UTF-8. |
| 193 | ** |
| 194 | ** (0) -- The content appears to be binary because it contains embedded |
| 195 | ** NUL characters or an extremely long line. Since this function |
| 196 | ** does not understand UTF-16, it may falsely consider UTF-16 text |
| 197 | ** to be binary. |
| @@ -204,49 +218,50 @@ | |
| 204 | ** switches between UTF-8 and other encodings occur. |
| 205 | ** |
| 206 | ** The only code points that this function cares about are the NUL character, |
| 207 | ** carriage-return, and line-feed. |
| 208 | ** |
| 209 | ** If pbLongLine is not NULL and the blob is detected as being binary only because |
| 210 | ** of long lines, the integer pointed to is set to 1. Otherwise, it is left as is. |
| 211 | ** If pbCrlf is not NULL and the blob contains crlf, the integer pointed |
| 212 | ** to is set to 1. Otherwise, it is left as is. |
| 213 | ** |
| 214 | ************************************ WARNING ********************************** |
| 215 | */ |
| 216 | int looks_like_utf8(const Blob *pContent, int *pbLongLine, int *pbCrlf){ |
| 217 | const char *z = blob_buffer(pContent); |
| 218 | unsigned int n = blob_size(pContent); |
| 219 | int j, c; |
| 220 | int crlf = 0; |
| 221 | int longline = 0; |
| 222 | |
| 223 | /* Check individual lines. |
| 224 | */ |
| 225 | if( n==0 ) return 1; /* Empty file -> text */ |
| 226 | c = *z; |
| 227 | if( c==0 ) return 0; /* Zero byte in a file -> binary */ |
| 228 | j = (c!='\n'); |
| 229 | while( --n>0 ){ |
| 230 | c = *++z; ++j; |
| 231 | if( c==0 ) return 0; /* Zero byte in a file -> binary */ |
| 232 | if( c=='\n' ){ |
| 233 | int c2 = z[-1]; |
| 234 | if( c2=='\r' ){ |
| 235 | crlf = 1; /* Contains CR/NL, continue */ |
| 236 | } |
| 237 | if( j>LENGTH_MASK ){ |
| 238 | longline = 1; /* Contains long line, continue */ |
| 239 | } |
| 240 | j = 0; |
| 241 | } |
| 242 | } |
| 243 | if( longline || (j>LENGTH_MASK) ){ |
| 244 | if( pbLongLine ) *pbLongLine = 1; |
| 245 | return 0; /* Very long line -> binary */ |
| 246 | } |
| 247 | if( pbCrlf && crlf) *pbCrlf = 1; |
| 248 | return 1; /* No problems seen -> not binary */ |
| 249 | } |
| 250 | |
| 251 | /* |
| 252 | ** Define the type needed to represent a Unicode (UTF-16) character. |
| @@ -279,12 +294,12 @@ | |
| 279 | /* |
| 280 | ** This function attempts to scan each logical line within the blob to |
| 281 | ** determine the type of content it appears to contain. Possible return |
| 282 | ** values are: |
| 283 | ** |
| 284 | ** (1) -- The content appears to consist entirely of text; |
| 285 | ** however, the encoding may not be UTF-16. |
| 286 | ** |
| 287 | ** (0) -- The content appears to be binary because it contains embedded |
| 288 | ** NUL characters or an extremely long line. Since this function |
| 289 | ** does not understand UTF-8, it may falsely consider UTF-8 text |
| 290 | ** to be binary. |
| @@ -297,50 +312,51 @@ | |
| 297 | ** switches between the UTF-16be and UTF-16le encodings occur. |
| 298 | ** |
| 299 | ** The only code points that this function cares about are the NUL character, |
| 300 | ** carriage-return, and line-feed. |
| 301 | ** |
| 302 | ** If pbLongLine is not NULL and the blob is detected as being binary only because |
| 303 | ** of long lines, the integer pointed to is set to 1. Otherwise, it is left as is. |
| 304 | ** If pbCrlf is not NULL and the blob contains crlf, the integer pointed |
| 305 | ** to is set to 1. Otherwise, it is left as is. |
| 306 | ** |
| 307 | ************************************ WARNING ********************************** |
| 308 | */ |
| 309 | int looks_like_utf16(const Blob *pContent, int *pbLongLine, int *pbCrlf){ |
| 310 | const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent); |
| 311 | unsigned int n = blob_size(pContent); |
| 312 | int j, c; |
| 313 | int crlf = 0; |
| 314 | int longline = 0; |
| 315 | |
| 316 | /* Check individual lines. |
| 317 | */ |
| 318 | if( n==0 ) return 1; /* Empty file -> text */ |
| 319 | if( n%2 ) return 0; /* Odd number of bytes -> binary (or UTF-8) */ |
| 320 | c = *z; |
| 321 | if( c==0 ) return 0; /* NUL character in a file -> binary */ |
| 322 | j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF)); |
| 323 | while( (n-=2)>0 ){ |
| 324 | c = *++z; ++j; |
| 325 | if( c==0 ) return 0; /* NUL character in a file -> binary */ |
| 326 | if( c==UTF16BE_LF || c==UTF16LE_LF ){ |
| 327 | int c2 = z[-1]; |
| 328 | if( c2==UTF16BE_CR || c2==UTF16LE_CR ){ |
| 329 | crlf = 1; /* Contains CR/NL, continue */ |
| 330 | } |
| 331 | if( j>UTF16_LENGTH_MASK ){ |
| 332 | longline = 1; /* Contains long line, continue */ |
| 333 | } |
| 334 | j = 0; |
| 335 | } |
| 336 | } |
| 337 | if( longline || j>UTF16_LENGTH_MASK ){ |
| 338 | if( pbLongLine ) *pbLongLine = 1; |
| 339 | return 0; /* Very long line -> binary */ |
| 340 | } |
| 341 | if( pbCrlf ) *pbCrlf = crlf; |
| 342 | return 1; /* No problems seen -> not binary */ |
| 343 | } |
| 344 | |
| 345 | /* |
| 346 | ** This function returns an array of bytes representing the byte-order-mark |
| 347 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -57,11 +57,25 @@ | |
| 57 | "more than 10,000 changes\n" |
| 58 | |
| 59 | #define DIFF_TOO_MANY_CHANGES_HTML \ |
| 60 | "<p class='generalError'>More than 10,000 changes</p>\n" |
| 61 | |
| 62 | /* |
| 63 | ** This macro is designed to return non-zero if the specified blob contains |
| 64 | ** data that MAY be binary in nature; otherwise, zero will be returned. |
| 65 | */ |
| 66 | #define looks_like_binary(blob) (looks_like_utf8((blob), 0) == 0) |
| 67 | |
| 68 | /* |
| 69 | ** Output flags for the looks_like_utf8() and looks_like_utf16() routines used |
| 70 | ** to convey status information about the blob content. |
| 71 | */ |
| 72 | #define LOOK_NONE ((int)0x00000000) /* Nothing special was found. */ |
| 73 | #define LOOK_NUL ((int)0x00000001) /* One or more NUL chars were found. */ |
| 74 | #define LOOK_LF ((int)0x00000002) /* One or more LF chars were found. */ |
| 75 | #define LOOK_CRLF ((int)0x00000004) /* One or more CR/LF pairs were found. */ |
| 76 | #define LOOK_LENGTH ((int)0x00000008) /* An over length line was found. */ |
| 77 | #endif /* INTERFACE */ |
| 78 | |
| 79 | /* |
| 80 | ** Maximum length of a line in a text file, in bytes. (2**13 = 8192 bytes) |
| 81 | */ |
| @@ -186,12 +200,12 @@ | |
| 200 | /* |
| 201 | ** This function attempts to scan each logical line within the blob to |
| 202 | ** determine the type of content it appears to contain. Possible return |
| 203 | ** values are: |
| 204 | ** |
| 205 | ** (1) -- The content appears to consist entirely of text; however, the |
| 206 | ** encoding may not be UTF-8. |
| 207 | ** |
| 208 | ** (0) -- The content appears to be binary because it contains embedded |
| 209 | ** NUL characters or an extremely long line. Since this function |
| 210 | ** does not understand UTF-16, it may falsely consider UTF-16 text |
| 211 | ** to be binary. |
| @@ -204,49 +218,50 @@ | |
| 218 | ** switches between UTF-8 and other encodings occur. |
| 219 | ** |
| 220 | ** The only code points that this function cares about are the NUL character, |
| 221 | ** carriage-return, and line-feed. |
| 222 | ** |
| 223 | ************************************ WARNING ********************************** |
| 224 | */ |
| 225 | int looks_like_utf8(const Blob *pContent, int *pFlags){ |
| 226 | const char *z = blob_buffer(pContent); |
| 227 | unsigned int n = blob_size(pContent); |
| 228 | int j, c; |
| 229 | |
| 230 | if( pFlags ) *pFlags = LOOK_NONE; |
| 231 | if( n==0 ) return 1; /* Empty file -> text */ |
| 232 | c = *z; |
| 233 | if( c==0 ){ |
| 234 | if( pFlags ) *pFlags |= LOOK_NUL; |
| 235 | return 0; /* NUL character in a file -> binary */ |
| 236 | } |
| 237 | j = (c!='\n'); |
| 238 | while( --n>0 ){ |
| 239 | c = *++z; ++j; |
| 240 | if( c==0 ){ |
| 241 | if( pFlags ) *pFlags |= LOOK_NUL; |
| 242 | return 0; /* NUL character in a file -> binary */ |
| 243 | } |
| 244 | if( c=='\n' ){ |
| 245 | int c2 = z[-1]; |
| 246 | if( pFlags ){ |
| 247 | *pFlags |= LOOK_LF; |
| 248 | if( c2=='\r' ){ |
| 249 | *pFlags |= LOOK_CRLF; |
| 250 | } |
| 251 | } |
| 252 | if( j>LENGTH_MASK ){ |
| 253 | if( pFlags ) *pFlags |= LOOK_LENGTH; |
| 254 | return 0; /* Very long line -> binary */ |
| 255 | } |
| 256 | j = 0; |
| 257 | } |
| 258 | } |
| 259 | if( j>LENGTH_MASK ){ |
| 260 | if( pFlags ) *pFlags |= LOOK_LENGTH; |
| 261 | return 0; /* Very long line -> binary */ |
| 262 | } |
| 263 | return 1; /* No problems seen -> not binary */ |
| 264 | } |
| 265 | |
| 266 | /* |
| 267 | ** Define the type needed to represent a Unicode (UTF-16) character. |
| @@ -279,12 +294,12 @@ | |
| 294 | /* |
| 295 | ** This function attempts to scan each logical line within the blob to |
| 296 | ** determine the type of content it appears to contain. Possible return |
| 297 | ** values are: |
| 298 | ** |
| 299 | ** (1) -- The content appears to consist entirely of text; however, the |
| 300 | ** encoding may not be UTF-16. |
| 301 | ** |
| 302 | ** (0) -- The content appears to be binary because it contains embedded |
| 303 | ** NUL characters or an extremely long line. Since this function |
| 304 | ** does not understand UTF-8, it may falsely consider UTF-8 text |
| 305 | ** to be binary. |
| @@ -297,50 +312,51 @@ | |
| 312 | ** switches between the UTF-16be and UTF-16le encodings occur. |
| 313 | ** |
| 314 | ** The only code points that this function cares about are the NUL character, |
| 315 | ** carriage-return, and line-feed. |
| 316 | ** |
| 317 | ************************************ WARNING ********************************** |
| 318 | */ |
| 319 | int looks_like_utf16(const Blob *pContent, int *pFlags){ |
| 320 | const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent); |
| 321 | unsigned int n = blob_size(pContent); |
| 322 | int j, c; |
| 323 | |
| 324 | if( pFlags ) *pFlags = LOOK_NONE; |
| 325 | if( n==0 ) return 1; /* Empty file -> text */ |
| 326 | if( n%2 ) return 0; /* Odd number of bytes -> binary (or UTF-8) */ |
| 327 | c = *z; |
| 328 | if( c==0 ){ |
| 329 | if( pFlags ) *pFlags |= LOOK_NUL; |
| 330 | return 0; /* NUL character in a file -> binary */ |
| 331 | } |
| 332 | j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF)); |
| 333 | while( (n-=2)>0 ){ |
| 334 | c = *++z; ++j; |
| 335 | if( c==0 ){ |
| 336 | if( pFlags ) *pFlags |= LOOK_NUL; |
| 337 | return 0; /* NUL character in a file -> binary */ |
| 338 | } |
| 339 | if( c==UTF16BE_LF || c==UTF16LE_LF ){ |
| 340 | int c2 = z[-1]; |
| 341 | if( pFlags ){ |
| 342 | *pFlags |= LOOK_LF; |
| 343 | if( c2==UTF16BE_CR || c2==UTF16LE_CR ){ |
| 344 | *pFlags |= LOOK_CRLF; |
| 345 | } |
| 346 | } |
| 347 | if( j>UTF16_LENGTH_MASK ){ |
| 348 | if( pFlags ) *pFlags |= LOOK_LENGTH; |
| 349 | return 0; /* Very long line -> binary */ |
| 350 | } |
| 351 | j = 0; |
| 352 | } |
| 353 | } |
| 354 | if( j>UTF16_LENGTH_MASK ){ |
| 355 | if( pFlags ) *pFlags |= LOOK_LENGTH; |
| 356 | return 0; /* Very long line -> binary */ |
| 357 | } |
| 358 | return 1; /* No problems seen -> not binary */ |
| 359 | } |
| 360 | |
| 361 | /* |
| 362 | ** This function returns an array of bytes representing the byte-order-mark |
| 363 |