Fossil SCM
Don't trigger the long-lines warning if the long line is followed by a null-byte: it's a normal binary file then. re-write looks_like_utf8/16 to handle crlf the same way as long lines (thanks, Joe, for the long-lines rewrite!)
Commit
ab2920c2b9478b41563d2747e960a02b53c62f59
Parent
10fbcda270363ce…
2 files changed
+7
-6
+33
-33
+7
-6
| --- src/checkin.c | ||
| +++ src/checkin.c | ||
| @@ -902,33 +902,34 @@ | ||
| 902 | 902 | int encodingOk, /* Non-zero if encoding warnings should be disabled. */ |
| 903 | 903 | const char *zFilename /* The full name of the file being committed. */ |
| 904 | 904 | ){ |
| 905 | 905 | int eType; /* return value of looks_like_utf8/utf16() */ |
| 906 | 906 | int fUnicode; /* return value of starts_with_utf16_bom() */ |
| 907 | - int longLine; /* non-zero if blob has "long lines" */ | |
| 907 | + int longLine = 0; /* non-zero if blob has "long lines" */ | |
| 908 | + int crlf = 0; /* non-zero if blob has "crlf" */ | |
| 908 | 909 | char *zMsg; /* Warning message */ |
| 909 | 910 | Blob fname; /* Relative pathname of the file */ |
| 910 | 911 | static int allOk = 0; /* Set to true to disable this routine */ |
| 911 | 912 | |
| 912 | 913 | if( allOk ) return 0; |
| 913 | 914 | fUnicode = starts_with_utf16_bom(p, 0, 0); |
| 914 | - eType = fUnicode ? looks_like_utf16(p, &longLine) : | |
| 915 | - looks_like_utf8(p, &longLine); | |
| 916 | - if( eType==0 || eType==-1 || fUnicode ){ | |
| 915 | + eType = fUnicode ? looks_like_utf16(p, &longLine, &crlf) : | |
| 916 | + looks_like_utf8(p, &longLine, &crlf); | |
| 917 | + if( eType==0 || crlf || fUnicode ){ | |
| 917 | 918 | const char *zWarning; |
| 918 | 919 | const char *zDisable; |
| 919 | 920 | const char *zConvert = "c=convert/"; |
| 920 | 921 | Blob ans; |
| 921 | 922 | char cReply; |
| 922 | 923 | |
| 923 | - if( eType==-1 && fUnicode ){ | |
| 924 | + if( crlf && fUnicode ){ | |
| 924 | 925 | if ( crnlOk && encodingOk ){ |
| 925 | 926 | return 0; /* We don't want CR/NL and Unicode warnings for this file. */ |
| 926 | 927 | } |
| 927 | 928 | zWarning = "CR/NL line endings and Unicode"; |
| 928 | 929 | zDisable = "\"crnl-glob\" and \"encoding-glob\" settings"; |
| 929 | - }else if( eType==-1 ){ | |
| 930 | + }else if( crlf ){ | |
| 930 | 931 | if( crnlOk ){ |
| 931 | 932 | return 0; /* We don't want CR/NL warnings for this file. */ |
| 932 | 933 | } |
| 933 | 934 | zWarning = "CR/NL line endings"; |
| 934 | 935 | zDisable = "\"crnl-glob\" setting"; |
| 935 | 936 |
| --- src/checkin.c | |
| +++ src/checkin.c | |
| @@ -902,33 +902,34 @@ | |
| 902 | int encodingOk, /* Non-zero if encoding warnings should be disabled. */ |
| 903 | const char *zFilename /* The full name of the file being committed. */ |
| 904 | ){ |
| 905 | int eType; /* return value of looks_like_utf8/utf16() */ |
| 906 | int fUnicode; /* return value of starts_with_utf16_bom() */ |
| 907 | int longLine; /* non-zero if blob has "long lines" */ |
| 908 | char *zMsg; /* Warning message */ |
| 909 | Blob fname; /* Relative pathname of the file */ |
| 910 | static int allOk = 0; /* Set to true to disable this routine */ |
| 911 | |
| 912 | if( allOk ) return 0; |
| 913 | fUnicode = starts_with_utf16_bom(p, 0, 0); |
| 914 | eType = fUnicode ? looks_like_utf16(p, &longLine) : |
| 915 | looks_like_utf8(p, &longLine); |
| 916 | if( eType==0 || eType==-1 || fUnicode ){ |
| 917 | const char *zWarning; |
| 918 | const char *zDisable; |
| 919 | const char *zConvert = "c=convert/"; |
| 920 | Blob ans; |
| 921 | char cReply; |
| 922 | |
| 923 | if( eType==-1 && fUnicode ){ |
| 924 | if ( crnlOk && encodingOk ){ |
| 925 | return 0; /* We don't want CR/NL and Unicode warnings for this file. */ |
| 926 | } |
| 927 | zWarning = "CR/NL line endings and Unicode"; |
| 928 | zDisable = "\"crnl-glob\" and \"encoding-glob\" settings"; |
| 929 | }else if( eType==-1 ){ |
| 930 | if( crnlOk ){ |
| 931 | return 0; /* We don't want CR/NL warnings for this file. */ |
| 932 | } |
| 933 | zWarning = "CR/NL line endings"; |
| 934 | zDisable = "\"crnl-glob\" setting"; |
| 935 |
| --- src/checkin.c | |
| +++ src/checkin.c | |
| @@ -902,33 +902,34 @@ | |
| 902 | int encodingOk, /* Non-zero if encoding warnings should be disabled. */ |
| 903 | const char *zFilename /* The full name of the file being committed. */ |
| 904 | ){ |
| 905 | int eType; /* return value of looks_like_utf8/utf16() */ |
| 906 | int fUnicode; /* return value of starts_with_utf16_bom() */ |
| 907 | int longLine = 0; /* non-zero if blob has "long lines" */ |
| 908 | int crlf = 0; /* non-zero if blob has "crlf" */ |
| 909 | char *zMsg; /* Warning message */ |
| 910 | Blob fname; /* Relative pathname of the file */ |
| 911 | static int allOk = 0; /* Set to true to disable this routine */ |
| 912 | |
| 913 | if( allOk ) return 0; |
| 914 | fUnicode = starts_with_utf16_bom(p, 0, 0); |
| 915 | eType = fUnicode ? looks_like_utf16(p, &longLine, &crlf) : |
| 916 | looks_like_utf8(p, &longLine, &crlf); |
| 917 | if( eType==0 || crlf || fUnicode ){ |
| 918 | const char *zWarning; |
| 919 | const char *zDisable; |
| 920 | const char *zConvert = "c=convert/"; |
| 921 | Blob ans; |
| 922 | char cReply; |
| 923 | |
| 924 | if( crlf && fUnicode ){ |
| 925 | if ( crnlOk && encodingOk ){ |
| 926 | return 0; /* We don't want CR/NL and Unicode warnings for this file. */ |
| 927 | } |
| 928 | zWarning = "CR/NL line endings and Unicode"; |
| 929 | zDisable = "\"crnl-glob\" and \"encoding-glob\" settings"; |
| 930 | }else if( crlf ){ |
| 931 | if( crnlOk ){ |
| 932 | return 0; /* We don't want CR/NL warnings for this file. */ |
| 933 | } |
| 934 | zWarning = "CR/NL line endings"; |
| 935 | zDisable = "\"crnl-glob\" setting"; |
| 936 |
+33
-33
| --- src/diff.c | ||
| +++ src/diff.c | ||
| @@ -57,11 +57,11 @@ | ||
| 57 | 57 | "more than 10,000 changes\n" |
| 58 | 58 | |
| 59 | 59 | #define DIFF_TOO_MANY_CHANGES_HTML \ |
| 60 | 60 | "<p class='generalError'>More than 10,000 changes</p>\n" |
| 61 | 61 | |
| 62 | -#define looks_like_binary(blob) (looks_like_utf8((blob), 0) == 0) | |
| 62 | +#define looks_like_binary(blob) (looks_like_utf8((blob), 0, 0) != 1) | |
| 63 | 63 | #endif /* INTERFACE */ |
| 64 | 64 | |
| 65 | 65 | /* |
| 66 | 66 | ** Maximum length of a line in a text file, in bytes. (2**13 = 8192 bytes) |
| 67 | 67 | */ |
| @@ -186,68 +186,68 @@ | ||
| 186 | 186 | /* |
| 187 | 187 | ** This function attempts to scan each logical line within the blob to |
| 188 | 188 | ** determine the type of content it appears to contain. Possible return |
| 189 | 189 | ** values are: |
| 190 | 190 | ** |
| 191 | -** (1) -- The content appears to consist entirely of text, with lines | |
| 192 | -** delimited by line-feed characters; however, the encoding may | |
| 193 | -** not be UTF-8. | |
| 191 | +** (1) -- The content appears to consist entirely of text; | |
| 192 | +** however, the encoding may not be UTF-8. | |
| 194 | 193 | ** |
| 195 | 194 | ** (0) -- The content appears to be binary because it contains embedded |
| 196 | 195 | ** NUL characters or an extremely long line. Since this function |
| 197 | 196 | ** does not understand UTF-16, it may falsely consider UTF-16 text |
| 198 | 197 | ** to be binary. |
| 199 | 198 | ** |
| 200 | -** (-1) -- The content appears to consist entirely of text, with lines | |
| 201 | -** delimited by carriage-return, line-feed pairs; however, the | |
| 202 | -** encoding may not be UTF-8. | |
| 203 | -** | |
| 204 | 199 | ************************************ WARNING ********************************** |
| 205 | 200 | ** |
| 206 | 201 | ** This function does not validate that the blob content is properly formed |
| 207 | 202 | ** UTF-8. It assumes that all code points are the same size. It does not |
| 208 | 203 | ** validate any code points. It makes no attempt to detect if any [invalid] |
| 209 | 204 | ** switches between UTF-8 and other encodings occur. |
| 210 | 205 | ** |
| 211 | 206 | ** The only code points that this function cares about are the NUL character, |
| 212 | 207 | ** carriage-return, and line-feed. |
| 208 | +** | |
| 209 | +** If pbLongLine is not NULL and the blob is detected as being binary only because | |
| 210 | +** of long lines, the integer pointed to is set to 1. Otherwise, it is left as is. | |
| 211 | +** If pbCrlf is not NULL and the blob contains crlf, the integer pointed | |
| 212 | +** to is set to 1. Otherwise, it is left as is. | |
| 213 | 213 | ** |
| 214 | 214 | ************************************ WARNING ********************************** |
| 215 | 215 | */ |
| 216 | -int looks_like_utf8(const Blob *pContent, int *pbLongLine){ | |
| 216 | +int looks_like_utf8(const Blob *pContent, int *pbLongLine, int *pbCrlf){ | |
| 217 | 217 | const char *z = blob_buffer(pContent); |
| 218 | 218 | unsigned int n = blob_size(pContent); |
| 219 | 219 | int j, c; |
| 220 | - int result = 1; /* Assume UTF-8 text with no CR/NL */ | |
| 220 | + int crlf = 0; | |
| 221 | + int longline = 0; | |
| 221 | 222 | |
| 222 | 223 | /* Check individual lines. |
| 223 | 224 | */ |
| 224 | - if( pbLongLine ) *pbLongLine = 0; | |
| 225 | - if( n==0 ) return result; /* Empty file -> text */ | |
| 225 | + if( n==0 ) return 1; /* Empty file -> text */ | |
| 226 | 226 | c = *z; |
| 227 | 227 | if( c==0 ) return 0; /* Zero byte in a file -> binary */ |
| 228 | 228 | j = (c!='\n'); |
| 229 | 229 | while( --n>0 ){ |
| 230 | 230 | c = *++z; ++j; |
| 231 | 231 | if( c==0 ) return 0; /* Zero byte in a file -> binary */ |
| 232 | 232 | if( c=='\n' ){ |
| 233 | 233 | int c2 = z[-1]; |
| 234 | 234 | if( c2=='\r' ){ |
| 235 | - result = -1; /* Contains CR/NL, continue */ | |
| 235 | + crlf = 1; /* Contains CR/NL, continue */ | |
| 236 | 236 | } |
| 237 | 237 | if( j>LENGTH_MASK ){ |
| 238 | - if( pbLongLine ) *pbLongLine = 1; | |
| 239 | - return 0; /* Very long line -> binary */ | |
| 238 | + longline = 1; /* Contains long line, continue */ | |
| 240 | 239 | } |
| 241 | 240 | j = 0; |
| 242 | 241 | } |
| 243 | 242 | } |
| 244 | - if( j>LENGTH_MASK ){ | |
| 243 | + if( longline || (j>LENGTH_MASK) ){ | |
| 245 | 244 | if( pbLongLine ) *pbLongLine = 1; |
| 246 | 245 | return 0; /* Very long line -> binary */ |
| 247 | 246 | } |
| 248 | - return result; /* No problems seen -> not binary */ | |
| 247 | + if( pbCrlf && crlf) *pbCrlf = 1; | |
| 248 | + return 1; /* No problems seen -> not binary */ | |
| 249 | 249 | } |
| 250 | 250 | |
| 251 | 251 | /* |
| 252 | 252 | ** Define the type needed to represent a Unicode (UTF-16) character. |
| 253 | 253 | */ |
| @@ -279,45 +279,45 @@ | ||
| 279 | 279 | /* |
| 280 | 280 | ** This function attempts to scan each logical line within the blob to |
| 281 | 281 | ** determine the type of content it appears to contain. Possible return |
| 282 | 282 | ** values are: |
| 283 | 283 | ** |
| 284 | -** (1) -- The content appears to consist entirely of text, with lines | |
| 285 | -** delimited by line-feed characters; however, the encoding may | |
| 286 | -** not be UTF-16. | |
| 284 | +** (1) -- The content appears to consist entirely of text; | |
| 285 | +** however, the encoding may not be UTF-16. | |
| 287 | 286 | ** |
| 288 | 287 | ** (0) -- The content appears to be binary because it contains embedded |
| 289 | 288 | ** NUL characters or an extremely long line. Since this function |
| 290 | 289 | ** does not understand UTF-8, it may falsely consider UTF-8 text |
| 291 | 290 | ** to be binary. |
| 292 | 291 | ** |
| 293 | -** (-1) -- The content appears to consist entirely of text, with lines | |
| 294 | -** delimited by carriage-return, line-feed pairs; however, the | |
| 295 | -** encoding may not be UTF-16. | |
| 296 | -** | |
| 297 | 292 | ************************************ WARNING ********************************** |
| 298 | 293 | ** |
| 299 | 294 | ** This function does not validate that the blob content is properly formed |
| 300 | 295 | ** UTF-16. It assumes that all code points are the same size. It does not |
| 301 | 296 | ** validate any code points. It makes no attempt to detect if any [invalid] |
| 302 | 297 | ** switches between the UTF-16be and UTF-16le encodings occur. |
| 303 | 298 | ** |
| 304 | 299 | ** The only code points that this function cares about are the NUL character, |
| 305 | 300 | ** carriage-return, and line-feed. |
| 301 | +** | |
| 302 | +** If pbLongLine is not NULL and the blob is detected as being binary only because | |
| 303 | +** of long lines, the integer pointed to is set to 1. Otherwise, it is left as is. | |
| 304 | +** If pbCrlf is not NULL and the blob contains crlf, the integer pointed | |
| 305 | +** to is set to 1. Otherwise, it is left as is. | |
| 306 | 306 | ** |
| 307 | 307 | ************************************ WARNING ********************************** |
| 308 | 308 | */ |
| 309 | -int looks_like_utf16(const Blob *pContent, int *pbLongLine){ | |
| 309 | +int looks_like_utf16(const Blob *pContent, int *pbLongLine, int *pbCrlf){ | |
| 310 | 310 | const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent); |
| 311 | 311 | unsigned int n = blob_size(pContent); |
| 312 | 312 | int j, c; |
| 313 | - int result = 1; /* Assume UTF-16 text with no CR/NL */ | |
| 313 | + int crlf = 0; | |
| 314 | + int longline = 0; | |
| 314 | 315 | |
| 315 | 316 | /* Check individual lines. |
| 316 | 317 | */ |
| 317 | - if( pbLongLine ) *pbLongLine = 0; | |
| 318 | - if( n==0 ) return result; /* Empty file -> text */ | |
| 318 | + if( n==0 ) return 1; /* Empty file -> text */ | |
| 319 | 319 | if( n%2 ) return 0; /* Odd number of bytes -> binary (or UTF-8) */ |
| 320 | 320 | c = *z; |
| 321 | 321 | if( c==0 ) return 0; /* NUL character in a file -> binary */ |
| 322 | 322 | j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF)); |
| 323 | 323 | while( (n-=2)>0 ){ |
| @@ -324,24 +324,24 @@ | ||
| 324 | 324 | c = *++z; ++j; |
| 325 | 325 | if( c==0 ) return 0; /* NUL character in a file -> binary */ |
| 326 | 326 | if( c==UTF16BE_LF || c==UTF16LE_LF ){ |
| 327 | 327 | int c2 = z[-1]; |
| 328 | 328 | if( c2==UTF16BE_CR || c2==UTF16LE_CR ){ |
| 329 | - result = -1; /* Contains CR/NL, continue */ | |
| 329 | + crlf = 1; /* Contains CR/NL, continue */ | |
| 330 | 330 | } |
| 331 | 331 | if( j>UTF16_LENGTH_MASK ){ |
| 332 | - if( pbLongLine ) *pbLongLine = 1; | |
| 333 | - return 0; /* Very long line -> binary */ | |
| 332 | + longline = 1; /* Contains long line, continue */ | |
| 334 | 333 | } |
| 335 | 334 | j = 0; |
| 336 | 335 | } |
| 337 | 336 | } |
| 338 | - if( j>UTF16_LENGTH_MASK ){ | |
| 337 | + if( longline || j>UTF16_LENGTH_MASK ){ | |
| 339 | 338 | if( pbLongLine ) *pbLongLine = 1; |
| 340 | 339 | return 0; /* Very long line -> binary */ |
| 341 | 340 | } |
| 342 | - return result; /* No problems seen -> not binary */ | |
| 341 | + if( pbCrlf ) *pbCrlf = crlf; | |
| 342 | + return 1; /* No problems seen -> not binary */ | |
| 343 | 343 | } |
| 344 | 344 | |
| 345 | 345 | /* |
| 346 | 346 | ** This function returns an array of bytes representing the byte-order-mark |
| 347 | 347 | ** for UTF-8. |
| 348 | 348 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -57,11 +57,11 @@ | |
| 57 | "more than 10,000 changes\n" |
| 58 | |
| 59 | #define DIFF_TOO_MANY_CHANGES_HTML \ |
| 60 | "<p class='generalError'>More than 10,000 changes</p>\n" |
| 61 | |
| 62 | #define looks_like_binary(blob) (looks_like_utf8((blob), 0) == 0) |
| 63 | #endif /* INTERFACE */ |
| 64 | |
| 65 | /* |
| 66 | ** Maximum length of a line in a text file, in bytes. (2**13 = 8192 bytes) |
| 67 | */ |
| @@ -186,68 +186,68 @@ | |
| 186 | /* |
| 187 | ** This function attempts to scan each logical line within the blob to |
| 188 | ** determine the type of content it appears to contain. Possible return |
| 189 | ** values are: |
| 190 | ** |
| 191 | ** (1) -- The content appears to consist entirely of text, with lines |
| 192 | ** delimited by line-feed characters; however, the encoding may |
| 193 | ** not be UTF-8. |
| 194 | ** |
| 195 | ** (0) -- The content appears to be binary because it contains embedded |
| 196 | ** NUL characters or an extremely long line. Since this function |
| 197 | ** does not understand UTF-16, it may falsely consider UTF-16 text |
| 198 | ** to be binary. |
| 199 | ** |
| 200 | ** (-1) -- The content appears to consist entirely of text, with lines |
| 201 | ** delimited by carriage-return, line-feed pairs; however, the |
| 202 | ** encoding may not be UTF-8. |
| 203 | ** |
| 204 | ************************************ WARNING ********************************** |
| 205 | ** |
| 206 | ** This function does not validate that the blob content is properly formed |
| 207 | ** UTF-8. It assumes that all code points are the same size. It does not |
| 208 | ** validate any code points. It makes no attempt to detect if any [invalid] |
| 209 | ** switches between UTF-8 and other encodings occur. |
| 210 | ** |
| 211 | ** The only code points that this function cares about are the NUL character, |
| 212 | ** carriage-return, and line-feed. |
| 213 | ** |
| 214 | ************************************ WARNING ********************************** |
| 215 | */ |
| 216 | int looks_like_utf8(const Blob *pContent, int *pbLongLine){ |
| 217 | const char *z = blob_buffer(pContent); |
| 218 | unsigned int n = blob_size(pContent); |
| 219 | int j, c; |
| 220 | int result = 1; /* Assume UTF-8 text with no CR/NL */ |
| 221 | |
| 222 | /* Check individual lines. |
| 223 | */ |
| 224 | if( pbLongLine ) *pbLongLine = 0; |
| 225 | if( n==0 ) return result; /* Empty file -> text */ |
| 226 | c = *z; |
| 227 | if( c==0 ) return 0; /* Zero byte in a file -> binary */ |
| 228 | j = (c!='\n'); |
| 229 | while( --n>0 ){ |
| 230 | c = *++z; ++j; |
| 231 | if( c==0 ) return 0; /* Zero byte in a file -> binary */ |
| 232 | if( c=='\n' ){ |
| 233 | int c2 = z[-1]; |
| 234 | if( c2=='\r' ){ |
| 235 | result = -1; /* Contains CR/NL, continue */ |
| 236 | } |
| 237 | if( j>LENGTH_MASK ){ |
| 238 | if( pbLongLine ) *pbLongLine = 1; |
| 239 | return 0; /* Very long line -> binary */ |
| 240 | } |
| 241 | j = 0; |
| 242 | } |
| 243 | } |
| 244 | if( j>LENGTH_MASK ){ |
| 245 | if( pbLongLine ) *pbLongLine = 1; |
| 246 | return 0; /* Very long line -> binary */ |
| 247 | } |
| 248 | return result; /* No problems seen -> not binary */ |
| 249 | } |
| 250 | |
| 251 | /* |
| 252 | ** Define the type needed to represent a Unicode (UTF-16) character. |
| 253 | */ |
| @@ -279,45 +279,45 @@ | |
| 279 | /* |
| 280 | ** This function attempts to scan each logical line within the blob to |
| 281 | ** determine the type of content it appears to contain. Possible return |
| 282 | ** values are: |
| 283 | ** |
| 284 | ** (1) -- The content appears to consist entirely of text, with lines |
| 285 | ** delimited by line-feed characters; however, the encoding may |
| 286 | ** not be UTF-16. |
| 287 | ** |
| 288 | ** (0) -- The content appears to be binary because it contains embedded |
| 289 | ** NUL characters or an extremely long line. Since this function |
| 290 | ** does not understand UTF-8, it may falsely consider UTF-8 text |
| 291 | ** to be binary. |
| 292 | ** |
| 293 | ** (-1) -- The content appears to consist entirely of text, with lines |
| 294 | ** delimited by carriage-return, line-feed pairs; however, the |
| 295 | ** encoding may not be UTF-16. |
| 296 | ** |
| 297 | ************************************ WARNING ********************************** |
| 298 | ** |
| 299 | ** This function does not validate that the blob content is properly formed |
| 300 | ** UTF-16. It assumes that all code points are the same size. It does not |
| 301 | ** validate any code points. It makes no attempt to detect if any [invalid] |
| 302 | ** switches between the UTF-16be and UTF-16le encodings occur. |
| 303 | ** |
| 304 | ** The only code points that this function cares about are the NUL character, |
| 305 | ** carriage-return, and line-feed. |
| 306 | ** |
| 307 | ************************************ WARNING ********************************** |
| 308 | */ |
| 309 | int looks_like_utf16(const Blob *pContent, int *pbLongLine){ |
| 310 | const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent); |
| 311 | unsigned int n = blob_size(pContent); |
| 312 | int j, c; |
| 313 | int result = 1; /* Assume UTF-16 text with no CR/NL */ |
| 314 | |
| 315 | /* Check individual lines. |
| 316 | */ |
| 317 | if( pbLongLine ) *pbLongLine = 0; |
| 318 | if( n==0 ) return result; /* Empty file -> text */ |
| 319 | if( n%2 ) return 0; /* Odd number of bytes -> binary (or UTF-8) */ |
| 320 | c = *z; |
| 321 | if( c==0 ) return 0; /* NUL character in a file -> binary */ |
| 322 | j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF)); |
| 323 | while( (n-=2)>0 ){ |
| @@ -324,24 +324,24 @@ | |
| 324 | c = *++z; ++j; |
| 325 | if( c==0 ) return 0; /* NUL character in a file -> binary */ |
| 326 | if( c==UTF16BE_LF || c==UTF16LE_LF ){ |
| 327 | int c2 = z[-1]; |
| 328 | if( c2==UTF16BE_CR || c2==UTF16LE_CR ){ |
| 329 | result = -1; /* Contains CR/NL, continue */ |
| 330 | } |
| 331 | if( j>UTF16_LENGTH_MASK ){ |
| 332 | if( pbLongLine ) *pbLongLine = 1; |
| 333 | return 0; /* Very long line -> binary */ |
| 334 | } |
| 335 | j = 0; |
| 336 | } |
| 337 | } |
| 338 | if( j>UTF16_LENGTH_MASK ){ |
| 339 | if( pbLongLine ) *pbLongLine = 1; |
| 340 | return 0; /* Very long line -> binary */ |
| 341 | } |
| 342 | return result; /* No problems seen -> not binary */ |
| 343 | } |
| 344 | |
| 345 | /* |
| 346 | ** This function returns an array of bytes representing the byte-order-mark |
| 347 | ** for UTF-8. |
| 348 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -57,11 +57,11 @@ | |
| 57 | "more than 10,000 changes\n" |
| 58 | |
| 59 | #define DIFF_TOO_MANY_CHANGES_HTML \ |
| 60 | "<p class='generalError'>More than 10,000 changes</p>\n" |
| 61 | |
| 62 | #define looks_like_binary(blob) (looks_like_utf8((blob), 0, 0) != 1) |
| 63 | #endif /* INTERFACE */ |
| 64 | |
| 65 | /* |
| 66 | ** Maximum length of a line in a text file, in bytes. (2**13 = 8192 bytes) |
| 67 | */ |
| @@ -186,68 +186,68 @@ | |
| 186 | /* |
| 187 | ** This function attempts to scan each logical line within the blob to |
| 188 | ** determine the type of content it appears to contain. Possible return |
| 189 | ** values are: |
| 190 | ** |
| 191 | ** (1) -- The content appears to consist entirely of text; |
| 192 | ** however, the encoding may not be UTF-8. |
| 193 | ** |
| 194 | ** (0) -- The content appears to be binary because it contains embedded |
| 195 | ** NUL characters or an extremely long line. Since this function |
| 196 | ** does not understand UTF-16, it may falsely consider UTF-16 text |
| 197 | ** to be binary. |
| 198 | ** |
| 199 | ************************************ WARNING ********************************** |
| 200 | ** |
| 201 | ** This function does not validate that the blob content is properly formed |
| 202 | ** UTF-8. It assumes that all code points are the same size. It does not |
| 203 | ** validate any code points. It makes no attempt to detect if any [invalid] |
| 204 | ** switches between UTF-8 and other encodings occur. |
| 205 | ** |
| 206 | ** The only code points that this function cares about are the NUL character, |
| 207 | ** carriage-return, and line-feed. |
| 208 | ** |
| 209 | ** If pbLongLine is not NULL and the blob is detected as being binary only because |
| 210 | ** of long lines, the integer pointed to is set to 1. Otherwise, it is left as is. |
| 211 | ** If pbCrlf is not NULL and the blob contains crlf, the integer pointed |
| 212 | ** to is set to 1. Otherwise, it is left as is. |
| 213 | ** |
| 214 | ************************************ WARNING ********************************** |
| 215 | */ |
| 216 | int looks_like_utf8(const Blob *pContent, int *pbLongLine, int *pbCrlf){ |
| 217 | const char *z = blob_buffer(pContent); |
| 218 | unsigned int n = blob_size(pContent); |
| 219 | int j, c; |
| 220 | int crlf = 0; |
| 221 | int longline = 0; |
| 222 | |
| 223 | /* Check individual lines. |
| 224 | */ |
| 225 | if( n==0 ) return 1; /* Empty file -> text */ |
| 226 | c = *z; |
| 227 | if( c==0 ) return 0; /* Zero byte in a file -> binary */ |
| 228 | j = (c!='\n'); |
| 229 | while( --n>0 ){ |
| 230 | c = *++z; ++j; |
| 231 | if( c==0 ) return 0; /* Zero byte in a file -> binary */ |
| 232 | if( c=='\n' ){ |
| 233 | int c2 = z[-1]; |
| 234 | if( c2=='\r' ){ |
| 235 | crlf = 1; /* Contains CR/NL, continue */ |
| 236 | } |
| 237 | if( j>LENGTH_MASK ){ |
| 238 | longline = 1; /* Contains long line, continue */ |
| 239 | } |
| 240 | j = 0; |
| 241 | } |
| 242 | } |
| 243 | if( longline || (j>LENGTH_MASK) ){ |
| 244 | if( pbLongLine ) *pbLongLine = 1; |
| 245 | return 0; /* Very long line -> binary */ |
| 246 | } |
| 247 | if( pbCrlf && crlf) *pbCrlf = 1; |
| 248 | return 1; /* No problems seen -> not binary */ |
| 249 | } |
| 250 | |
| 251 | /* |
| 252 | ** Define the type needed to represent a Unicode (UTF-16) character. |
| 253 | */ |
| @@ -279,45 +279,45 @@ | |
| 279 | /* |
| 280 | ** This function attempts to scan each logical line within the blob to |
| 281 | ** determine the type of content it appears to contain. Possible return |
| 282 | ** values are: |
| 283 | ** |
| 284 | ** (1) -- The content appears to consist entirely of text; |
| 285 | ** however, the encoding may not be UTF-16. |
| 286 | ** |
| 287 | ** (0) -- The content appears to be binary because it contains embedded |
| 288 | ** NUL characters or an extremely long line. Since this function |
| 289 | ** does not understand UTF-8, it may falsely consider UTF-8 text |
| 290 | ** to be binary. |
| 291 | ** |
| 292 | ************************************ WARNING ********************************** |
| 293 | ** |
| 294 | ** This function does not validate that the blob content is properly formed |
| 295 | ** UTF-16. It assumes that all code points are the same size. It does not |
| 296 | ** validate any code points. It makes no attempt to detect if any [invalid] |
| 297 | ** switches between the UTF-16be and UTF-16le encodings occur. |
| 298 | ** |
| 299 | ** The only code points that this function cares about are the NUL character, |
| 300 | ** carriage-return, and line-feed. |
| 301 | ** |
| 302 | ** If pbLongLine is not NULL and the blob is detected as being binary only because |
| 303 | ** of long lines, the integer pointed to is set to 1. Otherwise, it is left as is. |
| 304 | ** If pbCrlf is not NULL and the blob contains crlf, the integer pointed |
| 305 | ** to is set to 1. Otherwise, it is left as is. |
| 306 | ** |
| 307 | ************************************ WARNING ********************************** |
| 308 | */ |
| 309 | int looks_like_utf16(const Blob *pContent, int *pbLongLine, int *pbCrlf){ |
| 310 | const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent); |
| 311 | unsigned int n = blob_size(pContent); |
| 312 | int j, c; |
| 313 | int crlf = 0; |
| 314 | int longline = 0; |
| 315 | |
| 316 | /* Check individual lines. |
| 317 | */ |
| 318 | if( n==0 ) return 1; /* Empty file -> text */ |
| 319 | if( n%2 ) return 0; /* Odd number of bytes -> binary (or UTF-8) */ |
| 320 | c = *z; |
| 321 | if( c==0 ) return 0; /* NUL character in a file -> binary */ |
| 322 | j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF)); |
| 323 | while( (n-=2)>0 ){ |
| @@ -324,24 +324,24 @@ | |
| 324 | c = *++z; ++j; |
| 325 | if( c==0 ) return 0; /* NUL character in a file -> binary */ |
| 326 | if( c==UTF16BE_LF || c==UTF16LE_LF ){ |
| 327 | int c2 = z[-1]; |
| 328 | if( c2==UTF16BE_CR || c2==UTF16LE_CR ){ |
| 329 | crlf = 1; /* Contains CR/NL, continue */ |
| 330 | } |
| 331 | if( j>UTF16_LENGTH_MASK ){ |
| 332 | longline = 1; /* Contains long line, continue */ |
| 333 | } |
| 334 | j = 0; |
| 335 | } |
| 336 | } |
| 337 | if( longline || j>UTF16_LENGTH_MASK ){ |
| 338 | if( pbLongLine ) *pbLongLine = 1; |
| 339 | return 0; /* Very long line -> binary */ |
| 340 | } |
| 341 | if( pbCrlf ) *pbCrlf = crlf; |
| 342 | return 1; /* No problems seen -> not binary */ |
| 343 | } |
| 344 | |
| 345 | /* |
| 346 | ** This function returns an array of bytes representing the byte-order-mark |
| 347 | ** for UTF-8. |
| 348 |