| | @@ -48,15 +48,15 @@ |
| 48 | 48 | "cannot compute difference between binary files\n" |
| 49 | 49 | |
| 50 | 50 | #define DIFF_CANNOT_COMPUTE_SYMLINK \ |
| 51 | 51 | "cannot compute difference between symlink and regular file\n" |
| 52 | 52 | |
| 53 | | -#define looks_like_binary(blob) (looks_like_text((blob)) == 0) |
| 53 | +#define looks_like_binary(blob) (looks_like_utf8((blob)) == 0) |
| 54 | 54 | #endif /* INTERFACE */ |
| 55 | 55 | |
| 56 | 56 | /* |
| 57 | | -** Maximum length of a line in a text file. (8192) |
| 57 | +** Maximum length of a line in a text file, in bytes. (8192) |
| 58 | 58 | */ |
| 59 | 59 | #define LENGTH_MASK_SZ 13 |
| 60 | 60 | #define LENGTH_MASK ((1<<LENGTH_MASK_SZ)-1) |
| 61 | 61 | |
| 62 | 62 | /* |
| | @@ -179,34 +179,34 @@ |
| 179 | 179 | ** (1) -- The content appears to consist entirely of text, with lines |
| 180 | 180 | ** delimited by line-feed characters; however, the encoding may |
| 181 | 181 | ** not be UTF-8. |
| 182 | 182 | ** |
| 183 | 183 | ** (0) -- The content appears to be binary because it contains embedded |
| 184 | | -** NUL (\000) characters or an extremely long line. Since this |
| 185 | | -** function does not understand UTF-16, it may falsely consider |
| 186 | | -** UTF-16 text to be binary. |
| 184 | +** NUL characters or an extremely long line. Since this function |
| 185 | +** does not understand UTF-16, it may falsely consider UTF-16 text |
| 186 | +** to be binary. |
| 187 | 187 | ** |
| 188 | 188 | ** (-1) -- The content appears to consist entirely of text, with lines |
| 189 | 189 | ** delimited by carriage-return, line-feed pairs; however, the |
| 190 | 190 | ** encoding may not be UTF-8. |
| 191 | 191 | ** |
| 192 | 192 | */ |
| 193 | | -int looks_like_text(const Blob *pContent){ |
| 193 | +int looks_like_utf8(const Blob *pContent){ |
| 194 | 194 | const char *z = blob_buffer(pContent); |
| 195 | 195 | unsigned int n = blob_size(pContent); |
| 196 | 196 | int j, c; |
| 197 | | - int result = 1; /* Assume text with no CR/NL */ |
| 197 | + int result = 1; /* Assume UTF-8 text with no CR/NL */ |
| 198 | 198 | |
| 199 | 199 | /* Check individual lines. |
| 200 | 200 | */ |
| 201 | 201 | if( n==0 ) return result; /* Empty file -> text */ |
| 202 | 202 | c = *z; |
| 203 | | - if( c==0 ) return 0; /* \000 byte in a file -> binary */ |
| 203 | + if( c==0 ) return 0; /* Zero byte in a file -> binary */ |
| 204 | 204 | j = (c!='\n'); |
| 205 | 205 | while( --n>0 ){ |
| 206 | 206 | c = *++z; ++j; |
| 207 | | - if( c==0 ) return 0; /* \000 byte in a file -> binary */ |
| 207 | + if( c==0 ) return 0; /* Zero byte in a file -> binary */ |
| 208 | 208 | if( c=='\n' ){ |
| 209 | 209 | if( z[-1]=='\r' ){ |
| 210 | 210 | result = -1; /* Contains CR/NL, continue */ |
| 211 | 211 | } |
| 212 | 212 | if( j>LENGTH_MASK ){ |
| | @@ -215,10 +215,78 @@ |
| 215 | 215 | j = 0; |
| 216 | 216 | } |
| 217 | 217 | } |
| 218 | 218 | if( j>LENGTH_MASK ){ |
| 219 | 219 | return 0; /* Very long line -> binary */ |
| 220 | + } |
| 221 | + return result; /* No problems seen -> not binary */ |
| 222 | +} |
| 223 | + |
| 224 | +/* |
| 225 | +** Maximum length of a line in a text file, in UTF-16 characters. (4096) |
| 226 | +** The number of bytes represented by this value cannot exceed LENGTH_MASK |
| 227 | +** bytes, because that is the line buffer size used by the diff engine. |
| 228 | +*/ |
| 229 | +#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-1) |
| 230 | +#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1) |
| 231 | + |
| 232 | +/* |
| 233 | +** The carriage-return / line-feed characters in the UTF-16be and UTF-16le |
| 234 | +** encodings. |
| 235 | +*/ |
| 236 | +#define UTF16BE_CR ((wchar_t)'\r') |
| 237 | +#define UTF16BE_LF ((wchar_t)'\n') |
| 238 | +#define UTF16LE_CR (((wchar_t)'\r')<<(sizeof(wchar_t)<<2)) |
| 239 | +#define UTF16LE_LF (((wchar_t)'\n')<<(sizeof(wchar_t)<<2)) |
| 240 | + |
| 241 | +/* |
| 242 | +** This function attempts to scan each logical line within the blob to |
| 243 | +** determine the type of content it appears to contain. Possible return |
| 244 | +** values are: |
| 245 | +** |
| 246 | +** (1) -- The content appears to consist entirely of text, with lines |
| 247 | +** delimited by line-feed characters; however, the encoding may |
| 248 | +** not be UTF-16. |
| 249 | +** |
| 250 | +** (0) -- The content appears to be binary because it contains embedded |
| 251 | +** NUL characters or an extremely long line. Since this function |
| 252 | +** does not understand UTF-8, it may falsely consider UTF-8 text |
| 253 | +** to be binary. |
| 254 | +** |
| 255 | +** (-1) -- The content appears to consist entirely of text, with lines |
| 256 | +** delimited by carriage-return, line-feed pairs; however, the |
| 257 | +** encoding may not be UTF-16. |
| 258 | +** |
| 259 | +*/ |
| 260 | +int looks_like_utf16(const Blob *pContent){ |
| 261 | + const wchar_t *z = (wchar_t *)blob_buffer(pContent); |
| 262 | + unsigned int n = blob_size(pContent); |
| 263 | + int j, c; |
| 264 | + int result = 1; /* Assume UTF-16 text with no CR/NL */ |
| 265 | + |
| 266 | + /* Check individual lines. |
| 267 | + */ |
| 268 | + if( n==0 ) return result; /* Empty file -> text */ |
| 269 | + if( n%2 ) return 0; /* Odd number of bytes -> binary (or UTF-8) */ |
| 270 | + c = *z; |
| 271 | + if( c==0 ) return 0; /* NUL character in a file -> binary */ |
| 272 | + j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF)); |
| 273 | + while( (n-=2)>0 ){ |
| 274 | + c = *++z; ++j; |
| 275 | + if( c==0 ) return 0; /* NUL character in a file -> binary */ |
| 276 | + if( c==UTF16BE_LF || c==UTF16LE_LF ){ |
| 277 | + if( z[-1]==UTF16BE_CR || z[-1]==UTF16LE_CR ){ |
| 278 | + result = -1; /* Contains CR/NL, continue */ |
| 279 | + } |
| 280 | + if( j>UTF16_LENGTH_MASK ){ |
| 281 | + return 0; /* Very long line -> binary */ |
| 282 | + } |
| 283 | + j = 0; |
| 284 | + } |
| 285 | + } |
| 286 | + if( j>UTF16_LENGTH_MASK ){ |
| 287 | + return 0; /* Very long line -> binary */ |
| 220 | 288 | } |
| 221 | 289 | return result; /* No problems seen -> not binary */ |
| 222 | 290 | } |
| 223 | 291 | |
| 224 | 292 | /* |
| 225 | 293 | |