Fossil SCM
Enhance looks_like_text(): <br>- Detect line-length overflow earlier, not at the next NL <br>- Implement the same binary and line-length check for UTF-16 as well <p>For UTF-16, the line-length limit is set to 2/3th of the line length limit for other text, because UTF-16 -> UTF-8 conversion can increase the line length (in bytes) by max 50%. This guarantees that a UTF-16 diff can be made by converting the two UTF-16 files to UTF-8 and then do a normal diff.
Commit
58702daa558730c3b109fe2af115ac950c9e6144
Parent
0ba08f9d26ecb3a…
1 file changed
+44
-18
+44
-18
| --- src/diff.c | ||
| +++ src/diff.c | ||
| @@ -179,21 +179,22 @@ | ||
| 179 | 179 | ** (1) -- The content appears to consist entirely of text, with lines |
| 180 | 180 | ** delimited by line-feed characters; however, the encoding may |
| 181 | 181 | ** not be UTF-8. |
| 182 | 182 | ** |
| 183 | 183 | ** (0) -- The content appears to be binary because it contains embedded |
| 184 | -** NUL (\000) characters or an extremely long line. | |
| 184 | +** non-text (\0x0-\0x8, \0xe-\0x1a, \x01c-\x01f) characters or an | |
| 185 | +** extremely long line. | |
| 185 | 186 | ** |
| 186 | 187 | ** (-1) -- The content appears to consist entirely of text, with lines |
| 187 | 188 | ** delimited by carriage-return, line-feed pairs; however, the |
| 188 | 189 | ** encoding may not be UTF-8. |
| 189 | 190 | ** |
| 190 | 191 | ** (-2) -- The content appears to consist entirely of text, in the |
| 191 | 192 | ** UTF-16 (BE or LE) encoding. |
| 192 | 193 | */ |
| 193 | 194 | int looks_like_text(const Blob *pContent){ |
| 194 | - const unsigned char *z = blob_buffer(pContent); | |
| 195 | + unsigned char *z = (unsigned char *) blob_buffer(pContent); | |
| 195 | 196 | unsigned int n = blob_size(pContent); |
| 196 | 197 | int j; |
| 197 | 198 | unsigned char c; |
| 198 | 199 | int result = 1; /* Assume text with no CR/NL */ |
| 199 | 200 | static const char isBinary[256] = { |
| @@ -205,33 +206,58 @@ | ||
| 205 | 206 | /* Check individual lines. |
| 206 | 207 | */ |
| 207 | 208 | if( n==0 ) return result; /* Empty file -> text */ |
| 208 | 209 | c = *z; |
| 209 | 210 | if( isBinary[c] ) return 0; /* non-text byte in a file -> binary */ |
| 210 | - if ( n > 1 ){ | |
| 211 | - if ( (c==0xff) && (z[1]==0xfe) ){ | |
| 212 | - return -2; | |
| 213 | - } else if ( (c==0xfe) && (z[1]==0xff) ){ | |
| 214 | - return -2; | |
| 211 | + if ( (n&1)==0 ){ /* UTF-16 must have an even blob length */ | |
| 212 | + if ( (c==0xff) && (z[1]==0xfe) ){ /* UTF-16 LE BOM */ | |
| 213 | + result = -2; | |
| 214 | + j = LENGTH_MASK*2/3; | |
| 215 | + while( (n-=2)>0 ){ | |
| 216 | + c = *(z+=2); | |
| 217 | + if( z[1]==0 ){ /* High-byte must be 0 for further checks */ | |
| 218 | + if( isBinary[c] ) return 0; /* non-text char in a file -> binary */ | |
| 219 | + if( c=='\n' ){ | |
| 220 | + j = LENGTH_MASK; | |
| 221 | + } | |
| 222 | + } | |
| 223 | + if( --j==0 ){ | |
| 224 | + return 0; /* Very long line -> binary */ | |
| 225 | + } | |
| 226 | + } | |
| 227 | + return result; | |
| 228 | + } else if ( (c==0xfe) && (z[1]==0xff) ){ /* UTF-16 BE BOM */ | |
| 229 | + result = -2; | |
| 230 | + ++z; j = LENGTH_MASK*2/3; | |
| 231 | + while( (n-=2)>0 ){ | |
| 232 | + c = *(z+=2); | |
| 233 | + if ( z[-1]==0 ){ /* High-byte must be 0 for further checks */ | |
| 234 | + if( isBinary[c] ) return 0; /* non-text char in a file -> binary */ | |
| 235 | + if( c=='\n' ){ | |
| 236 | + j = LENGTH_MASK; | |
| 237 | + } | |
| 238 | + } | |
| 239 | + if( --j==0 ){ | |
| 240 | + return 0; /* Very long line -> binary */ | |
| 241 | + } | |
| 242 | + } | |
| 243 | + return result; | |
| 215 | 244 | } |
| 216 | 245 | } |
| 217 | - j = (c!='\n'); | |
| 246 | + j = LENGTH_MASK - (c!='\n'); | |
| 218 | 247 | while( --n>0 ){ |
| 219 | - c = *++z; ++j; | |
| 220 | - if( isBinary[c] ) return 0; /* \000 byte in a file -> binary */ | |
| 248 | + c = *++z; | |
| 249 | + if( isBinary[c] ) return 0; /* non-text byte in a file -> binary */ | |
| 221 | 250 | if( c=='\n' ){ |
| 222 | 251 | if( z[-1]=='\r' ){ |
| 223 | 252 | result = -1; /* Contains CR/NL, continue */ |
| 224 | 253 | } |
| 225 | - if( j>LENGTH_MASK ){ | |
| 226 | - return 0; /* Very long line -> binary */ | |
| 227 | - } | |
| 228 | - j = 0; | |
| 229 | - } | |
| 230 | - } | |
| 231 | - if( j>LENGTH_MASK ){ | |
| 232 | - return 0; /* Very long line -> binary */ | |
| 254 | + j = LENGTH_MASK; | |
| 255 | + } | |
| 256 | + if( --j==0 ){ | |
| 257 | + return 0; /* Very long line -> binary */ | |
| 258 | + } | |
| 233 | 259 | } |
| 234 | 260 | return result; /* No problems seen -> not binary */ |
| 235 | 261 | } |
| 236 | 262 | |
| 237 | 263 | /* |
| 238 | 264 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -179,21 +179,22 @@ | |
| 179 | ** (1) -- The content appears to consist entirely of text, with lines |
| 180 | ** delimited by line-feed characters; however, the encoding may |
| 181 | ** not be UTF-8. |
| 182 | ** |
| 183 | ** (0) -- The content appears to be binary because it contains embedded |
| 184 | ** NUL (\000) characters or an extremely long line. |
| 185 | ** |
| 186 | ** (-1) -- The content appears to consist entirely of text, with lines |
| 187 | ** delimited by carriage-return, line-feed pairs; however, the |
| 188 | ** encoding may not be UTF-8. |
| 189 | ** |
| 190 | ** (-2) -- The content appears to consist entirely of text, in the |
| 191 | ** UTF-16 (BE or LE) encoding. |
| 192 | */ |
| 193 | int looks_like_text(const Blob *pContent){ |
| 194 | const unsigned char *z = blob_buffer(pContent); |
| 195 | unsigned int n = blob_size(pContent); |
| 196 | int j; |
| 197 | unsigned char c; |
| 198 | int result = 1; /* Assume text with no CR/NL */ |
| 199 | static const char isBinary[256] = { |
| @@ -205,33 +206,58 @@ | |
| 205 | /* Check individual lines. |
| 206 | */ |
| 207 | if( n==0 ) return result; /* Empty file -> text */ |
| 208 | c = *z; |
| 209 | if( isBinary[c] ) return 0; /* non-text byte in a file -> binary */ |
| 210 | if ( n > 1 ){ |
| 211 | if ( (c==0xff) && (z[1]==0xfe) ){ |
| 212 | return -2; |
| 213 | } else if ( (c==0xfe) && (z[1]==0xff) ){ |
| 214 | return -2; |
| 215 | } |
| 216 | } |
| 217 | j = (c!='\n'); |
| 218 | while( --n>0 ){ |
| 219 | c = *++z; ++j; |
| 220 | if( isBinary[c] ) return 0; /* \000 byte in a file -> binary */ |
| 221 | if( c=='\n' ){ |
| 222 | if( z[-1]=='\r' ){ |
| 223 | result = -1; /* Contains CR/NL, continue */ |
| 224 | } |
| 225 | if( j>LENGTH_MASK ){ |
| 226 | return 0; /* Very long line -> binary */ |
| 227 | } |
| 228 | j = 0; |
| 229 | } |
| 230 | } |
| 231 | if( j>LENGTH_MASK ){ |
| 232 | return 0; /* Very long line -> binary */ |
| 233 | } |
| 234 | return result; /* No problems seen -> not binary */ |
| 235 | } |
| 236 | |
| 237 | /* |
| 238 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -179,21 +179,22 @@ | |
| 179 | ** (1) -- The content appears to consist entirely of text, with lines |
| 180 | ** delimited by line-feed characters; however, the encoding may |
| 181 | ** not be UTF-8. |
| 182 | ** |
| 183 | ** (0) -- The content appears to be binary because it contains embedded |
| 184 | ** non-text (\0x0-\0x8, \0xe-\0x1a, \x01c-\x01f) characters or an |
| 185 | ** extremely long line. |
| 186 | ** |
| 187 | ** (-1) -- The content appears to consist entirely of text, with lines |
| 188 | ** delimited by carriage-return, line-feed pairs; however, the |
| 189 | ** encoding may not be UTF-8. |
| 190 | ** |
| 191 | ** (-2) -- The content appears to consist entirely of text, in the |
| 192 | ** UTF-16 (BE or LE) encoding. |
| 193 | */ |
| 194 | int looks_like_text(const Blob *pContent){ |
| 195 | unsigned char *z = (unsigned char *) blob_buffer(pContent); |
| 196 | unsigned int n = blob_size(pContent); |
| 197 | int j; |
| 198 | unsigned char c; |
| 199 | int result = 1; /* Assume text with no CR/NL */ |
| 200 | static const char isBinary[256] = { |
| @@ -205,33 +206,58 @@ | |
| 206 | /* Check individual lines. |
| 207 | */ |
| 208 | if( n==0 ) return result; /* Empty file -> text */ |
| 209 | c = *z; |
| 210 | if( isBinary[c] ) return 0; /* non-text byte in a file -> binary */ |
| 211 | if ( (n&1)==0 ){ /* UTF-16 must have an even blob length */ |
| 212 | if ( (c==0xff) && (z[1]==0xfe) ){ /* UTF-16 LE BOM */ |
| 213 | result = -2; |
| 214 | j = LENGTH_MASK*2/3; |
| 215 | while( (n-=2)>0 ){ |
| 216 | c = *(z+=2); |
| 217 | if( z[1]==0 ){ /* High-byte must be 0 for further checks */ |
| 218 | if( isBinary[c] ) return 0; /* non-text char in a file -> binary */ |
| 219 | if( c=='\n' ){ |
| 220 | j = LENGTH_MASK; |
| 221 | } |
| 222 | } |
| 223 | if( --j==0 ){ |
| 224 | return 0; /* Very long line -> binary */ |
| 225 | } |
| 226 | } |
| 227 | return result; |
| 228 | } else if ( (c==0xfe) && (z[1]==0xff) ){ /* UTF-16 BE BOM */ |
| 229 | result = -2; |
| 230 | ++z; j = LENGTH_MASK*2/3; |
| 231 | while( (n-=2)>0 ){ |
| 232 | c = *(z+=2); |
| 233 | if ( z[-1]==0 ){ /* High-byte must be 0 for further checks */ |
| 234 | if( isBinary[c] ) return 0; /* non-text char in a file -> binary */ |
| 235 | if( c=='\n' ){ |
| 236 | j = LENGTH_MASK; |
| 237 | } |
| 238 | } |
| 239 | if( --j==0 ){ |
| 240 | return 0; /* Very long line -> binary */ |
| 241 | } |
| 242 | } |
| 243 | return result; |
| 244 | } |
| 245 | } |
| 246 | j = LENGTH_MASK - (c!='\n'); |
| 247 | while( --n>0 ){ |
| 248 | c = *++z; |
| 249 | if( isBinary[c] ) return 0; /* non-text byte in a file -> binary */ |
| 250 | if( c=='\n' ){ |
| 251 | if( z[-1]=='\r' ){ |
| 252 | result = -1; /* Contains CR/NL, continue */ |
| 253 | } |
| 254 | j = LENGTH_MASK; |
| 255 | } |
| 256 | if( --j==0 ){ |
| 257 | return 0; /* Very long line -> binary */ |
| 258 | } |
| 259 | } |
| 260 | return result; /* No problems seen -> not binary */ |
| 261 | } |
| 262 | |
| 263 | /* |
| 264 |