Fossil SCM

Enhance looks_like_text(): <br>- Detect line-length overflow earlier, not at the next NL <br>- Implement the same binary and line-length check for UTF-16 as well <p>For UTF-16, the line-length limit is set to 2/3th of the line length limit for other text, because UTF-16 -> UTF-8 conversion can increase the line length (in bytes) by max 50%. This guarantees that a UTF-16 diff can be made by converting the two UTF-16 files to UTF-8 and then do a normal diff.

jan.nijtmans 2012-10-31 08:43 UTC improve_looks_like_binary
Commit 58702daa558730c3b109fe2af115ac950c9e6144
1 file changed +44 -18
+44 -18
--- src/diff.c
+++ src/diff.c
@@ -179,21 +179,22 @@
179179
** (1) -- The content appears to consist entirely of text, with lines
180180
** delimited by line-feed characters; however, the encoding may
181181
** not be UTF-8.
182182
**
183183
** (0) -- The content appears to be binary because it contains embedded
184
-** NUL (\000) characters or an extremely long line.
184
+** non-text (\0x0-\0x8, \0xe-\0x1a, \x01c-\x01f) characters or an
185
+** extremely long line.
185186
**
186187
** (-1) -- The content appears to consist entirely of text, with lines
187188
** delimited by carriage-return, line-feed pairs; however, the
188189
** encoding may not be UTF-8.
189190
**
190191
** (-2) -- The content appears to consist entirely of text, in the
191192
** UTF-16 (BE or LE) encoding.
192193
*/
193194
int looks_like_text(const Blob *pContent){
194
- const unsigned char *z = blob_buffer(pContent);
195
+ unsigned char *z = (unsigned char *) blob_buffer(pContent);
195196
unsigned int n = blob_size(pContent);
196197
int j;
197198
unsigned char c;
198199
int result = 1; /* Assume text with no CR/NL */
199200
static const char isBinary[256] = {
@@ -205,33 +206,58 @@
205206
/* Check individual lines.
206207
*/
207208
if( n==0 ) return result; /* Empty file -> text */
208209
c = *z;
209210
if( isBinary[c] ) return 0; /* non-text byte in a file -> binary */
210
- if ( n > 1 ){
211
- if ( (c==0xff) && (z[1]==0xfe) ){
212
- return -2;
213
- } else if ( (c==0xfe) && (z[1]==0xff) ){
214
- return -2;
211
+ if ( (n&1)==0 ){ /* UTF-16 must have an even blob length */
212
+ if ( (c==0xff) && (z[1]==0xfe) ){ /* UTF-16 LE BOM */
213
+ result = -2;
214
+ j = LENGTH_MASK*2/3;
215
+ while( (n-=2)>0 ){
216
+ c = *(z+=2);
217
+ if( z[1]==0 ){ /* High-byte must be 0 for further checks */
218
+ if( isBinary[c] ) return 0; /* non-text char in a file -> binary */
219
+ if( c=='\n' ){
220
+ j = LENGTH_MASK;
221
+ }
222
+ }
223
+ if( --j==0 ){
224
+ return 0; /* Very long line -> binary */
225
+ }
226
+ }
227
+ return result;
228
+ } else if ( (c==0xfe) && (z[1]==0xff) ){ /* UTF-16 BE BOM */
229
+ result = -2;
230
+ ++z; j = LENGTH_MASK*2/3;
231
+ while( (n-=2)>0 ){
232
+ c = *(z+=2);
233
+ if ( z[-1]==0 ){ /* High-byte must be 0 for further checks */
234
+ if( isBinary[c] ) return 0; /* non-text char in a file -> binary */
235
+ if( c=='\n' ){
236
+ j = LENGTH_MASK;
237
+ }
238
+ }
239
+ if( --j==0 ){
240
+ return 0; /* Very long line -> binary */
241
+ }
242
+ }
243
+ return result;
215244
}
216245
}
217
- j = (c!='\n');
246
+ j = LENGTH_MASK - (c!='\n');
218247
while( --n>0 ){
219
- c = *++z; ++j;
220
- if( isBinary[c] ) return 0; /* \000 byte in a file -> binary */
248
+ c = *++z;
249
+ if( isBinary[c] ) return 0; /* non-text byte in a file -> binary */
221250
if( c=='\n' ){
222251
if( z[-1]=='\r' ){
223252
result = -1; /* Contains CR/NL, continue */
224253
}
225
- if( j>LENGTH_MASK ){
226
- return 0; /* Very long line -> binary */
227
- }
228
- j = 0;
229
- }
230
- }
231
- if( j>LENGTH_MASK ){
232
- return 0; /* Very long line -> binary */
254
+ j = LENGTH_MASK;
255
+ }
256
+ if( --j==0 ){
257
+ return 0; /* Very long line -> binary */
258
+ }
233259
}
234260
return result; /* No problems seen -> not binary */
235261
}
236262
237263
/*
238264
--- src/diff.c
+++ src/diff.c
@@ -179,21 +179,22 @@
179 ** (1) -- The content appears to consist entirely of text, with lines
180 ** delimited by line-feed characters; however, the encoding may
181 ** not be UTF-8.
182 **
183 ** (0) -- The content appears to be binary because it contains embedded
184 ** NUL (\000) characters or an extremely long line.
 
185 **
186 ** (-1) -- The content appears to consist entirely of text, with lines
187 ** delimited by carriage-return, line-feed pairs; however, the
188 ** encoding may not be UTF-8.
189 **
190 ** (-2) -- The content appears to consist entirely of text, in the
191 ** UTF-16 (BE or LE) encoding.
192 */
193 int looks_like_text(const Blob *pContent){
194 const unsigned char *z = blob_buffer(pContent);
195 unsigned int n = blob_size(pContent);
196 int j;
197 unsigned char c;
198 int result = 1; /* Assume text with no CR/NL */
199 static const char isBinary[256] = {
@@ -205,33 +206,58 @@
205 /* Check individual lines.
206 */
207 if( n==0 ) return result; /* Empty file -> text */
208 c = *z;
209 if( isBinary[c] ) return 0; /* non-text byte in a file -> binary */
210 if ( n > 1 ){
211 if ( (c==0xff) && (z[1]==0xfe) ){
212 return -2;
213 } else if ( (c==0xfe) && (z[1]==0xff) ){
214 return -2;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215 }
216 }
217 j = (c!='\n');
218 while( --n>0 ){
219 c = *++z; ++j;
220 if( isBinary[c] ) return 0; /* \000 byte in a file -> binary */
221 if( c=='\n' ){
222 if( z[-1]=='\r' ){
223 result = -1; /* Contains CR/NL, continue */
224 }
225 if( j>LENGTH_MASK ){
226 return 0; /* Very long line -> binary */
227 }
228 j = 0;
229 }
230 }
231 if( j>LENGTH_MASK ){
232 return 0; /* Very long line -> binary */
233 }
234 return result; /* No problems seen -> not binary */
235 }
236
237 /*
238
--- src/diff.c
+++ src/diff.c
@@ -179,21 +179,22 @@
179 ** (1) -- The content appears to consist entirely of text, with lines
180 ** delimited by line-feed characters; however, the encoding may
181 ** not be UTF-8.
182 **
183 ** (0) -- The content appears to be binary because it contains embedded
184 ** non-text (\0x0-\0x8, \0xe-\0x1a, \x01c-\x01f) characters or an
185 ** extremely long line.
186 **
187 ** (-1) -- The content appears to consist entirely of text, with lines
188 ** delimited by carriage-return, line-feed pairs; however, the
189 ** encoding may not be UTF-8.
190 **
191 ** (-2) -- The content appears to consist entirely of text, in the
192 ** UTF-16 (BE or LE) encoding.
193 */
194 int looks_like_text(const Blob *pContent){
195 unsigned char *z = (unsigned char *) blob_buffer(pContent);
196 unsigned int n = blob_size(pContent);
197 int j;
198 unsigned char c;
199 int result = 1; /* Assume text with no CR/NL */
200 static const char isBinary[256] = {
@@ -205,33 +206,58 @@
206 /* Check individual lines.
207 */
208 if( n==0 ) return result; /* Empty file -> text */
209 c = *z;
210 if( isBinary[c] ) return 0; /* non-text byte in a file -> binary */
211 if ( (n&1)==0 ){ /* UTF-16 must have an even blob length */
212 if ( (c==0xff) && (z[1]==0xfe) ){ /* UTF-16 LE BOM */
213 result = -2;
214 j = LENGTH_MASK*2/3;
215 while( (n-=2)>0 ){
216 c = *(z+=2);
217 if( z[1]==0 ){ /* High-byte must be 0 for further checks */
218 if( isBinary[c] ) return 0; /* non-text char in a file -> binary */
219 if( c=='\n' ){
220 j = LENGTH_MASK;
221 }
222 }
223 if( --j==0 ){
224 return 0; /* Very long line -> binary */
225 }
226 }
227 return result;
228 } else if ( (c==0xfe) && (z[1]==0xff) ){ /* UTF-16 BE BOM */
229 result = -2;
230 ++z; j = LENGTH_MASK*2/3;
231 while( (n-=2)>0 ){
232 c = *(z+=2);
233 if ( z[-1]==0 ){ /* High-byte must be 0 for further checks */
234 if( isBinary[c] ) return 0; /* non-text char in a file -> binary */
235 if( c=='\n' ){
236 j = LENGTH_MASK;
237 }
238 }
239 if( --j==0 ){
240 return 0; /* Very long line -> binary */
241 }
242 }
243 return result;
244 }
245 }
246 j = LENGTH_MASK - (c!='\n');
247 while( --n>0 ){
248 c = *++z;
249 if( isBinary[c] ) return 0; /* non-text byte in a file -> binary */
250 if( c=='\n' ){
251 if( z[-1]=='\r' ){
252 result = -1; /* Contains CR/NL, continue */
253 }
254 j = LENGTH_MASK;
255 }
256 if( --j==0 ){
257 return 0; /* Very long line -> binary */
258 }
 
 
 
259 }
260 return result; /* No problems seen -> not binary */
261 }
262
263 /*
264

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button