Fossil SCM

Add back checks for FFFE and FFFF in looks_like_text: <br>- FFFE is a reversed BOM. It could arise when an UTF-16LE and an UTF-16BE file are concatenated. This is always a fatal error, so diff should bail out early. <br>- FFFF is a permanently unassigned character as well, which doesn't exist in any font. Sometimes uninitialized variables are either initialized with 0000 or FFFF, so this makes it much more likely that binary files are detected early. <p>The earlier check for FEFF was not sensible: A BOM in the middle of an UTF-16 file could easily arise when two UTF-16LE or twe UTF-16BE files are concatenated. Nothing goes wrong in that case, the BOM should just be displayed a zero-width-space. Therefore, diff should not bail out.

jan.nijtmans 2012-11-01 14:49 UTC improve_looks_like_binary
Commit 64868f2b9899a593860e3513a8577913d0b0270a
1 file changed +6
+6
--- src/diff.c
+++ src/diff.c
@@ -215,10 +215,13 @@
215215
if( z[1]==0 ){ /* High-byte must be 0 for further checks */
216216
if( c==0 ) return 0; /* \000 char in a file -> binary */
217217
if( c=='\n' ){
218218
j = LENGTH_MASK/3;
219219
}
220
+ }else if( (z[1]==0xff)&&(c>0xfd) ){
221
+ /* FFFE and FFFF are invalid UTF-16. */
222
+ return 0;
220223
}
221224
if( --j==0 ){
222225
return 0; /* Very long line -> binary */
223226
}
224227
}
@@ -231,10 +234,13 @@
231234
if ( z[-1]==0 ){ /* High-byte must be 0 for further checks */
232235
if( c==0 ) return 0; /* \000 char in a file -> binary */
233236
if( c=='\n' ){
234237
j = LENGTH_MASK/3;
235238
}
239
+ }else if( (z[-1]==0xff)&&(c>0xfd) ){
240
+ /* FFFE and FFFF are invalid UTF-16. */
241
+ return 0;
236242
}
237243
if( --j==0 ){
238244
return 0; /* Very long line -> binary */
239245
}
240246
}
241247
--- src/diff.c
+++ src/diff.c
@@ -215,10 +215,13 @@
215 if( z[1]==0 ){ /* High-byte must be 0 for further checks */
216 if( c==0 ) return 0; /* \000 char in a file -> binary */
217 if( c=='\n' ){
218 j = LENGTH_MASK/3;
219 }
 
 
 
220 }
221 if( --j==0 ){
222 return 0; /* Very long line -> binary */
223 }
224 }
@@ -231,10 +234,13 @@
231 if ( z[-1]==0 ){ /* High-byte must be 0 for further checks */
232 if( c==0 ) return 0; /* \000 char in a file -> binary */
233 if( c=='\n' ){
234 j = LENGTH_MASK/3;
235 }
 
 
 
236 }
237 if( --j==0 ){
238 return 0; /* Very long line -> binary */
239 }
240 }
241
--- src/diff.c
+++ src/diff.c
@@ -215,10 +215,13 @@
215 if( z[1]==0 ){ /* High-byte must be 0 for further checks */
216 if( c==0 ) return 0; /* \000 char in a file -> binary */
217 if( c=='\n' ){
218 j = LENGTH_MASK/3;
219 }
220 }else if( (z[1]==0xff)&&(c>0xfd) ){
221 /* FFFE and FFFF are invalid UTF-16. */
222 return 0;
223 }
224 if( --j==0 ){
225 return 0; /* Very long line -> binary */
226 }
227 }
@@ -231,10 +234,13 @@
234 if ( z[-1]==0 ){ /* High-byte must be 0 for further checks */
235 if( c==0 ) return 0; /* \000 char in a file -> binary */
236 if( c=='\n' ){
237 j = LENGTH_MASK/3;
238 }
239 }else if( (z[-1]==0xff)&&(c>0xfd) ){
240 /* FFFE and FFFF are invalid UTF-16. */
241 return 0;
242 }
243 if( --j==0 ){
244 return 0; /* Very long line -> binary */
245 }
246 }
247

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button