Fossil SCM
Add back checks for FFFE and FFFF in looks_like_text: <br>- FFFE is a reversed BOM. It could arise when an UTF-16LE and an UTF-16BE file are concatenated. This is always a fatal error, so diff should bail out early. <br>- FFFF is a permanently unassigned character as well, which doesn't exist in any font. Sometimes uninitialized variables are either initialized with 0000 or FFFF, so this makes it much more likely that binary files are detected early. <p>The earlier check for FEFF was not sensible: A BOM in the middle of an UTF-16 file could easily arise when two UTF-16LE or twe UTF-16BE files are concatenated. Nothing goes wrong in that case, the BOM should just be displayed a zero-width-space. Therefore, diff should not bail out.
Commit
64868f2b9899a593860e3513a8577913d0b0270a
Parent
0ccbb44f14790cb…
1 file changed
+6
+6
| --- src/diff.c | ||
| +++ src/diff.c | ||
| @@ -215,10 +215,13 @@ | ||
| 215 | 215 | if( z[1]==0 ){ /* High-byte must be 0 for further checks */ |
| 216 | 216 | if( c==0 ) return 0; /* \000 char in a file -> binary */ |
| 217 | 217 | if( c=='\n' ){ |
| 218 | 218 | j = LENGTH_MASK/3; |
| 219 | 219 | } |
| 220 | + }else if( (z[1]==0xff)&&(c>0xfd) ){ | |
| 221 | + /* FFFE and FFFF are invalid UTF-16. */ | |
| 222 | + return 0; | |
| 220 | 223 | } |
| 221 | 224 | if( --j==0 ){ |
| 222 | 225 | return 0; /* Very long line -> binary */ |
| 223 | 226 | } |
| 224 | 227 | } |
| @@ -231,10 +234,13 @@ | ||
| 231 | 234 | if ( z[-1]==0 ){ /* High-byte must be 0 for further checks */ |
| 232 | 235 | if( c==0 ) return 0; /* \000 char in a file -> binary */ |
| 233 | 236 | if( c=='\n' ){ |
| 234 | 237 | j = LENGTH_MASK/3; |
| 235 | 238 | } |
| 239 | + }else if( (z[-1]==0xff)&&(c>0xfd) ){ | |
| 240 | + /* FFFE and FFFF are invalid UTF-16. */ | |
| 241 | + return 0; | |
| 236 | 242 | } |
| 237 | 243 | if( --j==0 ){ |
| 238 | 244 | return 0; /* Very long line -> binary */ |
| 239 | 245 | } |
| 240 | 246 | } |
| 241 | 247 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -215,10 +215,13 @@ | |
| 215 | if( z[1]==0 ){ /* High-byte must be 0 for further checks */ |
| 216 | if( c==0 ) return 0; /* \000 char in a file -> binary */ |
| 217 | if( c=='\n' ){ |
| 218 | j = LENGTH_MASK/3; |
| 219 | } |
| 220 | } |
| 221 | if( --j==0 ){ |
| 222 | return 0; /* Very long line -> binary */ |
| 223 | } |
| 224 | } |
| @@ -231,10 +234,13 @@ | |
| 231 | if ( z[-1]==0 ){ /* High-byte must be 0 for further checks */ |
| 232 | if( c==0 ) return 0; /* \000 char in a file -> binary */ |
| 233 | if( c=='\n' ){ |
| 234 | j = LENGTH_MASK/3; |
| 235 | } |
| 236 | } |
| 237 | if( --j==0 ){ |
| 238 | return 0; /* Very long line -> binary */ |
| 239 | } |
| 240 | } |
| 241 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -215,10 +215,13 @@ | |
| 215 | if( z[1]==0 ){ /* High-byte must be 0 for further checks */ |
| 216 | if( c==0 ) return 0; /* \000 char in a file -> binary */ |
| 217 | if( c=='\n' ){ |
| 218 | j = LENGTH_MASK/3; |
| 219 | } |
| 220 | }else if( (z[1]==0xff)&&(c>0xfd) ){ |
| 221 | /* FFFE and FFFF are invalid UTF-16. */ |
| 222 | return 0; |
| 223 | } |
| 224 | if( --j==0 ){ |
| 225 | return 0; /* Very long line -> binary */ |
| 226 | } |
| 227 | } |
| @@ -231,10 +234,13 @@ | |
| 234 | if ( z[-1]==0 ){ /* High-byte must be 0 for further checks */ |
| 235 | if( c==0 ) return 0; /* \000 char in a file -> binary */ |
| 236 | if( c=='\n' ){ |
| 237 | j = LENGTH_MASK/3; |
| 238 | } |
| 239 | }else if( (z[-1]==0xff)&&(c>0xfd) ){ |
| 240 | /* FFFE and FFFF are invalid UTF-16. */ |
| 241 | return 0; |
| 242 | } |
| 243 | if( --j==0 ){ |
| 244 | return 0; /* Very long line -> binary */ |
| 245 | } |
| 246 | } |
| 247 |