Fossil SCM

For the looks_like_utf*() functions, continue to examine blob content in order to fully set the output flags, even if it appears to be binary. Also, increase the strictness of starts_with_utf16_bom() and make it more accurate.

mistachkin 2013-03-07 01:12 trunk
Commit 13fac7f74a95059f3ac42246676d80045df829c3
1 file changed +35 -23
+35 -23
--- src/diff.c
+++ src/diff.c
@@ -72,10 +72,11 @@
7272
#define LOOK_NONE ((int)0x00000000) /* Nothing special was found. */
7373
#define LOOK_NUL ((int)0x00000001) /* One or more NUL chars were found. */
7474
#define LOOK_LF ((int)0x00000002) /* One or more LF chars were found. */
7575
#define LOOK_CRLF ((int)0x00000004) /* One or more CR/LF pairs were found. */
7676
#define LOOK_LENGTH ((int)0x00000008) /* An over length line was found. */
77
+#define LOOK_ODD ((int)0x00000010) /* An odd number of bytes was found. */
7778
#endif /* INTERFACE */
7879
7980
/*
8081
** Maximum length of a line in a text file, in bytes. (2**13 = 8192 bytes)
8182
*/
@@ -217,31 +218,34 @@
217218
** validate any code points. It makes no attempt to detect if any [invalid]
218219
** switches between UTF-8 and other encodings occur.
219220
**
220221
** The only code points that this function cares about are the NUL character,
221222
** carriage-return, and line-feed.
223
+**
224
+** Whether or not this function examines the entire contents of the blob are
225
+** officially unspecified.
222226
**
223227
************************************ WARNING **********************************
224228
*/
225229
int looks_like_utf8(const Blob *pContent, int *pFlags){
226230
const char *z = blob_buffer(pContent);
227231
unsigned int n = blob_size(pContent);
228
- int j, c;
232
+ int j, c, result = 1; /* Assume UTF-8 text, prove otherwise */
229233
230234
if( pFlags ) *pFlags = LOOK_NONE;
231
- if( n==0 ) return 1; /* Empty file -> text */
235
+ if( n==0 ) return result; /* Empty file -> text */
232236
c = *z;
233237
if( c==0 ){
234238
if( pFlags ) *pFlags |= LOOK_NUL;
235
- return 0; /* NUL character in a file -> binary */
239
+ result = 0; /* NUL character in a file -> binary */
236240
}
237241
j = (c!='\n');
238242
while( --n>0 ){
239243
c = *++z; ++j;
240244
if( c==0 ){
241245
if( pFlags ) *pFlags |= LOOK_NUL;
242
- return 0; /* NUL character in a file -> binary */
246
+ result = 0; /* NUL character in a file -> binary */
243247
}
244248
if( c=='\n' ){
245249
int c2 = z[-1];
246250
if( pFlags ){
247251
*pFlags |= LOOK_LF;
@@ -249,20 +253,20 @@
249253
*pFlags |= LOOK_CRLF;
250254
}
251255
}
252256
if( j>LENGTH_MASK ){
253257
if( pFlags ) *pFlags |= LOOK_LENGTH;
254
- return 0; /* Very long line -> binary */
258
+ result = 0; /* Very long line -> binary */
255259
}
256260
j = 0;
257261
}
258262
}
259263
if( j>LENGTH_MASK ){
260264
if( pFlags ) *pFlags |= LOOK_LENGTH;
261
- return 0; /* Very long line -> binary */
265
+ result = 0; /* Very long line -> binary */
262266
}
263
- return 1; /* No problems seen -> not binary */
267
+ return result; /* No problems seen -> not binary */
264268
}
265269
266270
/*
267271
** Define the type needed to represent a Unicode (UTF-16) character.
268272
*/
@@ -311,32 +315,38 @@
311315
** validate any code points. It makes no attempt to detect if any [invalid]
312316
** switches between the UTF-16be and UTF-16le encodings occur.
313317
**
314318
** The only code points that this function cares about are the NUL character,
315319
** carriage-return, and line-feed.
320
+**
321
+** Whether or not this function examines the entire contents of the blob are
322
+** officially unspecified.
316323
**
317324
************************************ WARNING **********************************
318325
*/
319326
int looks_like_utf16(const Blob *pContent, int *pFlags){
320327
const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent);
321328
unsigned int n = blob_size(pContent);
322
- int j, c;
329
+ int j, c, result = 1; /* Assume UTF-16 text, prove otherwise */
323330
324331
if( pFlags ) *pFlags = LOOK_NONE;
325
- if( n==0 ) return 1; /* Empty file -> text */
326
- if( n%2 ) return 0; /* Odd number of bytes -> binary (or UTF-8) */
332
+ if( n==0 ) return result; /* Empty file -> text */
333
+ if( n%2 ){
334
+ if( pFlags ) *pFlags |= LOOK_ODD;
335
+ result = 0; /* Odd number of bytes -> binary (or UTF-8) */
336
+ }
327337
c = *z;
328338
if( c==0 ){
329339
if( pFlags ) *pFlags |= LOOK_NUL;
330
- return 0; /* NUL character in a file -> binary */
340
+ result = 0; /* NUL character in a file -> binary */
331341
}
332342
j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
333343
while( (n-=2)>0 ){
334344
c = *++z; ++j;
335345
if( c==0 ){
336346
if( pFlags ) *pFlags |= LOOK_NUL;
337
- return 0; /* NUL character in a file -> binary */
347
+ result = 0; /* NUL character in a file -> binary */
338348
}
339349
if( c==UTF16BE_LF || c==UTF16LE_LF ){
340350
int c2 = z[-1];
341351
if( pFlags ){
342352
*pFlags |= LOOK_LF;
@@ -344,20 +354,20 @@
344354
*pFlags |= LOOK_CRLF;
345355
}
346356
}
347357
if( j>UTF16_LENGTH_MASK ){
348358
if( pFlags ) *pFlags |= LOOK_LENGTH;
349
- return 0; /* Very long line -> binary */
359
+ result = 0; /* Very long line -> binary */
350360
}
351361
j = 0;
352362
}
353363
}
354364
if( j>UTF16_LENGTH_MASK ){
355365
if( pFlags ) *pFlags |= LOOK_LENGTH;
356
- return 0; /* Very long line -> binary */
366
+ result = 0; /* Very long line -> binary */
357367
}
358
- return 1; /* No problems seen -> not binary */
368
+ return result; /* No problems seen -> not binary */
359369
}
360370
361371
/*
362372
** This function returns an array of bytes representing the byte-order-mark
363373
** for UTF-8.
@@ -395,23 +405,24 @@
395405
const Blob *pContent, /* IN: Blob content to perform BOM detection on. */
396406
int *pnByte, /* OUT: The number of bytes used for the BOM. */
397407
int *pbReverse /* OUT: Non-zero for BOM in reverse byte-order. */
398408
){
399409
const unsigned short *z = (unsigned short *)blob_buffer(pContent);
410
+ int bomSize = sizeof(unsigned short);
400411
int size = blob_size(pContent);
401412
402
- if( (size<2) || (size%2)
403
- || (size>=4 && z[1]==0) ) return 0;
404
- if( z[0] == 0xfffe ){
413
+ if( size<bomSize ) return 0; /* No: cannot read BOM. */
414
+ if( size>=(2*bomSize) && z[1]==0 ) return 0; /* No: possible UTF-32. */
415
+ if( z[0]==0xfffe ){
405416
if( pbReverse ) *pbReverse = 1;
406
- }else if( z[0] == 0xfeff ){
417
+ }else if( z[0]==0xfeff ){
407418
if( pbReverse ) *pbReverse = 0;
408419
}else{
409
- return 0;
420
+ return 0; /* No: UTF-16 byte-order-mark not found. */
410421
}
411
- if( pnByte ) *pnByte = 2;
412
- return 1;
422
+ if( pnByte ) *pnByte = bomSize;
423
+ return 1; /* Yes. */
413424
}
414425
415426
/*
416427
** Return true if two DLine elements are identical.
417428
*/
@@ -2474,12 +2485,13 @@
24742485
eType = fUtf16 ? looks_like_utf16(&blob, &lookFlags) :
24752486
looks_like_utf8(&blob, &lookFlags);
24762487
fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
24772488
fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
24782489
fossil_print("Starts with UTF-16 BOM: %s\n",fUtf16?"yes":"no");
2479
- fossil_print("Looks like UTF-%s: %s\n", fUtf16?"16":"8",eType?"yes":"no");
2490
+ fossil_print("Looks like UTF-%s: %s\n",fUtf16?"16":"8",eType?"yes":"no");
24802491
fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no");
24812492
fossil_print("Has flag LOOK_LF: %s\n",(lookFlags&LOOK_LF)?"yes":"no");
24822493
fossil_print("Has flag LOOK_CRLF: %s\n",(lookFlags&LOOK_CRLF)?"yes":"no");
24832494
fossil_print("Has flag LOOK_LENGTH: %s\n",(lookFlags&LOOK_LENGTH)?"yes":"no");
2495
+ fossil_print("Has flag LOOK_ODD: %s\n",(lookFlags&LOOK_ODD)?"yes":"no");
24842496
blob_reset(&blob);
24852497
}
24862498
--- src/diff.c
+++ src/diff.c
@@ -72,10 +72,11 @@
72 #define LOOK_NONE ((int)0x00000000) /* Nothing special was found. */
73 #define LOOK_NUL ((int)0x00000001) /* One or more NUL chars were found. */
74 #define LOOK_LF ((int)0x00000002) /* One or more LF chars were found. */
75 #define LOOK_CRLF ((int)0x00000004) /* One or more CR/LF pairs were found. */
76 #define LOOK_LENGTH ((int)0x00000008) /* An over length line was found. */
 
77 #endif /* INTERFACE */
78
79 /*
80 ** Maximum length of a line in a text file, in bytes. (2**13 = 8192 bytes)
81 */
@@ -217,31 +218,34 @@
217 ** validate any code points. It makes no attempt to detect if any [invalid]
218 ** switches between UTF-8 and other encodings occur.
219 **
220 ** The only code points that this function cares about are the NUL character,
221 ** carriage-return, and line-feed.
 
 
 
222 **
223 ************************************ WARNING **********************************
224 */
225 int looks_like_utf8(const Blob *pContent, int *pFlags){
226 const char *z = blob_buffer(pContent);
227 unsigned int n = blob_size(pContent);
228 int j, c;
229
230 if( pFlags ) *pFlags = LOOK_NONE;
231 if( n==0 ) return 1; /* Empty file -> text */
232 c = *z;
233 if( c==0 ){
234 if( pFlags ) *pFlags |= LOOK_NUL;
235 return 0; /* NUL character in a file -> binary */
236 }
237 j = (c!='\n');
238 while( --n>0 ){
239 c = *++z; ++j;
240 if( c==0 ){
241 if( pFlags ) *pFlags |= LOOK_NUL;
242 return 0; /* NUL character in a file -> binary */
243 }
244 if( c=='\n' ){
245 int c2 = z[-1];
246 if( pFlags ){
247 *pFlags |= LOOK_LF;
@@ -249,20 +253,20 @@
249 *pFlags |= LOOK_CRLF;
250 }
251 }
252 if( j>LENGTH_MASK ){
253 if( pFlags ) *pFlags |= LOOK_LENGTH;
254 return 0; /* Very long line -> binary */
255 }
256 j = 0;
257 }
258 }
259 if( j>LENGTH_MASK ){
260 if( pFlags ) *pFlags |= LOOK_LENGTH;
261 return 0; /* Very long line -> binary */
262 }
263 return 1; /* No problems seen -> not binary */
264 }
265
266 /*
267 ** Define the type needed to represent a Unicode (UTF-16) character.
268 */
@@ -311,32 +315,38 @@
311 ** validate any code points. It makes no attempt to detect if any [invalid]
312 ** switches between the UTF-16be and UTF-16le encodings occur.
313 **
314 ** The only code points that this function cares about are the NUL character,
315 ** carriage-return, and line-feed.
 
 
 
316 **
317 ************************************ WARNING **********************************
318 */
319 int looks_like_utf16(const Blob *pContent, int *pFlags){
320 const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent);
321 unsigned int n = blob_size(pContent);
322 int j, c;
323
324 if( pFlags ) *pFlags = LOOK_NONE;
325 if( n==0 ) return 1; /* Empty file -> text */
326 if( n%2 ) return 0; /* Odd number of bytes -> binary (or UTF-8) */
 
 
 
327 c = *z;
328 if( c==0 ){
329 if( pFlags ) *pFlags |= LOOK_NUL;
330 return 0; /* NUL character in a file -> binary */
331 }
332 j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
333 while( (n-=2)>0 ){
334 c = *++z; ++j;
335 if( c==0 ){
336 if( pFlags ) *pFlags |= LOOK_NUL;
337 return 0; /* NUL character in a file -> binary */
338 }
339 if( c==UTF16BE_LF || c==UTF16LE_LF ){
340 int c2 = z[-1];
341 if( pFlags ){
342 *pFlags |= LOOK_LF;
@@ -344,20 +354,20 @@
344 *pFlags |= LOOK_CRLF;
345 }
346 }
347 if( j>UTF16_LENGTH_MASK ){
348 if( pFlags ) *pFlags |= LOOK_LENGTH;
349 return 0; /* Very long line -> binary */
350 }
351 j = 0;
352 }
353 }
354 if( j>UTF16_LENGTH_MASK ){
355 if( pFlags ) *pFlags |= LOOK_LENGTH;
356 return 0; /* Very long line -> binary */
357 }
358 return 1; /* No problems seen -> not binary */
359 }
360
361 /*
362 ** This function returns an array of bytes representing the byte-order-mark
363 ** for UTF-8.
@@ -395,23 +405,24 @@
395 const Blob *pContent, /* IN: Blob content to perform BOM detection on. */
396 int *pnByte, /* OUT: The number of bytes used for the BOM. */
397 int *pbReverse /* OUT: Non-zero for BOM in reverse byte-order. */
398 ){
399 const unsigned short *z = (unsigned short *)blob_buffer(pContent);
 
400 int size = blob_size(pContent);
401
402 if( (size<2) || (size%2)
403 || (size>=4 && z[1]==0) ) return 0;
404 if( z[0] == 0xfffe ){
405 if( pbReverse ) *pbReverse = 1;
406 }else if( z[0] == 0xfeff ){
407 if( pbReverse ) *pbReverse = 0;
408 }else{
409 return 0;
410 }
411 if( pnByte ) *pnByte = 2;
412 return 1;
413 }
414
415 /*
416 ** Return true if two DLine elements are identical.
417 */
@@ -2474,12 +2485,13 @@
2474 eType = fUtf16 ? looks_like_utf16(&blob, &lookFlags) :
2475 looks_like_utf8(&blob, &lookFlags);
2476 fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
2477 fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
2478 fossil_print("Starts with UTF-16 BOM: %s\n",fUtf16?"yes":"no");
2479 fossil_print("Looks like UTF-%s: %s\n", fUtf16?"16":"8",eType?"yes":"no");
2480 fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no");
2481 fossil_print("Has flag LOOK_LF: %s\n",(lookFlags&LOOK_LF)?"yes":"no");
2482 fossil_print("Has flag LOOK_CRLF: %s\n",(lookFlags&LOOK_CRLF)?"yes":"no");
2483 fossil_print("Has flag LOOK_LENGTH: %s\n",(lookFlags&LOOK_LENGTH)?"yes":"no");
 
2484 blob_reset(&blob);
2485 }
2486
--- src/diff.c
+++ src/diff.c
@@ -72,10 +72,11 @@
72 #define LOOK_NONE ((int)0x00000000) /* Nothing special was found. */
73 #define LOOK_NUL ((int)0x00000001) /* One or more NUL chars were found. */
74 #define LOOK_LF ((int)0x00000002) /* One or more LF chars were found. */
75 #define LOOK_CRLF ((int)0x00000004) /* One or more CR/LF pairs were found. */
76 #define LOOK_LENGTH ((int)0x00000008) /* An over length line was found. */
77 #define LOOK_ODD ((int)0x00000010) /* An odd number of bytes was found. */
78 #endif /* INTERFACE */
79
80 /*
81 ** Maximum length of a line in a text file, in bytes. (2**13 = 8192 bytes)
82 */
@@ -217,31 +218,34 @@
218 ** validate any code points. It makes no attempt to detect if any [invalid]
219 ** switches between UTF-8 and other encodings occur.
220 **
221 ** The only code points that this function cares about are the NUL character,
222 ** carriage-return, and line-feed.
223 **
224 ** Whether or not this function examines the entire contents of the blob are
225 ** officially unspecified.
226 **
227 ************************************ WARNING **********************************
228 */
229 int looks_like_utf8(const Blob *pContent, int *pFlags){
230 const char *z = blob_buffer(pContent);
231 unsigned int n = blob_size(pContent);
232 int j, c, result = 1; /* Assume UTF-8 text, prove otherwise */
233
234 if( pFlags ) *pFlags = LOOK_NONE;
235 if( n==0 ) return result; /* Empty file -> text */
236 c = *z;
237 if( c==0 ){
238 if( pFlags ) *pFlags |= LOOK_NUL;
239 result = 0; /* NUL character in a file -> binary */
240 }
241 j = (c!='\n');
242 while( --n>0 ){
243 c = *++z; ++j;
244 if( c==0 ){
245 if( pFlags ) *pFlags |= LOOK_NUL;
246 result = 0; /* NUL character in a file -> binary */
247 }
248 if( c=='\n' ){
249 int c2 = z[-1];
250 if( pFlags ){
251 *pFlags |= LOOK_LF;
@@ -249,20 +253,20 @@
253 *pFlags |= LOOK_CRLF;
254 }
255 }
256 if( j>LENGTH_MASK ){
257 if( pFlags ) *pFlags |= LOOK_LENGTH;
258 result = 0; /* Very long line -> binary */
259 }
260 j = 0;
261 }
262 }
263 if( j>LENGTH_MASK ){
264 if( pFlags ) *pFlags |= LOOK_LENGTH;
265 result = 0; /* Very long line -> binary */
266 }
267 return result; /* No problems seen -> not binary */
268 }
269
270 /*
271 ** Define the type needed to represent a Unicode (UTF-16) character.
272 */
@@ -311,32 +315,38 @@
315 ** validate any code points. It makes no attempt to detect if any [invalid]
316 ** switches between the UTF-16be and UTF-16le encodings occur.
317 **
318 ** The only code points that this function cares about are the NUL character,
319 ** carriage-return, and line-feed.
320 **
321 ** Whether or not this function examines the entire contents of the blob are
322 ** officially unspecified.
323 **
324 ************************************ WARNING **********************************
325 */
326 int looks_like_utf16(const Blob *pContent, int *pFlags){
327 const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent);
328 unsigned int n = blob_size(pContent);
329 int j, c, result = 1; /* Assume UTF-16 text, prove otherwise */
330
331 if( pFlags ) *pFlags = LOOK_NONE;
332 if( n==0 ) return result; /* Empty file -> text */
333 if( n%2 ){
334 if( pFlags ) *pFlags |= LOOK_ODD;
335 result = 0; /* Odd number of bytes -> binary (or UTF-8) */
336 }
337 c = *z;
338 if( c==0 ){
339 if( pFlags ) *pFlags |= LOOK_NUL;
340 result = 0; /* NUL character in a file -> binary */
341 }
342 j = ((c!=UTF16BE_LF) && (c!=UTF16LE_LF));
343 while( (n-=2)>0 ){
344 c = *++z; ++j;
345 if( c==0 ){
346 if( pFlags ) *pFlags |= LOOK_NUL;
347 result = 0; /* NUL character in a file -> binary */
348 }
349 if( c==UTF16BE_LF || c==UTF16LE_LF ){
350 int c2 = z[-1];
351 if( pFlags ){
352 *pFlags |= LOOK_LF;
@@ -344,20 +354,20 @@
354 *pFlags |= LOOK_CRLF;
355 }
356 }
357 if( j>UTF16_LENGTH_MASK ){
358 if( pFlags ) *pFlags |= LOOK_LENGTH;
359 result = 0; /* Very long line -> binary */
360 }
361 j = 0;
362 }
363 }
364 if( j>UTF16_LENGTH_MASK ){
365 if( pFlags ) *pFlags |= LOOK_LENGTH;
366 result = 0; /* Very long line -> binary */
367 }
368 return result; /* No problems seen -> not binary */
369 }
370
371 /*
372 ** This function returns an array of bytes representing the byte-order-mark
373 ** for UTF-8.
@@ -395,23 +405,24 @@
405 const Blob *pContent, /* IN: Blob content to perform BOM detection on. */
406 int *pnByte, /* OUT: The number of bytes used for the BOM. */
407 int *pbReverse /* OUT: Non-zero for BOM in reverse byte-order. */
408 ){
409 const unsigned short *z = (unsigned short *)blob_buffer(pContent);
410 int bomSize = sizeof(unsigned short);
411 int size = blob_size(pContent);
412
413 if( size<bomSize ) return 0; /* No: cannot read BOM. */
414 if( size>=(2*bomSize) && z[1]==0 ) return 0; /* No: possible UTF-32. */
415 if( z[0]==0xfffe ){
416 if( pbReverse ) *pbReverse = 1;
417 }else if( z[0]==0xfeff ){
418 if( pbReverse ) *pbReverse = 0;
419 }else{
420 return 0; /* No: UTF-16 byte-order-mark not found. */
421 }
422 if( pnByte ) *pnByte = bomSize;
423 return 1; /* Yes. */
424 }
425
426 /*
427 ** Return true if two DLine elements are identical.
428 */
@@ -2474,12 +2485,13 @@
2485 eType = fUtf16 ? looks_like_utf16(&blob, &lookFlags) :
2486 looks_like_utf8(&blob, &lookFlags);
2487 fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
2488 fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
2489 fossil_print("Starts with UTF-16 BOM: %s\n",fUtf16?"yes":"no");
2490 fossil_print("Looks like UTF-%s: %s\n",fUtf16?"16":"8",eType?"yes":"no");
2491 fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no");
2492 fossil_print("Has flag LOOK_LF: %s\n",(lookFlags&LOOK_LF)?"yes":"no");
2493 fossil_print("Has flag LOOK_CRLF: %s\n",(lookFlags&LOOK_CRLF)?"yes":"no");
2494 fossil_print("Has flag LOOK_LENGTH: %s\n",(lookFlags&LOOK_LENGTH)?"yes":"no");
2495 fossil_print("Has flag LOOK_ODD: %s\n",(lookFlags&LOOK_ODD)?"yes":"no");
2496 blob_reset(&blob);
2497 }
2498

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button