Fossil SCM

Merge UTF-16 byte swapping fix and test-looks-like-utf command enhancements.

mistachkin 2013-03-19 17:40 trunk merge
Commit b4bec3753dd6106eaa840275aa891277609a33f2
1 file changed +33 -24
+33 -24
--- src/diff.c
+++ src/diff.c
@@ -258,11 +258,11 @@
258258
if( c==0 ){
259259
flags |= LOOK_NUL; /* NUL character in a file -> binary */
260260
}else if( c=='\n' ){
261261
flags |= LOOK_LF;
262262
if( c2=='\r' ){
263
- flags |= LOOK_CRLF; /* Found LF preceded by CR */
263
+ flags |= (LOOK_CR | LOOK_CRLF); /* Found LF preceded by CR */
264264
}else{
265265
flags |= LOOK_LONE_LF;
266266
}
267267
if( j>LENGTH_MASK ){
268268
flags |= LOOK_LONG; /* Very long line -> binary */
@@ -295,18 +295,19 @@
295295
/*
296296
** Maximum length of a line in a text file, in UTF-16 characters. (4096)
297297
** The number of bytes represented by this value cannot exceed LENGTH_MASK
298298
** bytes, because that is the line buffer size used by the diff engine.
299299
*/
300
-#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char)))
301
-#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1)
300
+#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char)))
301
+#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1)
302302
303303
/*
304304
** This macro is used to swap the byte order of a UTF-16 character in the
305305
** looks_like_utf16() function.
306306
*/
307
-#define UTF16_SWAP(ch) (((ch) << 8) & 0xFF00) | (((ch) >> 8) & 0xFF)
307
+#define UTF16_SWAP(ch) ((((ch) << 8) & 0xFF00) | (((ch) >> 8) & 0xFF))
308
+#define UTF16_SWAP_IF(expr,ch) ((expr) ? UTF16_SWAP((ch)) : (ch))
308309
309310
/*
310311
** This function attempts to scan each logical line within the blob to
311312
** determine the type of content it appears to contain. The return value
312313
** is a combination of one or more of the LOOK_XXX flags (see above):
@@ -347,45 +348,52 @@
347348
if( n%sizeof(WCHAR_T) ){
348349
flags |= LOOK_ODD; /* Odd number of bytes -> binary (UTF-8?) */
349350
if( n<sizeof(WCHAR_T) ) return flags; /* One byte -> binary (UTF-8?) */
350351
}
351352
c = *z;
353
+ if( bReverse ){
354
+ c = UTF16_SWAP(c);
355
+ }
352356
if( c==0 ){
353357
flags |= LOOK_NUL; /* NUL character in a file -> binary */
354
- }else if( bReverse ){
355
- c = UTF16_SWAP(c);
358
+ }else if( c=='\r' ){
359
+ flags |= LOOK_CR;
360
+ if( n<=sizeof(WCHAR_T) || UTF16_SWAP_IF(bReverse, z[1])!='\n' ){
361
+ flags |= LOOK_LONE_CR; /* More chars, next char is not LF */
362
+ }
356363
}
357364
j = (c!='\n');
358365
if( !j ) flags |= (LOOK_LF | LOOK_LONE_LF); /* Found LF as first char */
359366
while( 1 ){
360367
int c2 = c;
361368
n -= sizeof(WCHAR_T);
362369
if( n<sizeof(WCHAR_T) ) break;
363370
c = *++z;
371
+ if( bReverse ){
372
+ c = UTF16_SWAP(c);
373
+ }
364374
++j;
365375
if( c==0 ){
366376
flags |= LOOK_NUL; /* NUL character in a file -> binary */
367
- }else if( bReverse ){
368
- c = UTF16_SWAP(c);
369
- }
370
- if( c=='\n' ){
377
+ }else if( c=='\n' ){
378
+ flags |= LOOK_LF;
371379
if( c2=='\r' ){
372
- flags |= (LOOK_CRLF | LOOK_CR | LOOK_LF);
380
+ flags |= (LOOK_CR | LOOK_CRLF); /* Found LF preceded by CR */
373381
}else{
374
- flags |= (LOOK_LONE_LF | LOOK_LF);
382
+ flags |= LOOK_LONE_LF;
375383
}
376384
if( j>UTF16_LENGTH_MASK ){
377385
flags |= LOOK_LONG; /* Very long line -> binary */
378386
}
379387
j = 0;
380
- }else if( c2=='\r' ){
381
- flags |= (LOOK_CR | LOOK_LONE_CR);
388
+ }else if( c=='\r' ){
389
+ flags |= LOOK_CR;
390
+ if( n<=sizeof(WCHAR_T) || UTF16_SWAP_IF(bReverse, z[1])!='\n' ){
391
+ flags |= LOOK_LONE_CR; /* More chars, next char is not LF */
392
+ }
382393
}
383394
}
384
- if( c=='\r' ){
385
- flags |= (LOOK_CR | LOOK_LONE_CR); /* Found CR as last char */
386
- }
387395
if( j>UTF16_LENGTH_MASK ){
388396
flags |= LOOK_LONG; /* Very long line -> binary */
389397
}
390398
return flags;
391399
}
@@ -2524,22 +2532,23 @@
25242532
Blob blob; /* the contents of the specified file */
25252533
int fUtf8; /* return value of starts_with_utf8_bom() */
25262534
int fUtf16; /* return value of starts_with_utf16_bom() */
25272535
int fUnicode; /* return value of could_be_utf16() */
25282536
int lookFlags; /* output flags from looks_like_utf8/utf16() */
2529
- int bReverse = 0; /* non-zero -> UTF-16 byte order reversed */
2530
- if( g.argc<3 ) usage("FILENAME");
2537
+ int bRevUtf16 = 0; /* non-zero -> UTF-16 byte order reversed */
2538
+ int bRevUnicode = 0; /* non-zero -> UTF-16 byte order reversed */
2539
+ if( g.argc!=3 ) usage("FILENAME");
25312540
blob_read_from_file(&blob, g.argv[2]);
25322541
fUtf8 = starts_with_utf8_bom(&blob, 0);
2533
- fUtf16 = starts_with_utf16_bom(&blob, 0, &bReverse);
2534
- fUnicode = could_be_utf16(&blob, &bReverse);
2535
- lookFlags = fUnicode ? looks_like_utf16(&blob, bReverse) :
2536
- looks_like_utf8(&blob);
2542
+ fUtf16 = starts_with_utf16_bom(&blob, 0, &bRevUtf16);
2543
+ fUnicode = could_be_utf16(&blob, &bRevUnicode);
2544
+ lookFlags = fUnicode ? looks_like_utf16(&blob, bRevUnicode) :
2545
+ looks_like_utf8(&blob);
25372546
fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
25382547
fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
25392548
fossil_print("Starts with UTF-16 BOM: %s\n",
2540
- fUtf16?(bReverse?"reversed":"yes"):"no");
2549
+ fUtf16?(bRevUtf16?"reversed":"yes"):"no");
25412550
fossil_print("Looks like UTF-%s: %s\n",fUnicode?"16":"8",
25422551
(lookFlags&LOOK_BINARY)?"no":"yes");
25432552
fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no");
25442553
fossil_print("Has flag LOOK_CR: %s\n",(lookFlags&LOOK_CR)?"yes":"no");
25452554
fossil_print("Has flag LOOK_LONE_CR: %s\n",
25462555
--- src/diff.c
+++ src/diff.c
@@ -258,11 +258,11 @@
258 if( c==0 ){
259 flags |= LOOK_NUL; /* NUL character in a file -> binary */
260 }else if( c=='\n' ){
261 flags |= LOOK_LF;
262 if( c2=='\r' ){
263 flags |= LOOK_CRLF; /* Found LF preceded by CR */
264 }else{
265 flags |= LOOK_LONE_LF;
266 }
267 if( j>LENGTH_MASK ){
268 flags |= LOOK_LONG; /* Very long line -> binary */
@@ -295,18 +295,19 @@
295 /*
296 ** Maximum length of a line in a text file, in UTF-16 characters. (4096)
297 ** The number of bytes represented by this value cannot exceed LENGTH_MASK
298 ** bytes, because that is the line buffer size used by the diff engine.
299 */
300 #define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char)))
301 #define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1)
302
303 /*
304 ** This macro is used to swap the byte order of a UTF-16 character in the
305 ** looks_like_utf16() function.
306 */
307 #define UTF16_SWAP(ch) (((ch) << 8) & 0xFF00) | (((ch) >> 8) & 0xFF)
 
308
309 /*
310 ** This function attempts to scan each logical line within the blob to
311 ** determine the type of content it appears to contain. The return value
312 ** is a combination of one or more of the LOOK_XXX flags (see above):
@@ -347,45 +348,52 @@
347 if( n%sizeof(WCHAR_T) ){
348 flags |= LOOK_ODD; /* Odd number of bytes -> binary (UTF-8?) */
349 if( n<sizeof(WCHAR_T) ) return flags; /* One byte -> binary (UTF-8?) */
350 }
351 c = *z;
 
 
 
352 if( c==0 ){
353 flags |= LOOK_NUL; /* NUL character in a file -> binary */
354 }else if( bReverse ){
355 c = UTF16_SWAP(c);
 
 
 
356 }
357 j = (c!='\n');
358 if( !j ) flags |= (LOOK_LF | LOOK_LONE_LF); /* Found LF as first char */
359 while( 1 ){
360 int c2 = c;
361 n -= sizeof(WCHAR_T);
362 if( n<sizeof(WCHAR_T) ) break;
363 c = *++z;
 
 
 
364 ++j;
365 if( c==0 ){
366 flags |= LOOK_NUL; /* NUL character in a file -> binary */
367 }else if( bReverse ){
368 c = UTF16_SWAP(c);
369 }
370 if( c=='\n' ){
371 if( c2=='\r' ){
372 flags |= (LOOK_CRLF | LOOK_CR | LOOK_LF);
373 }else{
374 flags |= (LOOK_LONE_LF | LOOK_LF);
375 }
376 if( j>UTF16_LENGTH_MASK ){
377 flags |= LOOK_LONG; /* Very long line -> binary */
378 }
379 j = 0;
380 }else if( c2=='\r' ){
381 flags |= (LOOK_CR | LOOK_LONE_CR);
 
 
 
382 }
383 }
384 if( c=='\r' ){
385 flags |= (LOOK_CR | LOOK_LONE_CR); /* Found CR as last char */
386 }
387 if( j>UTF16_LENGTH_MASK ){
388 flags |= LOOK_LONG; /* Very long line -> binary */
389 }
390 return flags;
391 }
@@ -2524,22 +2532,23 @@
2524 Blob blob; /* the contents of the specified file */
2525 int fUtf8; /* return value of starts_with_utf8_bom() */
2526 int fUtf16; /* return value of starts_with_utf16_bom() */
2527 int fUnicode; /* return value of could_be_utf16() */
2528 int lookFlags; /* output flags from looks_like_utf8/utf16() */
2529 int bReverse = 0; /* non-zero -> UTF-16 byte order reversed */
2530 if( g.argc<3 ) usage("FILENAME");
 
2531 blob_read_from_file(&blob, g.argv[2]);
2532 fUtf8 = starts_with_utf8_bom(&blob, 0);
2533 fUtf16 = starts_with_utf16_bom(&blob, 0, &bReverse);
2534 fUnicode = could_be_utf16(&blob, &bReverse);
2535 lookFlags = fUnicode ? looks_like_utf16(&blob, bReverse) :
2536 looks_like_utf8(&blob);
2537 fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
2538 fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
2539 fossil_print("Starts with UTF-16 BOM: %s\n",
2540 fUtf16?(bReverse?"reversed":"yes"):"no");
2541 fossil_print("Looks like UTF-%s: %s\n",fUnicode?"16":"8",
2542 (lookFlags&LOOK_BINARY)?"no":"yes");
2543 fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no");
2544 fossil_print("Has flag LOOK_CR: %s\n",(lookFlags&LOOK_CR)?"yes":"no");
2545 fossil_print("Has flag LOOK_LONE_CR: %s\n",
2546
--- src/diff.c
+++ src/diff.c
@@ -258,11 +258,11 @@
258 if( c==0 ){
259 flags |= LOOK_NUL; /* NUL character in a file -> binary */
260 }else if( c=='\n' ){
261 flags |= LOOK_LF;
262 if( c2=='\r' ){
263 flags |= (LOOK_CR | LOOK_CRLF); /* Found LF preceded by CR */
264 }else{
265 flags |= LOOK_LONE_LF;
266 }
267 if( j>LENGTH_MASK ){
268 flags |= LOOK_LONG; /* Very long line -> binary */
@@ -295,18 +295,19 @@
295 /*
296 ** Maximum length of a line in a text file, in UTF-16 characters. (4096)
297 ** The number of bytes represented by this value cannot exceed LENGTH_MASK
298 ** bytes, because that is the line buffer size used by the diff engine.
299 */
300 #define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char)))
301 #define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1)
302
303 /*
304 ** This macro is used to swap the byte order of a UTF-16 character in the
305 ** looks_like_utf16() function.
306 */
307 #define UTF16_SWAP(ch) ((((ch) << 8) & 0xFF00) | (((ch) >> 8) & 0xFF))
308 #define UTF16_SWAP_IF(expr,ch) ((expr) ? UTF16_SWAP((ch)) : (ch))
309
310 /*
311 ** This function attempts to scan each logical line within the blob to
312 ** determine the type of content it appears to contain. The return value
313 ** is a combination of one or more of the LOOK_XXX flags (see above):
@@ -347,45 +348,52 @@
348 if( n%sizeof(WCHAR_T) ){
349 flags |= LOOK_ODD; /* Odd number of bytes -> binary (UTF-8?) */
350 if( n<sizeof(WCHAR_T) ) return flags; /* One byte -> binary (UTF-8?) */
351 }
352 c = *z;
353 if( bReverse ){
354 c = UTF16_SWAP(c);
355 }
356 if( c==0 ){
357 flags |= LOOK_NUL; /* NUL character in a file -> binary */
358 }else if( c=='\r' ){
359 flags |= LOOK_CR;
360 if( n<=sizeof(WCHAR_T) || UTF16_SWAP_IF(bReverse, z[1])!='\n' ){
361 flags |= LOOK_LONE_CR; /* More chars, next char is not LF */
362 }
363 }
364 j = (c!='\n');
365 if( !j ) flags |= (LOOK_LF | LOOK_LONE_LF); /* Found LF as first char */
366 while( 1 ){
367 int c2 = c;
368 n -= sizeof(WCHAR_T);
369 if( n<sizeof(WCHAR_T) ) break;
370 c = *++z;
371 if( bReverse ){
372 c = UTF16_SWAP(c);
373 }
374 ++j;
375 if( c==0 ){
376 flags |= LOOK_NUL; /* NUL character in a file -> binary */
377 }else if( c=='\n' ){
378 flags |= LOOK_LF;
 
 
379 if( c2=='\r' ){
380 flags |= (LOOK_CR | LOOK_CRLF); /* Found LF preceded by CR */
381 }else{
382 flags |= LOOK_LONE_LF;
383 }
384 if( j>UTF16_LENGTH_MASK ){
385 flags |= LOOK_LONG; /* Very long line -> binary */
386 }
387 j = 0;
388 }else if( c=='\r' ){
389 flags |= LOOK_CR;
390 if( n<=sizeof(WCHAR_T) || UTF16_SWAP_IF(bReverse, z[1])!='\n' ){
391 flags |= LOOK_LONE_CR; /* More chars, next char is not LF */
392 }
393 }
394 }
 
 
 
395 if( j>UTF16_LENGTH_MASK ){
396 flags |= LOOK_LONG; /* Very long line -> binary */
397 }
398 return flags;
399 }
@@ -2524,22 +2532,23 @@
2532 Blob blob; /* the contents of the specified file */
2533 int fUtf8; /* return value of starts_with_utf8_bom() */
2534 int fUtf16; /* return value of starts_with_utf16_bom() */
2535 int fUnicode; /* return value of could_be_utf16() */
2536 int lookFlags; /* output flags from looks_like_utf8/utf16() */
2537 int bRevUtf16 = 0; /* non-zero -> UTF-16 byte order reversed */
2538 int bRevUnicode = 0; /* non-zero -> UTF-16 byte order reversed */
2539 if( g.argc!=3 ) usage("FILENAME");
2540 blob_read_from_file(&blob, g.argv[2]);
2541 fUtf8 = starts_with_utf8_bom(&blob, 0);
2542 fUtf16 = starts_with_utf16_bom(&blob, 0, &bRevUtf16);
2543 fUnicode = could_be_utf16(&blob, &bRevUnicode);
2544 lookFlags = fUnicode ? looks_like_utf16(&blob, bRevUnicode) :
2545 looks_like_utf8(&blob);
2546 fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
2547 fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
2548 fossil_print("Starts with UTF-16 BOM: %s\n",
2549 fUtf16?(bRevUtf16?"reversed":"yes"):"no");
2550 fossil_print("Looks like UTF-%s: %s\n",fUnicode?"16":"8",
2551 (lookFlags&LOOK_BINARY)?"no":"yes");
2552 fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no");
2553 fossil_print("Has flag LOOK_CR: %s\n",(lookFlags&LOOK_CR)?"yes":"no");
2554 fossil_print("Has flag LOOK_LONE_CR: %s\n",
2555

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button