Fossil SCM

Combine 4 "starts_with_utf??_bom" functions to a single - easier to use - function "starts_with_bom". In addition, it only checks for an UTF-16 BOM if the blob has an even number of bytes.

jan.nijtmans 2013-02-07 08:47 UTC trunk
Commit 6c417d8bf5f6d2e530987ccf7b122d8965b0dc11
3 files changed +12 -17 +2 -2 +13 -58
+12 -17
--- src/blob.c
+++ src/blob.c
@@ -1095,35 +1095,30 @@
10951095
** done. If useMbcs is false and there is no BOM, the input string is assumed
10961096
** to be UTF-8 already, so no conversion is done.
10971097
*/
10981098
void blob_to_utf8_no_bom(Blob *pBlob, int useMbcs){
10991099
char *zUtf8;
1100
- int bomSize = 0;
1101
- if( starts_with_utf8_bom(pBlob, &bomSize) ){
1100
+ int bomSize = starts_with_bom(pBlob);
1101
+ if( bomSize == 3 ){
11021102
struct Blob temp;
11031103
zUtf8 = blob_str(pBlob) + bomSize;
11041104
blob_zero(&temp);
11051105
blob_append(&temp, zUtf8, -1);
11061106
blob_swap(pBlob, &temp);
11071107
blob_reset(&temp);
11081108
#ifdef _WIN32
1109
- }else if( starts_with_utf16le_bom(pBlob, &bomSize) ){
1110
- /* Make sure the blob contains two terminating 0-bytes */
1111
- blob_append(pBlob, "", 1);
1112
- zUtf8 = blob_str(pBlob) + bomSize;
1113
- zUtf8 = fossil_unicode_to_utf8(zUtf8);
1114
- blob_zero(pBlob);
1115
- blob_append(pBlob, zUtf8, -1);
1116
- fossil_unicode_free(zUtf8);
1117
- }else if( starts_with_utf16be_bom(pBlob, &bomSize) ){
1118
- unsigned int i = blob_size(pBlob);
1109
+ }else if( bomSize == 2 ){
11191110
zUtf8 = blob_buffer(pBlob);
1120
- while( i > 0 ){
1121
- /* swap bytes of unicode representation */
1122
- char zTemp = zUtf8[--i];
1123
- zUtf8[i] = zUtf8[i-1];
1124
- zUtf8[--i] = zTemp;
1111
+ if (*((unsigned short *)zUtf8) == 0xfffe) {
1112
+ /* Found BOM, but with reversed bytes */
1113
+ unsigned int i = blob_size(pBlob);
1114
+ while( i > 0 ){
1115
+ /* swap bytes of unicode representation */
1116
+ char zTemp = zUtf8[--i];
1117
+ zUtf8[i] = zUtf8[i-1];
1118
+ zUtf8[--i] = zTemp;
1119
+ }
11251120
}
11261121
/* Make sure the blob contains two terminating 0-bytes */
11271122
blob_append(pBlob, "", 1);
11281123
zUtf8 = blob_str(pBlob) + bomSize;
11291124
zUtf8 = fossil_unicode_to_utf8(zUtf8);
11301125
--- src/blob.c
+++ src/blob.c
@@ -1095,35 +1095,30 @@
1095 ** done. If useMbcs is false and there is no BOM, the input string is assumed
1096 ** to be UTF-8 already, so no conversion is done.
1097 */
1098 void blob_to_utf8_no_bom(Blob *pBlob, int useMbcs){
1099 char *zUtf8;
1100 int bomSize = 0;
1101 if( starts_with_utf8_bom(pBlob, &bomSize) ){
1102 struct Blob temp;
1103 zUtf8 = blob_str(pBlob) + bomSize;
1104 blob_zero(&temp);
1105 blob_append(&temp, zUtf8, -1);
1106 blob_swap(pBlob, &temp);
1107 blob_reset(&temp);
1108 #ifdef _WIN32
1109 }else if( starts_with_utf16le_bom(pBlob, &bomSize) ){
1110 /* Make sure the blob contains two terminating 0-bytes */
1111 blob_append(pBlob, "", 1);
1112 zUtf8 = blob_str(pBlob) + bomSize;
1113 zUtf8 = fossil_unicode_to_utf8(zUtf8);
1114 blob_zero(pBlob);
1115 blob_append(pBlob, zUtf8, -1);
1116 fossil_unicode_free(zUtf8);
1117 }else if( starts_with_utf16be_bom(pBlob, &bomSize) ){
1118 unsigned int i = blob_size(pBlob);
1119 zUtf8 = blob_buffer(pBlob);
1120 while( i > 0 ){
1121 /* swap bytes of unicode representation */
1122 char zTemp = zUtf8[--i];
1123 zUtf8[i] = zUtf8[i-1];
1124 zUtf8[--i] = zTemp;
 
 
 
 
1125 }
1126 /* Make sure the blob contains two terminating 0-bytes */
1127 blob_append(pBlob, "", 1);
1128 zUtf8 = blob_str(pBlob) + bomSize;
1129 zUtf8 = fossil_unicode_to_utf8(zUtf8);
1130
--- src/blob.c
+++ src/blob.c
@@ -1095,35 +1095,30 @@
1095 ** done. If useMbcs is false and there is no BOM, the input string is assumed
1096 ** to be UTF-8 already, so no conversion is done.
1097 */
1098 void blob_to_utf8_no_bom(Blob *pBlob, int useMbcs){
1099 char *zUtf8;
1100 int bomSize = starts_with_bom(pBlob);
1101 if( bomSize == 3 ){
1102 struct Blob temp;
1103 zUtf8 = blob_str(pBlob) + bomSize;
1104 blob_zero(&temp);
1105 blob_append(&temp, zUtf8, -1);
1106 blob_swap(pBlob, &temp);
1107 blob_reset(&temp);
1108 #ifdef _WIN32
1109 }else if( bomSize == 2 ){
 
 
 
 
 
 
 
 
 
1110 zUtf8 = blob_buffer(pBlob);
1111 if (*((unsigned short *)zUtf8) == 0xfffe) {
1112 /* Found BOM, but with reversed bytes */
1113 unsigned int i = blob_size(pBlob);
1114 while( i > 0 ){
1115 /* swap bytes of unicode representation */
1116 char zTemp = zUtf8[--i];
1117 zUtf8[i] = zUtf8[i-1];
1118 zUtf8[--i] = zTemp;
1119 }
1120 }
1121 /* Make sure the blob contains two terminating 0-bytes */
1122 blob_append(pBlob, "", 1);
1123 zUtf8 = blob_str(pBlob) + bomSize;
1124 zUtf8 = fossil_unicode_to_utf8(zUtf8);
1125
+2 -2
--- src/checkin.c
+++ src/checkin.c
@@ -899,17 +899,17 @@
899899
int binOk, /* Non-zero if binary warnings should be disabled. */
900900
int encodingOk, /* Non-zero if encoding warnings should be disabled. */
901901
const char *zFilename /* The full name of the file being committed. */
902902
){
903903
int eType; /* return value of looks_like_utf8/utf16() */
904
- int fUnicode; /* return value of starts_with_utf16_bom() */
904
+ int fUnicode; /* 1 if blob starts with UTF-16 BOM */
905905
char *zMsg; /* Warning message */
906906
Blob fname; /* Relative pathname of the file */
907907
static int allOk = 0; /* Set to true to disable this routine */
908908
909909
if( allOk ) return 0;
910
- fUnicode = starts_with_utf16_bom(p, 0);
910
+ fUnicode = (starts_with_bom(p) == 2);
911911
eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
912912
if( eType==0 || eType==-1 || fUnicode ){
913913
const char *zWarning;
914914
const char *zDisable;
915915
const char *zConvert = "c=convert/";
916916
--- src/checkin.c
+++ src/checkin.c
@@ -899,17 +899,17 @@
899 int binOk, /* Non-zero if binary warnings should be disabled. */
900 int encodingOk, /* Non-zero if encoding warnings should be disabled. */
901 const char *zFilename /* The full name of the file being committed. */
902 ){
903 int eType; /* return value of looks_like_utf8/utf16() */
904 int fUnicode; /* return value of starts_with_utf16_bom() */
905 char *zMsg; /* Warning message */
906 Blob fname; /* Relative pathname of the file */
907 static int allOk = 0; /* Set to true to disable this routine */
908
909 if( allOk ) return 0;
910 fUnicode = starts_with_utf16_bom(p, 0);
911 eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
912 if( eType==0 || eType==-1 || fUnicode ){
913 const char *zWarning;
914 const char *zDisable;
915 const char *zConvert = "c=convert/";
916
--- src/checkin.c
+++ src/checkin.c
@@ -899,17 +899,17 @@
899 int binOk, /* Non-zero if binary warnings should be disabled. */
900 int encodingOk, /* Non-zero if encoding warnings should be disabled. */
901 const char *zFilename /* The full name of the file being committed. */
902 ){
903 int eType; /* return value of looks_like_utf8/utf16() */
904 int fUnicode; /* 1 if blob starts with UTF-16 BOM */
905 char *zMsg; /* Warning message */
906 Blob fname; /* Relative pathname of the file */
907 static int allOk = 0; /* Set to true to disable this routine */
908
909 if( allOk ) return 0;
910 fUnicode = (starts_with_bom(p) == 2);
911 eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
912 if( eType==0 || eType==-1 || fUnicode ){
913 const char *zWarning;
914 const char *zDisable;
915 const char *zConvert = "c=convert/";
916
+13 -58
--- src/diff.c
+++ src/diff.c
@@ -340,72 +340,27 @@
340340
if( pnByte ) *pnByte = 3;
341341
return bom;
342342
}
343343
344344
/*
345
-** This function returns non-zero if the blob starts with a UTF-8
346
-** byte-order-mark (BOM).
345
+** This function returns detected BOM size if the blob starts with
346
+** a UTF-8, UTF-16le or UTF-16be byte-order-mark (BOM).
347347
*/
348
-int starts_with_utf8_bom(const Blob *pContent, int *pnByte){
348
+int starts_with_bom(const Blob *pContent){
349349
const char *z = blob_buffer(pContent);
350
- int bomSize = 0;
350
+ int c1, bomSize = 0;
351351
const unsigned char *bom = get_utf8_bom(&bomSize);
352352
353
- if( pnByte ) *pnByte = bomSize;
354
- if( blob_size(pContent)<bomSize ) return 0;
355
- return memcmp(z, bom, bomSize)==0;
356
-}
357
-
358
-/*
359
-** This function returns non-zero if the blob starts with a UTF-16le or
360
-** UTF-16be byte-order-mark (BOM).
361
-*/
362
-int starts_with_utf16_bom(const Blob *pContent, int *pnByte){
363
- const char *z = blob_buffer(pContent);
364
- int c1, c2;
365
-
366
- if( pnByte ) *pnByte = 2;
367
- if( blob_size(pContent)<2 ) return 0;
368
- c1 = z[0]; c2 = z[1];
369
- if( (c1==(char)0xff) && (c2==(char)0xfe) ){
370
- return 1;
371
- }else if( (c1==(char)0xfe) && (c2==(char)0xff) ){
372
- return 1;
373
- }
374
- return 0;
375
-}
376
-
377
-/*
378
-** This function returns non-zero if the blob starts with a UTF-16le
379
-** byte-order-mark (BOM).
380
-*/
381
-int starts_with_utf16le_bom(const Blob *pContent, int *pnByte){
382
- const char *z = blob_buffer(pContent);
383
- int c1, c2;
384
-
385
- if( pnByte ) *pnByte = 2;
386
- if( blob_size(pContent)<2 ) return 0;
387
- c1 = z[0]; c2 = z[1];
388
- if( (c1==(char)0xff) && (c2==(char)0xfe) ){
389
- return 1;
390
- }
391
- return 0;
392
-}
393
-
394
-/*
395
-** This function returns non-zero if the blob starts with a UTF-16be
396
-** byte-order-mark (BOM).
397
-*/
398
-int starts_with_utf16be_bom(const Blob *pContent, int *pnByte){
399
- const char *z = blob_buffer(pContent);
400
- int c1, c2;
401
-
402
- if( pnByte ) *pnByte = 2;
403
- if( blob_size(pContent)<2 ) return 0;
404
- c1 = z[0]; c2 = z[1];
405
- if( (c1==(char)0xfe) && (c2==(char)0xff) ){
406
- return 1;
353
+ if( (blob_size(pContent)>=bomSize)
354
+ && (memcmp(z, bom, bomSize)==0) ){
355
+ return bomSize;
356
+ }
357
+ /* Only accept UTF-16 BOM if the blob has an even number of bytes */
358
+ if( (blob_size(pContent)<2) || (blob_size(pContent)&1) ) return 0;
359
+ c1 = *((unsigned short *)z);
360
+ if( (c1==0xfffe) || (c1==0xfeff) ){
361
+ return 2;
407362
}
408363
return 0;
409364
}
410365
411366
/*
412367
--- src/diff.c
+++ src/diff.c
@@ -340,72 +340,27 @@
340 if( pnByte ) *pnByte = 3;
341 return bom;
342 }
343
344 /*
345 ** This function returns non-zero if the blob starts with a UTF-8
346 ** byte-order-mark (BOM).
347 */
348 int starts_with_utf8_bom(const Blob *pContent, int *pnByte){
349 const char *z = blob_buffer(pContent);
350 int bomSize = 0;
351 const unsigned char *bom = get_utf8_bom(&bomSize);
352
353 if( pnByte ) *pnByte = bomSize;
354 if( blob_size(pContent)<bomSize ) return 0;
355 return memcmp(z, bom, bomSize)==0;
356 }
357
358 /*
359 ** This function returns non-zero if the blob starts with a UTF-16le or
360 ** UTF-16be byte-order-mark (BOM).
361 */
362 int starts_with_utf16_bom(const Blob *pContent, int *pnByte){
363 const char *z = blob_buffer(pContent);
364 int c1, c2;
365
366 if( pnByte ) *pnByte = 2;
367 if( blob_size(pContent)<2 ) return 0;
368 c1 = z[0]; c2 = z[1];
369 if( (c1==(char)0xff) && (c2==(char)0xfe) ){
370 return 1;
371 }else if( (c1==(char)0xfe) && (c2==(char)0xff) ){
372 return 1;
373 }
374 return 0;
375 }
376
377 /*
378 ** This function returns non-zero if the blob starts with a UTF-16le
379 ** byte-order-mark (BOM).
380 */
381 int starts_with_utf16le_bom(const Blob *pContent, int *pnByte){
382 const char *z = blob_buffer(pContent);
383 int c1, c2;
384
385 if( pnByte ) *pnByte = 2;
386 if( blob_size(pContent)<2 ) return 0;
387 c1 = z[0]; c2 = z[1];
388 if( (c1==(char)0xff) && (c2==(char)0xfe) ){
389 return 1;
390 }
391 return 0;
392 }
393
394 /*
395 ** This function returns non-zero if the blob starts with a UTF-16be
396 ** byte-order-mark (BOM).
397 */
398 int starts_with_utf16be_bom(const Blob *pContent, int *pnByte){
399 const char *z = blob_buffer(pContent);
400 int c1, c2;
401
402 if( pnByte ) *pnByte = 2;
403 if( blob_size(pContent)<2 ) return 0;
404 c1 = z[0]; c2 = z[1];
405 if( (c1==(char)0xfe) && (c2==(char)0xff) ){
406 return 1;
407 }
408 return 0;
409 }
410
411 /*
412
--- src/diff.c
+++ src/diff.c
@@ -340,72 +340,27 @@
340 if( pnByte ) *pnByte = 3;
341 return bom;
342 }
343
344 /*
345 ** This function returns detected BOM size if the blob starts with
346 ** a UTF-8, UTF-16le or UTF-16be byte-order-mark (BOM).
347 */
348 int starts_with_bom(const Blob *pContent){
349 const char *z = blob_buffer(pContent);
350 int c1, bomSize = 0;
351 const unsigned char *bom = get_utf8_bom(&bomSize);
352
353 if( (blob_size(pContent)>=bomSize)
354 && (memcmp(z, bom, bomSize)==0) ){
355 return bomSize;
356 }
357 /* Only accept UTF-16 BOM if the blob has an even number of bytes */
358 if( (blob_size(pContent)<2) || (blob_size(pContent)&1) ) return 0;
359 c1 = *((unsigned short *)z);
360 if( (c1==0xfffe) || (c1==0xfeff) ){
361 return 2;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362 }
363 return 0;
364 }
365
366 /*
367

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button