Fossil SCM

Improvements to UTF-16 byte-order-mark detection.

mistachkin 2013-02-15 19:57 trunk merge
Commit 3d988df67062c5e099a609bb9c6e61a1e78c6fa5
+4 -3
--- src/blob.c
+++ src/blob.c
@@ -1096,24 +1096,25 @@
10961096
** to be UTF-8 already, so no conversion is done.
10971097
*/
10981098
void blob_to_utf8_no_bom(Blob *pBlob, int useMbcs){
10991099
char *zUtf8;
11001100
int bomSize = 0;
1101
+ int bomReverse = 0;
11011102
if( starts_with_utf8_bom(pBlob, &bomSize) ){
11021103
struct Blob temp;
11031104
zUtf8 = blob_str(pBlob) + bomSize;
11041105
blob_zero(&temp);
11051106
blob_append(&temp, zUtf8, -1);
11061107
blob_swap(pBlob, &temp);
11071108
blob_reset(&temp);
11081109
#ifdef _WIN32
1109
- }else if( starts_with_utf16_bom(pBlob, &bomSize) ){
1110
+ }else if( starts_with_utf16_bom(pBlob, &bomSize, &bomReverse) ){
11101111
zUtf8 = blob_buffer(pBlob);
1111
- if (*((unsigned short *)zUtf8) == 0xfffe) {
1112
+ if( bomReverse ){
11121113
/* Found BOM, but with reversed bytes */
11131114
unsigned int i = blob_size(pBlob);
1114
- while( i > 0 ){
1115
+ while( i>0 ){
11151116
/* swap bytes of unicode representation */
11161117
char zTemp = zUtf8[--i];
11171118
zUtf8[i] = zUtf8[i-1];
11181119
zUtf8[--i] = zTemp;
11191120
}
11201121
--- src/blob.c
+++ src/blob.c
@@ -1096,24 +1096,25 @@
1096 ** to be UTF-8 already, so no conversion is done.
1097 */
1098 void blob_to_utf8_no_bom(Blob *pBlob, int useMbcs){
1099 char *zUtf8;
1100 int bomSize = 0;
 
1101 if( starts_with_utf8_bom(pBlob, &bomSize) ){
1102 struct Blob temp;
1103 zUtf8 = blob_str(pBlob) + bomSize;
1104 blob_zero(&temp);
1105 blob_append(&temp, zUtf8, -1);
1106 blob_swap(pBlob, &temp);
1107 blob_reset(&temp);
1108 #ifdef _WIN32
1109 }else if( starts_with_utf16_bom(pBlob, &bomSize) ){
1110 zUtf8 = blob_buffer(pBlob);
1111 if (*((unsigned short *)zUtf8) == 0xfffe) {
1112 /* Found BOM, but with reversed bytes */
1113 unsigned int i = blob_size(pBlob);
1114 while( i > 0 ){
1115 /* swap bytes of unicode representation */
1116 char zTemp = zUtf8[--i];
1117 zUtf8[i] = zUtf8[i-1];
1118 zUtf8[--i] = zTemp;
1119 }
1120
--- src/blob.c
+++ src/blob.c
@@ -1096,24 +1096,25 @@
1096 ** to be UTF-8 already, so no conversion is done.
1097 */
1098 void blob_to_utf8_no_bom(Blob *pBlob, int useMbcs){
1099 char *zUtf8;
1100 int bomSize = 0;
1101 int bomReverse = 0;
1102 if( starts_with_utf8_bom(pBlob, &bomSize) ){
1103 struct Blob temp;
1104 zUtf8 = blob_str(pBlob) + bomSize;
1105 blob_zero(&temp);
1106 blob_append(&temp, zUtf8, -1);
1107 blob_swap(pBlob, &temp);
1108 blob_reset(&temp);
1109 #ifdef _WIN32
1110 }else if( starts_with_utf16_bom(pBlob, &bomSize, &bomReverse) ){
1111 zUtf8 = blob_buffer(pBlob);
1112 if( bomReverse ){
1113 /* Found BOM, but with reversed bytes */
1114 unsigned int i = blob_size(pBlob);
1115 while( i>0 ){
1116 /* swap bytes of unicode representation */
1117 char zTemp = zUtf8[--i];
1118 zUtf8[i] = zUtf8[i-1];
1119 zUtf8[--i] = zTemp;
1120 }
1121
+1 -1
--- src/checkin.c
+++ src/checkin.c
@@ -906,11 +906,11 @@
906906
char *zMsg; /* Warning message */
907907
Blob fname; /* Relative pathname of the file */
908908
static int allOk = 0; /* Set to true to disable this routine */
909909
910910
if( allOk ) return 0;
911
- fUnicode = starts_with_utf16_bom(p, 0);
911
+ fUnicode = starts_with_utf16_bom(p, 0, 0);
912912
eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
913913
if( eType==0 || eType==-1 || fUnicode ){
914914
const char *zWarning;
915915
const char *zDisable;
916916
const char *zConvert = "c=convert/";
917917
--- src/checkin.c
+++ src/checkin.c
@@ -906,11 +906,11 @@
906 char *zMsg; /* Warning message */
907 Blob fname; /* Relative pathname of the file */
908 static int allOk = 0; /* Set to true to disable this routine */
909
910 if( allOk ) return 0;
911 fUnicode = starts_with_utf16_bom(p, 0);
912 eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
913 if( eType==0 || eType==-1 || fUnicode ){
914 const char *zWarning;
915 const char *zDisable;
916 const char *zConvert = "c=convert/";
917
--- src/checkin.c
+++ src/checkin.c
@@ -906,11 +906,11 @@
906 char *zMsg; /* Warning message */
907 Blob fname; /* Relative pathname of the file */
908 static int allOk = 0; /* Set to true to disable this routine */
909
910 if( allOk ) return 0;
911 fUnicode = starts_with_utf16_bom(p, 0, 0);
912 eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
913 if( eType==0 || eType==-1 || fUnicode ){
914 const char *zWarning;
915 const char *zDisable;
916 const char *zConvert = "c=convert/";
917
+1 -1
--- src/checkin.c
+++ src/checkin.c
@@ -906,11 +906,11 @@
906906
char *zMsg; /* Warning message */
907907
Blob fname; /* Relative pathname of the file */
908908
static int allOk = 0; /* Set to true to disable this routine */
909909
910910
if( allOk ) return 0;
911
- fUnicode = starts_with_utf16_bom(p, 0);
911
+ fUnicode = starts_with_utf16_bom(p, 0, 0);
912912
eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
913913
if( eType==0 || eType==-1 || fUnicode ){
914914
const char *zWarning;
915915
const char *zDisable;
916916
const char *zConvert = "c=convert/";
917917
--- src/checkin.c
+++ src/checkin.c
@@ -906,11 +906,11 @@
906 char *zMsg; /* Warning message */
907 Blob fname; /* Relative pathname of the file */
908 static int allOk = 0; /* Set to true to disable this routine */
909
910 if( allOk ) return 0;
911 fUnicode = starts_with_utf16_bom(p, 0);
912 eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
913 if( eType==0 || eType==-1 || fUnicode ){
914 const char *zWarning;
915 const char *zDisable;
916 const char *zConvert = "c=convert/";
917
--- src/checkin.c
+++ src/checkin.c
@@ -906,11 +906,11 @@
906 char *zMsg; /* Warning message */
907 Blob fname; /* Relative pathname of the file */
908 static int allOk = 0; /* Set to true to disable this routine */
909
910 if( allOk ) return 0;
911 fUnicode = starts_with_utf16_bom(p, 0, 0);
912 eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p);
913 if( eType==0 || eType==-1 || fUnicode ){
914 const char *zWarning;
915 const char *zDisable;
916 const char *zConvert = "c=convert/";
917
+26 -12
--- src/diff.c
+++ src/diff.c
@@ -354,24 +354,38 @@
354354
if( blob_size(pContent)<bomSize ) return 0;
355355
return memcmp(z, bom, bomSize)==0;
356356
}
357357
358358
/*
359
-** This function returns non-zero if the blob starts with a UTF-16le or
360
-** UTF-16be byte-order-mark (BOM).
359
+** This function returns non-zero if the blob starts with a UTF-16
360
+** byte-order-mark (BOM), either in the endianness of the machine
361
+** or in reversed byte order.
361362
*/
362
-int starts_with_utf16_bom(const Blob *pContent, int *pnByte){
363
+int starts_with_utf16_bom(
364
+ const Blob *pContent, /* IN: Blob content to perform BOM detection on. */
365
+ int *pnByte, /* OUT: The number of bytes used for the BOM. */
366
+ int *pbReverse /* OUT: Non-zero for BOM in reverse byte-order. */
367
+){
363368
const char *z = blob_buffer(pContent);
364
- int c1;
365
-
366
- if( pnByte ) *pnByte = 2;
367
- if( (blob_size(pContent)<2) || (blob_size(pContent)&1)) return 0;
368
- c1 = ((unsigned short *)z)[0];
369
- if( (c1==0xfeff) || (c1==0xfffe) ){
370
- if( blob_size(pContent) < 4 ) return 1;
371
- c1 = ((unsigned short *)z)[1];
372
- if( c1 != 0 ) return 1;
369
+ int bomSize = 2;
370
+ static const unsigned short bom = 0xfeff;
371
+ static const unsigned short bom_reversed = 0xfffe;
372
+ static const unsigned short null = 0;
373
+ int size;
374
+
375
+ if( pnByte ) *pnByte = bomSize;
376
+ if( pbReverse ) *pbReverse = -1; /* Unknown. */
377
+ size = blob_size(pContent);
378
+ if( (size<bomSize) || (size%2) ) return 0;
379
+ if( memcmp(z, &bom_reversed, bomSize)==0 ){
380
+ if( pbReverse ) *pbReverse = 1;
381
+ if( size<(2*bomSize) ) return 1;
382
+ if( memcmp(z+bomSize, &null, bomSize)!=0 ) return 1;
383
+ }else if( memcmp(z, &bom, bomSize)==0 ){
384
+ if( pbReverse ) *pbReverse = 0;
385
+ if( size<(2*bomSize) ) return 1;
386
+ if( memcmp(z+bomSize, &null, bomSize)!=0 ) return 1;
373387
}
374388
return 0;
375389
}
376390
377391
/*
378392
--- src/diff.c
+++ src/diff.c
@@ -354,24 +354,38 @@
354 if( blob_size(pContent)<bomSize ) return 0;
355 return memcmp(z, bom, bomSize)==0;
356 }
357
358 /*
359 ** This function returns non-zero if the blob starts with a UTF-16le or
360 ** UTF-16be byte-order-mark (BOM).
 
361 */
362 int starts_with_utf16_bom(const Blob *pContent, int *pnByte){
 
 
 
 
363 const char *z = blob_buffer(pContent);
364 int c1;
365
366 if( pnByte ) *pnByte = 2;
367 if( (blob_size(pContent)<2) || (blob_size(pContent)&1)) return 0;
368 c1 = ((unsigned short *)z)[0];
369 if( (c1==0xfeff) || (c1==0xfffe) ){
370 if( blob_size(pContent) < 4 ) return 1;
371 c1 = ((unsigned short *)z)[1];
372 if( c1 != 0 ) return 1;
 
 
 
 
 
 
 
 
 
373 }
374 return 0;
375 }
376
377 /*
378
--- src/diff.c
+++ src/diff.c
@@ -354,24 +354,38 @@
354 if( blob_size(pContent)<bomSize ) return 0;
355 return memcmp(z, bom, bomSize)==0;
356 }
357
358 /*
359 ** This function returns non-zero if the blob starts with a UTF-16
360 ** byte-order-mark (BOM), either in the endianness of the machine
361 ** or in reversed byte order.
362 */
363 int starts_with_utf16_bom(
364 const Blob *pContent, /* IN: Blob content to perform BOM detection on. */
365 int *pnByte, /* OUT: The number of bytes used for the BOM. */
366 int *pbReverse /* OUT: Non-zero for BOM in reverse byte-order. */
367 ){
368 const char *z = blob_buffer(pContent);
369 int bomSize = 2;
370 static const unsigned short bom = 0xfeff;
371 static const unsigned short bom_reversed = 0xfffe;
372 static const unsigned short null = 0;
373 int size;
374
375 if( pnByte ) *pnByte = bomSize;
376 if( pbReverse ) *pbReverse = -1; /* Unknown. */
377 size = blob_size(pContent);
378 if( (size<bomSize) || (size%2) ) return 0;
379 if( memcmp(z, &bom_reversed, bomSize)==0 ){
380 if( pbReverse ) *pbReverse = 1;
381 if( size<(2*bomSize) ) return 1;
382 if( memcmp(z+bomSize, &null, bomSize)!=0 ) return 1;
383 }else if( memcmp(z, &bom, bomSize)==0 ){
384 if( pbReverse ) *pbReverse = 0;
385 if( size<(2*bomSize) ) return 1;
386 if( memcmp(z+bomSize, &null, bomSize)!=0 ) return 1;
387 }
388 return 0;
389 }
390
391 /*
392
+26 -12
--- src/diff.c
+++ src/diff.c
@@ -354,24 +354,38 @@
354354
if( blob_size(pContent)<bomSize ) return 0;
355355
return memcmp(z, bom, bomSize)==0;
356356
}
357357
358358
/*
359
-** This function returns non-zero if the blob starts with a UTF-16le or
360
-** UTF-16be byte-order-mark (BOM).
359
+** This function returns non-zero if the blob starts with a UTF-16
360
+** byte-order-mark (BOM), either in the endianness of the machine
361
+** or in reversed byte order.
361362
*/
362
-int starts_with_utf16_bom(const Blob *pContent, int *pnByte){
363
+int starts_with_utf16_bom(
364
+ const Blob *pContent, /* IN: Blob content to perform BOM detection on. */
365
+ int *pnByte, /* OUT: The number of bytes used for the BOM. */
366
+ int *pbReverse /* OUT: Non-zero for BOM in reverse byte-order. */
367
+){
363368
const char *z = blob_buffer(pContent);
364
- int c1;
365
-
366
- if( pnByte ) *pnByte = 2;
367
- if( (blob_size(pContent)<2) || (blob_size(pContent)&1)) return 0;
368
- c1 = ((unsigned short *)z)[0];
369
- if( (c1==0xfeff) || (c1==0xfffe) ){
370
- if( blob_size(pContent) < 4 ) return 1;
371
- c1 = ((unsigned short *)z)[1];
372
- if( c1 != 0 ) return 1;
369
+ int bomSize = 2;
370
+ static const unsigned short bom = 0xfeff;
371
+ static const unsigned short bom_reversed = 0xfffe;
372
+ static const unsigned short null = 0;
373
+ int size;
374
+
375
+ if( pnByte ) *pnByte = bomSize;
376
+ if( pbReverse ) *pbReverse = -1; /* Unknown. */
377
+ size = blob_size(pContent);
378
+ if( (size<bomSize) || (size%2) ) return 0;
379
+ if( memcmp(z, &bom_reversed, bomSize)==0 ){
380
+ if( pbReverse ) *pbReverse = 1;
381
+ if( size<(2*bomSize) ) return 1;
382
+ if( memcmp(z+bomSize, &null, bomSize)!=0 ) return 1;
383
+ }else if( memcmp(z, &bom, bomSize)==0 ){
384
+ if( pbReverse ) *pbReverse = 0;
385
+ if( size<(2*bomSize) ) return 1;
386
+ if( memcmp(z+bomSize, &null, bomSize)!=0 ) return 1;
373387
}
374388
return 0;
375389
}
376390
377391
/*
378392
--- src/diff.c
+++ src/diff.c
@@ -354,24 +354,38 @@
354 if( blob_size(pContent)<bomSize ) return 0;
355 return memcmp(z, bom, bomSize)==0;
356 }
357
358 /*
359 ** This function returns non-zero if the blob starts with a UTF-16le or
360 ** UTF-16be byte-order-mark (BOM).
 
361 */
362 int starts_with_utf16_bom(const Blob *pContent, int *pnByte){
 
 
 
 
363 const char *z = blob_buffer(pContent);
364 int c1;
365
366 if( pnByte ) *pnByte = 2;
367 if( (blob_size(pContent)<2) || (blob_size(pContent)&1)) return 0;
368 c1 = ((unsigned short *)z)[0];
369 if( (c1==0xfeff) || (c1==0xfffe) ){
370 if( blob_size(pContent) < 4 ) return 1;
371 c1 = ((unsigned short *)z)[1];
372 if( c1 != 0 ) return 1;
 
 
 
 
 
 
 
 
 
373 }
374 return 0;
375 }
376
377 /*
378
--- src/diff.c
+++ src/diff.c
@@ -354,24 +354,38 @@
354 if( blob_size(pContent)<bomSize ) return 0;
355 return memcmp(z, bom, bomSize)==0;
356 }
357
358 /*
359 ** This function returns non-zero if the blob starts with a UTF-16
360 ** byte-order-mark (BOM), either in the endianness of the machine
361 ** or in reversed byte order.
362 */
363 int starts_with_utf16_bom(
364 const Blob *pContent, /* IN: Blob content to perform BOM detection on. */
365 int *pnByte, /* OUT: The number of bytes used for the BOM. */
366 int *pbReverse /* OUT: Non-zero for BOM in reverse byte-order. */
367 ){
368 const char *z = blob_buffer(pContent);
369 int bomSize = 2;
370 static const unsigned short bom = 0xfeff;
371 static const unsigned short bom_reversed = 0xfffe;
372 static const unsigned short null = 0;
373 int size;
374
375 if( pnByte ) *pnByte = bomSize;
376 if( pbReverse ) *pbReverse = -1; /* Unknown. */
377 size = blob_size(pContent);
378 if( (size<bomSize) || (size%2) ) return 0;
379 if( memcmp(z, &bom_reversed, bomSize)==0 ){
380 if( pbReverse ) *pbReverse = 1;
381 if( size<(2*bomSize) ) return 1;
382 if( memcmp(z+bomSize, &null, bomSize)!=0 ) return 1;
383 }else if( memcmp(z, &bom, bomSize)==0 ){
384 if( pbReverse ) *pbReverse = 0;
385 if( size<(2*bomSize) ) return 1;
386 if( memcmp(z+bomSize, &null, bomSize)!=0 ) return 1;
387 }
388 return 0;
389 }
390
391 /*
392

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button