Fossil SCM
Improvements to UTF-16 byte-order-mark detection.
Commit
3d988df67062c5e099a609bb9c6e61a1e78c6fa5
Parent
8522e0ab02507f2…
5 files changed
+4
-3
+1
-1
+1
-1
+26
-12
+26
-12
+4
-3
| --- src/blob.c | ||
| +++ src/blob.c | ||
| @@ -1096,24 +1096,25 @@ | ||
| 1096 | 1096 | ** to be UTF-8 already, so no conversion is done. |
| 1097 | 1097 | */ |
| 1098 | 1098 | void blob_to_utf8_no_bom(Blob *pBlob, int useMbcs){ |
| 1099 | 1099 | char *zUtf8; |
| 1100 | 1100 | int bomSize = 0; |
| 1101 | + int bomReverse = 0; | |
| 1101 | 1102 | if( starts_with_utf8_bom(pBlob, &bomSize) ){ |
| 1102 | 1103 | struct Blob temp; |
| 1103 | 1104 | zUtf8 = blob_str(pBlob) + bomSize; |
| 1104 | 1105 | blob_zero(&temp); |
| 1105 | 1106 | blob_append(&temp, zUtf8, -1); |
| 1106 | 1107 | blob_swap(pBlob, &temp); |
| 1107 | 1108 | blob_reset(&temp); |
| 1108 | 1109 | #ifdef _WIN32 |
| 1109 | - }else if( starts_with_utf16_bom(pBlob, &bomSize) ){ | |
| 1110 | + }else if( starts_with_utf16_bom(pBlob, &bomSize, &bomReverse) ){ | |
| 1110 | 1111 | zUtf8 = blob_buffer(pBlob); |
| 1111 | - if (*((unsigned short *)zUtf8) == 0xfffe) { | |
| 1112 | + if( bomReverse ){ | |
| 1112 | 1113 | /* Found BOM, but with reversed bytes */ |
| 1113 | 1114 | unsigned int i = blob_size(pBlob); |
| 1114 | - while( i > 0 ){ | |
| 1115 | + while( i>0 ){ | |
| 1115 | 1116 | /* swap bytes of unicode representation */ |
| 1116 | 1117 | char zTemp = zUtf8[--i]; |
| 1117 | 1118 | zUtf8[i] = zUtf8[i-1]; |
| 1118 | 1119 | zUtf8[--i] = zTemp; |
| 1119 | 1120 | } |
| 1120 | 1121 |
| --- src/blob.c | |
| +++ src/blob.c | |
| @@ -1096,24 +1096,25 @@ | |
| 1096 | ** to be UTF-8 already, so no conversion is done. |
| 1097 | */ |
| 1098 | void blob_to_utf8_no_bom(Blob *pBlob, int useMbcs){ |
| 1099 | char *zUtf8; |
| 1100 | int bomSize = 0; |
| 1101 | if( starts_with_utf8_bom(pBlob, &bomSize) ){ |
| 1102 | struct Blob temp; |
| 1103 | zUtf8 = blob_str(pBlob) + bomSize; |
| 1104 | blob_zero(&temp); |
| 1105 | blob_append(&temp, zUtf8, -1); |
| 1106 | blob_swap(pBlob, &temp); |
| 1107 | blob_reset(&temp); |
| 1108 | #ifdef _WIN32 |
| 1109 | }else if( starts_with_utf16_bom(pBlob, &bomSize) ){ |
| 1110 | zUtf8 = blob_buffer(pBlob); |
| 1111 | if (*((unsigned short *)zUtf8) == 0xfffe) { |
| 1112 | /* Found BOM, but with reversed bytes */ |
| 1113 | unsigned int i = blob_size(pBlob); |
| 1114 | while( i > 0 ){ |
| 1115 | /* swap bytes of unicode representation */ |
| 1116 | char zTemp = zUtf8[--i]; |
| 1117 | zUtf8[i] = zUtf8[i-1]; |
| 1118 | zUtf8[--i] = zTemp; |
| 1119 | } |
| 1120 |
| --- src/blob.c | |
| +++ src/blob.c | |
| @@ -1096,24 +1096,25 @@ | |
| 1096 | ** to be UTF-8 already, so no conversion is done. |
| 1097 | */ |
| 1098 | void blob_to_utf8_no_bom(Blob *pBlob, int useMbcs){ |
| 1099 | char *zUtf8; |
| 1100 | int bomSize = 0; |
| 1101 | int bomReverse = 0; |
| 1102 | if( starts_with_utf8_bom(pBlob, &bomSize) ){ |
| 1103 | struct Blob temp; |
| 1104 | zUtf8 = blob_str(pBlob) + bomSize; |
| 1105 | blob_zero(&temp); |
| 1106 | blob_append(&temp, zUtf8, -1); |
| 1107 | blob_swap(pBlob, &temp); |
| 1108 | blob_reset(&temp); |
| 1109 | #ifdef _WIN32 |
| 1110 | }else if( starts_with_utf16_bom(pBlob, &bomSize, &bomReverse) ){ |
| 1111 | zUtf8 = blob_buffer(pBlob); |
| 1112 | if( bomReverse ){ |
| 1113 | /* Found BOM, but with reversed bytes */ |
| 1114 | unsigned int i = blob_size(pBlob); |
| 1115 | while( i>0 ){ |
| 1116 | /* swap bytes of unicode representation */ |
| 1117 | char zTemp = zUtf8[--i]; |
| 1118 | zUtf8[i] = zUtf8[i-1]; |
| 1119 | zUtf8[--i] = zTemp; |
| 1120 | } |
| 1121 |
+1
-1
| --- src/checkin.c | ||
| +++ src/checkin.c | ||
| @@ -906,11 +906,11 @@ | ||
| 906 | 906 | char *zMsg; /* Warning message */ |
| 907 | 907 | Blob fname; /* Relative pathname of the file */ |
| 908 | 908 | static int allOk = 0; /* Set to true to disable this routine */ |
| 909 | 909 | |
| 910 | 910 | if( allOk ) return 0; |
| 911 | - fUnicode = starts_with_utf16_bom(p, 0); | |
| 911 | + fUnicode = starts_with_utf16_bom(p, 0, 0); | |
| 912 | 912 | eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p); |
| 913 | 913 | if( eType==0 || eType==-1 || fUnicode ){ |
| 914 | 914 | const char *zWarning; |
| 915 | 915 | const char *zDisable; |
| 916 | 916 | const char *zConvert = "c=convert/"; |
| 917 | 917 |
| --- src/checkin.c | |
| +++ src/checkin.c | |
| @@ -906,11 +906,11 @@ | |
| 906 | char *zMsg; /* Warning message */ |
| 907 | Blob fname; /* Relative pathname of the file */ |
| 908 | static int allOk = 0; /* Set to true to disable this routine */ |
| 909 | |
| 910 | if( allOk ) return 0; |
| 911 | fUnicode = starts_with_utf16_bom(p, 0); |
| 912 | eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p); |
| 913 | if( eType==0 || eType==-1 || fUnicode ){ |
| 914 | const char *zWarning; |
| 915 | const char *zDisable; |
| 916 | const char *zConvert = "c=convert/"; |
| 917 |
| --- src/checkin.c | |
| +++ src/checkin.c | |
| @@ -906,11 +906,11 @@ | |
| 906 | char *zMsg; /* Warning message */ |
| 907 | Blob fname; /* Relative pathname of the file */ |
| 908 | static int allOk = 0; /* Set to true to disable this routine */ |
| 909 | |
| 910 | if( allOk ) return 0; |
| 911 | fUnicode = starts_with_utf16_bom(p, 0, 0); |
| 912 | eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p); |
| 913 | if( eType==0 || eType==-1 || fUnicode ){ |
| 914 | const char *zWarning; |
| 915 | const char *zDisable; |
| 916 | const char *zConvert = "c=convert/"; |
| 917 |
+1
-1
| --- src/checkin.c | ||
| +++ src/checkin.c | ||
| @@ -906,11 +906,11 @@ | ||
| 906 | 906 | char *zMsg; /* Warning message */ |
| 907 | 907 | Blob fname; /* Relative pathname of the file */ |
| 908 | 908 | static int allOk = 0; /* Set to true to disable this routine */ |
| 909 | 909 | |
| 910 | 910 | if( allOk ) return 0; |
| 911 | - fUnicode = starts_with_utf16_bom(p, 0); | |
| 911 | + fUnicode = starts_with_utf16_bom(p, 0, 0); | |
| 912 | 912 | eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p); |
| 913 | 913 | if( eType==0 || eType==-1 || fUnicode ){ |
| 914 | 914 | const char *zWarning; |
| 915 | 915 | const char *zDisable; |
| 916 | 916 | const char *zConvert = "c=convert/"; |
| 917 | 917 |
| --- src/checkin.c | |
| +++ src/checkin.c | |
| @@ -906,11 +906,11 @@ | |
| 906 | char *zMsg; /* Warning message */ |
| 907 | Blob fname; /* Relative pathname of the file */ |
| 908 | static int allOk = 0; /* Set to true to disable this routine */ |
| 909 | |
| 910 | if( allOk ) return 0; |
| 911 | fUnicode = starts_with_utf16_bom(p, 0); |
| 912 | eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p); |
| 913 | if( eType==0 || eType==-1 || fUnicode ){ |
| 914 | const char *zWarning; |
| 915 | const char *zDisable; |
| 916 | const char *zConvert = "c=convert/"; |
| 917 |
| --- src/checkin.c | |
| +++ src/checkin.c | |
| @@ -906,11 +906,11 @@ | |
| 906 | char *zMsg; /* Warning message */ |
| 907 | Blob fname; /* Relative pathname of the file */ |
| 908 | static int allOk = 0; /* Set to true to disable this routine */ |
| 909 | |
| 910 | if( allOk ) return 0; |
| 911 | fUnicode = starts_with_utf16_bom(p, 0, 0); |
| 912 | eType = fUnicode ? looks_like_utf16(p) : looks_like_utf8(p); |
| 913 | if( eType==0 || eType==-1 || fUnicode ){ |
| 914 | const char *zWarning; |
| 915 | const char *zDisable; |
| 916 | const char *zConvert = "c=convert/"; |
| 917 |
+26
-12
| --- src/diff.c | ||
| +++ src/diff.c | ||
| @@ -354,24 +354,38 @@ | ||
| 354 | 354 | if( blob_size(pContent)<bomSize ) return 0; |
| 355 | 355 | return memcmp(z, bom, bomSize)==0; |
| 356 | 356 | } |
| 357 | 357 | |
| 358 | 358 | /* |
| 359 | -** This function returns non-zero if the blob starts with a UTF-16le or | |
| 360 | -** UTF-16be byte-order-mark (BOM). | |
| 359 | +** This function returns non-zero if the blob starts with a UTF-16 | |
| 360 | +** byte-order-mark (BOM), either in the endianness of the machine | |
| 361 | +** or in reversed byte order. | |
| 361 | 362 | */ |
| 362 | -int starts_with_utf16_bom(const Blob *pContent, int *pnByte){ | |
| 363 | +int starts_with_utf16_bom( | |
| 364 | + const Blob *pContent, /* IN: Blob content to perform BOM detection on. */ | |
| 365 | + int *pnByte, /* OUT: The number of bytes used for the BOM. */ | |
| 366 | + int *pbReverse /* OUT: Non-zero for BOM in reverse byte-order. */ | |
| 367 | +){ | |
| 363 | 368 | const char *z = blob_buffer(pContent); |
| 364 | - int c1; | |
| 365 | - | |
| 366 | - if( pnByte ) *pnByte = 2; | |
| 367 | - if( (blob_size(pContent)<2) || (blob_size(pContent)&1)) return 0; | |
| 368 | - c1 = ((unsigned short *)z)[0]; | |
| 369 | - if( (c1==0xfeff) || (c1==0xfffe) ){ | |
| 370 | - if( blob_size(pContent) < 4 ) return 1; | |
| 371 | - c1 = ((unsigned short *)z)[1]; | |
| 372 | - if( c1 != 0 ) return 1; | |
| 369 | + int bomSize = 2; | |
| 370 | + static const unsigned short bom = 0xfeff; | |
| 371 | + static const unsigned short bom_reversed = 0xfffe; | |
| 372 | + static const unsigned short null = 0; | |
| 373 | + int size; | |
| 374 | + | |
| 375 | + if( pnByte ) *pnByte = bomSize; | |
| 376 | + if( pbReverse ) *pbReverse = -1; /* Unknown. */ | |
| 377 | + size = blob_size(pContent); | |
| 378 | + if( (size<bomSize) || (size%2) ) return 0; | |
| 379 | + if( memcmp(z, &bom_reversed, bomSize)==0 ){ | |
| 380 | + if( pbReverse ) *pbReverse = 1; | |
| 381 | + if( size<(2*bomSize) ) return 1; | |
| 382 | + if( memcmp(z+bomSize, &null, bomSize)!=0 ) return 1; | |
| 383 | + }else if( memcmp(z, &bom, bomSize)==0 ){ | |
| 384 | + if( pbReverse ) *pbReverse = 0; | |
| 385 | + if( size<(2*bomSize) ) return 1; | |
| 386 | + if( memcmp(z+bomSize, &null, bomSize)!=0 ) return 1; | |
| 373 | 387 | } |
| 374 | 388 | return 0; |
| 375 | 389 | } |
| 376 | 390 | |
| 377 | 391 | /* |
| 378 | 392 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -354,24 +354,38 @@ | |
| 354 | if( blob_size(pContent)<bomSize ) return 0; |
| 355 | return memcmp(z, bom, bomSize)==0; |
| 356 | } |
| 357 | |
| 358 | /* |
| 359 | ** This function returns non-zero if the blob starts with a UTF-16le or |
| 360 | ** UTF-16be byte-order-mark (BOM). |
| 361 | */ |
| 362 | int starts_with_utf16_bom(const Blob *pContent, int *pnByte){ |
| 363 | const char *z = blob_buffer(pContent); |
| 364 | int c1; |
| 365 | |
| 366 | if( pnByte ) *pnByte = 2; |
| 367 | if( (blob_size(pContent)<2) || (blob_size(pContent)&1)) return 0; |
| 368 | c1 = ((unsigned short *)z)[0]; |
| 369 | if( (c1==0xfeff) || (c1==0xfffe) ){ |
| 370 | if( blob_size(pContent) < 4 ) return 1; |
| 371 | c1 = ((unsigned short *)z)[1]; |
| 372 | if( c1 != 0 ) return 1; |
| 373 | } |
| 374 | return 0; |
| 375 | } |
| 376 | |
| 377 | /* |
| 378 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -354,24 +354,38 @@ | |
| 354 | if( blob_size(pContent)<bomSize ) return 0; |
| 355 | return memcmp(z, bom, bomSize)==0; |
| 356 | } |
| 357 | |
| 358 | /* |
| 359 | ** This function returns non-zero if the blob starts with a UTF-16 |
| 360 | ** byte-order-mark (BOM), either in the endianness of the machine |
| 361 | ** or in reversed byte order. |
| 362 | */ |
| 363 | int starts_with_utf16_bom( |
| 364 | const Blob *pContent, /* IN: Blob content to perform BOM detection on. */ |
| 365 | int *pnByte, /* OUT: The number of bytes used for the BOM. */ |
| 366 | int *pbReverse /* OUT: Non-zero for BOM in reverse byte-order. */ |
| 367 | ){ |
| 368 | const char *z = blob_buffer(pContent); |
| 369 | int bomSize = 2; |
| 370 | static const unsigned short bom = 0xfeff; |
| 371 | static const unsigned short bom_reversed = 0xfffe; |
| 372 | static const unsigned short null = 0; |
| 373 | int size; |
| 374 | |
| 375 | if( pnByte ) *pnByte = bomSize; |
| 376 | if( pbReverse ) *pbReverse = -1; /* Unknown. */ |
| 377 | size = blob_size(pContent); |
| 378 | if( (size<bomSize) || (size%2) ) return 0; |
| 379 | if( memcmp(z, &bom_reversed, bomSize)==0 ){ |
| 380 | if( pbReverse ) *pbReverse = 1; |
| 381 | if( size<(2*bomSize) ) return 1; |
| 382 | if( memcmp(z+bomSize, &null, bomSize)!=0 ) return 1; |
| 383 | }else if( memcmp(z, &bom, bomSize)==0 ){ |
| 384 | if( pbReverse ) *pbReverse = 0; |
| 385 | if( size<(2*bomSize) ) return 1; |
| 386 | if( memcmp(z+bomSize, &null, bomSize)!=0 ) return 1; |
| 387 | } |
| 388 | return 0; |
| 389 | } |
| 390 | |
| 391 | /* |
| 392 |
+26
-12
| --- src/diff.c | ||
| +++ src/diff.c | ||
| @@ -354,24 +354,38 @@ | ||
| 354 | 354 | if( blob_size(pContent)<bomSize ) return 0; |
| 355 | 355 | return memcmp(z, bom, bomSize)==0; |
| 356 | 356 | } |
| 357 | 357 | |
| 358 | 358 | /* |
| 359 | -** This function returns non-zero if the blob starts with a UTF-16le or | |
| 360 | -** UTF-16be byte-order-mark (BOM). | |
| 359 | +** This function returns non-zero if the blob starts with a UTF-16 | |
| 360 | +** byte-order-mark (BOM), either in the endianness of the machine | |
| 361 | +** or in reversed byte order. | |
| 361 | 362 | */ |
| 362 | -int starts_with_utf16_bom(const Blob *pContent, int *pnByte){ | |
| 363 | +int starts_with_utf16_bom( | |
| 364 | + const Blob *pContent, /* IN: Blob content to perform BOM detection on. */ | |
| 365 | + int *pnByte, /* OUT: The number of bytes used for the BOM. */ | |
| 366 | + int *pbReverse /* OUT: Non-zero for BOM in reverse byte-order. */ | |
| 367 | +){ | |
| 363 | 368 | const char *z = blob_buffer(pContent); |
| 364 | - int c1; | |
| 365 | - | |
| 366 | - if( pnByte ) *pnByte = 2; | |
| 367 | - if( (blob_size(pContent)<2) || (blob_size(pContent)&1)) return 0; | |
| 368 | - c1 = ((unsigned short *)z)[0]; | |
| 369 | - if( (c1==0xfeff) || (c1==0xfffe) ){ | |
| 370 | - if( blob_size(pContent) < 4 ) return 1; | |
| 371 | - c1 = ((unsigned short *)z)[1]; | |
| 372 | - if( c1 != 0 ) return 1; | |
| 369 | + int bomSize = 2; | |
| 370 | + static const unsigned short bom = 0xfeff; | |
| 371 | + static const unsigned short bom_reversed = 0xfffe; | |
| 372 | + static const unsigned short null = 0; | |
| 373 | + int size; | |
| 374 | + | |
| 375 | + if( pnByte ) *pnByte = bomSize; | |
| 376 | + if( pbReverse ) *pbReverse = -1; /* Unknown. */ | |
| 377 | + size = blob_size(pContent); | |
| 378 | + if( (size<bomSize) || (size%2) ) return 0; | |
| 379 | + if( memcmp(z, &bom_reversed, bomSize)==0 ){ | |
| 380 | + if( pbReverse ) *pbReverse = 1; | |
| 381 | + if( size<(2*bomSize) ) return 1; | |
| 382 | + if( memcmp(z+bomSize, &null, bomSize)!=0 ) return 1; | |
| 383 | + }else if( memcmp(z, &bom, bomSize)==0 ){ | |
| 384 | + if( pbReverse ) *pbReverse = 0; | |
| 385 | + if( size<(2*bomSize) ) return 1; | |
| 386 | + if( memcmp(z+bomSize, &null, bomSize)!=0 ) return 1; | |
| 373 | 387 | } |
| 374 | 388 | return 0; |
| 375 | 389 | } |
| 376 | 390 | |
| 377 | 391 | /* |
| 378 | 392 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -354,24 +354,38 @@ | |
| 354 | if( blob_size(pContent)<bomSize ) return 0; |
| 355 | return memcmp(z, bom, bomSize)==0; |
| 356 | } |
| 357 | |
| 358 | /* |
| 359 | ** This function returns non-zero if the blob starts with a UTF-16le or |
| 360 | ** UTF-16be byte-order-mark (BOM). |
| 361 | */ |
| 362 | int starts_with_utf16_bom(const Blob *pContent, int *pnByte){ |
| 363 | const char *z = blob_buffer(pContent); |
| 364 | int c1; |
| 365 | |
| 366 | if( pnByte ) *pnByte = 2; |
| 367 | if( (blob_size(pContent)<2) || (blob_size(pContent)&1)) return 0; |
| 368 | c1 = ((unsigned short *)z)[0]; |
| 369 | if( (c1==0xfeff) || (c1==0xfffe) ){ |
| 370 | if( blob_size(pContent) < 4 ) return 1; |
| 371 | c1 = ((unsigned short *)z)[1]; |
| 372 | if( c1 != 0 ) return 1; |
| 373 | } |
| 374 | return 0; |
| 375 | } |
| 376 | |
| 377 | /* |
| 378 |
| --- src/diff.c | |
| +++ src/diff.c | |
| @@ -354,24 +354,38 @@ | |
| 354 | if( blob_size(pContent)<bomSize ) return 0; |
| 355 | return memcmp(z, bom, bomSize)==0; |
| 356 | } |
| 357 | |
| 358 | /* |
| 359 | ** This function returns non-zero if the blob starts with a UTF-16 |
| 360 | ** byte-order-mark (BOM), either in the endianness of the machine |
| 361 | ** or in reversed byte order. |
| 362 | */ |
| 363 | int starts_with_utf16_bom( |
| 364 | const Blob *pContent, /* IN: Blob content to perform BOM detection on. */ |
| 365 | int *pnByte, /* OUT: The number of bytes used for the BOM. */ |
| 366 | int *pbReverse /* OUT: Non-zero for BOM in reverse byte-order. */ |
| 367 | ){ |
| 368 | const char *z = blob_buffer(pContent); |
| 369 | int bomSize = 2; |
| 370 | static const unsigned short bom = 0xfeff; |
| 371 | static const unsigned short bom_reversed = 0xfffe; |
| 372 | static const unsigned short null = 0; |
| 373 | int size; |
| 374 | |
| 375 | if( pnByte ) *pnByte = bomSize; |
| 376 | if( pbReverse ) *pbReverse = -1; /* Unknown. */ |
| 377 | size = blob_size(pContent); |
| 378 | if( (size<bomSize) || (size%2) ) return 0; |
| 379 | if( memcmp(z, &bom_reversed, bomSize)==0 ){ |
| 380 | if( pbReverse ) *pbReverse = 1; |
| 381 | if( size<(2*bomSize) ) return 1; |
| 382 | if( memcmp(z+bomSize, &null, bomSize)!=0 ) return 1; |
| 383 | }else if( memcmp(z, &bom, bomSize)==0 ){ |
| 384 | if( pbReverse ) *pbReverse = 0; |
| 385 | if( size<(2*bomSize) ) return 1; |
| 386 | if( memcmp(z+bomSize, &null, bomSize)!=0 ) return 1; |
| 387 | } |
| 388 | return 0; |
| 389 | } |
| 390 | |
| 391 | /* |
| 392 |