Fossil SCM

Next step in "invalid-utf8" handling: If a source file contains invalid UTF-8 byte sequences, most likely the real encoding is either ISO-8895-1 or CP1252 (note that CP1252 is a superset of ISO-8859-1). Therefore, after providing a warning, we can now offer the option ('a') to convert it to valid UTF-8, just like we provide such option for UTF-16 and eol-handling as well.

jan.nijtmans 2014-07-04 10:11 UTC trunk
Commit 81eeb6f5535ad358183c51bf79b2c17b034de629
2 files changed +55 +2 -1
+55
--- src/blob.c
+++ src/blob.c
@@ -1004,10 +1004,65 @@
10041004
else if( z[i+1]!='\n' ) z[j++] = '\n';
10051005
}
10061006
z[j] = 0;
10071007
p->nUsed = j;
10081008
}
1009
+
1010
+/*
1011
+** Convert blob from cp1252 to utf-8. As cp1252 is a superset
1012
+** of iso8895-1, this is useful on UNIX as well.
1013
+**
1014
+** This table contains the character translations for 0x80..0xA0.
1015
+*/
1016
+
1017
+static const unsigned short cp1252[32] = {
1018
+ 0x20ac, 0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
1019
+ 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x8D, 0x017D, 0x8F,
1020
+ 0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
1021
+ 0x2DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x9D, 0x017E, 0x0178
1022
+};
1023
+
1024
+void blob_cp1252_to_utf8(Blob *p){
1025
+ unsigned char *z = (unsigned char *)p->aData;
1026
+ int j = p->nUsed;
1027
+ int i, n;
1028
+ for(i=n=0; i<j; i++){
1029
+ if( z[i]>=0x80 ){
1030
+ if( (z[i]<0xa0) && (cp1252[z[i]&0x1f]>=0x800)){
1031
+ n++;
1032
+ }
1033
+ n++;
1034
+ }
1035
+ }
1036
+ j += n;
1037
+ if( j>=p->nAlloc ){
1038
+ blob_resize(p, j);
1039
+ z = (unsigned char *)p->aData;
1040
+ }
1041
+ p->nUsed = j;
1042
+ z[j] = 0;
1043
+ while( j>i ){
1044
+ if( z[--i]>=0x80 ){
1045
+ if( z[i]<0xa0 ){
1046
+ unsigned short sym = cp1252[z[i]&0x1f];
1047
+ if( sym>=0x800 ){
1048
+ z[--j] = 0x80 | (sym&0x3f);
1049
+ z[--j] = 0x80 | ((sym>>6)&0x3f);
1050
+ z[--j] = 0xe0 | (sym>>12);
1051
+ }else{
1052
+ z[--j] = 0x80 | (sym&0x3f);
1053
+ z[--j] = 0xc0 | (sym>>6);
1054
+ }
1055
+ }else{
1056
+ z[--j] = 0x80 | (z[i]&0x3f);
1057
+ z[--j] = 0xC0 | (z[i]>>6);
1058
+ }
1059
+ }else{
1060
+ z[--j] = z[i];
1061
+ }
1062
+ }
1063
+}
10091064
10101065
/*
10111066
** Shell-escape the given string. Append the result to a blob.
10121067
*/
10131068
void shell_escape(Blob *pBlob, const char *zIn){
10141069
--- src/blob.c
+++ src/blob.c
@@ -1004,10 +1004,65 @@
1004 else if( z[i+1]!='\n' ) z[j++] = '\n';
1005 }
1006 z[j] = 0;
1007 p->nUsed = j;
1008 }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1009
1010 /*
1011 ** Shell-escape the given string. Append the result to a blob.
1012 */
1013 void shell_escape(Blob *pBlob, const char *zIn){
1014
--- src/blob.c
+++ src/blob.c
@@ -1004,10 +1004,65 @@
1004 else if( z[i+1]!='\n' ) z[j++] = '\n';
1005 }
1006 z[j] = 0;
1007 p->nUsed = j;
1008 }
1009
1010 /*
1011 ** Convert blob from cp1252 to utf-8. As cp1252 is a superset
1012 ** of iso8895-1, this is useful on UNIX as well.
1013 **
1014 ** This table contains the character translations for 0x80..0xA0.
1015 */
1016
1017 static const unsigned short cp1252[32] = {
1018 0x20ac, 0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
1019 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x8D, 0x017D, 0x8F,
1020 0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
1021 0x2DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x9D, 0x017E, 0x0178
1022 };
1023
1024 void blob_cp1252_to_utf8(Blob *p){
1025 unsigned char *z = (unsigned char *)p->aData;
1026 int j = p->nUsed;
1027 int i, n;
1028 for(i=n=0; i<j; i++){
1029 if( z[i]>=0x80 ){
1030 if( (z[i]<0xa0) && (cp1252[z[i]&0x1f]>=0x800)){
1031 n++;
1032 }
1033 n++;
1034 }
1035 }
1036 j += n;
1037 if( j>=p->nAlloc ){
1038 blob_resize(p, j);
1039 z = (unsigned char *)p->aData;
1040 }
1041 p->nUsed = j;
1042 z[j] = 0;
1043 while( j>i ){
1044 if( z[--i]>=0x80 ){
1045 if( z[i]<0xa0 ){
1046 unsigned short sym = cp1252[z[i]&0x1f];
1047 if( sym>=0x800 ){
1048 z[--j] = 0x80 | (sym&0x3f);
1049 z[--j] = 0x80 | ((sym>>6)&0x3f);
1050 z[--j] = 0xe0 | (sym>>12);
1051 }else{
1052 z[--j] = 0x80 | (sym&0x3f);
1053 z[--j] = 0xc0 | (sym>>6);
1054 }
1055 }else{
1056 z[--j] = 0x80 | (z[i]&0x3f);
1057 z[--j] = 0xC0 | (z[i]>>6);
1058 }
1059 }else{
1060 z[--j] = z[i];
1061 }
1062 }
1063 }
1064
1065 /*
1066 ** Shell-escape the given string. Append the result to a blob.
1067 */
1068 void shell_escape(Blob *pBlob, const char *zIn){
1069
+2 -1
--- src/checkin.c
+++ src/checkin.c
@@ -1296,11 +1296,10 @@
12961296
}else if( fHasInvalidUtf8 ){
12971297
if( encodingOk ){
12981298
return 0; /* We don't want encoding warnings for this file. */
12991299
}
13001300
zWarning = "invalid UTF-8";
1301
- zConvert = ""; /* Possible conversion to UTF-8 not yet implemented. */
13021301
zDisable = "\"encoding-glob\" setting";
13031302
}else if( fHasAnyCr ){
13041303
if( crnlOk ){
13051304
return 0; /* We don't want CR/NL warnings for this file. */
13061305
}
@@ -1341,10 +1340,12 @@
13411340
if( fUnicode ) {
13421341
int bomSize;
13431342
const unsigned char *bom = get_utf8_bom(&bomSize);
13441343
fwrite(bom, 1, bomSize, f);
13451344
blob_to_utf8_no_bom(p, 0);
1345
+ }else if( fHasInvalidUtf8 ){
1346
+ blob_cp1252_to_utf8(p);
13461347
}
13471348
if( fHasAnyCr ){
13481349
blob_to_lf_only(p);
13491350
}
13501351
fwrite(blob_buffer(p), 1, blob_size(p), f);
13511352
--- src/checkin.c
+++ src/checkin.c
@@ -1296,11 +1296,10 @@
1296 }else if( fHasInvalidUtf8 ){
1297 if( encodingOk ){
1298 return 0; /* We don't want encoding warnings for this file. */
1299 }
1300 zWarning = "invalid UTF-8";
1301 zConvert = ""; /* Possible conversion to UTF-8 not yet implemented. */
1302 zDisable = "\"encoding-glob\" setting";
1303 }else if( fHasAnyCr ){
1304 if( crnlOk ){
1305 return 0; /* We don't want CR/NL warnings for this file. */
1306 }
@@ -1341,10 +1340,12 @@
1341 if( fUnicode ) {
1342 int bomSize;
1343 const unsigned char *bom = get_utf8_bom(&bomSize);
1344 fwrite(bom, 1, bomSize, f);
1345 blob_to_utf8_no_bom(p, 0);
 
 
1346 }
1347 if( fHasAnyCr ){
1348 blob_to_lf_only(p);
1349 }
1350 fwrite(blob_buffer(p), 1, blob_size(p), f);
1351
--- src/checkin.c
+++ src/checkin.c
@@ -1296,11 +1296,10 @@
1296 }else if( fHasInvalidUtf8 ){
1297 if( encodingOk ){
1298 return 0; /* We don't want encoding warnings for this file. */
1299 }
1300 zWarning = "invalid UTF-8";
 
1301 zDisable = "\"encoding-glob\" setting";
1302 }else if( fHasAnyCr ){
1303 if( crnlOk ){
1304 return 0; /* We don't want CR/NL warnings for this file. */
1305 }
@@ -1341,10 +1340,12 @@
1340 if( fUnicode ) {
1341 int bomSize;
1342 const unsigned char *bom = get_utf8_bom(&bomSize);
1343 fwrite(bom, 1, bomSize, f);
1344 blob_to_utf8_no_bom(p, 0);
1345 }else if( fHasInvalidUtf8 ){
1346 blob_cp1252_to_utf8(p);
1347 }
1348 if( fHasAnyCr ){
1349 blob_to_lf_only(p);
1350 }
1351 fwrite(blob_buffer(p), 1, blob_size(p), f);
1352

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button