Fossil SCM
Next step in "invalid-utf8" handling: If a source file contains invalid UTF-8 byte sequences, most likely the real encoding is either ISO-8895-1 or CP1252 (note that CP1252 is a superset of ISO-8859-1). Therefore, after providing a warning, we can now offer the option ('a') to convert it to valid UTF-8, just like we provide such option for UTF-16 and eol-handling as well.
Commit
81eeb6f5535ad358183c51bf79b2c17b034de629
Parent
6728a8bd0898123…
2 files changed
+55
+2
-1
+55
| --- src/blob.c | ||
| +++ src/blob.c | ||
| @@ -1004,10 +1004,65 @@ | ||
| 1004 | 1004 | else if( z[i+1]!='\n' ) z[j++] = '\n'; |
| 1005 | 1005 | } |
| 1006 | 1006 | z[j] = 0; |
| 1007 | 1007 | p->nUsed = j; |
| 1008 | 1008 | } |
| 1009 | + | |
| 1010 | +/* | |
| 1011 | +** Convert blob from cp1252 to utf-8. As cp1252 is a superset | |
| 1012 | +** of iso8895-1, this is useful on UNIX as well. | |
| 1013 | +** | |
| 1014 | +** This table contains the character translations for 0x80..0xA0. | |
| 1015 | +*/ | |
| 1016 | + | |
| 1017 | +static const unsigned short cp1252[32] = { | |
| 1018 | + 0x20ac, 0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, | |
| 1019 | + 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x8D, 0x017D, 0x8F, | |
| 1020 | + 0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, | |
| 1021 | + 0x2DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x9D, 0x017E, 0x0178 | |
| 1022 | +}; | |
| 1023 | + | |
| 1024 | +void blob_cp1252_to_utf8(Blob *p){ | |
| 1025 | + unsigned char *z = (unsigned char *)p->aData; | |
| 1026 | + int j = p->nUsed; | |
| 1027 | + int i, n; | |
| 1028 | + for(i=n=0; i<j; i++){ | |
| 1029 | + if( z[i]>=0x80 ){ | |
| 1030 | + if( (z[i]<0xa0) && (cp1252[z[i]&0x1f]>=0x800)){ | |
| 1031 | + n++; | |
| 1032 | + } | |
| 1033 | + n++; | |
| 1034 | + } | |
| 1035 | + } | |
| 1036 | + j += n; | |
| 1037 | + if( j>=p->nAlloc ){ | |
| 1038 | + blob_resize(p, j); | |
| 1039 | + z = (unsigned char *)p->aData; | |
| 1040 | + } | |
| 1041 | + p->nUsed = j; | |
| 1042 | + z[j] = 0; | |
| 1043 | + while( j>i ){ | |
| 1044 | + if( z[--i]>=0x80 ){ | |
| 1045 | + if( z[i]<0xa0 ){ | |
| 1046 | + unsigned short sym = cp1252[z[i]&0x1f]; | |
| 1047 | + if( sym>=0x800 ){ | |
| 1048 | + z[--j] = 0x80 | (sym&0x3f); | |
| 1049 | + z[--j] = 0x80 | ((sym>>6)&0x3f); | |
| 1050 | + z[--j] = 0xe0 | (sym>>12); | |
| 1051 | + }else{ | |
| 1052 | + z[--j] = 0x80 | (sym&0x3f); | |
| 1053 | + z[--j] = 0xc0 | (sym>>6); | |
| 1054 | + } | |
| 1055 | + }else{ | |
| 1056 | + z[--j] = 0x80 | (z[i]&0x3f); | |
| 1057 | + z[--j] = 0xC0 | (z[i]>>6); | |
| 1058 | + } | |
| 1059 | + }else{ | |
| 1060 | + z[--j] = z[i]; | |
| 1061 | + } | |
| 1062 | + } | |
| 1063 | +} | |
| 1009 | 1064 | |
| 1010 | 1065 | /* |
| 1011 | 1066 | ** Shell-escape the given string. Append the result to a blob. |
| 1012 | 1067 | */ |
| 1013 | 1068 | void shell_escape(Blob *pBlob, const char *zIn){ |
| 1014 | 1069 |
| --- src/blob.c | |
| +++ src/blob.c | |
| @@ -1004,10 +1004,65 @@ | |
| 1004 | else if( z[i+1]!='\n' ) z[j++] = '\n'; |
| 1005 | } |
| 1006 | z[j] = 0; |
| 1007 | p->nUsed = j; |
| 1008 | } |
| 1009 | |
| 1010 | /* |
| 1011 | ** Shell-escape the given string. Append the result to a blob. |
| 1012 | */ |
| 1013 | void shell_escape(Blob *pBlob, const char *zIn){ |
| 1014 |
| --- src/blob.c | |
| +++ src/blob.c | |
| @@ -1004,10 +1004,65 @@ | |
| 1004 | else if( z[i+1]!='\n' ) z[j++] = '\n'; |
| 1005 | } |
| 1006 | z[j] = 0; |
| 1007 | p->nUsed = j; |
| 1008 | } |
| 1009 | |
| 1010 | /* |
| 1011 | ** Convert blob from cp1252 to utf-8. As cp1252 is a superset |
| 1012 | ** of iso8895-1, this is useful on UNIX as well. |
| 1013 | ** |
| 1014 | ** This table contains the character translations for 0x80..0xA0. |
| 1015 | */ |
| 1016 | |
| 1017 | static const unsigned short cp1252[32] = { |
| 1018 | 0x20ac, 0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, |
| 1019 | 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x8D, 0x017D, 0x8F, |
| 1020 | 0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, |
| 1021 | 0x2DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x9D, 0x017E, 0x0178 |
| 1022 | }; |
| 1023 | |
| 1024 | void blob_cp1252_to_utf8(Blob *p){ |
| 1025 | unsigned char *z = (unsigned char *)p->aData; |
| 1026 | int j = p->nUsed; |
| 1027 | int i, n; |
| 1028 | for(i=n=0; i<j; i++){ |
| 1029 | if( z[i]>=0x80 ){ |
| 1030 | if( (z[i]<0xa0) && (cp1252[z[i]&0x1f]>=0x800)){ |
| 1031 | n++; |
| 1032 | } |
| 1033 | n++; |
| 1034 | } |
| 1035 | } |
| 1036 | j += n; |
| 1037 | if( j>=p->nAlloc ){ |
| 1038 | blob_resize(p, j); |
| 1039 | z = (unsigned char *)p->aData; |
| 1040 | } |
| 1041 | p->nUsed = j; |
| 1042 | z[j] = 0; |
| 1043 | while( j>i ){ |
| 1044 | if( z[--i]>=0x80 ){ |
| 1045 | if( z[i]<0xa0 ){ |
| 1046 | unsigned short sym = cp1252[z[i]&0x1f]; |
| 1047 | if( sym>=0x800 ){ |
| 1048 | z[--j] = 0x80 | (sym&0x3f); |
| 1049 | z[--j] = 0x80 | ((sym>>6)&0x3f); |
| 1050 | z[--j] = 0xe0 | (sym>>12); |
| 1051 | }else{ |
| 1052 | z[--j] = 0x80 | (sym&0x3f); |
| 1053 | z[--j] = 0xc0 | (sym>>6); |
| 1054 | } |
| 1055 | }else{ |
| 1056 | z[--j] = 0x80 | (z[i]&0x3f); |
| 1057 | z[--j] = 0xC0 | (z[i]>>6); |
| 1058 | } |
| 1059 | }else{ |
| 1060 | z[--j] = z[i]; |
| 1061 | } |
| 1062 | } |
| 1063 | } |
| 1064 | |
| 1065 | /* |
| 1066 | ** Shell-escape the given string. Append the result to a blob. |
| 1067 | */ |
| 1068 | void shell_escape(Blob *pBlob, const char *zIn){ |
| 1069 |
+2
-1
| --- src/checkin.c | ||
| +++ src/checkin.c | ||
| @@ -1296,11 +1296,10 @@ | ||
| 1296 | 1296 | }else if( fHasInvalidUtf8 ){ |
| 1297 | 1297 | if( encodingOk ){ |
| 1298 | 1298 | return 0; /* We don't want encoding warnings for this file. */ |
| 1299 | 1299 | } |
| 1300 | 1300 | zWarning = "invalid UTF-8"; |
| 1301 | - zConvert = ""; /* Possible conversion to UTF-8 not yet implemented. */ | |
| 1302 | 1301 | zDisable = "\"encoding-glob\" setting"; |
| 1303 | 1302 | }else if( fHasAnyCr ){ |
| 1304 | 1303 | if( crnlOk ){ |
| 1305 | 1304 | return 0; /* We don't want CR/NL warnings for this file. */ |
| 1306 | 1305 | } |
| @@ -1341,10 +1340,12 @@ | ||
| 1341 | 1340 | if( fUnicode ) { |
| 1342 | 1341 | int bomSize; |
| 1343 | 1342 | const unsigned char *bom = get_utf8_bom(&bomSize); |
| 1344 | 1343 | fwrite(bom, 1, bomSize, f); |
| 1345 | 1344 | blob_to_utf8_no_bom(p, 0); |
| 1345 | + }else if( fHasInvalidUtf8 ){ | |
| 1346 | + blob_cp1252_to_utf8(p); | |
| 1346 | 1347 | } |
| 1347 | 1348 | if( fHasAnyCr ){ |
| 1348 | 1349 | blob_to_lf_only(p); |
| 1349 | 1350 | } |
| 1350 | 1351 | fwrite(blob_buffer(p), 1, blob_size(p), f); |
| 1351 | 1352 |
| --- src/checkin.c | |
| +++ src/checkin.c | |
| @@ -1296,11 +1296,10 @@ | |
| 1296 | }else if( fHasInvalidUtf8 ){ |
| 1297 | if( encodingOk ){ |
| 1298 | return 0; /* We don't want encoding warnings for this file. */ |
| 1299 | } |
| 1300 | zWarning = "invalid UTF-8"; |
| 1301 | zConvert = ""; /* Possible conversion to UTF-8 not yet implemented. */ |
| 1302 | zDisable = "\"encoding-glob\" setting"; |
| 1303 | }else if( fHasAnyCr ){ |
| 1304 | if( crnlOk ){ |
| 1305 | return 0; /* We don't want CR/NL warnings for this file. */ |
| 1306 | } |
| @@ -1341,10 +1340,12 @@ | |
| 1341 | if( fUnicode ) { |
| 1342 | int bomSize; |
| 1343 | const unsigned char *bom = get_utf8_bom(&bomSize); |
| 1344 | fwrite(bom, 1, bomSize, f); |
| 1345 | blob_to_utf8_no_bom(p, 0); |
| 1346 | } |
| 1347 | if( fHasAnyCr ){ |
| 1348 | blob_to_lf_only(p); |
| 1349 | } |
| 1350 | fwrite(blob_buffer(p), 1, blob_size(p), f); |
| 1351 |
| --- src/checkin.c | |
| +++ src/checkin.c | |
| @@ -1296,11 +1296,10 @@ | |
| 1296 | }else if( fHasInvalidUtf8 ){ |
| 1297 | if( encodingOk ){ |
| 1298 | return 0; /* We don't want encoding warnings for this file. */ |
| 1299 | } |
| 1300 | zWarning = "invalid UTF-8"; |
| 1301 | zDisable = "\"encoding-glob\" setting"; |
| 1302 | }else if( fHasAnyCr ){ |
| 1303 | if( crnlOk ){ |
| 1304 | return 0; /* We don't want CR/NL warnings for this file. */ |
| 1305 | } |
| @@ -1341,10 +1340,12 @@ | |
| 1340 | if( fUnicode ) { |
| 1341 | int bomSize; |
| 1342 | const unsigned char *bom = get_utf8_bom(&bomSize); |
| 1343 | fwrite(bom, 1, bomSize, f); |
| 1344 | blob_to_utf8_no_bom(p, 0); |
| 1345 | }else if( fHasInvalidUtf8 ){ |
| 1346 | blob_cp1252_to_utf8(p); |
| 1347 | } |
| 1348 | if( fHasAnyCr ){ |
| 1349 | blob_to_lf_only(p); |
| 1350 | } |
| 1351 | fwrite(blob_buffer(p), 1, blob_size(p), f); |
| 1352 |