Fossil SCM

When committing a (non-binary) file which contains bytes forming an invalid UTF-8 stream, add the possibility to convert it to a valid UTF-8 stream ('c') if you like.

jan.nijtmans 2014-07-10 07:36 trunk merge
Commit 45f5184e2a9734a9fd5f4d1ae9c10cec797a1a94
+55
--- src/blob.c
+++ src/blob.c
@@ -1004,10 +1004,65 @@
10041004
else if( z[i+1]!='\n' ) z[j++] = '\n';
10051005
}
10061006
z[j] = 0;
10071007
p->nUsed = j;
10081008
}
1009
+
1010
+/*
1011
+** Convert blob from cp1252 to UTF-8. As cp1252 is a superset
1012
+** of iso8859-1, this is useful on UNIX as well.
1013
+**
1014
+** This table contains the character translations for 0x80..0xA0.
1015
+*/
1016
+
1017
+static const unsigned short cp1252[32] = {
1018
+ 0x20ac, 0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
1019
+ 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x8D, 0x017D, 0x8F,
1020
+ 0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
1021
+ 0x2DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x9D, 0x017E, 0x0178
1022
+};
1023
+
1024
+void blob_cp1252_to_utf8(Blob *p){
1025
+ unsigned char *z = (unsigned char *)p->aData;
1026
+ int j = p->nUsed;
1027
+ int i, n;
1028
+ for(i=n=0; i<j; i++){
1029
+ if( z[i]>=0x80 ){
1030
+ if( (z[i]<0xa0) && (cp1252[z[i]&0x1f]>=0x800) ){
1031
+ n++;
1032
+ }
1033
+ n++;
1034
+ }
1035
+ }
1036
+ j += n;
1037
+ if( j>=p->nAlloc ){
1038
+ blob_resize(p, j);
1039
+ z = (unsigned char *)p->aData;
1040
+ }
1041
+ p->nUsed = j;
1042
+ z[j] = 0;
1043
+ while( j>i ){
1044
+ if( z[--i]>=0x80 ){
1045
+ if( z[i]<0xa0 ){
1046
+ unsigned short sym = cp1252[z[i]&0x1f];
1047
+ if( sym>=0x800 ){
1048
+ z[--j] = 0x80 | (sym&0x3f);
1049
+ z[--j] = 0x80 | ((sym>>6)&0x3f);
1050
+ z[--j] = 0xe0 | (sym>>12);
1051
+ }else{
1052
+ z[--j] = 0x80 | (sym&0x3f);
1053
+ z[--j] = 0xc0 | (sym>>6);
1054
+ }
1055
+ }else{
1056
+ z[--j] = 0x80 | (z[i]&0x3f);
1057
+ z[--j] = 0xC0 | (z[i]>>6);
1058
+ }
1059
+ }else{
1060
+ z[--j] = z[i];
1061
+ }
1062
+ }
1063
+}
10091064
10101065
/*
10111066
** Shell-escape the given string. Append the result to a blob.
10121067
*/
10131068
void shell_escape(Blob *pBlob, const char *zIn){
10141069
--- src/blob.c
+++ src/blob.c
@@ -1004,10 +1004,65 @@
1004 else if( z[i+1]!='\n' ) z[j++] = '\n';
1005 }
1006 z[j] = 0;
1007 p->nUsed = j;
1008 }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1009
1010 /*
1011 ** Shell-escape the given string. Append the result to a blob.
1012 */
1013 void shell_escape(Blob *pBlob, const char *zIn){
1014
--- src/blob.c
+++ src/blob.c
@@ -1004,10 +1004,65 @@
1004 else if( z[i+1]!='\n' ) z[j++] = '\n';
1005 }
1006 z[j] = 0;
1007 p->nUsed = j;
1008 }
1009
1010 /*
1011 ** Convert blob from cp1252 to UTF-8. As cp1252 is a superset
1012 ** of iso8859-1, this is useful on UNIX as well.
1013 **
1014 ** This table contains the character translations for 0x80..0xA0.
1015 */
1016
1017 static const unsigned short cp1252[32] = {
1018 0x20ac, 0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
1019 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x8D, 0x017D, 0x8F,
1020 0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
1021 0x2DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x9D, 0x017E, 0x0178
1022 };
1023
1024 void blob_cp1252_to_utf8(Blob *p){
1025 unsigned char *z = (unsigned char *)p->aData;
1026 int j = p->nUsed;
1027 int i, n;
1028 for(i=n=0; i<j; i++){
1029 if( z[i]>=0x80 ){
1030 if( (z[i]<0xa0) && (cp1252[z[i]&0x1f]>=0x800) ){
1031 n++;
1032 }
1033 n++;
1034 }
1035 }
1036 j += n;
1037 if( j>=p->nAlloc ){
1038 blob_resize(p, j);
1039 z = (unsigned char *)p->aData;
1040 }
1041 p->nUsed = j;
1042 z[j] = 0;
1043 while( j>i ){
1044 if( z[--i]>=0x80 ){
1045 if( z[i]<0xa0 ){
1046 unsigned short sym = cp1252[z[i]&0x1f];
1047 if( sym>=0x800 ){
1048 z[--j] = 0x80 | (sym&0x3f);
1049 z[--j] = 0x80 | ((sym>>6)&0x3f);
1050 z[--j] = 0xe0 | (sym>>12);
1051 }else{
1052 z[--j] = 0x80 | (sym&0x3f);
1053 z[--j] = 0xc0 | (sym>>6);
1054 }
1055 }else{
1056 z[--j] = 0x80 | (z[i]&0x3f);
1057 z[--j] = 0xC0 | (z[i]>>6);
1058 }
1059 }else{
1060 z[--j] = z[i];
1061 }
1062 }
1063 }
1064
1065 /*
1066 ** Shell-escape the given string. Append the result to a blob.
1067 */
1068 void shell_escape(Blob *pBlob, const char *zIn){
1069
+55
--- src/blob.c
+++ src/blob.c
@@ -1004,10 +1004,65 @@
10041004
else if( z[i+1]!='\n' ) z[j++] = '\n';
10051005
}
10061006
z[j] = 0;
10071007
p->nUsed = j;
10081008
}
1009
+
1010
+/*
1011
+** Convert blob from cp1252 to UTF-8. As cp1252 is a superset
1012
+** of iso8859-1, this is useful on UNIX as well.
1013
+**
1014
+** This table contains the character translations for 0x80..0xA0.
1015
+*/
1016
+
1017
+static const unsigned short cp1252[32] = {
1018
+ 0x20ac, 0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
1019
+ 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x8D, 0x017D, 0x8F,
1020
+ 0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
1021
+ 0x2DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x9D, 0x017E, 0x0178
1022
+};
1023
+
1024
+void blob_cp1252_to_utf8(Blob *p){
1025
+ unsigned char *z = (unsigned char *)p->aData;
1026
+ int j = p->nUsed;
1027
+ int i, n;
1028
+ for(i=n=0; i<j; i++){
1029
+ if( z[i]>=0x80 ){
1030
+ if( (z[i]<0xa0) && (cp1252[z[i]&0x1f]>=0x800) ){
1031
+ n++;
1032
+ }
1033
+ n++;
1034
+ }
1035
+ }
1036
+ j += n;
1037
+ if( j>=p->nAlloc ){
1038
+ blob_resize(p, j);
1039
+ z = (unsigned char *)p->aData;
1040
+ }
1041
+ p->nUsed = j;
1042
+ z[j] = 0;
1043
+ while( j>i ){
1044
+ if( z[--i]>=0x80 ){
1045
+ if( z[i]<0xa0 ){
1046
+ unsigned short sym = cp1252[z[i]&0x1f];
1047
+ if( sym>=0x800 ){
1048
+ z[--j] = 0x80 | (sym&0x3f);
1049
+ z[--j] = 0x80 | ((sym>>6)&0x3f);
1050
+ z[--j] = 0xe0 | (sym>>12);
1051
+ }else{
1052
+ z[--j] = 0x80 | (sym&0x3f);
1053
+ z[--j] = 0xc0 | (sym>>6);
1054
+ }
1055
+ }else{
1056
+ z[--j] = 0x80 | (z[i]&0x3f);
1057
+ z[--j] = 0xC0 | (z[i]>>6);
1058
+ }
1059
+ }else{
1060
+ z[--j] = z[i];
1061
+ }
1062
+ }
1063
+}
10091064
10101065
/*
10111066
** Shell-escape the given string. Append the result to a blob.
10121067
*/
10131068
void shell_escape(Blob *pBlob, const char *zIn){
10141069
--- src/blob.c
+++ src/blob.c
@@ -1004,10 +1004,65 @@
1004 else if( z[i+1]!='\n' ) z[j++] = '\n';
1005 }
1006 z[j] = 0;
1007 p->nUsed = j;
1008 }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1009
1010 /*
1011 ** Shell-escape the given string. Append the result to a blob.
1012 */
1013 void shell_escape(Blob *pBlob, const char *zIn){
1014
--- src/blob.c
+++ src/blob.c
@@ -1004,10 +1004,65 @@
1004 else if( z[i+1]!='\n' ) z[j++] = '\n';
1005 }
1006 z[j] = 0;
1007 p->nUsed = j;
1008 }
1009
1010 /*
1011 ** Convert blob from cp1252 to UTF-8. As cp1252 is a superset
1012 ** of iso8859-1, this is useful on UNIX as well.
1013 **
1014 ** This table contains the character translations for 0x80..0xA0.
1015 */
1016
1017 static const unsigned short cp1252[32] = {
1018 0x20ac, 0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
1019 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x8D, 0x017D, 0x8F,
1020 0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
1021 0x2DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x9D, 0x017E, 0x0178
1022 };
1023
1024 void blob_cp1252_to_utf8(Blob *p){
1025 unsigned char *z = (unsigned char *)p->aData;
1026 int j = p->nUsed;
1027 int i, n;
1028 for(i=n=0; i<j; i++){
1029 if( z[i]>=0x80 ){
1030 if( (z[i]<0xa0) && (cp1252[z[i]&0x1f]>=0x800) ){
1031 n++;
1032 }
1033 n++;
1034 }
1035 }
1036 j += n;
1037 if( j>=p->nAlloc ){
1038 blob_resize(p, j);
1039 z = (unsigned char *)p->aData;
1040 }
1041 p->nUsed = j;
1042 z[j] = 0;
1043 while( j>i ){
1044 if( z[--i]>=0x80 ){
1045 if( z[i]<0xa0 ){
1046 unsigned short sym = cp1252[z[i]&0x1f];
1047 if( sym>=0x800 ){
1048 z[--j] = 0x80 | (sym&0x3f);
1049 z[--j] = 0x80 | ((sym>>6)&0x3f);
1050 z[--j] = 0xe0 | (sym>>12);
1051 }else{
1052 z[--j] = 0x80 | (sym&0x3f);
1053 z[--j] = 0xc0 | (sym>>6);
1054 }
1055 }else{
1056 z[--j] = 0x80 | (z[i]&0x3f);
1057 z[--j] = 0xC0 | (z[i]>>6);
1058 }
1059 }else{
1060 z[--j] = z[i];
1061 }
1062 }
1063 }
1064
1065 /*
1066 ** Shell-escape the given string. Append the result to a blob.
1067 */
1068 void shell_escape(Blob *pBlob, const char *zIn){
1069
+2 -1
--- src/checkin.c
+++ src/checkin.c
@@ -1296,11 +1296,10 @@
12961296
}else if( fHasInvalidUtf8 ){
12971297
if( encodingOk ){
12981298
return 0; /* We don't want encoding warnings for this file. */
12991299
}
13001300
zWarning = "invalid UTF-8";
1301
- zConvert = ""; /* Possible conversion to UTF-8 not yet implemented. */
13021301
zDisable = "\"encoding-glob\" setting";
13031302
}else if( fHasAnyCr ){
13041303
if( crnlOk ){
13051304
return 0; /* We don't want CR/NL warnings for this file. */
13061305
}
@@ -1341,10 +1340,12 @@
13411340
if( fUnicode ) {
13421341
int bomSize;
13431342
const unsigned char *bom = get_utf8_bom(&bomSize);
13441343
fwrite(bom, 1, bomSize, f);
13451344
blob_to_utf8_no_bom(p, 0);
1345
+ }else if( fHasInvalidUtf8 ){
1346
+ blob_cp1252_to_utf8(p);
13461347
}
13471348
if( fHasAnyCr ){
13481349
blob_to_lf_only(p);
13491350
}
13501351
fwrite(blob_buffer(p), 1, blob_size(p), f);
13511352
--- src/checkin.c
+++ src/checkin.c
@@ -1296,11 +1296,10 @@
1296 }else if( fHasInvalidUtf8 ){
1297 if( encodingOk ){
1298 return 0; /* We don't want encoding warnings for this file. */
1299 }
1300 zWarning = "invalid UTF-8";
1301 zConvert = ""; /* Possible conversion to UTF-8 not yet implemented. */
1302 zDisable = "\"encoding-glob\" setting";
1303 }else if( fHasAnyCr ){
1304 if( crnlOk ){
1305 return 0; /* We don't want CR/NL warnings for this file. */
1306 }
@@ -1341,10 +1340,12 @@
1341 if( fUnicode ) {
1342 int bomSize;
1343 const unsigned char *bom = get_utf8_bom(&bomSize);
1344 fwrite(bom, 1, bomSize, f);
1345 blob_to_utf8_no_bom(p, 0);
 
 
1346 }
1347 if( fHasAnyCr ){
1348 blob_to_lf_only(p);
1349 }
1350 fwrite(blob_buffer(p), 1, blob_size(p), f);
1351
--- src/checkin.c
+++ src/checkin.c
@@ -1296,11 +1296,10 @@
1296 }else if( fHasInvalidUtf8 ){
1297 if( encodingOk ){
1298 return 0; /* We don't want encoding warnings for this file. */
1299 }
1300 zWarning = "invalid UTF-8";
 
1301 zDisable = "\"encoding-glob\" setting";
1302 }else if( fHasAnyCr ){
1303 if( crnlOk ){
1304 return 0; /* We don't want CR/NL warnings for this file. */
1305 }
@@ -1341,10 +1340,12 @@
1340 if( fUnicode ) {
1341 int bomSize;
1342 const unsigned char *bom = get_utf8_bom(&bomSize);
1343 fwrite(bom, 1, bomSize, f);
1344 blob_to_utf8_no_bom(p, 0);
1345 }else if( fHasInvalidUtf8 ){
1346 blob_cp1252_to_utf8(p);
1347 }
1348 if( fHasAnyCr ){
1349 blob_to_lf_only(p);
1350 }
1351 fwrite(blob_buffer(p), 1, blob_size(p), f);
1352
--- www/changes.wiki
+++ www/changes.wiki
@@ -12,10 +12,13 @@
1212
via a compile-time option.
1313
* Add the <nowiki>[checkout], [render], [styleHeader], [styleFooter],
1414
[trace], [getParameter], [setParameter], and [artifact]</nowiki> commands
1515
to TH1, primarily for use by TH1 hooks.
1616
* Bring in the latest version of autosetup from upstream.
17
+ * When committing a (non-binary) file which contains bytes forming an
18
+ invalid UTF-8 stream, fossil now adds the possibility to convert it
19
+ to a valid UTF-8 stream ('c') if you like.
1720
1821
<h2>Changes For Version 1.29 (2014-06-12)</h2>
1922
* Add the ability to display content, diffs and annotations for UTF16
2023
text files in the web interface.
2124
* Add the "SaveAs..." and "Invert" buttons
2225
--- www/changes.wiki
+++ www/changes.wiki
@@ -12,10 +12,13 @@
12 via a compile-time option.
13 * Add the <nowiki>[checkout], [render], [styleHeader], [styleFooter],
14 [trace], [getParameter], [setParameter], and [artifact]</nowiki> commands
15 to TH1, primarily for use by TH1 hooks.
16 * Bring in the latest version of autosetup from upstream.
 
 
 
17
18 <h2>Changes For Version 1.29 (2014-06-12)</h2>
19 * Add the ability to display content, diffs and annotations for UTF16
20 text files in the web interface.
21 * Add the "SaveAs..." and "Invert" buttons
22
--- www/changes.wiki
+++ www/changes.wiki
@@ -12,10 +12,13 @@
12 via a compile-time option.
13 * Add the <nowiki>[checkout], [render], [styleHeader], [styleFooter],
14 [trace], [getParameter], [setParameter], and [artifact]</nowiki> commands
15 to TH1, primarily for use by TH1 hooks.
16 * Bring in the latest version of autosetup from upstream.
17 * When committing a (non-binary) file which contains bytes forming an
18 invalid UTF-8 stream, fossil now adds the possibility to convert it
19 to a valid UTF-8 stream ('c') if you like.
20
21 <h2>Changes For Version 1.29 (2014-06-12)</h2>
22 * Add the ability to display content, diffs and annotations for UTF16
23 text files in the web interface.
24 * Add the "SaveAs..." and "Invert" buttons
25
--- www/changes.wiki
+++ www/changes.wiki
@@ -12,10 +12,13 @@
1212
via a compile-time option.
1313
* Add the <nowiki>[checkout], [render], [styleHeader], [styleFooter],
1414
[trace], [getParameter], [setParameter], and [artifact]</nowiki> commands
1515
to TH1, primarily for use by TH1 hooks.
1616
* Bring in the latest version of autosetup from upstream.
17
+ * When committing a (non-binary) file which contains bytes forming an
18
+ invalid UTF-8 stream, fossil now adds the possibility to convert it
19
+ to a valid UTF-8 stream ('c') if you like.
1720
1821
<h2>Changes For Version 1.29 (2014-06-12)</h2>
1922
* Add the ability to display content, diffs and annotations for UTF16
2023
text files in the web interface.
2124
* Add the "SaveAs..." and "Invert" buttons
2225
--- www/changes.wiki
+++ www/changes.wiki
@@ -12,10 +12,13 @@
12 via a compile-time option.
13 * Add the <nowiki>[checkout], [render], [styleHeader], [styleFooter],
14 [trace], [getParameter], [setParameter], and [artifact]</nowiki> commands
15 to TH1, primarily for use by TH1 hooks.
16 * Bring in the latest version of autosetup from upstream.
 
 
 
17
18 <h2>Changes For Version 1.29 (2014-06-12)</h2>
19 * Add the ability to display content, diffs and annotations for UTF16
20 text files in the web interface.
21 * Add the "SaveAs..." and "Invert" buttons
22
--- www/changes.wiki
+++ www/changes.wiki
@@ -12,10 +12,13 @@
12 via a compile-time option.
13 * Add the <nowiki>[checkout], [render], [styleHeader], [styleFooter],
14 [trace], [getParameter], [setParameter], and [artifact]</nowiki> commands
15 to TH1, primarily for use by TH1 hooks.
16 * Bring in the latest version of autosetup from upstream.
17 * When committing a (non-binary) file which contains bytes forming an
18 invalid UTF-8 stream, fossil now adds the possibility to convert it
19 to a valid UTF-8 stream ('c') if you like.
20
21 <h2>Changes For Version 1.29 (2014-06-12)</h2>
22 * Add the ability to display content, diffs and annotations for UTF16
23 text files in the web interface.
24 * Add the "SaveAs..." and "Invert" buttons
25

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button