Fossil SCM
completed cp1252 table and conversion
Commit
bab2f28b601c0ab6ddb28463f68f1015f33dab3a
Parent
4f060f6afb7d7a1…
1 file changed
+29
-4
+29
-4
| --- src/blob.c | ||
| +++ src/blob.c | ||
| @@ -1032,18 +1032,31 @@ | ||
| 1032 | 1032 | |
| 1033 | 1033 | /* |
| 1034 | 1034 | ** Convert blob from cp1252 to utf-8. As cp1252 is a superset |
| 1035 | 1035 | ** of iso8895-1, this is useful on UNIX as well. |
| 1036 | 1036 | ** |
| 1037 | -** TODO: the bytes 0x80..0xBF need a special table, iso8895-1 works. | |
| 1037 | +** This table contains the character translations for 0x80..0xA0. | |
| 1038 | 1038 | */ |
| 1039 | + | |
| 1040 | +static const unsigned short cp1252[32] = { | |
| 1041 | + 0x20ac, 0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, | |
| 1042 | + 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x8D, 0x017D, 0x8F, | |
| 1043 | + 0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, | |
| 1044 | + 0x2DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x9D, 0x017E, 0x0178 | |
| 1045 | +}; | |
| 1046 | + | |
| 1039 | 1047 | void blob_cp1252_to_utf8(Blob *p){ |
| 1040 | 1048 | unsigned char *z = (unsigned char *)p->aData; |
| 1041 | 1049 | int j = p->nUsed; |
| 1042 | 1050 | int i, n; |
| 1043 | 1051 | for(i=n=0; i<j; i++){ |
| 1044 | - if( z[i]>=0x80 ) n++; | |
| 1052 | + if( z[i]>=0x80 ){ | |
| 1053 | + if( (z[i]<0xa0) && (cp1252[z[i]&0x1f]>=0x800)){ | |
| 1054 | + n++; | |
| 1055 | + } | |
| 1056 | + n++; | |
| 1057 | + } | |
| 1045 | 1058 | } |
| 1046 | 1059 | j += n; |
| 1047 | 1060 | if( j>=p->nAlloc ){ |
| 1048 | 1061 | blob_resize(p, j); |
| 1049 | 1062 | z = (unsigned char *)p->aData; |
| @@ -1050,12 +1063,24 @@ | ||
| 1050 | 1063 | } |
| 1051 | 1064 | p->nUsed = j; |
| 1052 | 1065 | z[j] = 0; |
| 1053 | 1066 | while( j>i ){ |
| 1054 | 1067 | if( z[--i]>=0x80 ){ |
| 1055 | - z[--j] = 0x80 | (z[i]&0x3F); | |
| 1056 | - z[--j] = 0xC0 | (z[i]>>6); | |
| 1068 | + if( z[i]<0xa0 ){ | |
| 1069 | + unsigned short sym = cp1252[z[i]&0x1f]; | |
| 1070 | + if( sym>=0x800 ){ | |
| 1071 | + z[--j] = 0x80 | (sym&0x3f); | |
| 1072 | + z[--j] = 0x80 | ((sym>>6)&0x3f); | |
| 1073 | + z[--j] = 0xe0 | (sym>>12); | |
| 1074 | + }else{ | |
| 1075 | + z[--j] = 0x80 | (sym&0x3f); | |
| 1076 | + z[--j] = 0xc0 | (sym>>6); | |
| 1077 | + } | |
| 1078 | + }else{ | |
| 1079 | + z[--j] = 0x80 | (z[i]&0x3F); | |
| 1080 | + z[--j] = 0xC0 | (z[i]>>6); | |
| 1081 | + } | |
| 1057 | 1082 | }else{ |
| 1058 | 1083 | z[--j] = z[i]; |
| 1059 | 1084 | } |
| 1060 | 1085 | } |
| 1061 | 1086 | } |
| 1062 | 1087 |
| --- src/blob.c | |
| +++ src/blob.c | |
| @@ -1032,18 +1032,31 @@ | |
| 1032 | |
| 1033 | /* |
| 1034 | ** Convert blob from cp1252 to utf-8. As cp1252 is a superset |
| 1035 | ** of iso8895-1, this is useful on UNIX as well. |
| 1036 | ** |
| 1037 | ** TODO: the bytes 0x80..0xBF need a special table, iso8895-1 works. |
| 1038 | */ |
| 1039 | void blob_cp1252_to_utf8(Blob *p){ |
| 1040 | unsigned char *z = (unsigned char *)p->aData; |
| 1041 | int j = p->nUsed; |
| 1042 | int i, n; |
| 1043 | for(i=n=0; i<j; i++){ |
| 1044 | if( z[i]>=0x80 ) n++; |
| 1045 | } |
| 1046 | j += n; |
| 1047 | if( j>=p->nAlloc ){ |
| 1048 | blob_resize(p, j); |
| 1049 | z = (unsigned char *)p->aData; |
| @@ -1050,12 +1063,24 @@ | |
| 1050 | } |
| 1051 | p->nUsed = j; |
| 1052 | z[j] = 0; |
| 1053 | while( j>i ){ |
| 1054 | if( z[--i]>=0x80 ){ |
| 1055 | z[--j] = 0x80 | (z[i]&0x3F); |
| 1056 | z[--j] = 0xC0 | (z[i]>>6); |
| 1057 | }else{ |
| 1058 | z[--j] = z[i]; |
| 1059 | } |
| 1060 | } |
| 1061 | } |
| 1062 |
| --- src/blob.c | |
| +++ src/blob.c | |
| @@ -1032,18 +1032,31 @@ | |
| 1032 | |
| 1033 | /* |
| 1034 | ** Convert blob from cp1252 to utf-8. As cp1252 is a superset |
| 1035 | ** of iso8895-1, this is useful on UNIX as well. |
| 1036 | ** |
| 1037 | ** This table contains the character translations for 0x80..0xA0. |
| 1038 | */ |
| 1039 | |
| 1040 | static const unsigned short cp1252[32] = { |
| 1041 | 0x20ac, 0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, |
| 1042 | 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x8D, 0x017D, 0x8F, |
| 1043 | 0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, |
| 1044 | 0x2DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x9D, 0x017E, 0x0178 |
| 1045 | }; |
| 1046 | |
| 1047 | void blob_cp1252_to_utf8(Blob *p){ |
| 1048 | unsigned char *z = (unsigned char *)p->aData; |
| 1049 | int j = p->nUsed; |
| 1050 | int i, n; |
| 1051 | for(i=n=0; i<j; i++){ |
| 1052 | if( z[i]>=0x80 ){ |
| 1053 | if( (z[i]<0xa0) && (cp1252[z[i]&0x1f]>=0x800)){ |
| 1054 | n++; |
| 1055 | } |
| 1056 | n++; |
| 1057 | } |
| 1058 | } |
| 1059 | j += n; |
| 1060 | if( j>=p->nAlloc ){ |
| 1061 | blob_resize(p, j); |
| 1062 | z = (unsigned char *)p->aData; |
| @@ -1050,12 +1063,24 @@ | |
| 1063 | } |
| 1064 | p->nUsed = j; |
| 1065 | z[j] = 0; |
| 1066 | while( j>i ){ |
| 1067 | if( z[--i]>=0x80 ){ |
| 1068 | if( z[i]<0xa0 ){ |
| 1069 | unsigned short sym = cp1252[z[i]&0x1f]; |
| 1070 | if( sym>=0x800 ){ |
| 1071 | z[--j] = 0x80 | (sym&0x3f); |
| 1072 | z[--j] = 0x80 | ((sym>>6)&0x3f); |
| 1073 | z[--j] = 0xe0 | (sym>>12); |
| 1074 | }else{ |
| 1075 | z[--j] = 0x80 | (sym&0x3f); |
| 1076 | z[--j] = 0xc0 | (sym>>6); |
| 1077 | } |
| 1078 | }else{ |
| 1079 | z[--j] = 0x80 | (z[i]&0x3F); |
| 1080 | z[--j] = 0xC0 | (z[i]>>6); |
| 1081 | } |
| 1082 | }else{ |
| 1083 | z[--j] = z[i]; |
| 1084 | } |
| 1085 | } |
| 1086 | } |
| 1087 |