Fossil SCM

completed cp1252 table and conversion

jan.nijtmans 2012-12-12 15:35 improve_commit_warning
Commit bab2f28b601c0ab6ddb28463f68f1015f33dab3a
1 file changed +29 -4
+29 -4
--- src/blob.c
+++ src/blob.c
@@ -1032,18 +1032,31 @@
10321032
10331033
/*
10341034
** Convert blob from cp1252 to utf-8. As cp1252 is a superset
10351035
** of iso8895-1, this is useful on UNIX as well.
10361036
**
1037
-** TODO: the bytes 0x80..0xBF need a special table, iso8895-1 works.
1037
+** This table contains the character translations for 0x80..0xA0.
10381038
*/
1039
+
1040
+static const unsigned short cp1252[32] = {
1041
+ 0x20ac, 0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
1042
+ 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x8D, 0x017D, 0x8F,
1043
+ 0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
1044
+ 0x2DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x9D, 0x017E, 0x0178
1045
+};
1046
+
10391047
void blob_cp1252_to_utf8(Blob *p){
10401048
unsigned char *z = (unsigned char *)p->aData;
10411049
int j = p->nUsed;
10421050
int i, n;
10431051
for(i=n=0; i<j; i++){
1044
- if( z[i]>=0x80 ) n++;
1052
+ if( z[i]>=0x80 ){
1053
+ if( (z[i]<0xa0) && (cp1252[z[i]&0x1f]>=0x800)){
1054
+ n++;
1055
+ }
1056
+ n++;
1057
+ }
10451058
}
10461059
j += n;
10471060
if( j>=p->nAlloc ){
10481061
blob_resize(p, j);
10491062
z = (unsigned char *)p->aData;
@@ -1050,12 +1063,24 @@
10501063
}
10511064
p->nUsed = j;
10521065
z[j] = 0;
10531066
while( j>i ){
10541067
if( z[--i]>=0x80 ){
1055
- z[--j] = 0x80 | (z[i]&0x3F);
1056
- z[--j] = 0xC0 | (z[i]>>6);
1068
+ if( z[i]<0xa0 ){
1069
+ unsigned short sym = cp1252[z[i]&0x1f];
1070
+ if( sym>=0x800 ){
1071
+ z[--j] = 0x80 | (sym&0x3f);
1072
+ z[--j] = 0x80 | ((sym>>6)&0x3f);
1073
+ z[--j] = 0xe0 | (sym>>12);
1074
+ }else{
1075
+ z[--j] = 0x80 | (sym&0x3f);
1076
+ z[--j] = 0xc0 | (sym>>6);
1077
+ }
1078
+ }else{
1079
+ z[--j] = 0x80 | (z[i]&0x3F);
1080
+ z[--j] = 0xC0 | (z[i]>>6);
1081
+ }
10571082
}else{
10581083
z[--j] = z[i];
10591084
}
10601085
}
10611086
}
10621087
--- src/blob.c
+++ src/blob.c
@@ -1032,18 +1032,31 @@
1032
1033 /*
1034 ** Convert blob from cp1252 to utf-8. As cp1252 is a superset
1035 ** of iso8895-1, this is useful on UNIX as well.
1036 **
1037 ** TODO: the bytes 0x80..0xBF need a special table, iso8895-1 works.
1038 */
 
 
 
 
 
 
 
 
1039 void blob_cp1252_to_utf8(Blob *p){
1040 unsigned char *z = (unsigned char *)p->aData;
1041 int j = p->nUsed;
1042 int i, n;
1043 for(i=n=0; i<j; i++){
1044 if( z[i]>=0x80 ) n++;
 
 
 
 
 
1045 }
1046 j += n;
1047 if( j>=p->nAlloc ){
1048 blob_resize(p, j);
1049 z = (unsigned char *)p->aData;
@@ -1050,12 +1063,24 @@
1050 }
1051 p->nUsed = j;
1052 z[j] = 0;
1053 while( j>i ){
1054 if( z[--i]>=0x80 ){
1055 z[--j] = 0x80 | (z[i]&0x3F);
1056 z[--j] = 0xC0 | (z[i]>>6);
 
 
 
 
 
 
 
 
 
 
 
 
1057 }else{
1058 z[--j] = z[i];
1059 }
1060 }
1061 }
1062
--- src/blob.c
+++ src/blob.c
@@ -1032,18 +1032,31 @@
1032
1033 /*
1034 ** Convert blob from cp1252 to utf-8. As cp1252 is a superset
1035 ** of iso8895-1, this is useful on UNIX as well.
1036 **
1037 ** This table contains the character translations for 0x80..0xA0.
1038 */
1039
1040 static const unsigned short cp1252[32] = {
1041 0x20ac, 0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
1042 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x8D, 0x017D, 0x8F,
1043 0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
1044 0x2DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x9D, 0x017E, 0x0178
1045 };
1046
1047 void blob_cp1252_to_utf8(Blob *p){
1048 unsigned char *z = (unsigned char *)p->aData;
1049 int j = p->nUsed;
1050 int i, n;
1051 for(i=n=0; i<j; i++){
1052 if( z[i]>=0x80 ){
1053 if( (z[i]<0xa0) && (cp1252[z[i]&0x1f]>=0x800)){
1054 n++;
1055 }
1056 n++;
1057 }
1058 }
1059 j += n;
1060 if( j>=p->nAlloc ){
1061 blob_resize(p, j);
1062 z = (unsigned char *)p->aData;
@@ -1050,12 +1063,24 @@
1063 }
1064 p->nUsed = j;
1065 z[j] = 0;
1066 while( j>i ){
1067 if( z[--i]>=0x80 ){
1068 if( z[i]<0xa0 ){
1069 unsigned short sym = cp1252[z[i]&0x1f];
1070 if( sym>=0x800 ){
1071 z[--j] = 0x80 | (sym&0x3f);
1072 z[--j] = 0x80 | ((sym>>6)&0x3f);
1073 z[--j] = 0xe0 | (sym>>12);
1074 }else{
1075 z[--j] = 0x80 | (sym&0x3f);
1076 z[--j] = 0xc0 | (sym>>6);
1077 }
1078 }else{
1079 z[--j] = 0x80 | (z[i]&0x3F);
1080 z[--j] = 0xC0 | (z[i]>>6);
1081 }
1082 }else{
1083 z[--j] = z[i];
1084 }
1085 }
1086 }
1087

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button