Fossil SCM
Quick test whether the `cli_wcwidth()' function [https://sqlite.org/src/vdiff?branch=variable-width-char | recently added to the SQLite shell] can be used by the comment formatter to take character widths into account when calculating word-break positions. TODOs: (0) Fix the "modern" (i.e. non-legacy) comment formatter being off by one if a fullwidth character only fits partially. (1) Add tests for the comment formatters with non-ASCII input. (2) Implement a modified `decodeUtf8()' function (which is static, anyway) that also accepts single-byte UTF-8 characters and may allow for some simplifications to the comment formatter algorithms.
Commit
b2dbdc8afbff1c162400696cae7e8a80e444cd6129b01c496fda15ad59e8f68c
Parent
c20aa86727773e3…
2 files changed
+1
-1
+10
+1
-1
| --- extsrc/shell.c | ||
| +++ extsrc/shell.c | ||
| @@ -1024,11 +1024,11 @@ | ||
| 1024 | 1024 | ** Compute the value and length of a multi-byte UTF-8 character that |
| 1025 | 1025 | ** begins at z[0]. Return the length. Write the Unicode value into *pU. |
| 1026 | 1026 | ** |
| 1027 | 1027 | ** This routine only works for *multi-byte* UTF-8 characters. |
| 1028 | 1028 | */ |
| 1029 | -static int decodeUtf8(const unsigned char *z, int *pU){ | |
| 1029 | +int decodeUtf8(const unsigned char *z, int *pU){ | |
| 1030 | 1030 | if( (z[0] & 0xe0)==0xc0 && (z[1] & 0xc0)==0x80 ){ |
| 1031 | 1031 | *pU = ((z[0] & 0x1f)<<6) | (z[1] & 0x3f); |
| 1032 | 1032 | return 2; |
| 1033 | 1033 | } |
| 1034 | 1034 | if( (z[0] & 0xf0)==0xe0 && (z[1] & 0xc0)==0x80 && (z[2] & 0xc0)==0x80 ){ |
| 1035 | 1035 |
| --- extsrc/shell.c | |
| +++ extsrc/shell.c | |
| @@ -1024,11 +1024,11 @@ | |
| 1024 | ** Compute the value and length of a multi-byte UTF-8 character that |
| 1025 | ** begins at z[0]. Return the length. Write the Unicode value into *pU. |
| 1026 | ** |
| 1027 | ** This routine only works for *multi-byte* UTF-8 characters. |
| 1028 | */ |
| 1029 | static int decodeUtf8(const unsigned char *z, int *pU){ |
| 1030 | if( (z[0] & 0xe0)==0xc0 && (z[1] & 0xc0)==0x80 ){ |
| 1031 | *pU = ((z[0] & 0x1f)<<6) | (z[1] & 0x3f); |
| 1032 | return 2; |
| 1033 | } |
| 1034 | if( (z[0] & 0xf0)==0xe0 && (z[1] & 0xc0)==0x80 && (z[2] & 0xc0)==0x80 ){ |
| 1035 |
| --- extsrc/shell.c | |
| +++ extsrc/shell.c | |
| @@ -1024,11 +1024,11 @@ | |
| 1024 | ** Compute the value and length of a multi-byte UTF-8 character that |
| 1025 | ** begins at z[0]. Return the length. Write the Unicode value into *pU. |
| 1026 | ** |
| 1027 | ** This routine only works for *multi-byte* UTF-8 characters. |
| 1028 | */ |
| 1029 | int decodeUtf8(const unsigned char *z, int *pU){ |
| 1030 | if( (z[0] & 0xe0)==0xc0 && (z[1] & 0xc0)==0x80 ){ |
| 1031 | *pU = ((z[0] & 0x1f)<<6) | (z[1] & 0x3f); |
| 1032 | return 2; |
| 1033 | } |
| 1034 | if( (z[0] & 0xf0)==0xe0 && (z[1] & 0xc0)==0x80 && (z[2] & 0xc0)==0x80 ){ |
| 1035 |
+10
| --- src/comformat.c | ||
| +++ src/comformat.c | ||
| @@ -294,10 +294,15 @@ | ||
| 294 | 294 | while( cchUTF8<maxUTF8 && |
| 295 | 295 | (zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ |
| 296 | 296 | cchUTF8++; |
| 297 | 297 | zBuf[iBuf++] = zLine[index++]; |
| 298 | 298 | } |
| 299 | + if( cchUTF8>1 ){ | |
| 300 | + int utf32; | |
| 301 | + decodeUtf8(&zLine[index-cchUTF8],&utf32); | |
| 302 | + useChars += cli_wcwidth(utf32) - 1; | |
| 303 | + } | |
| 299 | 304 | maxChars -= useChars; |
| 300 | 305 | if( maxChars<=0 ) break; |
| 301 | 306 | if( c=='\n' ) break; |
| 302 | 307 | } |
| 303 | 308 | if( charCnt>0 ){ |
| @@ -380,10 +385,15 @@ | ||
| 380 | 385 | (zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ |
| 381 | 386 | cchUTF8++; |
| 382 | 387 | zBuf[k++] = zText[++i]; |
| 383 | 388 | } |
| 384 | 389 | } |
| 390 | + if( cchUTF8>1 ){ | |
| 391 | + int utf32; | |
| 392 | + decodeUtf8(&zText[k-cchUTF8],&utf32); | |
| 393 | + kc += cli_wcwidth(utf32) - 1; | |
| 394 | + } | |
| 385 | 395 | else if( fossil_isspace(c) ){ |
| 386 | 396 | si = i; |
| 387 | 397 | sk = k; |
| 388 | 398 | if( k==0 || zBuf[k-1]!=' ' ){ |
| 389 | 399 | zBuf[k++] = ' '; |
| 390 | 400 |
| --- src/comformat.c | |
| +++ src/comformat.c | |
| @@ -294,10 +294,15 @@ | |
| 294 | while( cchUTF8<maxUTF8 && |
| 295 | (zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ |
| 296 | cchUTF8++; |
| 297 | zBuf[iBuf++] = zLine[index++]; |
| 298 | } |
| 299 | maxChars -= useChars; |
| 300 | if( maxChars<=0 ) break; |
| 301 | if( c=='\n' ) break; |
| 302 | } |
| 303 | if( charCnt>0 ){ |
| @@ -380,10 +385,15 @@ | |
| 380 | (zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ |
| 381 | cchUTF8++; |
| 382 | zBuf[k++] = zText[++i]; |
| 383 | } |
| 384 | } |
| 385 | else if( fossil_isspace(c) ){ |
| 386 | si = i; |
| 387 | sk = k; |
| 388 | if( k==0 || zBuf[k-1]!=' ' ){ |
| 389 | zBuf[k++] = ' '; |
| 390 |
| --- src/comformat.c | |
| +++ src/comformat.c | |
| @@ -294,10 +294,15 @@ | |
| 294 | while( cchUTF8<maxUTF8 && |
| 295 | (zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ |
| 296 | cchUTF8++; |
| 297 | zBuf[iBuf++] = zLine[index++]; |
| 298 | } |
| 299 | if( cchUTF8>1 ){ |
| 300 | int utf32; |
| 301 | decodeUtf8(&zLine[index-cchUTF8],&utf32); |
| 302 | useChars += cli_wcwidth(utf32) - 1; |
| 303 | } |
| 304 | maxChars -= useChars; |
| 305 | if( maxChars<=0 ) break; |
| 306 | if( c=='\n' ) break; |
| 307 | } |
| 308 | if( charCnt>0 ){ |
| @@ -380,10 +385,15 @@ | |
| 385 | (zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ |
| 386 | cchUTF8++; |
| 387 | zBuf[k++] = zText[++i]; |
| 388 | } |
| 389 | } |
| 390 | if( cchUTF8>1 ){ |
| 391 | int utf32; |
| 392 | decodeUtf8(&zText[k-cchUTF8],&utf32); |
| 393 | kc += cli_wcwidth(utf32) - 1; |
| 394 | } |
| 395 | else if( fossil_isspace(c) ){ |
| 396 | si = i; |
| 397 | sk = k; |
| 398 | if( k==0 || zBuf[k-1]!=' ' ){ |
| 399 | zBuf[k++] = ' '; |
| 400 |