Fossil SCM

Quick test whether the `cli_wcwidth()' function [https://sqlite.org/src/vdiff?branch=variable-width-char | recently added to the SQLite shell] can be used by the comment formatter to take character widths into account when calculating word-break positions. TODOs: (0) Fix the "modern" (i.e. non-legacy) comment formatter being off by one if a fullwidth character only fits partially. (1) Add tests for the comment formatters with non-ASCII input. (2) Implement a modified `decodeUtf8()' function (which is static, anyway) that also accepts single-byte UTF-8 characters and may allow for some simplifications to the comment formatter algorithms.

florian 2024-09-27 04:52 trunk
Commit b2dbdc8afbff1c162400696cae7e8a80e444cd6129b01c496fda15ad59e8f68c
2 files changed +1 -1 +10
+1 -1
--- extsrc/shell.c
+++ extsrc/shell.c
@@ -1024,11 +1024,11 @@
10241024
** Compute the value and length of a multi-byte UTF-8 character that
10251025
** begins at z[0]. Return the length. Write the Unicode value into *pU.
10261026
**
10271027
** This routine only works for *multi-byte* UTF-8 characters.
10281028
*/
1029
-static int decodeUtf8(const unsigned char *z, int *pU){
1029
+int decodeUtf8(const unsigned char *z, int *pU){
10301030
if( (z[0] & 0xe0)==0xc0 && (z[1] & 0xc0)==0x80 ){
10311031
*pU = ((z[0] & 0x1f)<<6) | (z[1] & 0x3f);
10321032
return 2;
10331033
}
10341034
if( (z[0] & 0xf0)==0xe0 && (z[1] & 0xc0)==0x80 && (z[2] & 0xc0)==0x80 ){
10351035
--- extsrc/shell.c
+++ extsrc/shell.c
@@ -1024,11 +1024,11 @@
1024 ** Compute the value and length of a multi-byte UTF-8 character that
1025 ** begins at z[0]. Return the length. Write the Unicode value into *pU.
1026 **
1027 ** This routine only works for *multi-byte* UTF-8 characters.
1028 */
1029 static int decodeUtf8(const unsigned char *z, int *pU){
1030 if( (z[0] & 0xe0)==0xc0 && (z[1] & 0xc0)==0x80 ){
1031 *pU = ((z[0] & 0x1f)<<6) | (z[1] & 0x3f);
1032 return 2;
1033 }
1034 if( (z[0] & 0xf0)==0xe0 && (z[1] & 0xc0)==0x80 && (z[2] & 0xc0)==0x80 ){
1035
--- extsrc/shell.c
+++ extsrc/shell.c
@@ -1024,11 +1024,11 @@
1024 ** Compute the value and length of a multi-byte UTF-8 character that
1025 ** begins at z[0]. Return the length. Write the Unicode value into *pU.
1026 **
1027 ** This routine only works for *multi-byte* UTF-8 characters.
1028 */
1029 int decodeUtf8(const unsigned char *z, int *pU){
1030 if( (z[0] & 0xe0)==0xc0 && (z[1] & 0xc0)==0x80 ){
1031 *pU = ((z[0] & 0x1f)<<6) | (z[1] & 0x3f);
1032 return 2;
1033 }
1034 if( (z[0] & 0xf0)==0xe0 && (z[1] & 0xc0)==0x80 && (z[2] & 0xc0)==0x80 ){
1035
--- src/comformat.c
+++ src/comformat.c
@@ -294,10 +294,15 @@
294294
while( cchUTF8<maxUTF8 &&
295295
(zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
296296
cchUTF8++;
297297
zBuf[iBuf++] = zLine[index++];
298298
}
299
+ if( cchUTF8>1 ){
300
+ int utf32;
301
+ decodeUtf8(&zLine[index-cchUTF8],&utf32);
302
+ useChars += cli_wcwidth(utf32) - 1;
303
+ }
299304
maxChars -= useChars;
300305
if( maxChars<=0 ) break;
301306
if( c=='\n' ) break;
302307
}
303308
if( charCnt>0 ){
@@ -380,10 +385,15 @@
380385
(zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
381386
cchUTF8++;
382387
zBuf[k++] = zText[++i];
383388
}
384389
}
390
+ if( cchUTF8>1 ){
391
+ int utf32;
392
+ decodeUtf8(&zText[k-cchUTF8],&utf32);
393
+ kc += cli_wcwidth(utf32) - 1;
394
+ }
385395
else if( fossil_isspace(c) ){
386396
si = i;
387397
sk = k;
388398
if( k==0 || zBuf[k-1]!=' ' ){
389399
zBuf[k++] = ' ';
390400
--- src/comformat.c
+++ src/comformat.c
@@ -294,10 +294,15 @@
294 while( cchUTF8<maxUTF8 &&
295 (zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
296 cchUTF8++;
297 zBuf[iBuf++] = zLine[index++];
298 }
 
 
 
 
 
299 maxChars -= useChars;
300 if( maxChars<=0 ) break;
301 if( c=='\n' ) break;
302 }
303 if( charCnt>0 ){
@@ -380,10 +385,15 @@
380 (zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
381 cchUTF8++;
382 zBuf[k++] = zText[++i];
383 }
384 }
 
 
 
 
 
385 else if( fossil_isspace(c) ){
386 si = i;
387 sk = k;
388 if( k==0 || zBuf[k-1]!=' ' ){
389 zBuf[k++] = ' ';
390
--- src/comformat.c
+++ src/comformat.c
@@ -294,10 +294,15 @@
294 while( cchUTF8<maxUTF8 &&
295 (zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
296 cchUTF8++;
297 zBuf[iBuf++] = zLine[index++];
298 }
299 if( cchUTF8>1 ){
300 int utf32;
301 decodeUtf8(&zLine[index-cchUTF8],&utf32);
302 useChars += cli_wcwidth(utf32) - 1;
303 }
304 maxChars -= useChars;
305 if( maxChars<=0 ) break;
306 if( c=='\n' ) break;
307 }
308 if( charCnt>0 ){
@@ -380,10 +385,15 @@
385 (zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
386 cchUTF8++;
387 zBuf[k++] = zText[++i];
388 }
389 }
390 if( cchUTF8>1 ){
391 int utf32;
392 decodeUtf8(&zText[k-cchUTF8],&utf32);
393 kc += cli_wcwidth(utf32) - 1;
394 }
395 else if( fossil_isspace(c) ){
396 si = i;
397 sk = k;
398 if( k==0 || zBuf[k-1]!=' ' ){
399 zBuf[k++] = ' ';
400

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button