Fossil SCM

Merge updates to the character width measurements of the comment formatter. Note that multi-byte and wide characters are not handled in the comment prefix, which is entirely controlled by the application and only contains ASCII text.

florian 2024-10-05 13:29 trunk merge
Commit 725af9479166f68a76b2dc564329d35ddf1278404dbca97b63ba3730576eb1af
+108 -106
--- src/comformat.c
+++ src/comformat.c
@@ -31,11 +31,11 @@
3131
#define COMMENT_PRINT_ORIG_BREAK ((u32)0x00000010) /* Break before original. */
3232
#define COMMENT_PRINT_DEFAULT (COMMENT_PRINT_LEGACY) /* Defaults. */
3333
#define COMMENT_PRINT_UNSET (-1) /* Not initialized. */
3434
#endif
3535
36
-/********* Code copied from SQLite src/shell.c.in on 2024-09-28 **********/
36
+/********* Code copied from SQLite src/shell.c.in on 2024-09-30 **********/
3737
/* Lookup table to estimate the number of columns consumed by a Unicode
3838
** character.
3939
*/
4040
static const struct {
4141
unsigned char w; /* Width of the character in columns */
@@ -136,36 +136,10 @@
136136
}
137137
}
138138
if( aUWidth[iLast].iFirst > c ) return aUWidth[iFirst].w;
139139
return aUWidth[iLast].w;
140140
}
141
-
142
-/*
143
-** Compute the value and length of a multi-byte UTF-8 character that
144
-** begins at z[0]. Return the length. Write the Unicode value into *pU.
145
-**
146
-** This routine only works for *multi-byte* UTF-8 characters.
147
-*/
148
-static int decodeUtf8(const unsigned char *z, int *pU){
149
- if( (z[0] & 0xe0)==0xc0 && (z[1] & 0xc0)==0x80 ){
150
- *pU = ((z[0] & 0x1f)<<6) | (z[1] & 0x3f);
151
- return 2;
152
- }
153
- if( (z[0] & 0xf0)==0xe0 && (z[1] & 0xc0)==0x80 && (z[2] & 0xc0)==0x80 ){
154
- *pU = ((z[0] & 0x0f)<<12) | ((z[1] & 0x3f)<<6) | (z[2] & 0x3f);
155
- return 3;
156
- }
157
- if( (z[0] & 0xf8)==0xf0 && (z[1] & 0xc0)==0x80 && (z[2] & 0xc0)==0x80
158
- && (z[3] & 0xc0)==0x80
159
- ){
160
- *pU = ((z[0] & 0x0f)<<18) | ((z[1] & 0x3f)<<12) | ((z[2] & 0x3f))<<6
161
- | (z[4] & 0x3f);
162
- return 4;
163
- }
164
- *pU = 0;
165
- return 1;
166
-}
167141
/******* End of code copied from SQLite *************************************/
168142
169143
/*
170144
** This is the previous value used by most external callers when they
171145
** needed to specify a default maximum line length to be used with the
@@ -241,62 +215,96 @@
241215
** algorithm, the NUL character is treated the same as a spacing character.
242216
*/
243217
static int comment_next_space(
244218
const char *zLine, /* [in] The comment line being printed. */
245219
int index, /* [in] The current character index being handled. */
246
- int *distUTF8 /* [out] Distance to next space in UTF-8 sequences. */
220
+ int maxChars, /* [in] Optimization hint to abort before space found. */
221
+ int *sumWidth /* [out] Summated width of all characters to next space. */
247222
){
248
- int nextIndex = index + 1;
249
- int fNonASCII=0;
223
+ int cchUTF8, utf32, wcwidth = 0;
224
+ int nextIndex = index;
250225
for(;;){
251
- char c = zLine[nextIndex];
252
- if( (c&0x80)==0x80 ) fNonASCII=1;
253
- if( c==0 || fossil_isspace(c) ){
254
- if( distUTF8 ){
255
- if( fNonASCII!=0 ){
256
- *distUTF8 = strlen_utf8(&zLine[index], nextIndex-index);
257
- }else{
258
- *distUTF8 = nextIndex-index;
259
- }
260
- }
226
+ char_info_utf8(&zLine[nextIndex],&cchUTF8,&utf32);
227
+ nextIndex += cchUTF8;
228
+ wcwidth += cli_wcwidth(utf32);
229
+ if( zLine[nextIndex]==0 || fossil_isspace(zLine[nextIndex]) ||
230
+ wcwidth>maxChars ){
231
+ *sumWidth = wcwidth;
261232
return nextIndex;
262233
}
263
- nextIndex++;
264234
}
265235
return 0; /* NOT REACHED */
266236
}
267237
268238
/*
269
-** Count the number of UTF-8 sequences in a string. Incomplete, ill-formed and
270
-** overlong sequences are counted as one sequence. The invalid lead bytes 0xC0
271
-** to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and 4-byte
272
-** sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF are
273
-** treated as invalid 1-byte sequences (as lone trail bytes).
274
-** Combining characters and East Asian Wide and Fullwidth characters are counted
275
-** as one, so this function does not calculate the effective "display width".
239
+** Return information about the next (single- or multi-byte) character in the
240
+** specified UTF-8 string: The number of UTF-8 code units (in this case: bytes)
241
+** and the decoded UTF-32 code point. Incomplete, ill-formed and overlong
242
+** sequences are consumed together as one invalid code point. The invalid lead
243
+** bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2-
244
+** and 4-byte sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF
245
+** are treated as invalid 1-byte sequences (as lone trail bytes), all resulting
246
+** in one invalid code point. Invalid UTF-8 sequences encoding a non-scalar code
247
+** point (UTF-16 surrogates U+D800 to U+DFFF) are allowed.
276248
*/
277
-int strlen_utf8(const char *zString, int lengthBytes){
278
- int i; /* Counted bytes. */
279
- int lengthUTF8; /* Counted UTF-8 sequences. */
280
-#if 0
281
- assert( lengthBytes>=0 );
249
+void char_info_utf8(
250
+ const char *z,
251
+ int *pCchUTF8,
252
+ int *pUtf32
253
+){
254
+ int i = 0; /* Counted bytes. */
255
+ int cchUTF8 = 1; /* Code units consumed. */
256
+ int maxUTF8 = 1; /* Expected sequence length. */
257
+ char c = z[i++];
258
+ if( (c&0x80)==0x00 ){ /* 7-bit ASCII character. */
259
+ *pCchUTF8 = 1;
260
+ *pUtf32 = (int)z[0];
261
+ return;
262
+ }
263
+ else if( (c&0xe0)==0xc0 ) maxUTF8 = 2; /* UTF-8 lead byte 110vvvvv */
264
+ else if( (c&0xf0)==0xe0 ) maxUTF8 = 3; /* UTF-8 lead byte 1110vvvv */
265
+ else if( (c&0xf8)==0xf0 ) maxUTF8 = 4; /* UTF-8 lead byte 11110vvv */
266
+ while( cchUTF8<maxUTF8 &&
267
+ (z[i]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
268
+ cchUTF8++;
269
+ i++;
270
+ }
271
+ *pCchUTF8 = cchUTF8;
272
+ if( cchUTF8!=maxUTF8 || /* Incomplete UTF-8 sequence. */
273
+ ( cchUTF8==1 && (c&0x80)==0x80 )){ /* Lone UTF-8 trail byte. */
274
+ *pUtf32 = 0xfffd; /* U+FFFD Replacement Character */
275
+#ifdef FOSSIL_DEBUG
276
+ assert( *pUtf32!=0xfffd ); /* Invalid UTF-8 sequence. */
282277
#endif
283
- for(i=0, lengthUTF8=0; i<lengthBytes; i++, lengthUTF8++){
284
- char c = zString[i];
285
- int cchUTF8=1; /* Code units consumed. */
286
- int maxUTF8=1; /* Expected sequence length. */
287
- if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
288
- else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
289
- else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
290
- while( cchUTF8<maxUTF8 &&
291
- i<lengthBytes-1 &&
292
- (zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
293
- cchUTF8++;
294
- i++;
295
- }
296
- }
297
- return lengthUTF8;
278
+ return;
279
+ }
280
+ switch( cchUTF8 ){
281
+ case 4:
282
+ *pUtf32 =
283
+ ( (z[0] & 0x0f)<<18 ) |
284
+ ( (z[1] & 0x3f)<<12 ) |
285
+ ( (z[2] & 0x3f)<< 6 ) |
286
+ ( (z[4] & 0x3f)<< 0 ) ;
287
+ break;
288
+ case 3:
289
+ *pUtf32 =
290
+ ( (z[0] & 0x0f)<<12 ) |
291
+ ( (z[1] & 0x3f)<< 6 ) |
292
+ ( (z[2] & 0x3f)<< 0 ) ;
293
+ break;
294
+ case 2:
295
+ *pUtf32 =
296
+ ( (z[0] & 0x1f)<< 6 ) |
297
+ ( (z[1] & 0x3f)<< 0 ) ;
298
+ break;
299
+ }
300
+#ifdef FOSSIL_DEBUG
301
+ assert(
302
+ *pUtf32>=0 && *pUtf32<=0x10ffff && /* Valid range U+0000 to U+10FFFF. */
303
+ *pUtf32<0xd800 && *pUtf32>0xdfff /* Non-scalar (UTF-16 surrogates). */
304
+ );
305
+#endif
298306
}
299307
300308
/*
301309
** This function is called when printing a logical comment line to calculate
302310
** the necessary indenting. The caller needs to emit the indenting spaces.
@@ -339,11 +347,10 @@
339347
int *pLineCnt, /* [in/out] Pointer to the total line count. */
340348
const char **pzLine /* [out] Pointer to the end of the logical line. */
341349
){
342350
int index = 0, charCnt = 0, lineCnt = 0, maxChars, i;
343351
char zBuf[400]; int iBuf=0; /* Output buffer and counter. */
344
- int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */
345352
if( !zLine ) return;
346353
if( lineChars<=0 ) return;
347354
#if 0
348355
assert( indent<sizeof(zBuf)-5 ); /* See following comments to explain */
349356
assert( origIndent<sizeof(zBuf)-5 ); /* these limits. */
@@ -362,10 +369,11 @@
362369
/* Limit line indent to fit output buffer. */
363370
origIndent = sizeof(zBuf)-6;
364371
}
365372
maxChars = lineChars;
366373
for(;;){
374
+ int cchUTF8, utf32;
367375
int useChars = 1;
368376
char c = zLine[index];
369377
/* Flush the output buffer if there's no space left for at least one more
370378
** (potentially 4-byte) UTF-8 sequence, one level of indentation spaces,
371379
** a new line, and a terminating NULL. */
@@ -393,48 +401,47 @@
393401
if( c=='\n' ){
394402
lineCnt++;
395403
charCnt = 0;
396404
useChars = 0;
397405
}else if( c=='\t' ){
398
- int distUTF8;
399
- int nextIndex = comment_next_space(zLine, index, &distUTF8);
400
- if( nextIndex<=0 || distUTF8>maxChars ){
406
+ int sumWidth;
407
+ int nextIndex = comment_next_space(zLine, index, maxChars, &sumWidth);
408
+ if( nextIndex<=0 || sumWidth>maxChars ){
401409
break;
402410
}
403411
charCnt++;
404412
useChars = COMMENT_TAB_WIDTH;
405413
if( maxChars<useChars ){
406414
zBuf[iBuf++] = ' ';
407415
break;
408416
}
409417
}else if( wordBreak && fossil_isspace(c) ){
410
- int distUTF8;
411
- int nextIndex = comment_next_space(zLine, index, &distUTF8);
412
- if( nextIndex<=0 || distUTF8>=maxChars ){
418
+ int sumWidth;
419
+ int nextIndex = comment_next_space(zLine, index, maxChars, &sumWidth);
420
+ if( nextIndex<=0 || sumWidth>=maxChars ){
413421
break;
414422
}
415423
charCnt++;
416424
}else{
417425
charCnt++;
418426
}
419427
assert( c!='\n' || charCnt==0 );
420428
zBuf[iBuf++] = c;
421
- /* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */
422
- cchUTF8=1; /* Code units consumed. */
423
- maxUTF8=1; /* Expected sequence length. */
424
- if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
425
- else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
426
- else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
427
- while( cchUTF8<maxUTF8 &&
428
- (zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
429
- cchUTF8++;
430
- zBuf[iBuf++] = zLine[index++];
431
- }
429
+ char_info_utf8(&zLine[index-1],&cchUTF8,&utf32);
432430
if( cchUTF8>1 ){
433
- int utf32;
434
- decodeUtf8((const unsigned char*)&zLine[index-cchUTF8],&utf32);
435
- useChars += cli_wcwidth(utf32) - 1;
431
+ int wcwidth;
432
+ wcwidth = cli_wcwidth(utf32);
433
+ if( wcwidth>maxChars && lineChars>=wcwidth ){ /* rollback */
434
+ index--;
435
+ iBuf--;
436
+ zBuf[iBuf] = 0;
437
+ break;
438
+ }
439
+ for( ; cchUTF8>1; cchUTF8-- ){
440
+ zBuf[iBuf++] = zLine[index++];
441
+ }
442
+ useChars += wcwidth - 1;
436443
}
437444
maxChars -= useChars;
438445
if( maxChars<=0 ) break;
439446
if( c=='\n' ) break;
440447
}
@@ -476,11 +483,10 @@
476483
int si, sk, i, k, kc;
477484
int doIndent = 0;
478485
char *zBuf;
479486
char zBuffer[400];
480487
int lineCnt = 0;
481
- int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */
482488
483489
if( width<0 ){
484490
comment_set_maxchars(indent, &maxChars);
485491
}
486492
if( zText==0 ) zText = "(NULL)";
@@ -502,30 +508,25 @@
502508
}
503509
if( zBuf!=zBuffer) fossil_free(zBuf);
504510
return lineCnt;
505511
}
506512
for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){
513
+ int cchUTF8, utf32;
507514
char c = zText[i];
508515
kc++; /* Count complete UTF-8 sequences. */
509
- /* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */
510
- cchUTF8=1; /* Code units consumed. */
511
- maxUTF8=1; /* Expected sequence length. */
512
- if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
513
- else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
514
- else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
515
- if( maxUTF8>1 ){
516
- zBuf[k++] = c;
517
- while( cchUTF8<maxUTF8 &&
518
- (zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
519
- cchUTF8++;
516
+ char_info_utf8(&zText[i],&cchUTF8,&utf32);
517
+ if( cchUTF8>1 ){
518
+ int wcwidth;
519
+ wcwidth = cli_wcwidth(utf32);
520
+ if( kc+wcwidth-1>maxChars && maxChars>=wcwidth ){ /* rollback */
521
+ kc--;
522
+ break;
523
+ }
524
+ for( i--; cchUTF8>0; cchUTF8-- ){
520525
zBuf[k++] = zText[++i];
521526
}
522
- }
523
- if( cchUTF8>1 ){
524
- int utf32;
525
- decodeUtf8((const unsigned char*)&zText[k-cchUTF8],&utf32);
526
- kc += cli_wcwidth(utf32) - 1;
527
+ kc += wcwidth - 1;
527528
}
528529
else if( fossil_isspace(c) ){
529530
si = i;
530531
sk = k;
531532
if( k==0 || zBuf[k-1]!=' ' ){
@@ -742,10 +743,11 @@
742743
if( zIndent ){
743744
indent = atoi(zIndent);
744745
}else{
745746
indent = -1; /* automatic */
746747
}
748
+ verify_all_options();
747749
if( g.argc!=4 && g.argc!=5 ){
748750
usage("?OPTIONS? PREFIX TEXT ?ORIGTEXT?");
749751
}
750752
zPrefix = g.argv[2];
751753
zText = g.argv[3];
752754
--- src/comformat.c
+++ src/comformat.c
@@ -31,11 +31,11 @@
31 #define COMMENT_PRINT_ORIG_BREAK ((u32)0x00000010) /* Break before original. */
32 #define COMMENT_PRINT_DEFAULT (COMMENT_PRINT_LEGACY) /* Defaults. */
33 #define COMMENT_PRINT_UNSET (-1) /* Not initialized. */
34 #endif
35
36 /********* Code copied from SQLite src/shell.c.in on 2024-09-28 **********/
37 /* Lookup table to estimate the number of columns consumed by a Unicode
38 ** character.
39 */
40 static const struct {
41 unsigned char w; /* Width of the character in columns */
@@ -136,36 +136,10 @@
136 }
137 }
138 if( aUWidth[iLast].iFirst > c ) return aUWidth[iFirst].w;
139 return aUWidth[iLast].w;
140 }
141
142 /*
143 ** Compute the value and length of a multi-byte UTF-8 character that
144 ** begins at z[0]. Return the length. Write the Unicode value into *pU.
145 **
146 ** This routine only works for *multi-byte* UTF-8 characters.
147 */
148 static int decodeUtf8(const unsigned char *z, int *pU){
149 if( (z[0] & 0xe0)==0xc0 && (z[1] & 0xc0)==0x80 ){
150 *pU = ((z[0] & 0x1f)<<6) | (z[1] & 0x3f);
151 return 2;
152 }
153 if( (z[0] & 0xf0)==0xe0 && (z[1] & 0xc0)==0x80 && (z[2] & 0xc0)==0x80 ){
154 *pU = ((z[0] & 0x0f)<<12) | ((z[1] & 0x3f)<<6) | (z[2] & 0x3f);
155 return 3;
156 }
157 if( (z[0] & 0xf8)==0xf0 && (z[1] & 0xc0)==0x80 && (z[2] & 0xc0)==0x80
158 && (z[3] & 0xc0)==0x80
159 ){
160 *pU = ((z[0] & 0x0f)<<18) | ((z[1] & 0x3f)<<12) | ((z[2] & 0x3f))<<6
161 | (z[4] & 0x3f);
162 return 4;
163 }
164 *pU = 0;
165 return 1;
166 }
167 /******* End of code copied from SQLite *************************************/
168
169 /*
170 ** This is the previous value used by most external callers when they
171 ** needed to specify a default maximum line length to be used with the
@@ -241,62 +215,96 @@
241 ** algorithm, the NUL character is treated the same as a spacing character.
242 */
243 static int comment_next_space(
244 const char *zLine, /* [in] The comment line being printed. */
245 int index, /* [in] The current character index being handled. */
246 int *distUTF8 /* [out] Distance to next space in UTF-8 sequences. */
 
247 ){
248 int nextIndex = index + 1;
249 int fNonASCII=0;
250 for(;;){
251 char c = zLine[nextIndex];
252 if( (c&0x80)==0x80 ) fNonASCII=1;
253 if( c==0 || fossil_isspace(c) ){
254 if( distUTF8 ){
255 if( fNonASCII!=0 ){
256 *distUTF8 = strlen_utf8(&zLine[index], nextIndex-index);
257 }else{
258 *distUTF8 = nextIndex-index;
259 }
260 }
261 return nextIndex;
262 }
263 nextIndex++;
264 }
265 return 0; /* NOT REACHED */
266 }
267
268 /*
269 ** Count the number of UTF-8 sequences in a string. Incomplete, ill-formed and
270 ** overlong sequences are counted as one sequence. The invalid lead bytes 0xC0
271 ** to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and 4-byte
272 ** sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF are
273 ** treated as invalid 1-byte sequences (as lone trail bytes).
274 ** Combining characters and East Asian Wide and Fullwidth characters are counted
275 ** as one, so this function does not calculate the effective "display width".
 
 
276 */
277 int strlen_utf8(const char *zString, int lengthBytes){
278 int i; /* Counted bytes. */
279 int lengthUTF8; /* Counted UTF-8 sequences. */
280 #if 0
281 assert( lengthBytes>=0 );
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282 #endif
283 for(i=0, lengthUTF8=0; i<lengthBytes; i++, lengthUTF8++){
284 char c = zString[i];
285 int cchUTF8=1; /* Code units consumed. */
286 int maxUTF8=1; /* Expected sequence length. */
287 if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
288 else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
289 else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
290 while( cchUTF8<maxUTF8 &&
291 i<lengthBytes-1 &&
292 (zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
293 cchUTF8++;
294 i++;
295 }
296 }
297 return lengthUTF8;
 
 
 
 
 
 
 
 
 
 
 
 
 
298 }
299
300 /*
301 ** This function is called when printing a logical comment line to calculate
302 ** the necessary indenting. The caller needs to emit the indenting spaces.
@@ -339,11 +347,10 @@
339 int *pLineCnt, /* [in/out] Pointer to the total line count. */
340 const char **pzLine /* [out] Pointer to the end of the logical line. */
341 ){
342 int index = 0, charCnt = 0, lineCnt = 0, maxChars, i;
343 char zBuf[400]; int iBuf=0; /* Output buffer and counter. */
344 int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */
345 if( !zLine ) return;
346 if( lineChars<=0 ) return;
347 #if 0
348 assert( indent<sizeof(zBuf)-5 ); /* See following comments to explain */
349 assert( origIndent<sizeof(zBuf)-5 ); /* these limits. */
@@ -362,10 +369,11 @@
362 /* Limit line indent to fit output buffer. */
363 origIndent = sizeof(zBuf)-6;
364 }
365 maxChars = lineChars;
366 for(;;){
 
367 int useChars = 1;
368 char c = zLine[index];
369 /* Flush the output buffer if there's no space left for at least one more
370 ** (potentially 4-byte) UTF-8 sequence, one level of indentation spaces,
371 ** a new line, and a terminating NULL. */
@@ -393,48 +401,47 @@
393 if( c=='\n' ){
394 lineCnt++;
395 charCnt = 0;
396 useChars = 0;
397 }else if( c=='\t' ){
398 int distUTF8;
399 int nextIndex = comment_next_space(zLine, index, &distUTF8);
400 if( nextIndex<=0 || distUTF8>maxChars ){
401 break;
402 }
403 charCnt++;
404 useChars = COMMENT_TAB_WIDTH;
405 if( maxChars<useChars ){
406 zBuf[iBuf++] = ' ';
407 break;
408 }
409 }else if( wordBreak && fossil_isspace(c) ){
410 int distUTF8;
411 int nextIndex = comment_next_space(zLine, index, &distUTF8);
412 if( nextIndex<=0 || distUTF8>=maxChars ){
413 break;
414 }
415 charCnt++;
416 }else{
417 charCnt++;
418 }
419 assert( c!='\n' || charCnt==0 );
420 zBuf[iBuf++] = c;
421 /* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */
422 cchUTF8=1; /* Code units consumed. */
423 maxUTF8=1; /* Expected sequence length. */
424 if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
425 else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
426 else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
427 while( cchUTF8<maxUTF8 &&
428 (zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
429 cchUTF8++;
430 zBuf[iBuf++] = zLine[index++];
431 }
432 if( cchUTF8>1 ){
433 int utf32;
434 decodeUtf8((const unsigned char*)&zLine[index-cchUTF8],&utf32);
435 useChars += cli_wcwidth(utf32) - 1;
 
 
 
 
 
 
 
 
 
436 }
437 maxChars -= useChars;
438 if( maxChars<=0 ) break;
439 if( c=='\n' ) break;
440 }
@@ -476,11 +483,10 @@
476 int si, sk, i, k, kc;
477 int doIndent = 0;
478 char *zBuf;
479 char zBuffer[400];
480 int lineCnt = 0;
481 int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */
482
483 if( width<0 ){
484 comment_set_maxchars(indent, &maxChars);
485 }
486 if( zText==0 ) zText = "(NULL)";
@@ -502,30 +508,25 @@
502 }
503 if( zBuf!=zBuffer) fossil_free(zBuf);
504 return lineCnt;
505 }
506 for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){
 
507 char c = zText[i];
508 kc++; /* Count complete UTF-8 sequences. */
509 /* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */
510 cchUTF8=1; /* Code units consumed. */
511 maxUTF8=1; /* Expected sequence length. */
512 if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
513 else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
514 else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
515 if( maxUTF8>1 ){
516 zBuf[k++] = c;
517 while( cchUTF8<maxUTF8 &&
518 (zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
519 cchUTF8++;
520 zBuf[k++] = zText[++i];
521 }
522 }
523 if( cchUTF8>1 ){
524 int utf32;
525 decodeUtf8((const unsigned char*)&zText[k-cchUTF8],&utf32);
526 kc += cli_wcwidth(utf32) - 1;
527 }
528 else if( fossil_isspace(c) ){
529 si = i;
530 sk = k;
531 if( k==0 || zBuf[k-1]!=' ' ){
@@ -742,10 +743,11 @@
742 if( zIndent ){
743 indent = atoi(zIndent);
744 }else{
745 indent = -1; /* automatic */
746 }
 
747 if( g.argc!=4 && g.argc!=5 ){
748 usage("?OPTIONS? PREFIX TEXT ?ORIGTEXT?");
749 }
750 zPrefix = g.argv[2];
751 zText = g.argv[3];
752
--- src/comformat.c
+++ src/comformat.c
@@ -31,11 +31,11 @@
31 #define COMMENT_PRINT_ORIG_BREAK ((u32)0x00000010) /* Break before original. */
32 #define COMMENT_PRINT_DEFAULT (COMMENT_PRINT_LEGACY) /* Defaults. */
33 #define COMMENT_PRINT_UNSET (-1) /* Not initialized. */
34 #endif
35
36 /********* Code copied from SQLite src/shell.c.in on 2024-09-30 **********/
37 /* Lookup table to estimate the number of columns consumed by a Unicode
38 ** character.
39 */
40 static const struct {
41 unsigned char w; /* Width of the character in columns */
@@ -136,36 +136,10 @@
136 }
137 }
138 if( aUWidth[iLast].iFirst > c ) return aUWidth[iFirst].w;
139 return aUWidth[iLast].w;
140 }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141 /******* End of code copied from SQLite *************************************/
142
143 /*
144 ** This is the previous value used by most external callers when they
145 ** needed to specify a default maximum line length to be used with the
@@ -241,62 +215,96 @@
215 ** algorithm, the NUL character is treated the same as a spacing character.
216 */
217 static int comment_next_space(
218 const char *zLine, /* [in] The comment line being printed. */
219 int index, /* [in] The current character index being handled. */
220 int maxChars, /* [in] Optimization hint to abort before space found. */
221 int *sumWidth /* [out] Summated width of all characters to next space. */
222 ){
223 int cchUTF8, utf32, wcwidth = 0;
224 int nextIndex = index;
225 for(;;){
226 char_info_utf8(&zLine[nextIndex],&cchUTF8,&utf32);
227 nextIndex += cchUTF8;
228 wcwidth += cli_wcwidth(utf32);
229 if( zLine[nextIndex]==0 || fossil_isspace(zLine[nextIndex]) ||
230 wcwidth>maxChars ){
231 *sumWidth = wcwidth;
 
 
 
 
232 return nextIndex;
233 }
 
234 }
235 return 0; /* NOT REACHED */
236 }
237
238 /*
239 ** Return information about the next (single- or multi-byte) character in the
240 ** specified UTF-8 string: The number of UTF-8 code units (in this case: bytes)
241 ** and the decoded UTF-32 code point. Incomplete, ill-formed and overlong
242 ** sequences are consumed together as one invalid code point. The invalid lead
243 ** bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2-
244 ** and 4-byte sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF
245 ** are treated as invalid 1-byte sequences (as lone trail bytes), all resulting
246 ** in one invalid code point. Invalid UTF-8 sequences encoding a non-scalar code
247 ** point (UTF-16 surrogates U+D800 to U+DFFF) are allowed.
248 */
249 void char_info_utf8(
250 const char *z,
251 int *pCchUTF8,
252 int *pUtf32
253 ){
254 int i = 0; /* Counted bytes. */
255 int cchUTF8 = 1; /* Code units consumed. */
256 int maxUTF8 = 1; /* Expected sequence length. */
257 char c = z[i++];
258 if( (c&0x80)==0x00 ){ /* 7-bit ASCII character. */
259 *pCchUTF8 = 1;
260 *pUtf32 = (int)z[0];
261 return;
262 }
263 else if( (c&0xe0)==0xc0 ) maxUTF8 = 2; /* UTF-8 lead byte 110vvvvv */
264 else if( (c&0xf0)==0xe0 ) maxUTF8 = 3; /* UTF-8 lead byte 1110vvvv */
265 else if( (c&0xf8)==0xf0 ) maxUTF8 = 4; /* UTF-8 lead byte 11110vvv */
266 while( cchUTF8<maxUTF8 &&
267 (z[i]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
268 cchUTF8++;
269 i++;
270 }
271 *pCchUTF8 = cchUTF8;
272 if( cchUTF8!=maxUTF8 || /* Incomplete UTF-8 sequence. */
273 ( cchUTF8==1 && (c&0x80)==0x80 )){ /* Lone UTF-8 trail byte. */
274 *pUtf32 = 0xfffd; /* U+FFFD Replacement Character */
275 #ifdef FOSSIL_DEBUG
276 assert( *pUtf32!=0xfffd ); /* Invalid UTF-8 sequence. */
277 #endif
278 return;
279 }
280 switch( cchUTF8 ){
281 case 4:
282 *pUtf32 =
283 ( (z[0] & 0x0f)<<18 ) |
284 ( (z[1] & 0x3f)<<12 ) |
285 ( (z[2] & 0x3f)<< 6 ) |
286 ( (z[4] & 0x3f)<< 0 ) ;
287 break;
288 case 3:
289 *pUtf32 =
290 ( (z[0] & 0x0f)<<12 ) |
291 ( (z[1] & 0x3f)<< 6 ) |
292 ( (z[2] & 0x3f)<< 0 ) ;
293 break;
294 case 2:
295 *pUtf32 =
296 ( (z[0] & 0x1f)<< 6 ) |
297 ( (z[1] & 0x3f)<< 0 ) ;
298 break;
299 }
300 #ifdef FOSSIL_DEBUG
301 assert(
302 *pUtf32>=0 && *pUtf32<=0x10ffff && /* Valid range U+0000 to U+10FFFF. */
303 *pUtf32<0xd800 && *pUtf32>0xdfff /* Non-scalar (UTF-16 surrogates). */
304 );
305 #endif
306 }
307
308 /*
309 ** This function is called when printing a logical comment line to calculate
310 ** the necessary indenting. The caller needs to emit the indenting spaces.
@@ -339,11 +347,10 @@
347 int *pLineCnt, /* [in/out] Pointer to the total line count. */
348 const char **pzLine /* [out] Pointer to the end of the logical line. */
349 ){
350 int index = 0, charCnt = 0, lineCnt = 0, maxChars, i;
351 char zBuf[400]; int iBuf=0; /* Output buffer and counter. */
 
352 if( !zLine ) return;
353 if( lineChars<=0 ) return;
354 #if 0
355 assert( indent<sizeof(zBuf)-5 ); /* See following comments to explain */
356 assert( origIndent<sizeof(zBuf)-5 ); /* these limits. */
@@ -362,10 +369,11 @@
369 /* Limit line indent to fit output buffer. */
370 origIndent = sizeof(zBuf)-6;
371 }
372 maxChars = lineChars;
373 for(;;){
374 int cchUTF8, utf32;
375 int useChars = 1;
376 char c = zLine[index];
377 /* Flush the output buffer if there's no space left for at least one more
378 ** (potentially 4-byte) UTF-8 sequence, one level of indentation spaces,
379 ** a new line, and a terminating NULL. */
@@ -393,48 +401,47 @@
401 if( c=='\n' ){
402 lineCnt++;
403 charCnt = 0;
404 useChars = 0;
405 }else if( c=='\t' ){
406 int sumWidth;
407 int nextIndex = comment_next_space(zLine, index, maxChars, &sumWidth);
408 if( nextIndex<=0 || sumWidth>maxChars ){
409 break;
410 }
411 charCnt++;
412 useChars = COMMENT_TAB_WIDTH;
413 if( maxChars<useChars ){
414 zBuf[iBuf++] = ' ';
415 break;
416 }
417 }else if( wordBreak && fossil_isspace(c) ){
418 int sumWidth;
419 int nextIndex = comment_next_space(zLine, index, maxChars, &sumWidth);
420 if( nextIndex<=0 || sumWidth>=maxChars ){
421 break;
422 }
423 charCnt++;
424 }else{
425 charCnt++;
426 }
427 assert( c!='\n' || charCnt==0 );
428 zBuf[iBuf++] = c;
429 char_info_utf8(&zLine[index-1],&cchUTF8,&utf32);
 
 
 
 
 
 
 
 
 
 
430 if( cchUTF8>1 ){
431 int wcwidth;
432 wcwidth = cli_wcwidth(utf32);
433 if( wcwidth>maxChars && lineChars>=wcwidth ){ /* rollback */
434 index--;
435 iBuf--;
436 zBuf[iBuf] = 0;
437 break;
438 }
439 for( ; cchUTF8>1; cchUTF8-- ){
440 zBuf[iBuf++] = zLine[index++];
441 }
442 useChars += wcwidth - 1;
443 }
444 maxChars -= useChars;
445 if( maxChars<=0 ) break;
446 if( c=='\n' ) break;
447 }
@@ -476,11 +483,10 @@
483 int si, sk, i, k, kc;
484 int doIndent = 0;
485 char *zBuf;
486 char zBuffer[400];
487 int lineCnt = 0;
 
488
489 if( width<0 ){
490 comment_set_maxchars(indent, &maxChars);
491 }
492 if( zText==0 ) zText = "(NULL)";
@@ -502,30 +508,25 @@
508 }
509 if( zBuf!=zBuffer) fossil_free(zBuf);
510 return lineCnt;
511 }
512 for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){
513 int cchUTF8, utf32;
514 char c = zText[i];
515 kc++; /* Count complete UTF-8 sequences. */
516 char_info_utf8(&zText[i],&cchUTF8,&utf32);
517 if( cchUTF8>1 ){
518 int wcwidth;
519 wcwidth = cli_wcwidth(utf32);
520 if( kc+wcwidth-1>maxChars && maxChars>=wcwidth ){ /* rollback */
521 kc--;
522 break;
523 }
524 for( i--; cchUTF8>0; cchUTF8-- ){
 
 
525 zBuf[k++] = zText[++i];
526 }
527 kc += wcwidth - 1;
 
 
 
 
528 }
529 else if( fossil_isspace(c) ){
530 si = i;
531 sk = k;
532 if( k==0 || zBuf[k-1]!=' ' ){
@@ -742,10 +743,11 @@
743 if( zIndent ){
744 indent = atoi(zIndent);
745 }else{
746 indent = -1; /* automatic */
747 }
748 verify_all_options();
749 if( g.argc!=4 && g.argc!=5 ){
750 usage("?OPTIONS? PREFIX TEXT ?ORIGTEXT?");
751 }
752 zPrefix = g.argv[2];
753 zText = g.argv[3];
754
--- test/comment.test
+++ test/comment.test
@@ -319,8 +319,28 @@
319319
###############################################################################
320320
321321
fossil test-comment-format --width 81 --indent 9 --decode --trimcrlf --origbreak "00:00:00 " "\[0000000000\] *CURRENT* $orig" $orig
322322
test comment-60 {$RESULT eq "00:00:00 \[0000000000\] *CURRENT* \n xxxx xx xxxxxxx xxxx xxxxxx xxxxxxx, xxxxxxx, x xxxx xxxxxx xx xxxx xxxx\n xxxxxxx xxxxx xxxx xxxx xx xxxxxxx xxxxxxx (xxxxxx xxxxxxxxx x xxxxx).\n xxx'x xxx xxx xx xxxxx xxxx xxx xxx --xxxxxxxxxxx xxxxxx xx xx xxxx. x\n xxxxx x xxxxxx xxxx xxxx xxxx xxxx xxxx x xxxxx xx xxx x xxxxxxxx\n xxxxxxx.\n(6 lines output)"}
323323
324
+###############################################################################
325
+
326
+fossil test-comment-format --width 72 --file "" [file join $testdir "utf8-comment.txt"]
327
+test comment-61 {$RESULT eq "The comment formatter handles fullwidth and multi-byte \[äöü\] an\nd symbols \[☃\] and emoji \[💾\] characters!\n(2 lines output)"}
328
+
329
+###############################################################################
330
+
331
+fossil test-comment-format --width 72 --wordbreak --file "" [file join $testdir "utf8-comment.txt"]
332
+test comment-62 {$RESULT eq "The comment formatter handles fullwidth and multi-byte \[äöü\]\nand symbols \[☃\] and emoji \[💾\] characters!\n(2 lines output)"}
333
+
334
+###############################################################################
335
+
336
+fossil test-comment-format --width 72 --legacy --file "" [file join $testdir "utf8-comment.txt"]
337
+test comment-63 {$RESULT eq "The comment formatter handles fullwidth and multi-byte \[äöü\]\nand symbols \[☃\] and emoji \[💾\] characters!\n(2 lines output)"}
338
+
339
+###############################################################################
340
+
341
+fossil test-comment-format --width 72 --legacy --wordbreak --file "" [file join $testdir "utf8-comment.txt"]
342
+test comment-64 {$RESULT eq "The comment formatter handles fullwidth and multi-byte \[äöü\]\nand symbols \[☃\] and emoji \[💾\] characters!\n(2 lines output)"}
343
+
324344
###############################################################################
325345
326346
test_cleanup
327347
328348
ADDED test/utf8-comment.txt
--- test/comment.test
+++ test/comment.test
@@ -319,8 +319,28 @@
319 ###############################################################################
320
321 fossil test-comment-format --width 81 --indent 9 --decode --trimcrlf --origbreak "00:00:00 " "\[0000000000\] *CURRENT* $orig" $orig
322 test comment-60 {$RESULT eq "00:00:00 \[0000000000\] *CURRENT* \n xxxx xx xxxxxxx xxxx xxxxxx xxxxxxx, xxxxxxx, x xxxx xxxxxx xx xxxx xxxx\n xxxxxxx xxxxx xxxx xxxx xx xxxxxxx xxxxxxx (xxxxxx xxxxxxxxx x xxxxx).\n xxx'x xxx xxx xx xxxxx xxxx xxx xxx --xxxxxxxxxxx xxxxxx xx xx xxxx. x\n xxxxx x xxxxxx xxxx xxxx xxxx xxxx xxxx x xxxxx xx xxx x xxxxxxxx\n xxxxxxx.\n(6 lines output)"}
323
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324 ###############################################################################
325
326 test_cleanup
327
328 DDED test/utf8-comment.txt
--- test/comment.test
+++ test/comment.test
@@ -319,8 +319,28 @@
319 ###############################################################################
320
321 fossil test-comment-format --width 81 --indent 9 --decode --trimcrlf --origbreak "00:00:00 " "\[0000000000\] *CURRENT* $orig" $orig
322 test comment-60 {$RESULT eq "00:00:00 \[0000000000\] *CURRENT* \n xxxx xx xxxxxxx xxxx xxxxxx xxxxxxx, xxxxxxx, x xxxx xxxxxx xx xxxx xxxx\n xxxxxxx xxxxx xxxx xxxx xx xxxxxxx xxxxxxx (xxxxxx xxxxxxxxx x xxxxx).\n xxx'x xxx xxx xx xxxxx xxxx xxx xxx --xxxxxxxxxxx xxxxxx xx xx xxxx. x\n xxxxx x xxxxxx xxxx xxxx xxxx xxxx xxxx x xxxxx xx xxx x xxxxxxxx\n xxxxxxx.\n(6 lines output)"}
323
324 ###############################################################################
325
326 fossil test-comment-format --width 72 --file "" [file join $testdir "utf8-comment.txt"]
327 test comment-61 {$RESULT eq "The comment formatter handles fullwidth and multi-byte \[äöü\] an\nd symbols \[☃\] and emoji \[💾\] characters!\n(2 lines output)"}
328
329 ###############################################################################
330
331 fossil test-comment-format --width 72 --wordbreak --file "" [file join $testdir "utf8-comment.txt"]
332 test comment-62 {$RESULT eq "The comment formatter handles fullwidth and multi-byte \[äöü\]\nand symbols \[☃\] and emoji \[💾\] characters!\n(2 lines output)"}
333
334 ###############################################################################
335
336 fossil test-comment-format --width 72 --legacy --file "" [file join $testdir "utf8-comment.txt"]
337 test comment-63 {$RESULT eq "The comment formatter handles fullwidth and multi-byte \[äöü\]\nand symbols \[☃\] and emoji \[💾\] characters!\n(2 lines output)"}
338
339 ###############################################################################
340
341 fossil test-comment-format --width 72 --legacy --wordbreak --file "" [file join $testdir "utf8-comment.txt"]
342 test comment-64 {$RESULT eq "The comment formatter handles fullwidth and multi-byte \[äöü\]\nand symbols \[☃\] and emoji \[💾\] characters!\n(2 lines output)"}
343
344 ###############################################################################
345
346 test_cleanup
347
348 DDED test/utf8-comment.txt
--- a/test/utf8-comment.txt
+++ b/test/utf8-comment.txt
@@ -0,0 +1 @@
1
+The comment formatter handles fullwidth and multi-byte [äöü] and symbols [☃] and emoji [💾] characters!
--- a/test/utf8-comment.txt
+++ b/test/utf8-comment.txt
@@ -0,0 +1 @@
 
--- a/test/utf8-comment.txt
+++ b/test/utf8-comment.txt
@@ -0,0 +1 @@
1 The comment formatter handles fullwidth and multi-byte [äöü] and symbols [☃] and emoji [💾] characters!

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button