Fossil SCM

Fix the off-by-one errors if a fullwidth character only fits partially, and take into account character widths when scanning forward to find the distance to the next space.

florian 2024-10-02 14:43 trunk
Commit d5479ba7c66e74561ee43db687b3ce19304be0baffddaf0a79a37c146e9b0354
1 file changed +102 -79
+102 -79
--- src/comformat.c
+++ src/comformat.c
@@ -241,62 +241,92 @@
241241
** algorithm, the NUL character is treated the same as a spacing character.
242242
*/
243243
static int comment_next_space(
244244
const char *zLine, /* [in] The comment line being printed. */
245245
int index, /* [in] The current character index being handled. */
246
- int *distUTF8 /* [out] Distance to next space in UTF-8 sequences. */
246
+ int *sumWidth /* [out] Summated width of all characters to next space. */
247247
){
248
- int nextIndex = index + 1;
249
- int fNonASCII=0;
248
+ int cchUTF8, utf32, wcwidth = 0;
249
+ int nextIndex = index;
250250
for(;;){
251
- char c = zLine[nextIndex];
252
- if( (c&0x80)==0x80 ) fNonASCII=1;
253
- if( c==0 || fossil_isspace(c) ){
254
- if( distUTF8 ){
255
- if( fNonASCII!=0 ){
256
- *distUTF8 = strlen_utf8(&zLine[index], nextIndex-index);
257
- }else{
258
- *distUTF8 = nextIndex-index;
259
- }
260
- }
251
+ char_info_utf8(&zLine[nextIndex],&cchUTF8,&utf32);
252
+ nextIndex += cchUTF8;
253
+ wcwidth += cli_wcwidth(utf32);
254
+ if( zLine[nextIndex]==0 || fossil_isspace(zLine[nextIndex]) ){
255
+ *sumWidth = wcwidth;
261256
return nextIndex;
262257
}
263
- nextIndex++;
264258
}
265259
return 0; /* NOT REACHED */
266260
}
267261
268262
/*
269
-** Count the number of UTF-8 sequences in a string. Incomplete, ill-formed and
270
-** overlong sequences are counted as one sequence. The invalid lead bytes 0xC0
271
-** to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and 4-byte
272
-** sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF are
273
-** treated as invalid 1-byte sequences (as lone trail bytes).
274
-** Combining characters and East Asian Wide and Fullwidth characters are counted
275
-** as one, so this function does not calculate the effective "display width".
263
+** Return information about the next (single- or multi-byte) character in the
264
+** specified UTF-8 string: The number of UTF-8 code units (in this case: bytes)
265
+** and the decoded UTF-32 code point. Incomplete, ill-formed and overlong
266
+** sequences are consumed together as one invalid code point. The invalid lead
267
+** bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2-
268
+** and 4-byte sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF
269
+** are treated as invalid 1-byte sequences (as lone trail bytes), all resulting
270
+** in one invalid code point. Invalid UTF-8 sequences encoding a non-scalar code
271
+** point (UTF-16 surrogates U+D800 to U+DFFF) are allowed.
276272
*/
277
-int strlen_utf8(const char *zString, int lengthBytes){
278
- int i; /* Counted bytes. */
279
- int lengthUTF8; /* Counted UTF-8 sequences. */
280
-#if 0
281
- assert( lengthBytes>=0 );
273
+void char_info_utf8(
274
+ const unsigned char *z,
275
+ int *pCchUTF8,
276
+ int *pUtf32
277
+){
278
+ int i = 0; /* Counted bytes. */
279
+ int cchUTF8 = 1; /* Code units consumed. */
280
+ int maxUTF8 = 1; /* Expected sequence length. */
281
+ char c = z[i++];
282
+ if( (c&0xe0)==0xc0 ) maxUTF8 = 2; /* UTF-8 lead byte 110vvvvv */
283
+ else if( (c&0xf0)==0xe0 ) maxUTF8 = 3; /* UTF-8 lead byte 1110vvvv */
284
+ else if( (c&0xf8)==0xf0 ) maxUTF8 = 4; /* UTF-8 lead byte 11110vvv */
285
+ while( cchUTF8<maxUTF8 &&
286
+ (z[i]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
287
+ cchUTF8++;
288
+ i++;
289
+ }
290
+ *pCchUTF8 = cchUTF8;
291
+ if( cchUTF8!=maxUTF8 || /* Incomplete UTF-8 sequence. */
292
+ cchUTF8==1 && (c&0x80)==0x80 ){ /* Lone UTF-8 trail byte. */
293
+ *pUtf32 = 0xfffd; /* U+FFFD Replacement Character */
294
+#ifdef FOSSIL_DEBUG
295
+ assert( *pUtf32!=0xfffd ); /* Invalid UTF-8 sequence. */
282296
#endif
283
- for(i=0, lengthUTF8=0; i<lengthBytes; i++, lengthUTF8++){
284
- char c = zString[i];
285
- int cchUTF8=1; /* Code units consumed. */
286
- int maxUTF8=1; /* Expected sequence length. */
287
- if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
288
- else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
289
- else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
290
- while( cchUTF8<maxUTF8 &&
291
- i<lengthBytes-1 &&
292
- (zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
293
- cchUTF8++;
294
- i++;
295
- }
296
- }
297
- return lengthUTF8;
297
+ return;
298
+ }
299
+ switch( cchUTF8 ){
300
+ case 4:
301
+ *pUtf32 =
302
+ ( (z[0] & 0x0f)<<18 ) |
303
+ ( (z[1] & 0x3f)<<12 ) |
304
+ ( (z[2] & 0x3f)<< 6 ) |
305
+ ( (z[4] & 0x3f)<< 0 ) ;
306
+ break;
307
+ case 3:
308
+ *pUtf32 =
309
+ ( (z[0] & 0x0f)<<12 ) |
310
+ ( (z[1] & 0x3f)<< 6 ) |
311
+ ( (z[2] & 0x3f)<< 0 ) ;
312
+ break;
313
+ case 2:
314
+ *pUtf32 =
315
+ ( (z[0] & 0x1f)<< 6 ) |
316
+ ( (z[1] & 0x3f)<< 0 ) ;
317
+ break;
318
+ case 1:
319
+ *pUtf32 = (int)z[0];
320
+ break;
321
+ }
322
+#ifdef FOSSIL_DEBUG
323
+ assert(
324
+ *pUtf32>=0 && *pUtf32<=0x10ffff && /* Valid range U+0000 to U+10FFFF. */
325
+ *pUtf32<0xd800 && *pUtf32>0xdfff /* Non-scalar (UTF-16 surrogates). */
326
+ );
327
+#endif
298328
}
299329
300330
/*
301331
** This function is called when printing a logical comment line to calculate
302332
** the necessary indenting. The caller needs to emit the indenting spaces.
@@ -339,11 +369,10 @@
339369
int *pLineCnt, /* [in/out] Pointer to the total line count. */
340370
const char **pzLine /* [out] Pointer to the end of the logical line. */
341371
){
342372
int index = 0, charCnt = 0, lineCnt = 0, maxChars, i;
343373
char zBuf[400]; int iBuf=0; /* Output buffer and counter. */
344
- int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */
345374
if( !zLine ) return;
346375
if( lineChars<=0 ) return;
347376
#if 0
348377
assert( indent<sizeof(zBuf)-5 ); /* See following comments to explain */
349378
assert( origIndent<sizeof(zBuf)-5 ); /* these limits. */
@@ -362,10 +391,11 @@
362391
/* Limit line indent to fit output buffer. */
363392
origIndent = sizeof(zBuf)-6;
364393
}
365394
maxChars = lineChars;
366395
for(;;){
396
+ int cchUTF8, utf32;
367397
int useChars = 1;
368398
char c = zLine[index];
369399
/* Flush the output buffer if there's no space left for at least one more
370400
** (potentially 4-byte) UTF-8 sequence, one level of indentation spaces,
371401
** a new line, and a terminating NULL. */
@@ -393,48 +423,47 @@
393423
if( c=='\n' ){
394424
lineCnt++;
395425
charCnt = 0;
396426
useChars = 0;
397427
}else if( c=='\t' ){
398
- int distUTF8;
399
- int nextIndex = comment_next_space(zLine, index, &distUTF8);
400
- if( nextIndex<=0 || distUTF8>maxChars ){
428
+ int sumWidth;
429
+ int nextIndex = comment_next_space(zLine, index, &sumWidth);
430
+ if( nextIndex<=0 || sumWidth>maxChars ){
401431
break;
402432
}
403433
charCnt++;
404434
useChars = COMMENT_TAB_WIDTH;
405435
if( maxChars<useChars ){
406436
zBuf[iBuf++] = ' ';
407437
break;
408438
}
409439
}else if( wordBreak && fossil_isspace(c) ){
410
- int distUTF8;
411
- int nextIndex = comment_next_space(zLine, index, &distUTF8);
412
- if( nextIndex<=0 || distUTF8>=maxChars ){
440
+ int sumWidth;
441
+ int nextIndex = comment_next_space(zLine, index, &sumWidth);
442
+ if( nextIndex<=0 || sumWidth>=maxChars ){
413443
break;
414444
}
415445
charCnt++;
416446
}else{
417447
charCnt++;
418448
}
419449
assert( c!='\n' || charCnt==0 );
420450
zBuf[iBuf++] = c;
421
- /* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */
422
- cchUTF8=1; /* Code units consumed. */
423
- maxUTF8=1; /* Expected sequence length. */
424
- if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
425
- else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
426
- else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
427
- while( cchUTF8<maxUTF8 &&
428
- (zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
429
- cchUTF8++;
430
- zBuf[iBuf++] = zLine[index++];
431
- }
451
+ char_info_utf8(&zLine[index-1],&cchUTF8,&utf32);
432452
if( cchUTF8>1 ){
433
- int utf32;
434
- decodeUtf8((const unsigned char*)&zLine[index-cchUTF8],&utf32);
435
- useChars += cli_wcwidth(utf32) - 1;
453
+ int wcwidth;
454
+ wcwidth = cli_wcwidth(utf32);
455
+ if( wcwidth>maxChars && lineChars>=wcwidth ){ /* rollback */
456
+ index--;
457
+ iBuf--;
458
+ zBuf[iBuf] = 0;
459
+ break;
460
+ }
461
+ for( ; cchUTF8>1; cchUTF8-- ){
462
+ zBuf[iBuf++] = zLine[index++];
463
+ }
464
+ useChars += wcwidth - 1;
436465
}
437466
maxChars -= useChars;
438467
if( maxChars<=0 ) break;
439468
if( c=='\n' ) break;
440469
}
@@ -476,11 +505,10 @@
476505
int si, sk, i, k, kc;
477506
int doIndent = 0;
478507
char *zBuf;
479508
char zBuffer[400];
480509
int lineCnt = 0;
481
- int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */
482510
483511
if( width<0 ){
484512
comment_set_maxchars(indent, &maxChars);
485513
}
486514
if( zText==0 ) zText = "(NULL)";
@@ -502,30 +530,25 @@
502530
}
503531
if( zBuf!=zBuffer) fossil_free(zBuf);
504532
return lineCnt;
505533
}
506534
for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){
535
+ int cchUTF8, utf32;
507536
char c = zText[i];
508537
kc++; /* Count complete UTF-8 sequences. */
509
- /* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */
510
- cchUTF8=1; /* Code units consumed. */
511
- maxUTF8=1; /* Expected sequence length. */
512
- if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
513
- else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
514
- else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
515
- if( maxUTF8>1 ){
516
- zBuf[k++] = c;
517
- while( cchUTF8<maxUTF8 &&
518
- (zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
519
- cchUTF8++;
538
+ char_info_utf8(&zText[i],&cchUTF8,&utf32);
539
+ if( cchUTF8>1 ){
540
+ int wcwidth;
541
+ wcwidth = cli_wcwidth(utf32);
542
+ if( kc+wcwidth-1>maxChars && maxChars>=wcwidth ){ /* rollback */
543
+ kc--;
544
+ break;
545
+ }
546
+ for( i--; cchUTF8>0; cchUTF8-- ){
520547
zBuf[k++] = zText[++i];
521548
}
522
- }
523
- if( cchUTF8>1 ){
524
- int utf32;
525
- decodeUtf8((const unsigned char*)&zText[k-cchUTF8],&utf32);
526
- kc += cli_wcwidth(utf32) - 1;
549
+ kc += wcwidth - 1;
527550
}
528551
else if( fossil_isspace(c) ){
529552
si = i;
530553
sk = k;
531554
if( k==0 || zBuf[k-1]!=' ' ){
532555
--- src/comformat.c
+++ src/comformat.c
@@ -241,62 +241,92 @@
241 ** algorithm, the NUL character is treated the same as a spacing character.
242 */
243 static int comment_next_space(
244 const char *zLine, /* [in] The comment line being printed. */
245 int index, /* [in] The current character index being handled. */
246 int *distUTF8 /* [out] Distance to next space in UTF-8 sequences. */
247 ){
248 int nextIndex = index + 1;
249 int fNonASCII=0;
250 for(;;){
251 char c = zLine[nextIndex];
252 if( (c&0x80)==0x80 ) fNonASCII=1;
253 if( c==0 || fossil_isspace(c) ){
254 if( distUTF8 ){
255 if( fNonASCII!=0 ){
256 *distUTF8 = strlen_utf8(&zLine[index], nextIndex-index);
257 }else{
258 *distUTF8 = nextIndex-index;
259 }
260 }
261 return nextIndex;
262 }
263 nextIndex++;
264 }
265 return 0; /* NOT REACHED */
266 }
267
268 /*
269 ** Count the number of UTF-8 sequences in a string. Incomplete, ill-formed and
270 ** overlong sequences are counted as one sequence. The invalid lead bytes 0xC0
271 ** to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and 4-byte
272 ** sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF are
273 ** treated as invalid 1-byte sequences (as lone trail bytes).
274 ** Combining characters and East Asian Wide and Fullwidth characters are counted
275 ** as one, so this function does not calculate the effective "display width".
 
 
276 */
277 int strlen_utf8(const char *zString, int lengthBytes){
278 int i; /* Counted bytes. */
279 int lengthUTF8; /* Counted UTF-8 sequences. */
280 #if 0
281 assert( lengthBytes>=0 );
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282 #endif
283 for(i=0, lengthUTF8=0; i<lengthBytes; i++, lengthUTF8++){
284 char c = zString[i];
285 int cchUTF8=1; /* Code units consumed. */
286 int maxUTF8=1; /* Expected sequence length. */
287 if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
288 else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
289 else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
290 while( cchUTF8<maxUTF8 &&
291 i<lengthBytes-1 &&
292 (zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
293 cchUTF8++;
294 i++;
295 }
296 }
297 return lengthUTF8;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298 }
299
300 /*
301 ** This function is called when printing a logical comment line to calculate
302 ** the necessary indenting. The caller needs to emit the indenting spaces.
@@ -339,11 +369,10 @@
339 int *pLineCnt, /* [in/out] Pointer to the total line count. */
340 const char **pzLine /* [out] Pointer to the end of the logical line. */
341 ){
342 int index = 0, charCnt = 0, lineCnt = 0, maxChars, i;
343 char zBuf[400]; int iBuf=0; /* Output buffer and counter. */
344 int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */
345 if( !zLine ) return;
346 if( lineChars<=0 ) return;
347 #if 0
348 assert( indent<sizeof(zBuf)-5 ); /* See following comments to explain */
349 assert( origIndent<sizeof(zBuf)-5 ); /* these limits. */
@@ -362,10 +391,11 @@
362 /* Limit line indent to fit output buffer. */
363 origIndent = sizeof(zBuf)-6;
364 }
365 maxChars = lineChars;
366 for(;;){
 
367 int useChars = 1;
368 char c = zLine[index];
369 /* Flush the output buffer if there's no space left for at least one more
370 ** (potentially 4-byte) UTF-8 sequence, one level of indentation spaces,
371 ** a new line, and a terminating NULL. */
@@ -393,48 +423,47 @@
393 if( c=='\n' ){
394 lineCnt++;
395 charCnt = 0;
396 useChars = 0;
397 }else if( c=='\t' ){
398 int distUTF8;
399 int nextIndex = comment_next_space(zLine, index, &distUTF8);
400 if( nextIndex<=0 || distUTF8>maxChars ){
401 break;
402 }
403 charCnt++;
404 useChars = COMMENT_TAB_WIDTH;
405 if( maxChars<useChars ){
406 zBuf[iBuf++] = ' ';
407 break;
408 }
409 }else if( wordBreak && fossil_isspace(c) ){
410 int distUTF8;
411 int nextIndex = comment_next_space(zLine, index, &distUTF8);
412 if( nextIndex<=0 || distUTF8>=maxChars ){
413 break;
414 }
415 charCnt++;
416 }else{
417 charCnt++;
418 }
419 assert( c!='\n' || charCnt==0 );
420 zBuf[iBuf++] = c;
421 /* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */
422 cchUTF8=1; /* Code units consumed. */
423 maxUTF8=1; /* Expected sequence length. */
424 if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
425 else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
426 else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
427 while( cchUTF8<maxUTF8 &&
428 (zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
429 cchUTF8++;
430 zBuf[iBuf++] = zLine[index++];
431 }
432 if( cchUTF8>1 ){
433 int utf32;
434 decodeUtf8((const unsigned char*)&zLine[index-cchUTF8],&utf32);
435 useChars += cli_wcwidth(utf32) - 1;
 
 
 
 
 
 
 
 
 
436 }
437 maxChars -= useChars;
438 if( maxChars<=0 ) break;
439 if( c=='\n' ) break;
440 }
@@ -476,11 +505,10 @@
476 int si, sk, i, k, kc;
477 int doIndent = 0;
478 char *zBuf;
479 char zBuffer[400];
480 int lineCnt = 0;
481 int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */
482
483 if( width<0 ){
484 comment_set_maxchars(indent, &maxChars);
485 }
486 if( zText==0 ) zText = "(NULL)";
@@ -502,30 +530,25 @@
502 }
503 if( zBuf!=zBuffer) fossil_free(zBuf);
504 return lineCnt;
505 }
506 for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){
 
507 char c = zText[i];
508 kc++; /* Count complete UTF-8 sequences. */
509 /* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */
510 cchUTF8=1; /* Code units consumed. */
511 maxUTF8=1; /* Expected sequence length. */
512 if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
513 else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
514 else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
515 if( maxUTF8>1 ){
516 zBuf[k++] = c;
517 while( cchUTF8<maxUTF8 &&
518 (zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
519 cchUTF8++;
520 zBuf[k++] = zText[++i];
521 }
522 }
523 if( cchUTF8>1 ){
524 int utf32;
525 decodeUtf8((const unsigned char*)&zText[k-cchUTF8],&utf32);
526 kc += cli_wcwidth(utf32) - 1;
527 }
528 else if( fossil_isspace(c) ){
529 si = i;
530 sk = k;
531 if( k==0 || zBuf[k-1]!=' ' ){
532
--- src/comformat.c
+++ src/comformat.c
@@ -241,62 +241,92 @@
241 ** algorithm, the NUL character is treated the same as a spacing character.
242 */
243 static int comment_next_space(
244 const char *zLine, /* [in] The comment line being printed. */
245 int index, /* [in] The current character index being handled. */
246 int *sumWidth /* [out] Summated width of all characters to next space. */
247 ){
248 int cchUTF8, utf32, wcwidth = 0;
249 int nextIndex = index;
250 for(;;){
251 char_info_utf8(&zLine[nextIndex],&cchUTF8,&utf32);
252 nextIndex += cchUTF8;
253 wcwidth += cli_wcwidth(utf32);
254 if( zLine[nextIndex]==0 || fossil_isspace(zLine[nextIndex]) ){
255 *sumWidth = wcwidth;
 
 
 
 
 
256 return nextIndex;
257 }
 
258 }
259 return 0; /* NOT REACHED */
260 }
261
262 /*
263 ** Return information about the next (single- or multi-byte) character in the
264 ** specified UTF-8 string: The number of UTF-8 code units (in this case: bytes)
265 ** and the decoded UTF-32 code point. Incomplete, ill-formed and overlong
266 ** sequences are consumed together as one invalid code point. The invalid lead
267 ** bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2-
268 ** and 4-byte sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF
269 ** are treated as invalid 1-byte sequences (as lone trail bytes), all resulting
270 ** in one invalid code point. Invalid UTF-8 sequences encoding a non-scalar code
271 ** point (UTF-16 surrogates U+D800 to U+DFFF) are allowed.
272 */
273 void char_info_utf8(
274 const unsigned char *z,
275 int *pCchUTF8,
276 int *pUtf32
277 ){
278 int i = 0; /* Counted bytes. */
279 int cchUTF8 = 1; /* Code units consumed. */
280 int maxUTF8 = 1; /* Expected sequence length. */
281 char c = z[i++];
282 if( (c&0xe0)==0xc0 ) maxUTF8 = 2; /* UTF-8 lead byte 110vvvvv */
283 else if( (c&0xf0)==0xe0 ) maxUTF8 = 3; /* UTF-8 lead byte 1110vvvv */
284 else if( (c&0xf8)==0xf0 ) maxUTF8 = 4; /* UTF-8 lead byte 11110vvv */
285 while( cchUTF8<maxUTF8 &&
286 (z[i]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
287 cchUTF8++;
288 i++;
289 }
290 *pCchUTF8 = cchUTF8;
291 if( cchUTF8!=maxUTF8 || /* Incomplete UTF-8 sequence. */
292 cchUTF8==1 && (c&0x80)==0x80 ){ /* Lone UTF-8 trail byte. */
293 *pUtf32 = 0xfffd; /* U+FFFD Replacement Character */
294 #ifdef FOSSIL_DEBUG
295 assert( *pUtf32!=0xfffd ); /* Invalid UTF-8 sequence. */
296 #endif
297 return;
298 }
299 switch( cchUTF8 ){
300 case 4:
301 *pUtf32 =
302 ( (z[0] & 0x0f)<<18 ) |
303 ( (z[1] & 0x3f)<<12 ) |
304 ( (z[2] & 0x3f)<< 6 ) |
305 ( (z[4] & 0x3f)<< 0 ) ;
306 break;
307 case 3:
308 *pUtf32 =
309 ( (z[0] & 0x0f)<<12 ) |
310 ( (z[1] & 0x3f)<< 6 ) |
311 ( (z[2] & 0x3f)<< 0 ) ;
312 break;
313 case 2:
314 *pUtf32 =
315 ( (z[0] & 0x1f)<< 6 ) |
316 ( (z[1] & 0x3f)<< 0 ) ;
317 break;
318 case 1:
319 *pUtf32 = (int)z[0];
320 break;
321 }
322 #ifdef FOSSIL_DEBUG
323 assert(
324 *pUtf32>=0 && *pUtf32<=0x10ffff && /* Valid range U+0000 to U+10FFFF. */
325 *pUtf32<0xd800 && *pUtf32>0xdfff /* Non-scalar (UTF-16 surrogates). */
326 );
327 #endif
328 }
329
330 /*
331 ** This function is called when printing a logical comment line to calculate
332 ** the necessary indenting. The caller needs to emit the indenting spaces.
@@ -339,11 +369,10 @@
369 int *pLineCnt, /* [in/out] Pointer to the total line count. */
370 const char **pzLine /* [out] Pointer to the end of the logical line. */
371 ){
372 int index = 0, charCnt = 0, lineCnt = 0, maxChars, i;
373 char zBuf[400]; int iBuf=0; /* Output buffer and counter. */
 
374 if( !zLine ) return;
375 if( lineChars<=0 ) return;
376 #if 0
377 assert( indent<sizeof(zBuf)-5 ); /* See following comments to explain */
378 assert( origIndent<sizeof(zBuf)-5 ); /* these limits. */
@@ -362,10 +391,11 @@
391 /* Limit line indent to fit output buffer. */
392 origIndent = sizeof(zBuf)-6;
393 }
394 maxChars = lineChars;
395 for(;;){
396 int cchUTF8, utf32;
397 int useChars = 1;
398 char c = zLine[index];
399 /* Flush the output buffer if there's no space left for at least one more
400 ** (potentially 4-byte) UTF-8 sequence, one level of indentation spaces,
401 ** a new line, and a terminating NULL. */
@@ -393,48 +423,47 @@
423 if( c=='\n' ){
424 lineCnt++;
425 charCnt = 0;
426 useChars = 0;
427 }else if( c=='\t' ){
428 int sumWidth;
429 int nextIndex = comment_next_space(zLine, index, &sumWidth);
430 if( nextIndex<=0 || sumWidth>maxChars ){
431 break;
432 }
433 charCnt++;
434 useChars = COMMENT_TAB_WIDTH;
435 if( maxChars<useChars ){
436 zBuf[iBuf++] = ' ';
437 break;
438 }
439 }else if( wordBreak && fossil_isspace(c) ){
440 int sumWidth;
441 int nextIndex = comment_next_space(zLine, index, &sumWidth);
442 if( nextIndex<=0 || sumWidth>=maxChars ){
443 break;
444 }
445 charCnt++;
446 }else{
447 charCnt++;
448 }
449 assert( c!='\n' || charCnt==0 );
450 zBuf[iBuf++] = c;
451 char_info_utf8(&zLine[index-1],&cchUTF8,&utf32);
 
 
 
 
 
 
 
 
 
 
452 if( cchUTF8>1 ){
453 int wcwidth;
454 wcwidth = cli_wcwidth(utf32);
455 if( wcwidth>maxChars && lineChars>=wcwidth ){ /* rollback */
456 index--;
457 iBuf--;
458 zBuf[iBuf] = 0;
459 break;
460 }
461 for( ; cchUTF8>1; cchUTF8-- ){
462 zBuf[iBuf++] = zLine[index++];
463 }
464 useChars += wcwidth - 1;
465 }
466 maxChars -= useChars;
467 if( maxChars<=0 ) break;
468 if( c=='\n' ) break;
469 }
@@ -476,11 +505,10 @@
505 int si, sk, i, k, kc;
506 int doIndent = 0;
507 char *zBuf;
508 char zBuffer[400];
509 int lineCnt = 0;
 
510
511 if( width<0 ){
512 comment_set_maxchars(indent, &maxChars);
513 }
514 if( zText==0 ) zText = "(NULL)";
@@ -502,30 +530,25 @@
530 }
531 if( zBuf!=zBuffer) fossil_free(zBuf);
532 return lineCnt;
533 }
534 for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){
535 int cchUTF8, utf32;
536 char c = zText[i];
537 kc++; /* Count complete UTF-8 sequences. */
538 char_info_utf8(&zText[i],&cchUTF8,&utf32);
539 if( cchUTF8>1 ){
540 int wcwidth;
541 wcwidth = cli_wcwidth(utf32);
542 if( kc+wcwidth-1>maxChars && maxChars>=wcwidth ){ /* rollback */
543 kc--;
544 break;
545 }
546 for( i--; cchUTF8>0; cchUTF8-- ){
 
 
547 zBuf[k++] = zText[++i];
548 }
549 kc += wcwidth - 1;
 
 
 
 
550 }
551 else if( fossil_isspace(c) ){
552 si = i;
553 sk = k;
554 if( k==0 || zBuf[k-1]!=' ' ){
555

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button