Fossil SCM

Try two optimizations (to be reverted after further measurements, if they don't pay off): (0) Abort forward scanning for space characters as soon as the scanned characters don't fit on the current line. (1) Shortcut for ASCII characters on UTF-8 string analysis.

florian 2024-10-05 13:06 comment-formatter-wcwidth
Commit 4e8dd7df46cf1acb50c14ab66d68d2b7328241c87eb200d78c216acce7a5de6a
1 file changed +11 -7
+11 -7
--- src/comformat.c
+++ src/comformat.c
@@ -241,19 +241,21 @@
241241
** algorithm, the NUL character is treated the same as a spacing character.
242242
*/
243243
static int comment_next_space(
244244
const char *zLine, /* [in] The comment line being printed. */
245245
int index, /* [in] The current character index being handled. */
246
+ int maxChars, /* [in] Optimization hint to abort before space found. */
246247
int *sumWidth /* [out] Summated width of all characters to next space. */
247248
){
248249
int cchUTF8, utf32, wcwidth = 0;
249250
int nextIndex = index;
250251
for(;;){
251252
char_info_utf8(&zLine[nextIndex],&cchUTF8,&utf32);
252253
nextIndex += cchUTF8;
253254
wcwidth += cli_wcwidth(utf32);
254
- if( zLine[nextIndex]==0 || fossil_isspace(zLine[nextIndex]) ){
255
+ if( zLine[nextIndex]==0 || fossil_isspace(zLine[nextIndex]) ||
256
+ wcwidth>maxChars ){
255257
*sumWidth = wcwidth;
256258
return nextIndex;
257259
}
258260
}
259261
return 0; /* NOT REACHED */
@@ -277,11 +279,16 @@
277279
){
278280
int i = 0; /* Counted bytes. */
279281
int cchUTF8 = 1; /* Code units consumed. */
280282
int maxUTF8 = 1; /* Expected sequence length. */
281283
char c = z[i++];
282
- if( (c&0xe0)==0xc0 ) maxUTF8 = 2; /* UTF-8 lead byte 110vvvvv */
284
+ if( (c&0x80)==0x00 ){ /* 7-bit ASCII character. */
285
+ *pCchUTF8 = 1;
286
+ *pUtf32 = (int)z[0];
287
+ return;
288
+ }
289
+ else if( (c&0xe0)==0xc0 ) maxUTF8 = 2; /* UTF-8 lead byte 110vvvvv */
283290
else if( (c&0xf0)==0xe0 ) maxUTF8 = 3; /* UTF-8 lead byte 1110vvvv */
284291
else if( (c&0xf8)==0xf0 ) maxUTF8 = 4; /* UTF-8 lead byte 11110vvv */
285292
while( cchUTF8<maxUTF8 &&
286293
(z[i]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
287294
cchUTF8++;
@@ -313,13 +320,10 @@
313320
case 2:
314321
*pUtf32 =
315322
( (z[0] & 0x1f)<< 6 ) |
316323
( (z[1] & 0x3f)<< 0 ) ;
317324
break;
318
- case 1:
319
- *pUtf32 = (int)z[0];
320
- break;
321325
}
322326
#ifdef FOSSIL_DEBUG
323327
assert(
324328
*pUtf32>=0 && *pUtf32<=0x10ffff && /* Valid range U+0000 to U+10FFFF. */
325329
*pUtf32<0xd800 && *pUtf32>0xdfff /* Non-scalar (UTF-16 surrogates). */
@@ -424,11 +428,11 @@
424428
lineCnt++;
425429
charCnt = 0;
426430
useChars = 0;
427431
}else if( c=='\t' ){
428432
int sumWidth;
429
- int nextIndex = comment_next_space(zLine, index, &sumWidth);
433
+ int nextIndex = comment_next_space(zLine, index, maxChars, &sumWidth);
430434
if( nextIndex<=0 || sumWidth>maxChars ){
431435
break;
432436
}
433437
charCnt++;
434438
useChars = COMMENT_TAB_WIDTH;
@@ -436,11 +440,11 @@
436440
zBuf[iBuf++] = ' ';
437441
break;
438442
}
439443
}else if( wordBreak && fossil_isspace(c) ){
440444
int sumWidth;
441
- int nextIndex = comment_next_space(zLine, index, &sumWidth);
445
+ int nextIndex = comment_next_space(zLine, index, maxChars, &sumWidth);
442446
if( nextIndex<=0 || sumWidth>=maxChars ){
443447
break;
444448
}
445449
charCnt++;
446450
}else{
447451
--- src/comformat.c
+++ src/comformat.c
@@ -241,19 +241,21 @@
241 ** algorithm, the NUL character is treated the same as a spacing character.
242 */
243 static int comment_next_space(
244 const char *zLine, /* [in] The comment line being printed. */
245 int index, /* [in] The current character index being handled. */
 
246 int *sumWidth /* [out] Summated width of all characters to next space. */
247 ){
248 int cchUTF8, utf32, wcwidth = 0;
249 int nextIndex = index;
250 for(;;){
251 char_info_utf8(&zLine[nextIndex],&cchUTF8,&utf32);
252 nextIndex += cchUTF8;
253 wcwidth += cli_wcwidth(utf32);
254 if( zLine[nextIndex]==0 || fossil_isspace(zLine[nextIndex]) ){
 
255 *sumWidth = wcwidth;
256 return nextIndex;
257 }
258 }
259 return 0; /* NOT REACHED */
@@ -277,11 +279,16 @@
277 ){
278 int i = 0; /* Counted bytes. */
279 int cchUTF8 = 1; /* Code units consumed. */
280 int maxUTF8 = 1; /* Expected sequence length. */
281 char c = z[i++];
282 if( (c&0xe0)==0xc0 ) maxUTF8 = 2; /* UTF-8 lead byte 110vvvvv */
 
 
 
 
 
283 else if( (c&0xf0)==0xe0 ) maxUTF8 = 3; /* UTF-8 lead byte 1110vvvv */
284 else if( (c&0xf8)==0xf0 ) maxUTF8 = 4; /* UTF-8 lead byte 11110vvv */
285 while( cchUTF8<maxUTF8 &&
286 (z[i]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
287 cchUTF8++;
@@ -313,13 +320,10 @@
313 case 2:
314 *pUtf32 =
315 ( (z[0] & 0x1f)<< 6 ) |
316 ( (z[1] & 0x3f)<< 0 ) ;
317 break;
318 case 1:
319 *pUtf32 = (int)z[0];
320 break;
321 }
322 #ifdef FOSSIL_DEBUG
323 assert(
324 *pUtf32>=0 && *pUtf32<=0x10ffff && /* Valid range U+0000 to U+10FFFF. */
325 *pUtf32<0xd800 && *pUtf32>0xdfff /* Non-scalar (UTF-16 surrogates). */
@@ -424,11 +428,11 @@
424 lineCnt++;
425 charCnt = 0;
426 useChars = 0;
427 }else if( c=='\t' ){
428 int sumWidth;
429 int nextIndex = comment_next_space(zLine, index, &sumWidth);
430 if( nextIndex<=0 || sumWidth>maxChars ){
431 break;
432 }
433 charCnt++;
434 useChars = COMMENT_TAB_WIDTH;
@@ -436,11 +440,11 @@
436 zBuf[iBuf++] = ' ';
437 break;
438 }
439 }else if( wordBreak && fossil_isspace(c) ){
440 int sumWidth;
441 int nextIndex = comment_next_space(zLine, index, &sumWidth);
442 if( nextIndex<=0 || sumWidth>=maxChars ){
443 break;
444 }
445 charCnt++;
446 }else{
447
--- src/comformat.c
+++ src/comformat.c
@@ -241,19 +241,21 @@
241 ** algorithm, the NUL character is treated the same as a spacing character.
242 */
243 static int comment_next_space(
244 const char *zLine, /* [in] The comment line being printed. */
245 int index, /* [in] The current character index being handled. */
246 int maxChars, /* [in] Optimization hint to abort before space found. */
247 int *sumWidth /* [out] Summated width of all characters to next space. */
248 ){
249 int cchUTF8, utf32, wcwidth = 0;
250 int nextIndex = index;
251 for(;;){
252 char_info_utf8(&zLine[nextIndex],&cchUTF8,&utf32);
253 nextIndex += cchUTF8;
254 wcwidth += cli_wcwidth(utf32);
255 if( zLine[nextIndex]==0 || fossil_isspace(zLine[nextIndex]) ||
256 wcwidth>maxChars ){
257 *sumWidth = wcwidth;
258 return nextIndex;
259 }
260 }
261 return 0; /* NOT REACHED */
@@ -277,11 +279,16 @@
279 ){
280 int i = 0; /* Counted bytes. */
281 int cchUTF8 = 1; /* Code units consumed. */
282 int maxUTF8 = 1; /* Expected sequence length. */
283 char c = z[i++];
284 if( (c&0x80)==0x00 ){ /* 7-bit ASCII character. */
285 *pCchUTF8 = 1;
286 *pUtf32 = (int)z[0];
287 return;
288 }
289 else if( (c&0xe0)==0xc0 ) maxUTF8 = 2; /* UTF-8 lead byte 110vvvvv */
290 else if( (c&0xf0)==0xe0 ) maxUTF8 = 3; /* UTF-8 lead byte 1110vvvv */
291 else if( (c&0xf8)==0xf0 ) maxUTF8 = 4; /* UTF-8 lead byte 11110vvv */
292 while( cchUTF8<maxUTF8 &&
293 (z[i]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
294 cchUTF8++;
@@ -313,13 +320,10 @@
320 case 2:
321 *pUtf32 =
322 ( (z[0] & 0x1f)<< 6 ) |
323 ( (z[1] & 0x3f)<< 0 ) ;
324 break;
 
 
 
325 }
326 #ifdef FOSSIL_DEBUG
327 assert(
328 *pUtf32>=0 && *pUtf32<=0x10ffff && /* Valid range U+0000 to U+10FFFF. */
329 *pUtf32<0xd800 && *pUtf32>0xdfff /* Non-scalar (UTF-16 surrogates). */
@@ -424,11 +428,11 @@
428 lineCnt++;
429 charCnt = 0;
430 useChars = 0;
431 }else if( c=='\t' ){
432 int sumWidth;
433 int nextIndex = comment_next_space(zLine, index, maxChars, &sumWidth);
434 if( nextIndex<=0 || sumWidth>maxChars ){
435 break;
436 }
437 charCnt++;
438 useChars = COMMENT_TAB_WIDTH;
@@ -436,11 +440,11 @@
440 zBuf[iBuf++] = ' ';
441 break;
442 }
443 }else if( wordBreak && fossil_isspace(c) ){
444 int sumWidth;
445 int nextIndex = comment_next_space(zLine, index, maxChars, &sumWidth);
446 if( nextIndex<=0 || sumWidth>=maxChars ){
447 break;
448 }
449 charCnt++;
450 }else{
451

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button