Fossil SCM

Try two optimizations (to be reverted after further measurements, if they don't pay off): (0) Abort forward scanning for space characters as soon as the scanned characters don't fit on the current line. (1) Shortcut for ASCII characters on UTF-8 string analysis.

florian 2024-10-05 13:06 comment-formatter-wcwidth

Commit 4e8dd7df46cf1acb50c14ab66d68d2b7328241c87eb200d78c216acce7a5de6a

Parent 1cc31c309d4e7d1…

1 file changed +11 -7

~ src/comformat.c

M src/comformat.c

+11 -7

		--- src/comformat.c
		+++ src/comformat.c
		@@ -241,19 +241,21 @@
241	241	** algorithm, the NUL character is treated the same as a spacing character.
242	242	*/
243	243	static int comment_next_space(
244	244	const char zLine, / [in] The comment line being printed. */
245	245	int index, /* [in] The current character index being handled. */
	246	+ int maxChars, /* [in] Optimization hint to abort before space found. */
246	247	int sumWidth / [out] Summated width of all characters to next space. */
247	248	){
248	249	int cchUTF8, utf32, wcwidth = 0;
249	250	int nextIndex = index;
250	251	for(;;){
251	252	char_info_utf8(&zLine[nextIndex],&cchUTF8,&utf32);
252	253	nextIndex += cchUTF8;
253	254	wcwidth += cli_wcwidth(utf32);
254		- if( zLine[nextIndex]==0 \|\| fossil_isspace(zLine[nextIndex]) ){
	255	+ if( zLine[nextIndex]==0 \|\| fossil_isspace(zLine[nextIndex]) \|\|
	256	+ wcwidth>maxChars ){
255	257	*sumWidth = wcwidth;
256	258	return nextIndex;
257	259	}
258	260	}
259	261	return 0; /* NOT REACHED */
		@@ -277,11 +279,16 @@
277	279	){
278	280	int i = 0; /* Counted bytes. */
279	281	int cchUTF8 = 1; /* Code units consumed. */
280	282	int maxUTF8 = 1; /* Expected sequence length. */
281	283	char c = z[i++];
282		- if( (c&0xe0)==0xc0 ) maxUTF8 = 2; /* UTF-8 lead byte 110vvvvv */
	284	+ if( (c&0x80)==0x00 ){ /* 7-bit ASCII character. */
	285	+ *pCchUTF8 = 1;
	286	+ *pUtf32 = (int)z[0];
	287	+ return;
	288	+ }
	289	+ else if( (c&0xe0)==0xc0 ) maxUTF8 = 2; /* UTF-8 lead byte 110vvvvv */
283	290	else if( (c&0xf0)==0xe0 ) maxUTF8 = 3; /* UTF-8 lead byte 1110vvvv */
284	291	else if( (c&0xf8)==0xf0 ) maxUTF8 = 4; /* UTF-8 lead byte 11110vvv */
285	292	while( cchUTF8<maxUTF8 &&
286	293	(z[i]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
287	294	cchUTF8++;
		@@ -313,13 +320,10 @@
313	320	case 2:
314	321	*pUtf32 =
315	322	( (z[0] & 0x1f)<< 6 ) \|
316	323	( (z[1] & 0x3f)<< 0 ) ;
317	324	break;
318		- case 1:
319		- *pUtf32 = (int)z[0];
320		- break;
321	325	}
322	326	#ifdef FOSSIL_DEBUG
323	327	assert(
324	328	pUtf32>=0 && pUtf32<=0x10ffff && /* Valid range U+0000 to U+10FFFF. */
325	329	pUtf32<0xd800 && pUtf32>0xdfff /* Non-scalar (UTF-16 surrogates). */
		@@ -424,11 +428,11 @@
424	428	lineCnt++;
425	429	charCnt = 0;
426	430	useChars = 0;
427	431	}else if( c=='\t' ){
428	432	int sumWidth;
429		- int nextIndex = comment_next_space(zLine, index, &sumWidth);
	433	+ int nextIndex = comment_next_space(zLine, index, maxChars, &sumWidth);
430	434	if( nextIndex<=0 \|\| sumWidth>maxChars ){
431	435	break;
432	436	}
433	437	charCnt++;
434	438	useChars = COMMENT_TAB_WIDTH;
		@@ -436,11 +440,11 @@
436	440	zBuf[iBuf++] = ' ';
437	441	break;
438	442	}
439	443	}else if( wordBreak && fossil_isspace(c) ){
440	444	int sumWidth;
441		- int nextIndex = comment_next_space(zLine, index, &sumWidth);
	445	+ int nextIndex = comment_next_space(zLine, index, maxChars, &sumWidth);
442	446	if( nextIndex<=0 \|\| sumWidth>=maxChars ){
443	447	break;
444	448	}
445	449	charCnt++;
446	450	}else{
447	451

	--- src/comformat.c
	+++ src/comformat.c
	@@ -241,19 +241,21 @@
241	** algorithm, the NUL character is treated the same as a spacing character.
242	*/
243	static int comment_next_space(
244	const char zLine, / [in] The comment line being printed. */
245	int index, /* [in] The current character index being handled. */

246	int sumWidth / [out] Summated width of all characters to next space. */
247	){
248	int cchUTF8, utf32, wcwidth = 0;
249	int nextIndex = index;
250	for(;;){
251	char_info_utf8(&zLine[nextIndex],&cchUTF8,&utf32);
252	nextIndex += cchUTF8;
253	wcwidth += cli_wcwidth(utf32);
254	if( zLine[nextIndex]==0 \|\| fossil_isspace(zLine[nextIndex]) ){

255	*sumWidth = wcwidth;
256	return nextIndex;
257	}
258	}
259	return 0; /* NOT REACHED */
	@@ -277,11 +279,16 @@
277	){
278	int i = 0; /* Counted bytes. */
279	int cchUTF8 = 1; /* Code units consumed. */
280	int maxUTF8 = 1; /* Expected sequence length. */
281	char c = z[i++];
282	if( (c&0xe0)==0xc0 ) maxUTF8 = 2; /* UTF-8 lead byte 110vvvvv */





283	else if( (c&0xf0)==0xe0 ) maxUTF8 = 3; /* UTF-8 lead byte 1110vvvv */
284	else if( (c&0xf8)==0xf0 ) maxUTF8 = 4; /* UTF-8 lead byte 11110vvv */
285	while( cchUTF8<maxUTF8 &&
286	(z[i]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
287	cchUTF8++;
	@@ -313,13 +320,10 @@
313	case 2:
314	*pUtf32 =
315	( (z[0] & 0x1f)<< 6 ) \|
316	( (z[1] & 0x3f)<< 0 ) ;
317	break;
318	case 1:
319	*pUtf32 = (int)z[0];
320	break;
321	}
322	#ifdef FOSSIL_DEBUG
323	assert(
324	pUtf32>=0 && pUtf32<=0x10ffff && /* Valid range U+0000 to U+10FFFF. */
325	pUtf32<0xd800 && pUtf32>0xdfff /* Non-scalar (UTF-16 surrogates). */
	@@ -424,11 +428,11 @@
424	lineCnt++;
425	charCnt = 0;
426	useChars = 0;
427	}else if( c=='\t' ){
428	int sumWidth;
429	int nextIndex = comment_next_space(zLine, index, &sumWidth);
430	if( nextIndex<=0 \|\| sumWidth>maxChars ){
431	break;
432	}
433	charCnt++;
434	useChars = COMMENT_TAB_WIDTH;
	@@ -436,11 +440,11 @@
436	zBuf[iBuf++] = ' ';
437	break;
438	}
439	}else if( wordBreak && fossil_isspace(c) ){
440	int sumWidth;
441	int nextIndex = comment_next_space(zLine, index, &sumWidth);
442	if( nextIndex<=0 \|\| sumWidth>=maxChars ){
443	break;
444	}
445	charCnt++;
446	}else{
447

	--- src/comformat.c
	+++ src/comformat.c
	@@ -241,19 +241,21 @@
241	** algorithm, the NUL character is treated the same as a spacing character.
242	*/
243	static int comment_next_space(
244	const char zLine, / [in] The comment line being printed. */
245	int index, /* [in] The current character index being handled. */
246	int maxChars, /* [in] Optimization hint to abort before space found. */
247	int sumWidth / [out] Summated width of all characters to next space. */
248	){
249	int cchUTF8, utf32, wcwidth = 0;
250	int nextIndex = index;
251	for(;;){
252	char_info_utf8(&zLine[nextIndex],&cchUTF8,&utf32);
253	nextIndex += cchUTF8;
254	wcwidth += cli_wcwidth(utf32);
255	if( zLine[nextIndex]==0 \|\| fossil_isspace(zLine[nextIndex]) \|\|
256	wcwidth>maxChars ){
257	*sumWidth = wcwidth;
258	return nextIndex;
259	}
260	}
261	return 0; /* NOT REACHED */
	@@ -277,11 +279,16 @@
279	){
280	int i = 0; /* Counted bytes. */
281	int cchUTF8 = 1; /* Code units consumed. */
282	int maxUTF8 = 1; /* Expected sequence length. */
283	char c = z[i++];
284	if( (c&0x80)==0x00 ){ /* 7-bit ASCII character. */
285	*pCchUTF8 = 1;
286	*pUtf32 = (int)z[0];
287	return;
288	}
289	else if( (c&0xe0)==0xc0 ) maxUTF8 = 2; /* UTF-8 lead byte 110vvvvv */
290	else if( (c&0xf0)==0xe0 ) maxUTF8 = 3; /* UTF-8 lead byte 1110vvvv */
291	else if( (c&0xf8)==0xf0 ) maxUTF8 = 4; /* UTF-8 lead byte 11110vvv */
292	while( cchUTF8<maxUTF8 &&
293	(z[i]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
294	cchUTF8++;
	@@ -313,13 +320,10 @@
320	case 2:
321	*pUtf32 =
322	( (z[0] & 0x1f)<< 6 ) \|
323	( (z[1] & 0x3f)<< 0 ) ;
324	break;



325	}
326	#ifdef FOSSIL_DEBUG
327	assert(
328	pUtf32>=0 && pUtf32<=0x10ffff && /* Valid range U+0000 to U+10FFFF. */
329	pUtf32<0xd800 && pUtf32>0xdfff /* Non-scalar (UTF-16 surrogates). */
	@@ -424,11 +428,11 @@
428	lineCnt++;
429	charCnt = 0;
430	useChars = 0;
431	}else if( c=='\t' ){
432	int sumWidth;
433	int nextIndex = comment_next_space(zLine, index, maxChars, &sumWidth);
434	if( nextIndex<=0 \|\| sumWidth>maxChars ){
435	break;
436	}
437	charCnt++;
438	useChars = COMMENT_TAB_WIDTH;
	@@ -436,11 +440,11 @@
440	zBuf[iBuf++] = ' ';
441	break;
442	}
443	}else if( wordBreak && fossil_isspace(c) ){
444	int sumWidth;
445	int nextIndex = comment_next_space(zLine, index, maxChars, &sumWidth);
446	if( nextIndex<=0 \|\| sumWidth>=maxChars ){
447	break;
448	}
449	charCnt++;
450	}else{
451

Fossil SCM

Keyboard Shortcuts