Fossil SCM

Merge updates to the character width measurements of the comment formatter. Note that multi-byte and wide characters are not handled in the comment prefix, which is entirely controlled by the application and only contains ASCII text.

florian 2024-10-05 13:29 trunk merge

Commit 725af9479166f68a76b2dc564329d35ddf1278404dbca97b63ba3730576eb1af

Parent e483b3b15fad08e…

3 files changed +108 -106 +20 +1

~ src/comformat.c ~ test/comment.test ~ test/utf8-comment.txt

M src/comformat.c

+108 -106

		--- src/comformat.c
		+++ src/comformat.c
		@@ -31,11 +31,11 @@
31	31	#define COMMENT_PRINT_ORIG_BREAK ((u32)0x00000010) /* Break before original. */
32	32	#define COMMENT_PRINT_DEFAULT (COMMENT_PRINT_LEGACY) /* Defaults. */
33	33	#define COMMENT_PRINT_UNSET (-1) /* Not initialized. */
34	34	#endif
35	35
36		-/******* Code copied from SQLite src/shell.c.in on 2024-09-28 ********/
	36	+/******* Code copied from SQLite src/shell.c.in on 2024-09-30 ********/
37	37	/* Lookup table to estimate the number of columns consumed by a Unicode
38	38	** character.
39	39	*/
40	40	static const struct {
41	41	unsigned char w; /* Width of the character in columns */
		@@ -136,36 +136,10 @@
136	136	}
137	137	}
138	138	if( aUWidth[iLast].iFirst > c ) return aUWidth[iFirst].w;
139	139	return aUWidth[iLast].w;
140	140	}
141		-
142		-/*
143		-** Compute the value and length of a multi-byte UTF-8 character that
144		-** begins at z[0]. Return the length. Write the Unicode value into *pU.
145		-**
146		-** This routine only works for multi-byte UTF-8 characters.
147		-*/
148		-static int decodeUtf8(const unsigned char z, int pU){
149		- if( (z[0] & 0xe0)==0xc0 && (z[1] & 0xc0)==0x80 ){
150		- *pU = ((z[0] & 0x1f)<<6) \| (z[1] & 0x3f);
151		- return 2;
152		- }
153		- if( (z[0] & 0xf0)==0xe0 && (z[1] & 0xc0)==0x80 && (z[2] & 0xc0)==0x80 ){
154		- *pU = ((z[0] & 0x0f)<<12) \| ((z[1] & 0x3f)<<6) \| (z[2] & 0x3f);
155		- return 3;
156		- }
157		- if( (z[0] & 0xf8)==0xf0 && (z[1] & 0xc0)==0x80 && (z[2] & 0xc0)==0x80
158		- && (z[3] & 0xc0)==0x80
159		- ){
160		- *pU = ((z[0] & 0x0f)<<18) \| ((z[1] & 0x3f)<<12) \| ((z[2] & 0x3f))<<6
161		- \| (z[4] & 0x3f);
162		- return 4;
163		- }
164		- *pU = 0;
165		- return 1;
166		-}
167	141	/***** End of code copied from SQLite ***********************************/
168	142
169	143	/*
170	144	** This is the previous value used by most external callers when they
171	145	** needed to specify a default maximum line length to be used with the
		@@ -241,62 +215,96 @@
241	215	** algorithm, the NUL character is treated the same as a spacing character.
242	216	*/
243	217	static int comment_next_space(
244	218	const char zLine, / [in] The comment line being printed. */
245	219	int index, /* [in] The current character index being handled. */
246		- int distUTF8 / [out] Distance to next space in UTF-8 sequences. */
	220	+ int maxChars, /* [in] Optimization hint to abort before space found. */
	221	+ int sumWidth / [out] Summated width of all characters to next space. */
247	222	){
248		- int nextIndex = index + 1;
249		- int fNonASCII=0;
	223	+ int cchUTF8, utf32, wcwidth = 0;
	224	+ int nextIndex = index;
250	225	for(;;){
251		- char c = zLine[nextIndex];
252		- if( (c&0x80)==0x80 ) fNonASCII=1;
253		- if( c==0 \|\| fossil_isspace(c) ){
254		- if( distUTF8 ){
255		- if( fNonASCII!=0 ){
256		- *distUTF8 = strlen_utf8(&zLine[index], nextIndex-index);
257		- }else{
258		- *distUTF8 = nextIndex-index;
259		- }
260		- }
	226	+ char_info_utf8(&zLine[nextIndex],&cchUTF8,&utf32);
	227	+ nextIndex += cchUTF8;
	228	+ wcwidth += cli_wcwidth(utf32);
	229	+ if( zLine[nextIndex]==0 \|\| fossil_isspace(zLine[nextIndex]) \|\|
	230	+ wcwidth>maxChars ){
	231	+ *sumWidth = wcwidth;
261	232	return nextIndex;
262	233	}
263		- nextIndex++;
264	234	}
265	235	return 0; /* NOT REACHED */
266	236	}
267	237
268	238	/*
269		-** Count the number of UTF-8 sequences in a string. Incomplete, ill-formed and
270		-** overlong sequences are counted as one sequence. The invalid lead bytes 0xC0
271		-** to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and 4-byte
272		-** sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF are
273		-** treated as invalid 1-byte sequences (as lone trail bytes).
274		-** Combining characters and East Asian Wide and Fullwidth characters are counted
275		-** as one, so this function does not calculate the effective "display width".
	239	+** Return information about the next (single- or multi-byte) character in the
	240	+** specified UTF-8 string: The number of UTF-8 code units (in this case: bytes)
	241	+** and the decoded UTF-32 code point. Incomplete, ill-formed and overlong
	242	+** sequences are consumed together as one invalid code point. The invalid lead
	243	+** bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2-
	244	+** and 4-byte sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF
	245	+** are treated as invalid 1-byte sequences (as lone trail bytes), all resulting
	246	+** in one invalid code point. Invalid UTF-8 sequences encoding a non-scalar code
	247	+** point (UTF-16 surrogates U+D800 to U+DFFF) are allowed.
276	248	*/
277		-int strlen_utf8(const char *zString, int lengthBytes){
278		- int i; /* Counted bytes. */
279		- int lengthUTF8; /* Counted UTF-8 sequences. */
280		-#if 0
281		- assert( lengthBytes>=0 );
	249	+void char_info_utf8(
	250	+ const char *z,
	251	+ int *pCchUTF8,
	252	+ int *pUtf32
	253	+){
	254	+ int i = 0; /* Counted bytes. */
	255	+ int cchUTF8 = 1; /* Code units consumed. */
	256	+ int maxUTF8 = 1; /* Expected sequence length. */
	257	+ char c = z[i++];
	258	+ if( (c&0x80)==0x00 ){ /* 7-bit ASCII character. */
	259	+ *pCchUTF8 = 1;
	260	+ *pUtf32 = (int)z[0];
	261	+ return;
	262	+ }
	263	+ else if( (c&0xe0)==0xc0 ) maxUTF8 = 2; /* UTF-8 lead byte 110vvvvv */
	264	+ else if( (c&0xf0)==0xe0 ) maxUTF8 = 3; /* UTF-8 lead byte 1110vvvv */
	265	+ else if( (c&0xf8)==0xf0 ) maxUTF8 = 4; /* UTF-8 lead byte 11110vvv */
	266	+ while( cchUTF8<maxUTF8 &&
	267	+ (z[i]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
	268	+ cchUTF8++;
	269	+ i++;
	270	+ }
	271	+ *pCchUTF8 = cchUTF8;
	272	+ if( cchUTF8!=maxUTF8 \|\| /* Incomplete UTF-8 sequence. */
	273	+ ( cchUTF8==1 && (c&0x80)==0x80 )){ /* Lone UTF-8 trail byte. */
	274	+ pUtf32 = 0xfffd; / U+FFFD Replacement Character */
	275	+#ifdef FOSSIL_DEBUG
	276	+ assert( pUtf32!=0xfffd ); / Invalid UTF-8 sequence. */
282	277	#endif
283		- for(i=0, lengthUTF8=0; i<lengthBytes; i++, lengthUTF8++){
284		- char c = zString[i];
285		- int cchUTF8=1; /* Code units consumed. */
286		- int maxUTF8=1; /* Expected sequence length. */
287		- if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
288		- else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
289		- else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
290		- while( cchUTF8<maxUTF8 &&
291		- i<lengthBytes-1 &&
292		- (zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
293		- cchUTF8++;
294		- i++;
295		- }
296		- }
297		- return lengthUTF8;
	278	+ return;
	279	+ }
	280	+ switch( cchUTF8 ){
	281	+ case 4:
	282	+ *pUtf32 =
	283	+ ( (z[0] & 0x0f)<<18 ) \|
	284	+ ( (z[1] & 0x3f)<<12 ) \|
	285	+ ( (z[2] & 0x3f)<< 6 ) \|
	286	+ ( (z[4] & 0x3f)<< 0 ) ;
	287	+ break;
	288	+ case 3:
	289	+ *pUtf32 =
	290	+ ( (z[0] & 0x0f)<<12 ) \|
	291	+ ( (z[1] & 0x3f)<< 6 ) \|
	292	+ ( (z[2] & 0x3f)<< 0 ) ;
	293	+ break;
	294	+ case 2:
	295	+ *pUtf32 =
	296	+ ( (z[0] & 0x1f)<< 6 ) \|
	297	+ ( (z[1] & 0x3f)<< 0 ) ;
	298	+ break;
	299	+ }
	300	+#ifdef FOSSIL_DEBUG
	301	+ assert(
	302	+ pUtf32>=0 && pUtf32<=0x10ffff && /* Valid range U+0000 to U+10FFFF. */
	303	+ pUtf32<0xd800 && pUtf32>0xdfff /* Non-scalar (UTF-16 surrogates). */
	304	+ );
	305	+#endif
298	306	}
299	307
300	308	/*
301	309	** This function is called when printing a logical comment line to calculate
302	310	** the necessary indenting. The caller needs to emit the indenting spaces.
		@@ -339,11 +347,10 @@
339	347	int pLineCnt, / [in/out] Pointer to the total line count. */
340	348	const char *pzLine / [out] Pointer to the end of the logical line. */
341	349	){
342	350	int index = 0, charCnt = 0, lineCnt = 0, maxChars, i;
343	351	char zBuf[400]; int iBuf=0; /* Output buffer and counter. */
344		- int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */
345	352	if( !zLine ) return;
346	353	if( lineChars<=0 ) return;
347	354	#if 0
348	355	assert( indent<sizeof(zBuf)-5 ); /* See following comments to explain */
349	356	assert( origIndent<sizeof(zBuf)-5 ); /* these limits. */
		@@ -362,10 +369,11 @@
362	369	/* Limit line indent to fit output buffer. */
363	370	origIndent = sizeof(zBuf)-6;
364	371	}
365	372	maxChars = lineChars;
366	373	for(;;){
	374	+ int cchUTF8, utf32;
367	375	int useChars = 1;
368	376	char c = zLine[index];
369	377	/* Flush the output buffer if there's no space left for at least one more
370	378	** (potentially 4-byte) UTF-8 sequence, one level of indentation spaces,
371	379	** a new line, and a terminating NULL. */
		@@ -393,48 +401,47 @@
393	401	if( c=='\n' ){
394	402	lineCnt++;
395	403	charCnt = 0;
396	404	useChars = 0;
397	405	}else if( c=='\t' ){
398		- int distUTF8;
399		- int nextIndex = comment_next_space(zLine, index, &distUTF8);
400		- if( nextIndex<=0 \|\| distUTF8>maxChars ){
	406	+ int sumWidth;
	407	+ int nextIndex = comment_next_space(zLine, index, maxChars, &sumWidth);
	408	+ if( nextIndex<=0 \|\| sumWidth>maxChars ){
401	409	break;
402	410	}
403	411	charCnt++;
404	412	useChars = COMMENT_TAB_WIDTH;
405	413	if( maxChars<useChars ){
406	414	zBuf[iBuf++] = ' ';
407	415	break;
408	416	}
409	417	}else if( wordBreak && fossil_isspace(c) ){
410		- int distUTF8;
411		- int nextIndex = comment_next_space(zLine, index, &distUTF8);
412		- if( nextIndex<=0 \|\| distUTF8>=maxChars ){
	418	+ int sumWidth;
	419	+ int nextIndex = comment_next_space(zLine, index, maxChars, &sumWidth);
	420	+ if( nextIndex<=0 \|\| sumWidth>=maxChars ){
413	421	break;
414	422	}
415	423	charCnt++;
416	424	}else{
417	425	charCnt++;
418	426	}
419	427	assert( c!='\n' \|\| charCnt==0 );
420	428	zBuf[iBuf++] = c;
421		- /* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */
422		- cchUTF8=1; /* Code units consumed. */
423		- maxUTF8=1; /* Expected sequence length. */
424		- if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
425		- else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
426		- else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
427		- while( cchUTF8<maxUTF8 &&
428		- (zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
429		- cchUTF8++;
430		- zBuf[iBuf++] = zLine[index++];
431		- }
	429	+ char_info_utf8(&zLine[index-1],&cchUTF8,&utf32);
432	430	if( cchUTF8>1 ){
433		- int utf32;
434		- decodeUtf8((const unsigned char*)&zLine[index-cchUTF8],&utf32);
435		- useChars += cli_wcwidth(utf32) - 1;
	431	+ int wcwidth;
	432	+ wcwidth = cli_wcwidth(utf32);
	433	+ if( wcwidth>maxChars && lineChars>=wcwidth ){ /* rollback */
	434	+ index--;
	435	+ iBuf--;
	436	+ zBuf[iBuf] = 0;
	437	+ break;
	438	+ }
	439	+ for( ; cchUTF8>1; cchUTF8-- ){
	440	+ zBuf[iBuf++] = zLine[index++];
	441	+ }
	442	+ useChars += wcwidth - 1;
436	443	}
437	444	maxChars -= useChars;
438	445	if( maxChars<=0 ) break;
439	446	if( c=='\n' ) break;
440	447	}
		@@ -476,11 +483,10 @@
476	483	int si, sk, i, k, kc;
477	484	int doIndent = 0;
478	485	char *zBuf;
479	486	char zBuffer[400];
480	487	int lineCnt = 0;
481		- int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */
482	488
483	489	if( width<0 ){
484	490	comment_set_maxchars(indent, &maxChars);
485	491	}
486	492	if( zText==0 ) zText = "(NULL)";
		@@ -502,30 +508,25 @@
502	508	}
503	509	if( zBuf!=zBuffer) fossil_free(zBuf);
504	510	return lineCnt;
505	511	}
506	512	for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){
	513	+ int cchUTF8, utf32;
507	514	char c = zText[i];
508	515	kc++; /* Count complete UTF-8 sequences. */
509		- /* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */
510		- cchUTF8=1; /* Code units consumed. */
511		- maxUTF8=1; /* Expected sequence length. */
512		- if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
513		- else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
514		- else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
515		- if( maxUTF8>1 ){
516		- zBuf[k++] = c;
517		- while( cchUTF8<maxUTF8 &&
518		- (zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
519		- cchUTF8++;
	516	+ char_info_utf8(&zText[i],&cchUTF8,&utf32);
	517	+ if( cchUTF8>1 ){
	518	+ int wcwidth;
	519	+ wcwidth = cli_wcwidth(utf32);
	520	+ if( kc+wcwidth-1>maxChars && maxChars>=wcwidth ){ /* rollback */
	521	+ kc--;
	522	+ break;
	523	+ }
	524	+ for( i--; cchUTF8>0; cchUTF8-- ){
520	525	zBuf[k++] = zText[++i];
521	526	}
522		- }
523		- if( cchUTF8>1 ){
524		- int utf32;
525		- decodeUtf8((const unsigned char*)&zText[k-cchUTF8],&utf32);
526		- kc += cli_wcwidth(utf32) - 1;
	527	+ kc += wcwidth - 1;
527	528	}
528	529	else if( fossil_isspace(c) ){
529	530	si = i;
530	531	sk = k;
531	532	if( k==0 \|\| zBuf[k-1]!=' ' ){
		@@ -742,10 +743,11 @@
742	743	if( zIndent ){
743	744	indent = atoi(zIndent);
744	745	}else{
745	746	indent = -1; /* automatic */
746	747	}
	748	+ verify_all_options();
747	749	if( g.argc!=4 && g.argc!=5 ){
748	750	usage("?OPTIONS? PREFIX TEXT ?ORIGTEXT?");
749	751	}
750	752	zPrefix = g.argv[2];
751	753	zText = g.argv[3];
752	754

	--- src/comformat.c
	+++ src/comformat.c
	@@ -31,11 +31,11 @@
31	#define COMMENT_PRINT_ORIG_BREAK ((u32)0x00000010) /* Break before original. */
32	#define COMMENT_PRINT_DEFAULT (COMMENT_PRINT_LEGACY) /* Defaults. */
33	#define COMMENT_PRINT_UNSET (-1) /* Not initialized. */
34	#endif
35
36	/******* Code copied from SQLite src/shell.c.in on 2024-09-28 ********/
37	/* Lookup table to estimate the number of columns consumed by a Unicode
38	** character.
39	*/
40	static const struct {
41	unsigned char w; /* Width of the character in columns */
	@@ -136,36 +136,10 @@
136	}
137	}
138	if( aUWidth[iLast].iFirst > c ) return aUWidth[iFirst].w;
139	return aUWidth[iLast].w;
140	}
141
142	/*
143	** Compute the value and length of a multi-byte UTF-8 character that
144	** begins at z[0]. Return the length. Write the Unicode value into *pU.
145	**
146	** This routine only works for multi-byte UTF-8 characters.
147	*/
148	static int decodeUtf8(const unsigned char z, int pU){
149	if( (z[0] & 0xe0)==0xc0 && (z[1] & 0xc0)==0x80 ){
150	*pU = ((z[0] & 0x1f)<<6) \| (z[1] & 0x3f);
151	return 2;
152	}
153	if( (z[0] & 0xf0)==0xe0 && (z[1] & 0xc0)==0x80 && (z[2] & 0xc0)==0x80 ){
154	*pU = ((z[0] & 0x0f)<<12) \| ((z[1] & 0x3f)<<6) \| (z[2] & 0x3f);
155	return 3;
156	}
157	if( (z[0] & 0xf8)==0xf0 && (z[1] & 0xc0)==0x80 && (z[2] & 0xc0)==0x80
158	&& (z[3] & 0xc0)==0x80
159	){
160	*pU = ((z[0] & 0x0f)<<18) \| ((z[1] & 0x3f)<<12) \| ((z[2] & 0x3f))<<6
161	\| (z[4] & 0x3f);
162	return 4;
163	}
164	*pU = 0;
165	return 1;
166	}
167	/***** End of code copied from SQLite ***********************************/
168
169	/*
170	** This is the previous value used by most external callers when they
171	** needed to specify a default maximum line length to be used with the
	@@ -241,62 +215,96 @@
241	** algorithm, the NUL character is treated the same as a spacing character.
242	*/
243	static int comment_next_space(
244	const char zLine, / [in] The comment line being printed. */
245	int index, /* [in] The current character index being handled. */
246	int distUTF8 / [out] Distance to next space in UTF-8 sequences. */

247	){
248	int nextIndex = index + 1;
249	int fNonASCII=0;
250	for(;;){
251	char c = zLine[nextIndex];
252	if( (c&0x80)==0x80 ) fNonASCII=1;
253	if( c==0 \|\| fossil_isspace(c) ){
254	if( distUTF8 ){
255	if( fNonASCII!=0 ){
256	*distUTF8 = strlen_utf8(&zLine[index], nextIndex-index);
257	}else{
258	*distUTF8 = nextIndex-index;
259	}
260	}
261	return nextIndex;
262	}
263	nextIndex++;
264	}
265	return 0; /* NOT REACHED */
266	}
267
268	/*
269	** Count the number of UTF-8 sequences in a string. Incomplete, ill-formed and
270	** overlong sequences are counted as one sequence. The invalid lead bytes 0xC0
271	** to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and 4-byte
272	** sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF are
273	** treated as invalid 1-byte sequences (as lone trail bytes).
274	** Combining characters and East Asian Wide and Fullwidth characters are counted
275	** as one, so this function does not calculate the effective "display width".


276	*/
277	int strlen_utf8(const char *zString, int lengthBytes){
278	int i; /* Counted bytes. */
279	int lengthUTF8; /* Counted UTF-8 sequences. */
280	#if 0
281	assert( lengthBytes>=0 );























282	#endif
283	for(i=0, lengthUTF8=0; i<lengthBytes; i++, lengthUTF8++){
284	char c = zString[i];
285	int cchUTF8=1; /* Code units consumed. */
286	int maxUTF8=1; /* Expected sequence length. */
287	if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
288	else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
289	else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
290	while( cchUTF8<maxUTF8 &&
291	i<lengthBytes-1 &&
292	(zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
293	cchUTF8++;
294	i++;
295	}
296	}
297	return lengthUTF8;













298	}
299
300	/*
301	** This function is called when printing a logical comment line to calculate
302	** the necessary indenting. The caller needs to emit the indenting spaces.
	@@ -339,11 +347,10 @@
339	int pLineCnt, / [in/out] Pointer to the total line count. */
340	const char *pzLine / [out] Pointer to the end of the logical line. */
341	){
342	int index = 0, charCnt = 0, lineCnt = 0, maxChars, i;
343	char zBuf[400]; int iBuf=0; /* Output buffer and counter. */
344	int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */
345	if( !zLine ) return;
346	if( lineChars<=0 ) return;
347	#if 0
348	assert( indent<sizeof(zBuf)-5 ); /* See following comments to explain */
349	assert( origIndent<sizeof(zBuf)-5 ); /* these limits. */
	@@ -362,10 +369,11 @@
362	/* Limit line indent to fit output buffer. */
363	origIndent = sizeof(zBuf)-6;
364	}
365	maxChars = lineChars;
366	for(;;){

367	int useChars = 1;
368	char c = zLine[index];
369	/* Flush the output buffer if there's no space left for at least one more
370	** (potentially 4-byte) UTF-8 sequence, one level of indentation spaces,
371	** a new line, and a terminating NULL. */
	@@ -393,48 +401,47 @@
393	if( c=='\n' ){
394	lineCnt++;
395	charCnt = 0;
396	useChars = 0;
397	}else if( c=='\t' ){
398	int distUTF8;
399	int nextIndex = comment_next_space(zLine, index, &distUTF8);
400	if( nextIndex<=0 \|\| distUTF8>maxChars ){
401	break;
402	}
403	charCnt++;
404	useChars = COMMENT_TAB_WIDTH;
405	if( maxChars<useChars ){
406	zBuf[iBuf++] = ' ';
407	break;
408	}
409	}else if( wordBreak && fossil_isspace(c) ){
410	int distUTF8;
411	int nextIndex = comment_next_space(zLine, index, &distUTF8);
412	if( nextIndex<=0 \|\| distUTF8>=maxChars ){
413	break;
414	}
415	charCnt++;
416	}else{
417	charCnt++;
418	}
419	assert( c!='\n' \|\| charCnt==0 );
420	zBuf[iBuf++] = c;
421	/* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */
422	cchUTF8=1; /* Code units consumed. */
423	maxUTF8=1; /* Expected sequence length. */
424	if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
425	else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
426	else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
427	while( cchUTF8<maxUTF8 &&
428	(zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
429	cchUTF8++;
430	zBuf[iBuf++] = zLine[index++];
431	}
432	if( cchUTF8>1 ){
433	int utf32;
434	decodeUtf8((const unsigned char*)&zLine[index-cchUTF8],&utf32);
435	useChars += cli_wcwidth(utf32) - 1;









436	}
437	maxChars -= useChars;
438	if( maxChars<=0 ) break;
439	if( c=='\n' ) break;
440	}
	@@ -476,11 +483,10 @@
476	int si, sk, i, k, kc;
477	int doIndent = 0;
478	char *zBuf;
479	char zBuffer[400];
480	int lineCnt = 0;
481	int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */
482
483	if( width<0 ){
484	comment_set_maxchars(indent, &maxChars);
485	}
486	if( zText==0 ) zText = "(NULL)";
	@@ -502,30 +508,25 @@
502	}
503	if( zBuf!=zBuffer) fossil_free(zBuf);
504	return lineCnt;
505	}
506	for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){

507	char c = zText[i];
508	kc++; /* Count complete UTF-8 sequences. */
509	/* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */
510	cchUTF8=1; /* Code units consumed. */
511	maxUTF8=1; /* Expected sequence length. */
512	if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
513	else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
514	else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
515	if( maxUTF8>1 ){
516	zBuf[k++] = c;
517	while( cchUTF8<maxUTF8 &&
518	(zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
519	cchUTF8++;
520	zBuf[k++] = zText[++i];
521	}
522	}
523	if( cchUTF8>1 ){
524	int utf32;
525	decodeUtf8((const unsigned char*)&zText[k-cchUTF8],&utf32);
526	kc += cli_wcwidth(utf32) - 1;
527	}
528	else if( fossil_isspace(c) ){
529	si = i;
530	sk = k;
531	if( k==0 \|\| zBuf[k-1]!=' ' ){
	@@ -742,10 +743,11 @@
742	if( zIndent ){
743	indent = atoi(zIndent);
744	}else{
745	indent = -1; /* automatic */
746	}

747	if( g.argc!=4 && g.argc!=5 ){
748	usage("?OPTIONS? PREFIX TEXT ?ORIGTEXT?");
749	}
750	zPrefix = g.argv[2];
751	zText = g.argv[3];
752

	--- src/comformat.c
	+++ src/comformat.c
	@@ -31,11 +31,11 @@
31	#define COMMENT_PRINT_ORIG_BREAK ((u32)0x00000010) /* Break before original. */
32	#define COMMENT_PRINT_DEFAULT (COMMENT_PRINT_LEGACY) /* Defaults. */
33	#define COMMENT_PRINT_UNSET (-1) /* Not initialized. */
34	#endif
35
36	/******* Code copied from SQLite src/shell.c.in on 2024-09-30 ********/
37	/* Lookup table to estimate the number of columns consumed by a Unicode
38	** character.
39	*/
40	static const struct {
41	unsigned char w; /* Width of the character in columns */
	@@ -136,36 +136,10 @@
136	}
137	}
138	if( aUWidth[iLast].iFirst > c ) return aUWidth[iFirst].w;
139	return aUWidth[iLast].w;
140	}


























141	/***** End of code copied from SQLite ***********************************/
142
143	/*
144	** This is the previous value used by most external callers when they
145	** needed to specify a default maximum line length to be used with the
	@@ -241,62 +215,96 @@
215	** algorithm, the NUL character is treated the same as a spacing character.
216	*/
217	static int comment_next_space(
218	const char zLine, / [in] The comment line being printed. */
219	int index, /* [in] The current character index being handled. */
220	int maxChars, /* [in] Optimization hint to abort before space found. */
221	int sumWidth / [out] Summated width of all characters to next space. */
222	){
223	int cchUTF8, utf32, wcwidth = 0;
224	int nextIndex = index;
225	for(;;){
226	char_info_utf8(&zLine[nextIndex],&cchUTF8,&utf32);
227	nextIndex += cchUTF8;
228	wcwidth += cli_wcwidth(utf32);
229	if( zLine[nextIndex]==0 \|\| fossil_isspace(zLine[nextIndex]) \|\|
230	wcwidth>maxChars ){
231	*sumWidth = wcwidth;




232	return nextIndex;
233	}

234	}
235	return 0; /* NOT REACHED */
236	}
237
238	/*
239	** Return information about the next (single- or multi-byte) character in the
240	** specified UTF-8 string: The number of UTF-8 code units (in this case: bytes)
241	** and the decoded UTF-32 code point. Incomplete, ill-formed and overlong
242	** sequences are consumed together as one invalid code point. The invalid lead
243	** bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2-
244	** and 4-byte sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF
245	** are treated as invalid 1-byte sequences (as lone trail bytes), all resulting
246	** in one invalid code point. Invalid UTF-8 sequences encoding a non-scalar code
247	** point (UTF-16 surrogates U+D800 to U+DFFF) are allowed.
248	*/
249	void char_info_utf8(
250	const char *z,
251	int *pCchUTF8,
252	int *pUtf32
253	){
254	int i = 0; /* Counted bytes. */
255	int cchUTF8 = 1; /* Code units consumed. */
256	int maxUTF8 = 1; /* Expected sequence length. */
257	char c = z[i++];
258	if( (c&0x80)==0x00 ){ /* 7-bit ASCII character. */
259	*pCchUTF8 = 1;
260	*pUtf32 = (int)z[0];
261	return;
262	}
263	else if( (c&0xe0)==0xc0 ) maxUTF8 = 2; /* UTF-8 lead byte 110vvvvv */
264	else if( (c&0xf0)==0xe0 ) maxUTF8 = 3; /* UTF-8 lead byte 1110vvvv */
265	else if( (c&0xf8)==0xf0 ) maxUTF8 = 4; /* UTF-8 lead byte 11110vvv */
266	while( cchUTF8<maxUTF8 &&
267	(z[i]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
268	cchUTF8++;
269	i++;
270	}
271	*pCchUTF8 = cchUTF8;
272	if( cchUTF8!=maxUTF8 \|\| /* Incomplete UTF-8 sequence. */
273	( cchUTF8==1 && (c&0x80)==0x80 )){ /* Lone UTF-8 trail byte. */
274	pUtf32 = 0xfffd; / U+FFFD Replacement Character */
275	#ifdef FOSSIL_DEBUG
276	assert( pUtf32!=0xfffd ); / Invalid UTF-8 sequence. */
277	#endif
278	return;
279	}
280	switch( cchUTF8 ){
281	case 4:
282	*pUtf32 =
283	( (z[0] & 0x0f)<<18 ) \|
284	( (z[1] & 0x3f)<<12 ) \|
285	( (z[2] & 0x3f)<< 6 ) \|
286	( (z[4] & 0x3f)<< 0 ) ;
287	break;
288	case 3:
289	*pUtf32 =
290	( (z[0] & 0x0f)<<12 ) \|
291	( (z[1] & 0x3f)<< 6 ) \|
292	( (z[2] & 0x3f)<< 0 ) ;
293	break;
294	case 2:
295	*pUtf32 =
296	( (z[0] & 0x1f)<< 6 ) \|
297	( (z[1] & 0x3f)<< 0 ) ;
298	break;
299	}
300	#ifdef FOSSIL_DEBUG
301	assert(
302	pUtf32>=0 && pUtf32<=0x10ffff && /* Valid range U+0000 to U+10FFFF. */
303	pUtf32<0xd800 && pUtf32>0xdfff /* Non-scalar (UTF-16 surrogates). */
304	);
305	#endif
306	}
307
308	/*
309	** This function is called when printing a logical comment line to calculate
310	** the necessary indenting. The caller needs to emit the indenting spaces.
	@@ -339,11 +347,10 @@
347	int pLineCnt, / [in/out] Pointer to the total line count. */
348	const char *pzLine / [out] Pointer to the end of the logical line. */
349	){
350	int index = 0, charCnt = 0, lineCnt = 0, maxChars, i;
351	char zBuf[400]; int iBuf=0; /* Output buffer and counter. */

352	if( !zLine ) return;
353	if( lineChars<=0 ) return;
354	#if 0
355	assert( indent<sizeof(zBuf)-5 ); /* See following comments to explain */
356	assert( origIndent<sizeof(zBuf)-5 ); /* these limits. */
	@@ -362,10 +369,11 @@
369	/* Limit line indent to fit output buffer. */
370	origIndent = sizeof(zBuf)-6;
371	}
372	maxChars = lineChars;
373	for(;;){
374	int cchUTF8, utf32;
375	int useChars = 1;
376	char c = zLine[index];
377	/* Flush the output buffer if there's no space left for at least one more
378	** (potentially 4-byte) UTF-8 sequence, one level of indentation spaces,
379	** a new line, and a terminating NULL. */
	@@ -393,48 +401,47 @@
401	if( c=='\n' ){
402	lineCnt++;
403	charCnt = 0;
404	useChars = 0;
405	}else if( c=='\t' ){
406	int sumWidth;
407	int nextIndex = comment_next_space(zLine, index, maxChars, &sumWidth);
408	if( nextIndex<=0 \|\| sumWidth>maxChars ){
409	break;
410	}
411	charCnt++;
412	useChars = COMMENT_TAB_WIDTH;
413	if( maxChars<useChars ){
414	zBuf[iBuf++] = ' ';
415	break;
416	}
417	}else if( wordBreak && fossil_isspace(c) ){
418	int sumWidth;
419	int nextIndex = comment_next_space(zLine, index, maxChars, &sumWidth);
420	if( nextIndex<=0 \|\| sumWidth>=maxChars ){
421	break;
422	}
423	charCnt++;
424	}else{
425	charCnt++;
426	}
427	assert( c!='\n' \|\| charCnt==0 );
428	zBuf[iBuf++] = c;
429	char_info_utf8(&zLine[index-1],&cchUTF8,&utf32);










430	if( cchUTF8>1 ){
431	int wcwidth;
432	wcwidth = cli_wcwidth(utf32);
433	if( wcwidth>maxChars && lineChars>=wcwidth ){ /* rollback */
434	index--;
435	iBuf--;
436	zBuf[iBuf] = 0;
437	break;
438	}
439	for( ; cchUTF8>1; cchUTF8-- ){
440	zBuf[iBuf++] = zLine[index++];
441	}
442	useChars += wcwidth - 1;
443	}
444	maxChars -= useChars;
445	if( maxChars<=0 ) break;
446	if( c=='\n' ) break;
447	}
	@@ -476,11 +483,10 @@
483	int si, sk, i, k, kc;
484	int doIndent = 0;
485	char *zBuf;
486	char zBuffer[400];
487	int lineCnt = 0;

488
489	if( width<0 ){
490	comment_set_maxchars(indent, &maxChars);
491	}
492	if( zText==0 ) zText = "(NULL)";
	@@ -502,30 +508,25 @@
508	}
509	if( zBuf!=zBuffer) fossil_free(zBuf);
510	return lineCnt;
511	}
512	for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){
513	int cchUTF8, utf32;
514	char c = zText[i];
515	kc++; /* Count complete UTF-8 sequences. */
516	char_info_utf8(&zText[i],&cchUTF8,&utf32);
517	if( cchUTF8>1 ){
518	int wcwidth;
519	wcwidth = cli_wcwidth(utf32);
520	if( kc+wcwidth-1>maxChars && maxChars>=wcwidth ){ /* rollback */
521	kc--;
522	break;
523	}
524	for( i--; cchUTF8>0; cchUTF8-- ){


525	zBuf[k++] = zText[++i];
526	}
527	kc += wcwidth - 1;




528	}
529	else if( fossil_isspace(c) ){
530	si = i;
531	sk = k;
532	if( k==0 \|\| zBuf[k-1]!=' ' ){
	@@ -742,10 +743,11 @@
743	if( zIndent ){
744	indent = atoi(zIndent);
745	}else{
746	indent = -1; /* automatic */
747	}
748	verify_all_options();
749	if( g.argc!=4 && g.argc!=5 ){
750	usage("?OPTIONS? PREFIX TEXT ?ORIGTEXT?");
751	}
752	zPrefix = g.argv[2];
753	zText = g.argv[3];
754

M test/comment.test

+20

		--- test/comment.test
		+++ test/comment.test
		@@ -319,8 +319,28 @@
319	319	###############################################################################
320	320
321	321	fossil test-comment-format --width 81 --indent 9 --decode --trimcrlf --origbreak "00:00:00 " "\[0000000000\] CURRENT $orig" $orig
322	322	test comment-60 {$RESULT eq "00:00:00 \[0000000000\] CURRENT \n xxxx xx xxxxxxx xxxx xxxxxx xxxxxxx, xxxxxxx, x xxxx xxxxxx xx xxxx xxxx\n xxxxxxx xxxxx xxxx xxxx xx xxxxxxx xxxxxxx (xxxxxx xxxxxxxxx x xxxxx).\n xxx'x xxx xxx xx xxxxx xxxx xxx xxx --xxxxxxxxxxx xxxxxx xx xx xxxx. x\n xxxxx x xxxxxx xxxx xxxx xxxx xxxx xxxx x xxxxx xx xxx x xxxxxxxx\n xxxxxxx.\n(6 lines output)"}
323	323
	324	+###############################################################################
	325	+
	326	+fossil test-comment-format --width 72 --file "" [file join $testdir "utf8-comment.txt"]
	327	+test comment-61 {$RESULT eq "The comment formatter handles ｆｕｌｌｗｉｄｔｈ and multi-byte \[äöü\] an\nd symbols \[☃\] and emoji \[💾\] characters!\n(2 lines output)"}
	328	+
	329	+###############################################################################
	330	+
	331	+fossil test-comment-format --width 72 --wordbreak --file "" [file join $testdir "utf8-comment.txt"]
	332	+test comment-62 {$RESULT eq "The comment formatter handles ｆｕｌｌｗｉｄｔｈ and multi-byte \[äöü\]\nand symbols \[☃\] and emoji \[💾\] characters!\n(2 lines output)"}
	333	+
	334	+###############################################################################
	335	+
	336	+fossil test-comment-format --width 72 --legacy --file "" [file join $testdir "utf8-comment.txt"]
	337	+test comment-63 {$RESULT eq "The comment formatter handles ｆｕｌｌｗｉｄｔｈ and multi-byte \[äöü\]\nand symbols \[☃\] and emoji \[💾\] characters!\n(2 lines output)"}
	338	+
	339	+###############################################################################
	340	+
	341	+fossil test-comment-format --width 72 --legacy --wordbreak --file "" [file join $testdir "utf8-comment.txt"]
	342	+test comment-64 {$RESULT eq "The comment formatter handles ｆｕｌｌｗｉｄｔｈ and multi-byte \[äöü\]\nand symbols \[☃\] and emoji \[💾\] characters!\n(2 lines output)"}
	343	+
324	344	###############################################################################
325	345
326	346	test_cleanup
327	347
328	348	ADDED test/utf8-comment.txt

	--- test/comment.test
	+++ test/comment.test
	@@ -319,8 +319,28 @@
319	###############################################################################
320
321	fossil test-comment-format --width 81 --indent 9 --decode --trimcrlf --origbreak "00:00:00 " "\[0000000000\] CURRENT $orig" $orig
322	test comment-60 {$RESULT eq "00:00:00 \[0000000000\] CURRENT \n xxxx xx xxxxxxx xxxx xxxxxx xxxxxxx, xxxxxxx, x xxxx xxxxxx xx xxxx xxxx\n xxxxxxx xxxxx xxxx xxxx xx xxxxxxx xxxxxxx (xxxxxx xxxxxxxxx x xxxxx).\n xxx'x xxx xxx xx xxxxx xxxx xxx xxx --xxxxxxxxxxx xxxxxx xx xx xxxx. x\n xxxxx x xxxxxx xxxx xxxx xxxx xxxx xxxx x xxxxx xx xxx x xxxxxxxx\n xxxxxxx.\n(6 lines output)"}
323




















324	###############################################################################
325
326	test_cleanup
327
328	DDED test/utf8-comment.txt

	--- test/comment.test
	+++ test/comment.test
	@@ -319,8 +319,28 @@
319	###############################################################################
320
321	fossil test-comment-format --width 81 --indent 9 --decode --trimcrlf --origbreak "00:00:00 " "\[0000000000\] CURRENT $orig" $orig
322	test comment-60 {$RESULT eq "00:00:00 \[0000000000\] CURRENT \n xxxx xx xxxxxxx xxxx xxxxxx xxxxxxx, xxxxxxx, x xxxx xxxxxx xx xxxx xxxx\n xxxxxxx xxxxx xxxx xxxx xx xxxxxxx xxxxxxx (xxxxxx xxxxxxxxx x xxxxx).\n xxx'x xxx xxx xx xxxxx xxxx xxx xxx --xxxxxxxxxxx xxxxxx xx xx xxxx. x\n xxxxx x xxxxxx xxxx xxxx xxxx xxxx xxxx x xxxxx xx xxx x xxxxxxxx\n xxxxxxx.\n(6 lines output)"}
323
324	###############################################################################
325
326	fossil test-comment-format --width 72 --file "" [file join $testdir "utf8-comment.txt"]
327	test comment-61 {$RESULT eq "The comment formatter handles ｆｕｌｌｗｉｄｔｈ and multi-byte \[äöü\] an\nd symbols \[☃\] and emoji \[💾\] characters!\n(2 lines output)"}
328
329	###############################################################################
330
331	fossil test-comment-format --width 72 --wordbreak --file "" [file join $testdir "utf8-comment.txt"]
332	test comment-62 {$RESULT eq "The comment formatter handles ｆｕｌｌｗｉｄｔｈ and multi-byte \[äöü\]\nand symbols \[☃\] and emoji \[💾\] characters!\n(2 lines output)"}
333
334	###############################################################################
335
336	fossil test-comment-format --width 72 --legacy --file "" [file join $testdir "utf8-comment.txt"]
337	test comment-63 {$RESULT eq "The comment formatter handles ｆｕｌｌｗｉｄｔｈ and multi-byte \[äöü\]\nand symbols \[☃\] and emoji \[💾\] characters!\n(2 lines output)"}
338
339	###############################################################################
340
341	fossil test-comment-format --width 72 --legacy --wordbreak --file "" [file join $testdir "utf8-comment.txt"]
342	test comment-64 {$RESULT eq "The comment formatter handles ｆｕｌｌｗｉｄｔｈ and multi-byte \[äöü\]\nand symbols \[☃\] and emoji \[💾\] characters!\n(2 lines output)"}
343
344	###############################################################################
345
346	test_cleanup
347
348	DDED test/utf8-comment.txt

M test/utf8-comment.txt

		--- a/test/utf8-comment.txt
		+++ b/test/utf8-comment.txt
		@@ -0,0 +1 @@
	1	+The comment formatter handles ｆｕｌｌｗｉｄｔｈ and multi-byte [äöü] and symbols [☃] and emoji [💾] characters!

	--- a/test/utf8-comment.txt
	+++ b/test/utf8-comment.txt
	@@ -0,0 +1 @@

	--- a/test/utf8-comment.txt
	+++ b/test/utf8-comment.txt
	@@ -0,0 +1 @@
1	The comment formatter handles ｆｕｌｌｗｉｄｔｈ and multi-byte [äöü] and symbols [☃] and emoji [💾] characters!

Fossil SCM

Keyboard Shortcuts