Fossil SCM

Fix the off-by-one errors if a fullwidth character only fits partially, and take into account character widths when scanning forward to find the distance to the next space.

florian 2024-10-02 14:43 trunk

Commit d5479ba7c66e74561ee43db687b3ce19304be0baffddaf0a79a37c146e9b0354

Parent e483b3b15fad08e…

1 file changed +102 -79

~ src/comformat.c

M src/comformat.c

+102 -79

		--- src/comformat.c
		+++ src/comformat.c
		@@ -241,62 +241,92 @@
241	241	** algorithm, the NUL character is treated the same as a spacing character.
242	242	*/
243	243	static int comment_next_space(
244	244	const char zLine, / [in] The comment line being printed. */
245	245	int index, /* [in] The current character index being handled. */
246		- int distUTF8 / [out] Distance to next space in UTF-8 sequences. */
	246	+ int sumWidth / [out] Summated width of all characters to next space. */
247	247	){
248		- int nextIndex = index + 1;
249		- int fNonASCII=0;
	248	+ int cchUTF8, utf32, wcwidth = 0;
	249	+ int nextIndex = index;
250	250	for(;;){
251		- char c = zLine[nextIndex];
252		- if( (c&0x80)==0x80 ) fNonASCII=1;
253		- if( c==0 \|\| fossil_isspace(c) ){
254		- if( distUTF8 ){
255		- if( fNonASCII!=0 ){
256		- *distUTF8 = strlen_utf8(&zLine[index], nextIndex-index);
257		- }else{
258		- *distUTF8 = nextIndex-index;
259		- }
260		- }
	251	+ char_info_utf8(&zLine[nextIndex],&cchUTF8,&utf32);
	252	+ nextIndex += cchUTF8;
	253	+ wcwidth += cli_wcwidth(utf32);
	254	+ if( zLine[nextIndex]==0 \|\| fossil_isspace(zLine[nextIndex]) ){
	255	+ *sumWidth = wcwidth;
261	256	return nextIndex;
262	257	}
263		- nextIndex++;
264	258	}
265	259	return 0; /* NOT REACHED */
266	260	}
267	261
268	262	/*
269		-** Count the number of UTF-8 sequences in a string. Incomplete, ill-formed and
270		-** overlong sequences are counted as one sequence. The invalid lead bytes 0xC0
271		-** to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and 4-byte
272		-** sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF are
273		-** treated as invalid 1-byte sequences (as lone trail bytes).
274		-** Combining characters and East Asian Wide and Fullwidth characters are counted
275		-** as one, so this function does not calculate the effective "display width".
	263	+** Return information about the next (single- or multi-byte) character in the
	264	+** specified UTF-8 string: The number of UTF-8 code units (in this case: bytes)
	265	+** and the decoded UTF-32 code point. Incomplete, ill-formed and overlong
	266	+** sequences are consumed together as one invalid code point. The invalid lead
	267	+** bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2-
	268	+** and 4-byte sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF
	269	+** are treated as invalid 1-byte sequences (as lone trail bytes), all resulting
	270	+** in one invalid code point. Invalid UTF-8 sequences encoding a non-scalar code
	271	+** point (UTF-16 surrogates U+D800 to U+DFFF) are allowed.
276	272	*/
277		-int strlen_utf8(const char *zString, int lengthBytes){
278		- int i; /* Counted bytes. */
279		- int lengthUTF8; /* Counted UTF-8 sequences. */
280		-#if 0
281		- assert( lengthBytes>=0 );
	273	+void char_info_utf8(
	274	+ const unsigned char *z,
	275	+ int *pCchUTF8,
	276	+ int *pUtf32
	277	+){
	278	+ int i = 0; /* Counted bytes. */
	279	+ int cchUTF8 = 1; /* Code units consumed. */
	280	+ int maxUTF8 = 1; /* Expected sequence length. */
	281	+ char c = z[i++];
	282	+ if( (c&0xe0)==0xc0 ) maxUTF8 = 2; /* UTF-8 lead byte 110vvvvv */
	283	+ else if( (c&0xf0)==0xe0 ) maxUTF8 = 3; /* UTF-8 lead byte 1110vvvv */
	284	+ else if( (c&0xf8)==0xf0 ) maxUTF8 = 4; /* UTF-8 lead byte 11110vvv */
	285	+ while( cchUTF8<maxUTF8 &&
	286	+ (z[i]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
	287	+ cchUTF8++;
	288	+ i++;
	289	+ }
	290	+ *pCchUTF8 = cchUTF8;
	291	+ if( cchUTF8!=maxUTF8 \|\| /* Incomplete UTF-8 sequence. */
	292	+ cchUTF8==1 && (c&0x80)==0x80 ){ /* Lone UTF-8 trail byte. */
	293	+ pUtf32 = 0xfffd; / U+FFFD Replacement Character */
	294	+#ifdef FOSSIL_DEBUG
	295	+ assert( pUtf32!=0xfffd ); / Invalid UTF-8 sequence. */
282	296	#endif
283		- for(i=0, lengthUTF8=0; i<lengthBytes; i++, lengthUTF8++){
284		- char c = zString[i];
285		- int cchUTF8=1; /* Code units consumed. */
286		- int maxUTF8=1; /* Expected sequence length. */
287		- if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
288		- else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
289		- else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
290		- while( cchUTF8<maxUTF8 &&
291		- i<lengthBytes-1 &&
292		- (zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
293		- cchUTF8++;
294		- i++;
295		- }
296		- }
297		- return lengthUTF8;
	297	+ return;
	298	+ }
	299	+ switch( cchUTF8 ){
	300	+ case 4:
	301	+ *pUtf32 =
	302	+ ( (z[0] & 0x0f)<<18 ) \|
	303	+ ( (z[1] & 0x3f)<<12 ) \|
	304	+ ( (z[2] & 0x3f)<< 6 ) \|
	305	+ ( (z[4] & 0x3f)<< 0 ) ;
	306	+ break;
	307	+ case 3:
	308	+ *pUtf32 =
	309	+ ( (z[0] & 0x0f)<<12 ) \|
	310	+ ( (z[1] & 0x3f)<< 6 ) \|
	311	+ ( (z[2] & 0x3f)<< 0 ) ;
	312	+ break;
	313	+ case 2:
	314	+ *pUtf32 =
	315	+ ( (z[0] & 0x1f)<< 6 ) \|
	316	+ ( (z[1] & 0x3f)<< 0 ) ;
	317	+ break;
	318	+ case 1:
	319	+ *pUtf32 = (int)z[0];
	320	+ break;
	321	+ }
	322	+#ifdef FOSSIL_DEBUG
	323	+ assert(
	324	+ pUtf32>=0 && pUtf32<=0x10ffff && /* Valid range U+0000 to U+10FFFF. */
	325	+ pUtf32<0xd800 && pUtf32>0xdfff /* Non-scalar (UTF-16 surrogates). */
	326	+ );
	327	+#endif
298	328	}
299	329
300	330	/*
301	331	** This function is called when printing a logical comment line to calculate
302	332	** the necessary indenting. The caller needs to emit the indenting spaces.
		@@ -339,11 +369,10 @@
339	369	int pLineCnt, / [in/out] Pointer to the total line count. */
340	370	const char *pzLine / [out] Pointer to the end of the logical line. */
341	371	){
342	372	int index = 0, charCnt = 0, lineCnt = 0, maxChars, i;
343	373	char zBuf[400]; int iBuf=0; /* Output buffer and counter. */
344		- int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */
345	374	if( !zLine ) return;
346	375	if( lineChars<=0 ) return;
347	376	#if 0
348	377	assert( indent<sizeof(zBuf)-5 ); /* See following comments to explain */
349	378	assert( origIndent<sizeof(zBuf)-5 ); /* these limits. */
		@@ -362,10 +391,11 @@
362	391	/* Limit line indent to fit output buffer. */
363	392	origIndent = sizeof(zBuf)-6;
364	393	}
365	394	maxChars = lineChars;
366	395	for(;;){
	396	+ int cchUTF8, utf32;
367	397	int useChars = 1;
368	398	char c = zLine[index];
369	399	/* Flush the output buffer if there's no space left for at least one more
370	400	** (potentially 4-byte) UTF-8 sequence, one level of indentation spaces,
371	401	** a new line, and a terminating NULL. */
		@@ -393,48 +423,47 @@
393	423	if( c=='\n' ){
394	424	lineCnt++;
395	425	charCnt = 0;
396	426	useChars = 0;
397	427	}else if( c=='\t' ){
398		- int distUTF8;
399		- int nextIndex = comment_next_space(zLine, index, &distUTF8);
400		- if( nextIndex<=0 \|\| distUTF8>maxChars ){
	428	+ int sumWidth;
	429	+ int nextIndex = comment_next_space(zLine, index, &sumWidth);
	430	+ if( nextIndex<=0 \|\| sumWidth>maxChars ){
401	431	break;
402	432	}
403	433	charCnt++;
404	434	useChars = COMMENT_TAB_WIDTH;
405	435	if( maxChars<useChars ){
406	436	zBuf[iBuf++] = ' ';
407	437	break;
408	438	}
409	439	}else if( wordBreak && fossil_isspace(c) ){
410		- int distUTF8;
411		- int nextIndex = comment_next_space(zLine, index, &distUTF8);
412		- if( nextIndex<=0 \|\| distUTF8>=maxChars ){
	440	+ int sumWidth;
	441	+ int nextIndex = comment_next_space(zLine, index, &sumWidth);
	442	+ if( nextIndex<=0 \|\| sumWidth>=maxChars ){
413	443	break;
414	444	}
415	445	charCnt++;
416	446	}else{
417	447	charCnt++;
418	448	}
419	449	assert( c!='\n' \|\| charCnt==0 );
420	450	zBuf[iBuf++] = c;
421		- /* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */
422		- cchUTF8=1; /* Code units consumed. */
423		- maxUTF8=1; /* Expected sequence length. */
424		- if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
425		- else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
426		- else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
427		- while( cchUTF8<maxUTF8 &&
428		- (zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
429		- cchUTF8++;
430		- zBuf[iBuf++] = zLine[index++];
431		- }
	451	+ char_info_utf8(&zLine[index-1],&cchUTF8,&utf32);
432	452	if( cchUTF8>1 ){
433		- int utf32;
434		- decodeUtf8((const unsigned char*)&zLine[index-cchUTF8],&utf32);
435		- useChars += cli_wcwidth(utf32) - 1;
	453	+ int wcwidth;
	454	+ wcwidth = cli_wcwidth(utf32);
	455	+ if( wcwidth>maxChars && lineChars>=wcwidth ){ /* rollback */
	456	+ index--;
	457	+ iBuf--;
	458	+ zBuf[iBuf] = 0;
	459	+ break;
	460	+ }
	461	+ for( ; cchUTF8>1; cchUTF8-- ){
	462	+ zBuf[iBuf++] = zLine[index++];
	463	+ }
	464	+ useChars += wcwidth - 1;
436	465	}
437	466	maxChars -= useChars;
438	467	if( maxChars<=0 ) break;
439	468	if( c=='\n' ) break;
440	469	}
		@@ -476,11 +505,10 @@
476	505	int si, sk, i, k, kc;
477	506	int doIndent = 0;
478	507	char *zBuf;
479	508	char zBuffer[400];
480	509	int lineCnt = 0;
481		- int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */
482	510
483	511	if( width<0 ){
484	512	comment_set_maxchars(indent, &maxChars);
485	513	}
486	514	if( zText==0 ) zText = "(NULL)";
		@@ -502,30 +530,25 @@
502	530	}
503	531	if( zBuf!=zBuffer) fossil_free(zBuf);
504	532	return lineCnt;
505	533	}
506	534	for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){
	535	+ int cchUTF8, utf32;
507	536	char c = zText[i];
508	537	kc++; /* Count complete UTF-8 sequences. */
509		- /* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */
510		- cchUTF8=1; /* Code units consumed. */
511		- maxUTF8=1; /* Expected sequence length. */
512		- if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
513		- else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
514		- else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
515		- if( maxUTF8>1 ){
516		- zBuf[k++] = c;
517		- while( cchUTF8<maxUTF8 &&
518		- (zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
519		- cchUTF8++;
	538	+ char_info_utf8(&zText[i],&cchUTF8,&utf32);
	539	+ if( cchUTF8>1 ){
	540	+ int wcwidth;
	541	+ wcwidth = cli_wcwidth(utf32);
	542	+ if( kc+wcwidth-1>maxChars && maxChars>=wcwidth ){ /* rollback */
	543	+ kc--;
	544	+ break;
	545	+ }
	546	+ for( i--; cchUTF8>0; cchUTF8-- ){
520	547	zBuf[k++] = zText[++i];
521	548	}
522		- }
523		- if( cchUTF8>1 ){
524		- int utf32;
525		- decodeUtf8((const unsigned char*)&zText[k-cchUTF8],&utf32);
526		- kc += cli_wcwidth(utf32) - 1;
	549	+ kc += wcwidth - 1;
527	550	}
528	551	else if( fossil_isspace(c) ){
529	552	si = i;
530	553	sk = k;
531	554	if( k==0 \|\| zBuf[k-1]!=' ' ){
532	555

	--- src/comformat.c
	+++ src/comformat.c
	@@ -241,62 +241,92 @@
241	** algorithm, the NUL character is treated the same as a spacing character.
242	*/
243	static int comment_next_space(
244	const char zLine, / [in] The comment line being printed. */
245	int index, /* [in] The current character index being handled. */
246	int distUTF8 / [out] Distance to next space in UTF-8 sequences. */
247	){
248	int nextIndex = index + 1;
249	int fNonASCII=0;
250	for(;;){
251	char c = zLine[nextIndex];
252	if( (c&0x80)==0x80 ) fNonASCII=1;
253	if( c==0 \|\| fossil_isspace(c) ){
254	if( distUTF8 ){
255	if( fNonASCII!=0 ){
256	*distUTF8 = strlen_utf8(&zLine[index], nextIndex-index);
257	}else{
258	*distUTF8 = nextIndex-index;
259	}
260	}
261	return nextIndex;
262	}
263	nextIndex++;
264	}
265	return 0; /* NOT REACHED */
266	}
267
268	/*
269	** Count the number of UTF-8 sequences in a string. Incomplete, ill-formed and
270	** overlong sequences are counted as one sequence. The invalid lead bytes 0xC0
271	** to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and 4-byte
272	** sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF are
273	** treated as invalid 1-byte sequences (as lone trail bytes).
274	** Combining characters and East Asian Wide and Fullwidth characters are counted
275	** as one, so this function does not calculate the effective "display width".


276	*/
277	int strlen_utf8(const char *zString, int lengthBytes){
278	int i; /* Counted bytes. */
279	int lengthUTF8; /* Counted UTF-8 sequences. */
280	#if 0
281	assert( lengthBytes>=0 );


















282	#endif
283	for(i=0, lengthUTF8=0; i<lengthBytes; i++, lengthUTF8++){
284	char c = zString[i];
285	int cchUTF8=1; /* Code units consumed. */
286	int maxUTF8=1; /* Expected sequence length. */
287	if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
288	else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
289	else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
290	while( cchUTF8<maxUTF8 &&
291	i<lengthBytes-1 &&
292	(zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
293	cchUTF8++;
294	i++;
295	}
296	}
297	return lengthUTF8;
















298	}
299
300	/*
301	** This function is called when printing a logical comment line to calculate
302	** the necessary indenting. The caller needs to emit the indenting spaces.
	@@ -339,11 +369,10 @@
339	int pLineCnt, / [in/out] Pointer to the total line count. */
340	const char *pzLine / [out] Pointer to the end of the logical line. */
341	){
342	int index = 0, charCnt = 0, lineCnt = 0, maxChars, i;
343	char zBuf[400]; int iBuf=0; /* Output buffer and counter. */
344	int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */
345	if( !zLine ) return;
346	if( lineChars<=0 ) return;
347	#if 0
348	assert( indent<sizeof(zBuf)-5 ); /* See following comments to explain */
349	assert( origIndent<sizeof(zBuf)-5 ); /* these limits. */
	@@ -362,10 +391,11 @@
362	/* Limit line indent to fit output buffer. */
363	origIndent = sizeof(zBuf)-6;
364	}
365	maxChars = lineChars;
366	for(;;){

367	int useChars = 1;
368	char c = zLine[index];
369	/* Flush the output buffer if there's no space left for at least one more
370	** (potentially 4-byte) UTF-8 sequence, one level of indentation spaces,
371	** a new line, and a terminating NULL. */
	@@ -393,48 +423,47 @@
393	if( c=='\n' ){
394	lineCnt++;
395	charCnt = 0;
396	useChars = 0;
397	}else if( c=='\t' ){
398	int distUTF8;
399	int nextIndex = comment_next_space(zLine, index, &distUTF8);
400	if( nextIndex<=0 \|\| distUTF8>maxChars ){
401	break;
402	}
403	charCnt++;
404	useChars = COMMENT_TAB_WIDTH;
405	if( maxChars<useChars ){
406	zBuf[iBuf++] = ' ';
407	break;
408	}
409	}else if( wordBreak && fossil_isspace(c) ){
410	int distUTF8;
411	int nextIndex = comment_next_space(zLine, index, &distUTF8);
412	if( nextIndex<=0 \|\| distUTF8>=maxChars ){
413	break;
414	}
415	charCnt++;
416	}else{
417	charCnt++;
418	}
419	assert( c!='\n' \|\| charCnt==0 );
420	zBuf[iBuf++] = c;
421	/* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */
422	cchUTF8=1; /* Code units consumed. */
423	maxUTF8=1; /* Expected sequence length. */
424	if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
425	else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
426	else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
427	while( cchUTF8<maxUTF8 &&
428	(zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
429	cchUTF8++;
430	zBuf[iBuf++] = zLine[index++];
431	}
432	if( cchUTF8>1 ){
433	int utf32;
434	decodeUtf8((const unsigned char*)&zLine[index-cchUTF8],&utf32);
435	useChars += cli_wcwidth(utf32) - 1;









436	}
437	maxChars -= useChars;
438	if( maxChars<=0 ) break;
439	if( c=='\n' ) break;
440	}
	@@ -476,11 +505,10 @@
476	int si, sk, i, k, kc;
477	int doIndent = 0;
478	char *zBuf;
479	char zBuffer[400];
480	int lineCnt = 0;
481	int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */
482
483	if( width<0 ){
484	comment_set_maxchars(indent, &maxChars);
485	}
486	if( zText==0 ) zText = "(NULL)";
	@@ -502,30 +530,25 @@
502	}
503	if( zBuf!=zBuffer) fossil_free(zBuf);
504	return lineCnt;
505	}
506	for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){

507	char c = zText[i];
508	kc++; /* Count complete UTF-8 sequences. */
509	/* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */
510	cchUTF8=1; /* Code units consumed. */
511	maxUTF8=1; /* Expected sequence length. */
512	if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
513	else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
514	else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
515	if( maxUTF8>1 ){
516	zBuf[k++] = c;
517	while( cchUTF8<maxUTF8 &&
518	(zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
519	cchUTF8++;
520	zBuf[k++] = zText[++i];
521	}
522	}
523	if( cchUTF8>1 ){
524	int utf32;
525	decodeUtf8((const unsigned char*)&zText[k-cchUTF8],&utf32);
526	kc += cli_wcwidth(utf32) - 1;
527	}
528	else if( fossil_isspace(c) ){
529	si = i;
530	sk = k;
531	if( k==0 \|\| zBuf[k-1]!=' ' ){
532

	--- src/comformat.c
	+++ src/comformat.c
	@@ -241,62 +241,92 @@
241	** algorithm, the NUL character is treated the same as a spacing character.
242	*/
243	static int comment_next_space(
244	const char zLine, / [in] The comment line being printed. */
245	int index, /* [in] The current character index being handled. */
246	int sumWidth / [out] Summated width of all characters to next space. */
247	){
248	int cchUTF8, utf32, wcwidth = 0;
249	int nextIndex = index;
250	for(;;){
251	char_info_utf8(&zLine[nextIndex],&cchUTF8,&utf32);
252	nextIndex += cchUTF8;
253	wcwidth += cli_wcwidth(utf32);
254	if( zLine[nextIndex]==0 \|\| fossil_isspace(zLine[nextIndex]) ){
255	*sumWidth = wcwidth;





256	return nextIndex;
257	}

258	}
259	return 0; /* NOT REACHED */
260	}
261
262	/*
263	** Return information about the next (single- or multi-byte) character in the
264	** specified UTF-8 string: The number of UTF-8 code units (in this case: bytes)
265	** and the decoded UTF-32 code point. Incomplete, ill-formed and overlong
266	** sequences are consumed together as one invalid code point. The invalid lead
267	** bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2-
268	** and 4-byte sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF
269	** are treated as invalid 1-byte sequences (as lone trail bytes), all resulting
270	** in one invalid code point. Invalid UTF-8 sequences encoding a non-scalar code
271	** point (UTF-16 surrogates U+D800 to U+DFFF) are allowed.
272	*/
273	void char_info_utf8(
274	const unsigned char *z,
275	int *pCchUTF8,
276	int *pUtf32
277	){
278	int i = 0; /* Counted bytes. */
279	int cchUTF8 = 1; /* Code units consumed. */
280	int maxUTF8 = 1; /* Expected sequence length. */
281	char c = z[i++];
282	if( (c&0xe0)==0xc0 ) maxUTF8 = 2; /* UTF-8 lead byte 110vvvvv */
283	else if( (c&0xf0)==0xe0 ) maxUTF8 = 3; /* UTF-8 lead byte 1110vvvv */
284	else if( (c&0xf8)==0xf0 ) maxUTF8 = 4; /* UTF-8 lead byte 11110vvv */
285	while( cchUTF8<maxUTF8 &&
286	(z[i]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
287	cchUTF8++;
288	i++;
289	}
290	*pCchUTF8 = cchUTF8;
291	if( cchUTF8!=maxUTF8 \|\| /* Incomplete UTF-8 sequence. */
292	cchUTF8==1 && (c&0x80)==0x80 ){ /* Lone UTF-8 trail byte. */
293	pUtf32 = 0xfffd; / U+FFFD Replacement Character */
294	#ifdef FOSSIL_DEBUG
295	assert( pUtf32!=0xfffd ); / Invalid UTF-8 sequence. */
296	#endif
297	return;
298	}
299	switch( cchUTF8 ){
300	case 4:
301	*pUtf32 =
302	( (z[0] & 0x0f)<<18 ) \|
303	( (z[1] & 0x3f)<<12 ) \|
304	( (z[2] & 0x3f)<< 6 ) \|
305	( (z[4] & 0x3f)<< 0 ) ;
306	break;
307	case 3:
308	*pUtf32 =
309	( (z[0] & 0x0f)<<12 ) \|
310	( (z[1] & 0x3f)<< 6 ) \|
311	( (z[2] & 0x3f)<< 0 ) ;
312	break;
313	case 2:
314	*pUtf32 =
315	( (z[0] & 0x1f)<< 6 ) \|
316	( (z[1] & 0x3f)<< 0 ) ;
317	break;
318	case 1:
319	*pUtf32 = (int)z[0];
320	break;
321	}
322	#ifdef FOSSIL_DEBUG
323	assert(
324	pUtf32>=0 && pUtf32<=0x10ffff && /* Valid range U+0000 to U+10FFFF. */
325	pUtf32<0xd800 && pUtf32>0xdfff /* Non-scalar (UTF-16 surrogates). */
326	);
327	#endif
328	}
329
330	/*
331	** This function is called when printing a logical comment line to calculate
332	** the necessary indenting. The caller needs to emit the indenting spaces.
	@@ -339,11 +369,10 @@
369	int pLineCnt, / [in/out] Pointer to the total line count. */
370	const char *pzLine / [out] Pointer to the end of the logical line. */
371	){
372	int index = 0, charCnt = 0, lineCnt = 0, maxChars, i;
373	char zBuf[400]; int iBuf=0; /* Output buffer and counter. */

374	if( !zLine ) return;
375	if( lineChars<=0 ) return;
376	#if 0
377	assert( indent<sizeof(zBuf)-5 ); /* See following comments to explain */
378	assert( origIndent<sizeof(zBuf)-5 ); /* these limits. */
	@@ -362,10 +391,11 @@
391	/* Limit line indent to fit output buffer. */
392	origIndent = sizeof(zBuf)-6;
393	}
394	maxChars = lineChars;
395	for(;;){
396	int cchUTF8, utf32;
397	int useChars = 1;
398	char c = zLine[index];
399	/* Flush the output buffer if there's no space left for at least one more
400	** (potentially 4-byte) UTF-8 sequence, one level of indentation spaces,
401	** a new line, and a terminating NULL. */
	@@ -393,48 +423,47 @@
423	if( c=='\n' ){
424	lineCnt++;
425	charCnt = 0;
426	useChars = 0;
427	}else if( c=='\t' ){
428	int sumWidth;
429	int nextIndex = comment_next_space(zLine, index, &sumWidth);
430	if( nextIndex<=0 \|\| sumWidth>maxChars ){
431	break;
432	}
433	charCnt++;
434	useChars = COMMENT_TAB_WIDTH;
435	if( maxChars<useChars ){
436	zBuf[iBuf++] = ' ';
437	break;
438	}
439	}else if( wordBreak && fossil_isspace(c) ){
440	int sumWidth;
441	int nextIndex = comment_next_space(zLine, index, &sumWidth);
442	if( nextIndex<=0 \|\| sumWidth>=maxChars ){
443	break;
444	}
445	charCnt++;
446	}else{
447	charCnt++;
448	}
449	assert( c!='\n' \|\| charCnt==0 );
450	zBuf[iBuf++] = c;
451	char_info_utf8(&zLine[index-1],&cchUTF8,&utf32);










452	if( cchUTF8>1 ){
453	int wcwidth;
454	wcwidth = cli_wcwidth(utf32);
455	if( wcwidth>maxChars && lineChars>=wcwidth ){ /* rollback */
456	index--;
457	iBuf--;
458	zBuf[iBuf] = 0;
459	break;
460	}
461	for( ; cchUTF8>1; cchUTF8-- ){
462	zBuf[iBuf++] = zLine[index++];
463	}
464	useChars += wcwidth - 1;
465	}
466	maxChars -= useChars;
467	if( maxChars<=0 ) break;
468	if( c=='\n' ) break;
469	}
	@@ -476,11 +505,10 @@
505	int si, sk, i, k, kc;
506	int doIndent = 0;
507	char *zBuf;
508	char zBuffer[400];
509	int lineCnt = 0;

510
511	if( width<0 ){
512	comment_set_maxchars(indent, &maxChars);
513	}
514	if( zText==0 ) zText = "(NULL)";
	@@ -502,30 +530,25 @@
530	}
531	if( zBuf!=zBuffer) fossil_free(zBuf);
532	return lineCnt;
533	}
534	for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){
535	int cchUTF8, utf32;
536	char c = zText[i];
537	kc++; /* Count complete UTF-8 sequences. */
538	char_info_utf8(&zText[i],&cchUTF8,&utf32);
539	if( cchUTF8>1 ){
540	int wcwidth;
541	wcwidth = cli_wcwidth(utf32);
542	if( kc+wcwidth-1>maxChars && maxChars>=wcwidth ){ /* rollback */
543	kc--;
544	break;
545	}
546	for( i--; cchUTF8>0; cchUTF8-- ){


547	zBuf[k++] = zText[++i];
548	}
549	kc += wcwidth - 1;




550	}
551	else if( fossil_isspace(c) ){
552	si = i;
553	sk = k;
554	if( k==0 \|\| zBuf[k-1]!=' ' ){
555

Fossil SCM

Keyboard Shortcuts