Fossil SCM

For better word breaking results with the (non-legacy) comment printing algorithm, make sure the lookahead to the next space character is UTF-8-aware. Also make sure the per-line remaining character count is decremented properly for UTF-8 sequences. The neuralgic points now handle UTF-8 sequences correctly, and they could be enhanced to work with the effective display width, if required (to handle combining characters, and East Asian Wide and Fullwidth characters).

florian 2018-11-16 11:14 UTC comment-formatter-utf8

Commit c9ec3d1886367b546a37e674df1bff9913d8664a

Parent 29d3a2ed4ee03c7…

1 file changed +56 -7

~ src/comformat.c

M src/comformat.c

+56 -7

		--- src/comformat.c
		+++ src/comformat.c
		@@ -120,22 +120,67 @@
120	120	** zero if such a character cannot be found. For the purposes of this
121	121	** algorithm, the NUL character is treated the same as a spacing character.
122	122	*/
123	123	static int comment_next_space(
124	124	const char zLine, / [in] The comment line being printed. */
125		- int index /* [in] The current character index being handled. */
	125	+ int index, /* [in] The current character index being handled. */
	126	+ int distUTF8 / [out] Distance to next space in UTF-8 sequences. */
126	127	){
127	128	int nextIndex = index + 1;
	129	+ int fNonASCII=0;
128	130	for(;;){
129	131	char c = zLine[nextIndex];
	132	+ if ( (c&0x80)==0x80 ) fNonASCII=1;
130	133	if( c==0 \|\| fossil_isspace(c) ){
	134	+ if ( distUTF8 ){
	135	+ if ( fNonASCII!=0 ){
	136	+ *distUTF8 = strlen_utf8(&zLine[index], nextIndex-index);
	137	+ }else{
	138	+ *distUTF8 = nextIndex-index;
	139	+ }
	140	+ }
131	141	return nextIndex;
132	142	}
133	143	nextIndex++;
134	144	}
135	145	return 0; /* NOT REACHED */
136	146	}
	147	+
	148	+/*
	149	+** Count the number of UTF-8 sequences in a string. Incomplete, ill-formed and
	150	+** overlong sequences are counted as one sequence. The invalid lead bytes 0xC0
	151	+** to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and 4-byte
	152	+** sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF are
	153	+** treated as invalid 1-byte sequences (as lone trail bytes).
	154	+** Combining characters and East Asian Wide and Fullwidth characters are counted
	155	+** as one, so this function does not calculate the effective "display width".
	156	+*/
	157	+int strlen_utf8(const char *zString, int lengthBytes)
	158	+{
	159	+#if 0
	160	+ assert( lengthBytes>=0 );
	161	+#endif
	162	+ int lengthUTF8=0; /* Counted UTF-8 sequences. */
	163	+ int i;
	164	+ for( i=0; i<lengthBytes; i++ ){
	165	+ char c = zString[i];
	166	+ lengthUTF8++;
	167	+ if ( (c&0xc0)==0xc0 ){ /* Any UTF-8 lead byte 11xxxxxx */
	168	+ int cchUTF8=1; /* Code units consumed. */
	169	+ int maxUTF8=1; /* Expected sequence length. */
	170	+ if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
	171	+ else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
	172	+ else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
	173	+ while( i<lengthBytes-1 &&
	174	+ cchUTF8<maxUTF8 &&
	175	+ (zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
	176	+ i++;
	177	+ }
	178	+ }
	179	+ }
	180	+ return lengthUTF8;
	181	+}
137	182
138	183	/*
139	184	** This function is called when printing a logical comment line to calculate
140	185	** the necessary indenting. The caller needs to emit the indenting spaces.
141	186	*/
		@@ -227,23 +272,25 @@
227	272	if( c=='\n' ){
228	273	lineCnt++;
229	274	charCnt = 0;
230	275	useChars = 0;
231	276	}else if( c=='\t' ){
232		- int nextIndex = comment_next_space(zLine, index);
233		- if( nextIndex<=0 \|\| (nextIndex-index)>maxChars ){
	277	+ int distUTF8;
	278	+ int nextIndex = comment_next_space(zLine, index, &distUTF8);
	279	+ if( nextIndex<=0 \|\| distUTF8>maxChars ){
234	280	break;
235	281	}
236	282	charCnt++;
237	283	useChars = COMMENT_TAB_WIDTH;
238	284	if( maxChars<useChars ){
239	285	zBuf[iBuf++] = ' ';
240	286	break;
241	287	}
242	288	}else if( wordBreak && fossil_isspace(c) ){
243		- int nextIndex = comment_next_space(zLine, index);
244		- if( nextIndex<=0 \|\| (nextIndex-index)>maxChars ){
	289	+ int distUTF8;
	290	+ int nextIndex = comment_next_space(zLine, index, &distUTF8);
	291	+ if( nextIndex<=0 \|\| distUTF8>maxChars ){
245	292	break;
246	293	}
247	294	charCnt++;
248	295	}else{
249	296	charCnt++;
		@@ -267,14 +314,16 @@
267	314	while( cchUTF8<maxUTF8 &&
268	315	(zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
269	316	cchUTF8++;
270	317	zBuf[iBuf++] = zLine[index++];
271	318	}
	319	+ maxChars--;
272	320	}
273		- else
	321	+ else {
274	322	zBuf[iBuf++] = c;
275		- if( (c&0x80)==0 \|\| (zLine[index+1]&0xc0)!=0xc0 ) maxChars -= useChars;
	323	+ maxChars -= useChars;
	324	+ }
276	325	if( maxChars<=0 ) break;
277	326	if( c=='\n' ) break;
278	327	}
279	328	if( charCnt>0 ){
280	329	zBuf[iBuf++] = '\n';
281	330

	--- src/comformat.c
	+++ src/comformat.c
	@@ -120,22 +120,67 @@
120	** zero if such a character cannot be found. For the purposes of this
121	** algorithm, the NUL character is treated the same as a spacing character.
122	*/
123	static int comment_next_space(
124	const char zLine, / [in] The comment line being printed. */
125	int index /* [in] The current character index being handled. */

126	){
127	int nextIndex = index + 1;

128	for(;;){
129	char c = zLine[nextIndex];

130	if( c==0 \|\| fossil_isspace(c) ){







131	return nextIndex;
132	}
133	nextIndex++;
134	}
135	return 0; /* NOT REACHED */
136	}



































137
138	/*
139	** This function is called when printing a logical comment line to calculate
140	** the necessary indenting. The caller needs to emit the indenting spaces.
141	*/
	@@ -227,23 +272,25 @@
227	if( c=='\n' ){
228	lineCnt++;
229	charCnt = 0;
230	useChars = 0;
231	}else if( c=='\t' ){
232	int nextIndex = comment_next_space(zLine, index);
233	if( nextIndex<=0 \|\| (nextIndex-index)>maxChars ){

234	break;
235	}
236	charCnt++;
237	useChars = COMMENT_TAB_WIDTH;
238	if( maxChars<useChars ){
239	zBuf[iBuf++] = ' ';
240	break;
241	}
242	}else if( wordBreak && fossil_isspace(c) ){
243	int nextIndex = comment_next_space(zLine, index);
244	if( nextIndex<=0 \|\| (nextIndex-index)>maxChars ){

245	break;
246	}
247	charCnt++;
248	}else{
249	charCnt++;
	@@ -267,14 +314,16 @@
267	while( cchUTF8<maxUTF8 &&
268	(zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
269	cchUTF8++;
270	zBuf[iBuf++] = zLine[index++];
271	}

272	}
273	else
274	zBuf[iBuf++] = c;
275	if( (c&0x80)==0 \|\| (zLine[index+1]&0xc0)!=0xc0 ) maxChars -= useChars;

276	if( maxChars<=0 ) break;
277	if( c=='\n' ) break;
278	}
279	if( charCnt>0 ){
280	zBuf[iBuf++] = '\n';
281

	--- src/comformat.c
	+++ src/comformat.c
	@@ -120,22 +120,67 @@
120	** zero if such a character cannot be found. For the purposes of this
121	** algorithm, the NUL character is treated the same as a spacing character.
122	*/
123	static int comment_next_space(
124	const char zLine, / [in] The comment line being printed. */
125	int index, /* [in] The current character index being handled. */
126	int distUTF8 / [out] Distance to next space in UTF-8 sequences. */
127	){
128	int nextIndex = index + 1;
129	int fNonASCII=0;
130	for(;;){
131	char c = zLine[nextIndex];
132	if ( (c&0x80)==0x80 ) fNonASCII=1;
133	if( c==0 \|\| fossil_isspace(c) ){
134	if ( distUTF8 ){
135	if ( fNonASCII!=0 ){
136	*distUTF8 = strlen_utf8(&zLine[index], nextIndex-index);
137	}else{
138	*distUTF8 = nextIndex-index;
139	}
140	}
141	return nextIndex;
142	}
143	nextIndex++;
144	}
145	return 0; /* NOT REACHED */
146	}
147
148	/*
149	** Count the number of UTF-8 sequences in a string. Incomplete, ill-formed and
150	** overlong sequences are counted as one sequence. The invalid lead bytes 0xC0
151	** to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and 4-byte
152	** sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF are
153	** treated as invalid 1-byte sequences (as lone trail bytes).
154	** Combining characters and East Asian Wide and Fullwidth characters are counted
155	** as one, so this function does not calculate the effective "display width".
156	*/
157	int strlen_utf8(const char *zString, int lengthBytes)
158	{
159	#if 0
160	assert( lengthBytes>=0 );
161	#endif
162	int lengthUTF8=0; /* Counted UTF-8 sequences. */
163	int i;
164	for( i=0; i<lengthBytes; i++ ){
165	char c = zString[i];
166	lengthUTF8++;
167	if ( (c&0xc0)==0xc0 ){ /* Any UTF-8 lead byte 11xxxxxx */
168	int cchUTF8=1; /* Code units consumed. */
169	int maxUTF8=1; /* Expected sequence length. */
170	if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
171	else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
172	else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
173	while( i<lengthBytes-1 &&
174	cchUTF8<maxUTF8 &&
175	(zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
176	i++;
177	}
178	}
179	}
180	return lengthUTF8;
181	}
182
183	/*
184	** This function is called when printing a logical comment line to calculate
185	** the necessary indenting. The caller needs to emit the indenting spaces.
186	*/
	@@ -227,23 +272,25 @@
272	if( c=='\n' ){
273	lineCnt++;
274	charCnt = 0;
275	useChars = 0;
276	}else if( c=='\t' ){
277	int distUTF8;
278	int nextIndex = comment_next_space(zLine, index, &distUTF8);
279	if( nextIndex<=0 \|\| distUTF8>maxChars ){
280	break;
281	}
282	charCnt++;
283	useChars = COMMENT_TAB_WIDTH;
284	if( maxChars<useChars ){
285	zBuf[iBuf++] = ' ';
286	break;
287	}
288	}else if( wordBreak && fossil_isspace(c) ){
289	int distUTF8;
290	int nextIndex = comment_next_space(zLine, index, &distUTF8);
291	if( nextIndex<=0 \|\| distUTF8>maxChars ){
292	break;
293	}
294	charCnt++;
295	}else{
296	charCnt++;
	@@ -267,14 +314,16 @@
314	while( cchUTF8<maxUTF8 &&
315	(zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
316	cchUTF8++;
317	zBuf[iBuf++] = zLine[index++];
318	}
319	maxChars--;
320	}
321	else {
322	zBuf[iBuf++] = c;
323	maxChars -= useChars;
324	}
325	if( maxChars<=0 ) break;
326	if( c=='\n' ) break;
327	}
328	if( charCnt>0 ){
329	zBuf[iBuf++] = '\n';
330

Fossil SCM

Keyboard Shortcuts