Fossil SCM

Modify the comment formatter to avoid output of incomplete UTF-8 sequences, and to avoid line breaks inside UTF-8 sequences. See [https://fossil-scm.org/forum/forumpost/1247e4a3c4] for detailed information and tests.

florian 2018-10-17 14:16 UTC trunk

Commit 1bbca2c3f89b826d3350ca34a0e1a69a31180b72dcbece58f2714c87f7a8267e

Parent 35563f3db308ca3…

1 file changed +48 -4

~ src/comformat.c

M src/comformat.c

+48 -4

		--- src/comformat.c
		+++ src/comformat.c
		@@ -225,11 +225,35 @@
225	225	charCnt++;
226	226	}else{
227	227	charCnt++;
228	228	}
229	229	assert( c!='\n' \|\| charCnt==0 );
230		- fossil_print("%c", c);
	230	+ /*
	231	+ ** Avoid output of incomplete UTF-8 sequences, and also avoid line breaks
	232	+ ** inside UTF-8 sequences. Incomplete, ill-formed and overlong sequences are
	233	+ ** kept together. The invalid lead bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are
	234	+ ** allowed to initiate (ill-formed) 2- and 4-byte sequences, respectively,
	235	+ ** the other invalid lead bytes 0xF8 to 0xFF are treated as invalid 1-byte
	236	+ ** sequences (as lone trail bytes).
	237	+ */
	238	+ if( (c&0xc0)==0xc0 && zLine[index]!=0 ){ /* Any UTF-8 lead byte 11xxxxxx */
	239	+ char zUTF8[5]; /* Buffer to hold a UTF-8 sequence. */
	240	+ int cchUTF8=1; /* Code units consumed. */
	241	+ int maxUTF8=1; /* Expected sequence length. */
	242	+ zUTF8[0]=c;
	243	+ if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
	244	+ else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
	245	+ else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
	246	+ while( cchUTF8<maxUTF8 &&
	247	+ (zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
	248	+ zUTF8[cchUTF8++] = zLine[index++];
	249	+ }
	250	+ zUTF8[cchUTF8]=0;
	251	+ fossil_print("%s", zUTF8);
	252	+ }
	253	+ else
	254	+ fossil_print("%c", c);
231	255	if( (c&0x80)==0 \|\| (zLine[index+1]&0xc0)!=0xc0 ) maxChars -= useChars;
232	256	if( maxChars<=0 ) break;
233	257	if( c=='\n' ) break;
234	258	}
235	259	if( charCnt>0 ){
		@@ -259,11 +283,11 @@
259	283	const char zText, / The comment text to be printed. */
260	284	int indent, /* Number of spaces to indent each non-initial line. */
261	285	int width /* Maximum number of characters per line. */
262	286	){
263	287	int maxChars = width - indent;
264		- int si, sk, i, k;
	288	+ int si, sk, i, k, kc;
265	289	int doIndent = 0;
266	290	char *zBuf;
267	291	char zBuffer[400];
268	292	int lineCnt = 0;
269	293
		@@ -287,13 +311,33 @@
287	311	lineCnt = 1;
288	312	}
289	313	if( zBuf!=zBuffer) fossil_free(zBuf);
290	314	return lineCnt;
291	315	}
292		- for(sk=si=i=k=0; zText[i] && k<maxChars; i++){
	316	+ for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){
293	317	char c = zText[i];
294		- if( fossil_isspace(c) ){
	318	+ kc++; /* Count complete UTF-8 sequences. */
	319	+ /*
	320	+ ** Avoid line breaks inside UTF-8 sequences. Incomplete, ill-formed and
	321	+ ** overlong sequences are kept together. The invalid lead bytes 0xC0 to
	322	+ ** 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and
	323	+ ** 4-byte sequences, respectively, the other invalid lead bytes 0xF8 to
	324	+ ** 0xFF are treated as invalid 1-byte sequences (as lone trail bytes).
	325	+ */
	326	+ if( (c&0xc0)==0xc0 && zText[i+1]!=0 ){ /* Any UTF-8 lead byte 11xxxxxx */
	327	+ int cchUTF8=1; /* Code units consumed. */
	328	+ int maxUTF8=1; /* Expected sequence length. */
	329	+ if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
	330	+ else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
	331	+ else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
	332	+ zBuf[k++] = c;
	333	+ while( cchUTF8<maxUTF8 &&
	334	+ (zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
	335	+ zBuf[k++] = zText[++i];
	336	+ }
	337	+ }
	338	+ else if( fossil_isspace(c) ){
295	339	si = i;
296	340	sk = k;
297	341	if( k==0 \|\| zBuf[k-1]!=' ' ){
298	342	zBuf[k++] = ' ';
299	343	}
300	344

	--- src/comformat.c
	+++ src/comformat.c
	@@ -225,11 +225,35 @@
225	charCnt++;
226	}else{
227	charCnt++;
228	}
229	assert( c!='\n' \|\| charCnt==0 );
230	fossil_print("%c", c);
























231	if( (c&0x80)==0 \|\| (zLine[index+1]&0xc0)!=0xc0 ) maxChars -= useChars;
232	if( maxChars<=0 ) break;
233	if( c=='\n' ) break;
234	}
235	if( charCnt>0 ){
	@@ -259,11 +283,11 @@
259	const char zText, / The comment text to be printed. */
260	int indent, /* Number of spaces to indent each non-initial line. */
261	int width /* Maximum number of characters per line. */
262	){
263	int maxChars = width - indent;
264	int si, sk, i, k;
265	int doIndent = 0;
266	char *zBuf;
267	char zBuffer[400];
268	int lineCnt = 0;
269
	@@ -287,13 +311,33 @@
287	lineCnt = 1;
288	}
289	if( zBuf!=zBuffer) fossil_free(zBuf);
290	return lineCnt;
291	}
292	for(sk=si=i=k=0; zText[i] && k<maxChars; i++){
293	char c = zText[i];
294	if( fossil_isspace(c) ){




















295	si = i;
296	sk = k;
297	if( k==0 \|\| zBuf[k-1]!=' ' ){
298	zBuf[k++] = ' ';
299	}
300

	--- src/comformat.c
	+++ src/comformat.c
	@@ -225,11 +225,35 @@
225	charCnt++;
226	}else{
227	charCnt++;
228	}
229	assert( c!='\n' \|\| charCnt==0 );
230	/*
231	** Avoid output of incomplete UTF-8 sequences, and also avoid line breaks
232	** inside UTF-8 sequences. Incomplete, ill-formed and overlong sequences are
233	** kept together. The invalid lead bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are
234	** allowed to initiate (ill-formed) 2- and 4-byte sequences, respectively,
235	** the other invalid lead bytes 0xF8 to 0xFF are treated as invalid 1-byte
236	** sequences (as lone trail bytes).
237	*/
238	if( (c&0xc0)==0xc0 && zLine[index]!=0 ){ /* Any UTF-8 lead byte 11xxxxxx */
239	char zUTF8[5]; /* Buffer to hold a UTF-8 sequence. */
240	int cchUTF8=1; /* Code units consumed. */
241	int maxUTF8=1; /* Expected sequence length. */
242	zUTF8[0]=c;
243	if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
244	else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
245	else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
246	while( cchUTF8<maxUTF8 &&
247	(zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
248	zUTF8[cchUTF8++] = zLine[index++];
249	}
250	zUTF8[cchUTF8]=0;
251	fossil_print("%s", zUTF8);
252	}
253	else
254	fossil_print("%c", c);
255	if( (c&0x80)==0 \|\| (zLine[index+1]&0xc0)!=0xc0 ) maxChars -= useChars;
256	if( maxChars<=0 ) break;
257	if( c=='\n' ) break;
258	}
259	if( charCnt>0 ){
	@@ -259,11 +283,11 @@
283	const char zText, / The comment text to be printed. */
284	int indent, /* Number of spaces to indent each non-initial line. */
285	int width /* Maximum number of characters per line. */
286	){
287	int maxChars = width - indent;
288	int si, sk, i, k, kc;
289	int doIndent = 0;
290	char *zBuf;
291	char zBuffer[400];
292	int lineCnt = 0;
293
	@@ -287,13 +311,33 @@
311	lineCnt = 1;
312	}
313	if( zBuf!=zBuffer) fossil_free(zBuf);
314	return lineCnt;
315	}
316	for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){
317	char c = zText[i];
318	kc++; /* Count complete UTF-8 sequences. */
319	/*
320	** Avoid line breaks inside UTF-8 sequences. Incomplete, ill-formed and
321	** overlong sequences are kept together. The invalid lead bytes 0xC0 to
322	** 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and
323	** 4-byte sequences, respectively, the other invalid lead bytes 0xF8 to
324	** 0xFF are treated as invalid 1-byte sequences (as lone trail bytes).
325	*/
326	if( (c&0xc0)==0xc0 && zText[i+1]!=0 ){ /* Any UTF-8 lead byte 11xxxxxx */
327	int cchUTF8=1; /* Code units consumed. */
328	int maxUTF8=1; /* Expected sequence length. */
329	if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
330	else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
331	else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
332	zBuf[k++] = c;
333	while( cchUTF8<maxUTF8 &&
334	(zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
335	zBuf[k++] = zText[++i];
336	}
337	}
338	else if( fossil_isspace(c) ){
339	si = i;
340	sk = k;
341	if( k==0 \|\| zBuf[k-1]!=' ' ){
342	zBuf[k++] = ' ';
343	}
344

Fossil SCM

Keyboard Shortcuts