Fossil SCM

Minor optimizations: drop a few redundant comparisons and calculations, and take advantage of the logical AND short-circuit by testing the least expensive and most unlikely condition first. Also fold away the iterative comments into cross references.

florian 2018-11-24 07:49 UTC comment-formatter-utf8

Commit 490d38ff2e0079e76054bf207cb94de8ab1ecdb8

Parent b86a2fc7eb20968…

1 file changed +36 -55

~ src/comformat.c

M src/comformat.c

+36 -55

		--- src/comformat.c
		+++ src/comformat.c
		@@ -157,27 +157,24 @@
157	157	int strlen_utf8(const char *zString, int lengthBytes)
158	158	{
159	159	#if 0
160	160	assert( lengthBytes>=0 );
161	161	#endif
162		- int lengthUTF8=0; /* Counted UTF-8 sequences. */
163		- int i;
164		- for( i=0; i<lengthBytes; i++ ){
	162	+ int i; /* Counted bytes. */
	163	+ int lengthUTF8; /* Counted UTF-8 sequences. */
	164	+ for( i=0, lengthUTF8=0; i<lengthBytes; i++, lengthUTF8++ ){
165	165	char c = zString[i];
166		- lengthUTF8++;
167		- if( (c&0xc0)==0xc0 ){ /* Any UTF-8 lead byte 11xxxxxx */
168		- int cchUTF8=1; /* Code units consumed. */
169		- int maxUTF8=1; /* Expected sequence length. */
170		- if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
171		- else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
172		- else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
173		- while( i<lengthBytes-1 &&
174		- cchUTF8<maxUTF8 &&
175		- (zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
176		- cchUTF8++;
177		- i++;
178		- }
	166	+ int cchUTF8=1; /* Code units consumed. */
	167	+ int maxUTF8=1; /* Expected sequence length. */
	168	+ if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
	169	+ else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
	170	+ else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
	171	+ while( cchUTF8<maxUTF8 &&
	172	+ i<lengthBytes-1 &&
	173	+ (zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
	174	+ cchUTF8++;
	175	+ i++;
179	176	}
180	177	}
181	178	return lengthUTF8;
182	179	}
183	180
		@@ -223,10 +220,11 @@
223	220	int pLineCnt, / [in/out] Pointer to the total line count. */
224	221	const char *pzLine / [out] Pointer to the end of the logical line. */
225	222	){
226	223	int index = 0, charCnt = 0, lineCnt = 0, maxChars, i;
227	224	char zBuf[400]; int iBuf=0; /* Output buffer and counter. */
	225	+ int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */
228	226	if( !zLine ) return;
229	227	if( lineChars<=0 ) return;
230	228	#if 0
231	229	assert( indent<sizeof(zBuf)-5 ); /* See following comments to explain */
232	230	assert( origIndent<sizeof(zBuf)-5 ); /* these limits. */
		@@ -294,35 +292,23 @@
294	292	charCnt++;
295	293	}else{
296	294	charCnt++;
297	295	}
298	296	assert( c!='\n' \|\| charCnt==0 );
299		- /*
300		- ** Avoid output of incomplete UTF-8 sequences, and also avoid line breaks
301		- ** inside UTF-8 sequences. Incomplete, ill-formed and overlong sequences are
302		- ** kept together. The invalid lead bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are
303		- ** allowed to initiate (ill-formed) 2- and 4-byte sequences, respectively,
304		- ** the other invalid lead bytes 0xF8 to 0xFF are treated as invalid 1-byte
305		- ** sequences (as lone trail bytes).
306		- */
307		- if( (c&0xc0)==0xc0 && zLine[index]!=0 ){ /* Any UTF-8 lead byte 11xxxxxx */
308		- int cchUTF8=1; /* Code units consumed. */
309		- int maxUTF8=1; /* Expected sequence length. */
310		- zBuf[iBuf++]=c;
311		- if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
312		- else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
313		- else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
314		- while( cchUTF8<maxUTF8 &&
315		- (zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
316		- cchUTF8++;
317		- zBuf[iBuf++] = zLine[index++];
318		- }
319		- maxChars--;
320		- }else{
321		- zBuf[iBuf++] = c;
322		- maxChars -= useChars;
323		- }
	297	+ zBuf[iBuf++] = c;
	298	+ /* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */
	299	+ cchUTF8=1; /* Code units consumed. */
	300	+ maxUTF8=1; /* Expected sequence length. */
	301	+ if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
	302	+ else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
	303	+ else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
	304	+ while( cchUTF8<maxUTF8 &&
	305	+ (zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
	306	+ cchUTF8++;
	307	+ zBuf[iBuf++] = zLine[index++];
	308	+ }
	309	+ maxChars -= useChars;
324	310	if( maxChars<=0 ) break;
325	311	if( c=='\n' ) break;
326	312	}
327	313	if( charCnt>0 ){
328	314	zBuf[iBuf++] = '\n';
		@@ -362,10 +348,11 @@
362	348	int si, sk, i, k, kc;
363	349	int doIndent = 0;
364	350	char *zBuf;
365	351	char zBuffer[400];
366	352	int lineCnt = 0;
	353	+ int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */
367	354
368	355	if( width<0 ){
369	356	comment_set_maxchars(indent, &maxChars);
370	357	}
371	358	if( zText==0 ) zText = "(NULL)";
		@@ -389,26 +376,20 @@
389	376	return lineCnt;
390	377	}
391	378	for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){
392	379	char c = zText[i];
393	380	kc++; /* Count complete UTF-8 sequences. */
394		- /*
395		- ** Avoid line breaks inside UTF-8 sequences. Incomplete, ill-formed and
396		- ** overlong sequences are kept together. The invalid lead bytes 0xC0 to
397		- ** 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and
398		- ** 4-byte sequences, respectively, the other invalid lead bytes 0xF8 to
399		- ** 0xFF are treated as invalid 1-byte sequences (as lone trail bytes).
400		- */
401		- if( (c&0xc0)==0xc0 && zText[i+1]!=0 ){ /* Any UTF-8 lead byte 11xxxxxx */
402		- int cchUTF8=1; /* Code units consumed. */
403		- int maxUTF8=1; /* Expected sequence length. */
404		- if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
405		- else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
406		- else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
	381	+ /* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */
	382	+ cchUTF8=1; /* Code units consumed. */
	383	+ maxUTF8=1; /* Expected sequence length. */
	384	+ if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
	385	+ else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
	386	+ else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
	387	+ if( maxUTF8>1 ){
407	388	zBuf[k++] = c;
408	389	while( cchUTF8<maxUTF8 &&
409		- (zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
	390	+ (zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
410	391	cchUTF8++;
411	392	zBuf[k++] = zText[++i];
412	393	}
413	394	}
414	395	else if( fossil_isspace(c) ){
415	396

	--- src/comformat.c
	+++ src/comformat.c
	@@ -157,27 +157,24 @@
157	int strlen_utf8(const char *zString, int lengthBytes)
158	{
159	#if 0
160	assert( lengthBytes>=0 );
161	#endif
162	int lengthUTF8=0; /* Counted UTF-8 sequences. */
163	int i;
164	for( i=0; i<lengthBytes; i++ ){
165	char c = zString[i];
166	lengthUTF8++;
167	if( (c&0xc0)==0xc0 ){ /* Any UTF-8 lead byte 11xxxxxx */
168	int cchUTF8=1; /* Code units consumed. */
169	int maxUTF8=1; /* Expected sequence length. */
170	if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
171	else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
172	else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
173	while( i<lengthBytes-1 &&
174	cchUTF8<maxUTF8 &&
175	(zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
176	cchUTF8++;
177	i++;
178	}
179	}
180	}
181	return lengthUTF8;
182	}
183
	@@ -223,10 +220,11 @@
223	int pLineCnt, / [in/out] Pointer to the total line count. */
224	const char *pzLine / [out] Pointer to the end of the logical line. */
225	){
226	int index = 0, charCnt = 0, lineCnt = 0, maxChars, i;
227	char zBuf[400]; int iBuf=0; /* Output buffer and counter. */

228	if( !zLine ) return;
229	if( lineChars<=0 ) return;
230	#if 0
231	assert( indent<sizeof(zBuf)-5 ); /* See following comments to explain */
232	assert( origIndent<sizeof(zBuf)-5 ); /* these limits. */
	@@ -294,35 +292,23 @@
294	charCnt++;
295	}else{
296	charCnt++;
297	}
298	assert( c!='\n' \|\| charCnt==0 );
299	/*
300	** Avoid output of incomplete UTF-8 sequences, and also avoid line breaks
301	** inside UTF-8 sequences. Incomplete, ill-formed and overlong sequences are
302	** kept together. The invalid lead bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are
303	** allowed to initiate (ill-formed) 2- and 4-byte sequences, respectively,
304	** the other invalid lead bytes 0xF8 to 0xFF are treated as invalid 1-byte
305	** sequences (as lone trail bytes).
306	*/
307	if( (c&0xc0)==0xc0 && zLine[index]!=0 ){ /* Any UTF-8 lead byte 11xxxxxx */
308	int cchUTF8=1; /* Code units consumed. */
309	int maxUTF8=1; /* Expected sequence length. */
310	zBuf[iBuf++]=c;
311	if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
312	else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
313	else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
314	while( cchUTF8<maxUTF8 &&
315	(zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
316	cchUTF8++;
317	zBuf[iBuf++] = zLine[index++];
318	}
319	maxChars--;
320	}else{
321	zBuf[iBuf++] = c;
322	maxChars -= useChars;
323	}
324	if( maxChars<=0 ) break;
325	if( c=='\n' ) break;
326	}
327	if( charCnt>0 ){
328	zBuf[iBuf++] = '\n';
	@@ -362,10 +348,11 @@
362	int si, sk, i, k, kc;
363	int doIndent = 0;
364	char *zBuf;
365	char zBuffer[400];
366	int lineCnt = 0;

367
368	if( width<0 ){
369	comment_set_maxchars(indent, &maxChars);
370	}
371	if( zText==0 ) zText = "(NULL)";
	@@ -389,26 +376,20 @@
389	return lineCnt;
390	}
391	for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){
392	char c = zText[i];
393	kc++; /* Count complete UTF-8 sequences. */
394	/*
395	** Avoid line breaks inside UTF-8 sequences. Incomplete, ill-formed and
396	** overlong sequences are kept together. The invalid lead bytes 0xC0 to
397	** 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and
398	** 4-byte sequences, respectively, the other invalid lead bytes 0xF8 to
399	** 0xFF are treated as invalid 1-byte sequences (as lone trail bytes).
400	*/
401	if( (c&0xc0)==0xc0 && zText[i+1]!=0 ){ /* Any UTF-8 lead byte 11xxxxxx */
402	int cchUTF8=1; /* Code units consumed. */
403	int maxUTF8=1; /* Expected sequence length. */
404	if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
405	else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
406	else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
407	zBuf[k++] = c;
408	while( cchUTF8<maxUTF8 &&
409	(zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
410	cchUTF8++;
411	zBuf[k++] = zText[++i];
412	}
413	}
414	else if( fossil_isspace(c) ){
415

	--- src/comformat.c
	+++ src/comformat.c
	@@ -157,27 +157,24 @@
157	int strlen_utf8(const char *zString, int lengthBytes)
158	{
159	#if 0
160	assert( lengthBytes>=0 );
161	#endif
162	int i; /* Counted bytes. */
163	int lengthUTF8; /* Counted UTF-8 sequences. */
164	for( i=0, lengthUTF8=0; i<lengthBytes; i++, lengthUTF8++ ){
165	char c = zString[i];
166	int cchUTF8=1; /* Code units consumed. */
167	int maxUTF8=1; /* Expected sequence length. */
168	if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
169	else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
170	else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
171	while( cchUTF8<maxUTF8 &&
172	i<lengthBytes-1 &&
173	(zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
174	cchUTF8++;
175	i++;



176	}
177	}
178	return lengthUTF8;
179	}
180
	@@ -223,10 +220,11 @@
220	int pLineCnt, / [in/out] Pointer to the total line count. */
221	const char *pzLine / [out] Pointer to the end of the logical line. */
222	){
223	int index = 0, charCnt = 0, lineCnt = 0, maxChars, i;
224	char zBuf[400]; int iBuf=0; /* Output buffer and counter. */
225	int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */
226	if( !zLine ) return;
227	if( lineChars<=0 ) return;
228	#if 0
229	assert( indent<sizeof(zBuf)-5 ); /* See following comments to explain */
230	assert( origIndent<sizeof(zBuf)-5 ); /* these limits. */
	@@ -294,35 +292,23 @@
292	charCnt++;
293	}else{
294	charCnt++;
295	}
296	assert( c!='\n' \|\| charCnt==0 );
297	zBuf[iBuf++] = c;
298	/* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */
299	cchUTF8=1; /* Code units consumed. */
300	maxUTF8=1; /* Expected sequence length. */
301	if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
302	else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
303	else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
304	while( cchUTF8<maxUTF8 &&
305	(zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
306	cchUTF8++;
307	zBuf[iBuf++] = zLine[index++];
308	}
309	maxChars -= useChars;












310	if( maxChars<=0 ) break;
311	if( c=='\n' ) break;
312	}
313	if( charCnt>0 ){
314	zBuf[iBuf++] = '\n';
	@@ -362,10 +348,11 @@
348	int si, sk, i, k, kc;
349	int doIndent = 0;
350	char *zBuf;
351	char zBuffer[400];
352	int lineCnt = 0;
353	int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */
354
355	if( width<0 ){
356	comment_set_maxchars(indent, &maxChars);
357	}
358	if( zText==0 ) zText = "(NULL)";
	@@ -389,26 +376,20 @@
376	return lineCnt;
377	}
378	for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){
379	char c = zText[i];
380	kc++; /* Count complete UTF-8 sequences. */
381	/* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */
382	cchUTF8=1; /* Code units consumed. */
383	maxUTF8=1; /* Expected sequence length. */
384	if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
385	else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
386	else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
387	if( maxUTF8>1 ){






388	zBuf[k++] = c;
389	while( cchUTF8<maxUTF8 &&
390	(zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
391	cchUTF8++;
392	zBuf[k++] = zText[++i];
393	}
394	}
395	else if( fossil_isspace(c) ){
396

Fossil SCM

Keyboard Shortcuts