Fossil SCM

Minor optimizations: drop a few redundant comparisons and calculations, and take advantage of the logical AND short-circuit by testing the least expensive and most unlikely condition first. Also fold away the iterative comments into cross references.

florian 2018-11-24 07:49 UTC comment-formatter-utf8
Commit 490d38ff2e0079e76054bf207cb94de8ab1ecdb8
1 file changed +36 -55
+36 -55
--- src/comformat.c
+++ src/comformat.c
@@ -157,27 +157,24 @@
157157
int strlen_utf8(const char *zString, int lengthBytes)
158158
{
159159
#if 0
160160
assert( lengthBytes>=0 );
161161
#endif
162
- int lengthUTF8=0; /* Counted UTF-8 sequences. */
163
- int i;
164
- for( i=0; i<lengthBytes; i++ ){
162
+ int i; /* Counted bytes. */
163
+ int lengthUTF8; /* Counted UTF-8 sequences. */
164
+ for( i=0, lengthUTF8=0; i<lengthBytes; i++, lengthUTF8++ ){
165165
char c = zString[i];
166
- lengthUTF8++;
167
- if( (c&0xc0)==0xc0 ){ /* Any UTF-8 lead byte 11xxxxxx */
168
- int cchUTF8=1; /* Code units consumed. */
169
- int maxUTF8=1; /* Expected sequence length. */
170
- if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
171
- else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
172
- else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
173
- while( i<lengthBytes-1 &&
174
- cchUTF8<maxUTF8 &&
175
- (zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
176
- cchUTF8++;
177
- i++;
178
- }
166
+ int cchUTF8=1; /* Code units consumed. */
167
+ int maxUTF8=1; /* Expected sequence length. */
168
+ if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
169
+ else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
170
+ else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
171
+ while( cchUTF8<maxUTF8 &&
172
+ i<lengthBytes-1 &&
173
+ (zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
174
+ cchUTF8++;
175
+ i++;
179176
}
180177
}
181178
return lengthUTF8;
182179
}
183180
@@ -223,10 +220,11 @@
223220
int *pLineCnt, /* [in/out] Pointer to the total line count. */
224221
const char **pzLine /* [out] Pointer to the end of the logical line. */
225222
){
226223
int index = 0, charCnt = 0, lineCnt = 0, maxChars, i;
227224
char zBuf[400]; int iBuf=0; /* Output buffer and counter. */
225
+ int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */
228226
if( !zLine ) return;
229227
if( lineChars<=0 ) return;
230228
#if 0
231229
assert( indent<sizeof(zBuf)-5 ); /* See following comments to explain */
232230
assert( origIndent<sizeof(zBuf)-5 ); /* these limits. */
@@ -294,35 +292,23 @@
294292
charCnt++;
295293
}else{
296294
charCnt++;
297295
}
298296
assert( c!='\n' || charCnt==0 );
299
- /*
300
- ** Avoid output of incomplete UTF-8 sequences, and also avoid line breaks
301
- ** inside UTF-8 sequences. Incomplete, ill-formed and overlong sequences are
302
- ** kept together. The invalid lead bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are
303
- ** allowed to initiate (ill-formed) 2- and 4-byte sequences, respectively,
304
- ** the other invalid lead bytes 0xF8 to 0xFF are treated as invalid 1-byte
305
- ** sequences (as lone trail bytes).
306
- */
307
- if( (c&0xc0)==0xc0 && zLine[index]!=0 ){ /* Any UTF-8 lead byte 11xxxxxx */
308
- int cchUTF8=1; /* Code units consumed. */
309
- int maxUTF8=1; /* Expected sequence length. */
310
- zBuf[iBuf++]=c;
311
- if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
312
- else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
313
- else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
314
- while( cchUTF8<maxUTF8 &&
315
- (zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
316
- cchUTF8++;
317
- zBuf[iBuf++] = zLine[index++];
318
- }
319
- maxChars--;
320
- }else{
321
- zBuf[iBuf++] = c;
322
- maxChars -= useChars;
323
- }
297
+ zBuf[iBuf++] = c;
298
+ /* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */
299
+ cchUTF8=1; /* Code units consumed. */
300
+ maxUTF8=1; /* Expected sequence length. */
301
+ if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
302
+ else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
303
+ else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
304
+ while( cchUTF8<maxUTF8 &&
305
+ (zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
306
+ cchUTF8++;
307
+ zBuf[iBuf++] = zLine[index++];
308
+ }
309
+ maxChars -= useChars;
324310
if( maxChars<=0 ) break;
325311
if( c=='\n' ) break;
326312
}
327313
if( charCnt>0 ){
328314
zBuf[iBuf++] = '\n';
@@ -362,10 +348,11 @@
362348
int si, sk, i, k, kc;
363349
int doIndent = 0;
364350
char *zBuf;
365351
char zBuffer[400];
366352
int lineCnt = 0;
353
+ int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */
367354
368355
if( width<0 ){
369356
comment_set_maxchars(indent, &maxChars);
370357
}
371358
if( zText==0 ) zText = "(NULL)";
@@ -389,26 +376,20 @@
389376
return lineCnt;
390377
}
391378
for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){
392379
char c = zText[i];
393380
kc++; /* Count complete UTF-8 sequences. */
394
- /*
395
- ** Avoid line breaks inside UTF-8 sequences. Incomplete, ill-formed and
396
- ** overlong sequences are kept together. The invalid lead bytes 0xC0 to
397
- ** 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and
398
- ** 4-byte sequences, respectively, the other invalid lead bytes 0xF8 to
399
- ** 0xFF are treated as invalid 1-byte sequences (as lone trail bytes).
400
- */
401
- if( (c&0xc0)==0xc0 && zText[i+1]!=0 ){ /* Any UTF-8 lead byte 11xxxxxx */
402
- int cchUTF8=1; /* Code units consumed. */
403
- int maxUTF8=1; /* Expected sequence length. */
404
- if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
405
- else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
406
- else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
381
+ /* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */
382
+ cchUTF8=1; /* Code units consumed. */
383
+ maxUTF8=1; /* Expected sequence length. */
384
+ if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
385
+ else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
386
+ else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
387
+ if( maxUTF8>1 ){
407388
zBuf[k++] = c;
408389
while( cchUTF8<maxUTF8 &&
409
- (zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
390
+ (zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
410391
cchUTF8++;
411392
zBuf[k++] = zText[++i];
412393
}
413394
}
414395
else if( fossil_isspace(c) ){
415396
--- src/comformat.c
+++ src/comformat.c
@@ -157,27 +157,24 @@
157 int strlen_utf8(const char *zString, int lengthBytes)
158 {
159 #if 0
160 assert( lengthBytes>=0 );
161 #endif
162 int lengthUTF8=0; /* Counted UTF-8 sequences. */
163 int i;
164 for( i=0; i<lengthBytes; i++ ){
165 char c = zString[i];
166 lengthUTF8++;
167 if( (c&0xc0)==0xc0 ){ /* Any UTF-8 lead byte 11xxxxxx */
168 int cchUTF8=1; /* Code units consumed. */
169 int maxUTF8=1; /* Expected sequence length. */
170 if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
171 else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
172 else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
173 while( i<lengthBytes-1 &&
174 cchUTF8<maxUTF8 &&
175 (zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
176 cchUTF8++;
177 i++;
178 }
179 }
180 }
181 return lengthUTF8;
182 }
183
@@ -223,10 +220,11 @@
223 int *pLineCnt, /* [in/out] Pointer to the total line count. */
224 const char **pzLine /* [out] Pointer to the end of the logical line. */
225 ){
226 int index = 0, charCnt = 0, lineCnt = 0, maxChars, i;
227 char zBuf[400]; int iBuf=0; /* Output buffer and counter. */
 
228 if( !zLine ) return;
229 if( lineChars<=0 ) return;
230 #if 0
231 assert( indent<sizeof(zBuf)-5 ); /* See following comments to explain */
232 assert( origIndent<sizeof(zBuf)-5 ); /* these limits. */
@@ -294,35 +292,23 @@
294 charCnt++;
295 }else{
296 charCnt++;
297 }
298 assert( c!='\n' || charCnt==0 );
299 /*
300 ** Avoid output of incomplete UTF-8 sequences, and also avoid line breaks
301 ** inside UTF-8 sequences. Incomplete, ill-formed and overlong sequences are
302 ** kept together. The invalid lead bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are
303 ** allowed to initiate (ill-formed) 2- and 4-byte sequences, respectively,
304 ** the other invalid lead bytes 0xF8 to 0xFF are treated as invalid 1-byte
305 ** sequences (as lone trail bytes).
306 */
307 if( (c&0xc0)==0xc0 && zLine[index]!=0 ){ /* Any UTF-8 lead byte 11xxxxxx */
308 int cchUTF8=1; /* Code units consumed. */
309 int maxUTF8=1; /* Expected sequence length. */
310 zBuf[iBuf++]=c;
311 if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
312 else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
313 else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
314 while( cchUTF8<maxUTF8 &&
315 (zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
316 cchUTF8++;
317 zBuf[iBuf++] = zLine[index++];
318 }
319 maxChars--;
320 }else{
321 zBuf[iBuf++] = c;
322 maxChars -= useChars;
323 }
324 if( maxChars<=0 ) break;
325 if( c=='\n' ) break;
326 }
327 if( charCnt>0 ){
328 zBuf[iBuf++] = '\n';
@@ -362,10 +348,11 @@
362 int si, sk, i, k, kc;
363 int doIndent = 0;
364 char *zBuf;
365 char zBuffer[400];
366 int lineCnt = 0;
 
367
368 if( width<0 ){
369 comment_set_maxchars(indent, &maxChars);
370 }
371 if( zText==0 ) zText = "(NULL)";
@@ -389,26 +376,20 @@
389 return lineCnt;
390 }
391 for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){
392 char c = zText[i];
393 kc++; /* Count complete UTF-8 sequences. */
394 /*
395 ** Avoid line breaks inside UTF-8 sequences. Incomplete, ill-formed and
396 ** overlong sequences are kept together. The invalid lead bytes 0xC0 to
397 ** 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and
398 ** 4-byte sequences, respectively, the other invalid lead bytes 0xF8 to
399 ** 0xFF are treated as invalid 1-byte sequences (as lone trail bytes).
400 */
401 if( (c&0xc0)==0xc0 && zText[i+1]!=0 ){ /* Any UTF-8 lead byte 11xxxxxx */
402 int cchUTF8=1; /* Code units consumed. */
403 int maxUTF8=1; /* Expected sequence length. */
404 if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
405 else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
406 else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
407 zBuf[k++] = c;
408 while( cchUTF8<maxUTF8 &&
409 (zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
410 cchUTF8++;
411 zBuf[k++] = zText[++i];
412 }
413 }
414 else if( fossil_isspace(c) ){
415
--- src/comformat.c
+++ src/comformat.c
@@ -157,27 +157,24 @@
157 int strlen_utf8(const char *zString, int lengthBytes)
158 {
159 #if 0
160 assert( lengthBytes>=0 );
161 #endif
162 int i; /* Counted bytes. */
163 int lengthUTF8; /* Counted UTF-8 sequences. */
164 for( i=0, lengthUTF8=0; i<lengthBytes; i++, lengthUTF8++ ){
165 char c = zString[i];
166 int cchUTF8=1; /* Code units consumed. */
167 int maxUTF8=1; /* Expected sequence length. */
168 if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
169 else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
170 else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
171 while( cchUTF8<maxUTF8 &&
172 i<lengthBytes-1 &&
173 (zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
174 cchUTF8++;
175 i++;
 
 
 
176 }
177 }
178 return lengthUTF8;
179 }
180
@@ -223,10 +220,11 @@
220 int *pLineCnt, /* [in/out] Pointer to the total line count. */
221 const char **pzLine /* [out] Pointer to the end of the logical line. */
222 ){
223 int index = 0, charCnt = 0, lineCnt = 0, maxChars, i;
224 char zBuf[400]; int iBuf=0; /* Output buffer and counter. */
225 int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */
226 if( !zLine ) return;
227 if( lineChars<=0 ) return;
228 #if 0
229 assert( indent<sizeof(zBuf)-5 ); /* See following comments to explain */
230 assert( origIndent<sizeof(zBuf)-5 ); /* these limits. */
@@ -294,35 +292,23 @@
292 charCnt++;
293 }else{
294 charCnt++;
295 }
296 assert( c!='\n' || charCnt==0 );
297 zBuf[iBuf++] = c;
298 /* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */
299 cchUTF8=1; /* Code units consumed. */
300 maxUTF8=1; /* Expected sequence length. */
301 if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
302 else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
303 else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
304 while( cchUTF8<maxUTF8 &&
305 (zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
306 cchUTF8++;
307 zBuf[iBuf++] = zLine[index++];
308 }
309 maxChars -= useChars;
 
 
 
 
 
 
 
 
 
 
 
 
310 if( maxChars<=0 ) break;
311 if( c=='\n' ) break;
312 }
313 if( charCnt>0 ){
314 zBuf[iBuf++] = '\n';
@@ -362,10 +348,11 @@
348 int si, sk, i, k, kc;
349 int doIndent = 0;
350 char *zBuf;
351 char zBuffer[400];
352 int lineCnt = 0;
353 int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */
354
355 if( width<0 ){
356 comment_set_maxchars(indent, &maxChars);
357 }
358 if( zText==0 ) zText = "(NULL)";
@@ -389,26 +376,20 @@
376 return lineCnt;
377 }
378 for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){
379 char c = zText[i];
380 kc++; /* Count complete UTF-8 sequences. */
381 /* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */
382 cchUTF8=1; /* Code units consumed. */
383 maxUTF8=1; /* Expected sequence length. */
384 if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
385 else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
386 else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
387 if( maxUTF8>1 ){
 
 
 
 
 
 
388 zBuf[k++] = c;
389 while( cchUTF8<maxUTF8 &&
390 (zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
391 cchUTF8++;
392 zBuf[k++] = zText[++i];
393 }
394 }
395 else if( fossil_isspace(c) ){
396

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button