Fossil SCM

Modify the comment formatter to avoid output of incomplete UTF-8 sequences, and to avoid line breaks inside UTF-8 sequences. See [https://fossil-scm.org/forum/forumpost/1247e4a3c4] for detailed information and tests.

florian 2018-10-17 14:16 UTC trunk
Commit 1bbca2c3f89b826d3350ca34a0e1a69a31180b72dcbece58f2714c87f7a8267e
1 file changed +48 -4
+48 -4
--- src/comformat.c
+++ src/comformat.c
@@ -225,11 +225,35 @@
225225
charCnt++;
226226
}else{
227227
charCnt++;
228228
}
229229
assert( c!='\n' || charCnt==0 );
230
- fossil_print("%c", c);
230
+ /*
231
+ ** Avoid output of incomplete UTF-8 sequences, and also avoid line breaks
232
+ ** inside UTF-8 sequences. Incomplete, ill-formed and overlong sequences are
233
+ ** kept together. The invalid lead bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are
234
+ ** allowed to initiate (ill-formed) 2- and 4-byte sequences, respectively,
235
+ ** the other invalid lead bytes 0xF8 to 0xFF are treated as invalid 1-byte
236
+ ** sequences (as lone trail bytes).
237
+ */
238
+ if( (c&0xc0)==0xc0 && zLine[index]!=0 ){ /* Any UTF-8 lead byte 11xxxxxx */
239
+ char zUTF8[5]; /* Buffer to hold a UTF-8 sequence. */
240
+ int cchUTF8=1; /* Code units consumed. */
241
+ int maxUTF8=1; /* Expected sequence length. */
242
+ zUTF8[0]=c;
243
+ if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
244
+ else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
245
+ else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
246
+ while( cchUTF8<maxUTF8 &&
247
+ (zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
248
+ zUTF8[cchUTF8++] = zLine[index++];
249
+ }
250
+ zUTF8[cchUTF8]=0;
251
+ fossil_print("%s", zUTF8);
252
+ }
253
+ else
254
+ fossil_print("%c", c);
231255
if( (c&0x80)==0 || (zLine[index+1]&0xc0)!=0xc0 ) maxChars -= useChars;
232256
if( maxChars<=0 ) break;
233257
if( c=='\n' ) break;
234258
}
235259
if( charCnt>0 ){
@@ -259,11 +283,11 @@
259283
const char *zText, /* The comment text to be printed. */
260284
int indent, /* Number of spaces to indent each non-initial line. */
261285
int width /* Maximum number of characters per line. */
262286
){
263287
int maxChars = width - indent;
264
- int si, sk, i, k;
288
+ int si, sk, i, k, kc;
265289
int doIndent = 0;
266290
char *zBuf;
267291
char zBuffer[400];
268292
int lineCnt = 0;
269293
@@ -287,13 +311,33 @@
287311
lineCnt = 1;
288312
}
289313
if( zBuf!=zBuffer) fossil_free(zBuf);
290314
return lineCnt;
291315
}
292
- for(sk=si=i=k=0; zText[i] && k<maxChars; i++){
316
+ for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){
293317
char c = zText[i];
294
- if( fossil_isspace(c) ){
318
+ kc++; /* Count complete UTF-8 sequences. */
319
+ /*
320
+ ** Avoid line breaks inside UTF-8 sequences. Incomplete, ill-formed and
321
+ ** overlong sequences are kept together. The invalid lead bytes 0xC0 to
322
+ ** 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and
323
+ ** 4-byte sequences, respectively, the other invalid lead bytes 0xF8 to
324
+ ** 0xFF are treated as invalid 1-byte sequences (as lone trail bytes).
325
+ */
326
+ if( (c&0xc0)==0xc0 && zText[i+1]!=0 ){ /* Any UTF-8 lead byte 11xxxxxx */
327
+ int cchUTF8=1; /* Code units consumed. */
328
+ int maxUTF8=1; /* Expected sequence length. */
329
+ if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
330
+ else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
331
+ else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
332
+ zBuf[k++] = c;
333
+ while( cchUTF8<maxUTF8 &&
334
+ (zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
335
+ zBuf[k++] = zText[++i];
336
+ }
337
+ }
338
+ else if( fossil_isspace(c) ){
295339
si = i;
296340
sk = k;
297341
if( k==0 || zBuf[k-1]!=' ' ){
298342
zBuf[k++] = ' ';
299343
}
300344
--- src/comformat.c
+++ src/comformat.c
@@ -225,11 +225,35 @@
225 charCnt++;
226 }else{
227 charCnt++;
228 }
229 assert( c!='\n' || charCnt==0 );
230 fossil_print("%c", c);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231 if( (c&0x80)==0 || (zLine[index+1]&0xc0)!=0xc0 ) maxChars -= useChars;
232 if( maxChars<=0 ) break;
233 if( c=='\n' ) break;
234 }
235 if( charCnt>0 ){
@@ -259,11 +283,11 @@
259 const char *zText, /* The comment text to be printed. */
260 int indent, /* Number of spaces to indent each non-initial line. */
261 int width /* Maximum number of characters per line. */
262 ){
263 int maxChars = width - indent;
264 int si, sk, i, k;
265 int doIndent = 0;
266 char *zBuf;
267 char zBuffer[400];
268 int lineCnt = 0;
269
@@ -287,13 +311,33 @@
287 lineCnt = 1;
288 }
289 if( zBuf!=zBuffer) fossil_free(zBuf);
290 return lineCnt;
291 }
292 for(sk=si=i=k=0; zText[i] && k<maxChars; i++){
293 char c = zText[i];
294 if( fossil_isspace(c) ){
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295 si = i;
296 sk = k;
297 if( k==0 || zBuf[k-1]!=' ' ){
298 zBuf[k++] = ' ';
299 }
300
--- src/comformat.c
+++ src/comformat.c
@@ -225,11 +225,35 @@
225 charCnt++;
226 }else{
227 charCnt++;
228 }
229 assert( c!='\n' || charCnt==0 );
230 /*
231 ** Avoid output of incomplete UTF-8 sequences, and also avoid line breaks
232 ** inside UTF-8 sequences. Incomplete, ill-formed and overlong sequences are
233 ** kept together. The invalid lead bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are
234 ** allowed to initiate (ill-formed) 2- and 4-byte sequences, respectively,
235 ** the other invalid lead bytes 0xF8 to 0xFF are treated as invalid 1-byte
236 ** sequences (as lone trail bytes).
237 */
238 if( (c&0xc0)==0xc0 && zLine[index]!=0 ){ /* Any UTF-8 lead byte 11xxxxxx */
239 char zUTF8[5]; /* Buffer to hold a UTF-8 sequence. */
240 int cchUTF8=1; /* Code units consumed. */
241 int maxUTF8=1; /* Expected sequence length. */
242 zUTF8[0]=c;
243 if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
244 else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
245 else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
246 while( cchUTF8<maxUTF8 &&
247 (zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
248 zUTF8[cchUTF8++] = zLine[index++];
249 }
250 zUTF8[cchUTF8]=0;
251 fossil_print("%s", zUTF8);
252 }
253 else
254 fossil_print("%c", c);
255 if( (c&0x80)==0 || (zLine[index+1]&0xc0)!=0xc0 ) maxChars -= useChars;
256 if( maxChars<=0 ) break;
257 if( c=='\n' ) break;
258 }
259 if( charCnt>0 ){
@@ -259,11 +283,11 @@
283 const char *zText, /* The comment text to be printed. */
284 int indent, /* Number of spaces to indent each non-initial line. */
285 int width /* Maximum number of characters per line. */
286 ){
287 int maxChars = width - indent;
288 int si, sk, i, k, kc;
289 int doIndent = 0;
290 char *zBuf;
291 char zBuffer[400];
292 int lineCnt = 0;
293
@@ -287,13 +311,33 @@
311 lineCnt = 1;
312 }
313 if( zBuf!=zBuffer) fossil_free(zBuf);
314 return lineCnt;
315 }
316 for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){
317 char c = zText[i];
318 kc++; /* Count complete UTF-8 sequences. */
319 /*
320 ** Avoid line breaks inside UTF-8 sequences. Incomplete, ill-formed and
321 ** overlong sequences are kept together. The invalid lead bytes 0xC0 to
322 ** 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and
323 ** 4-byte sequences, respectively, the other invalid lead bytes 0xF8 to
324 ** 0xFF are treated as invalid 1-byte sequences (as lone trail bytes).
325 */
326 if( (c&0xc0)==0xc0 && zText[i+1]!=0 ){ /* Any UTF-8 lead byte 11xxxxxx */
327 int cchUTF8=1; /* Code units consumed. */
328 int maxUTF8=1; /* Expected sequence length. */
329 if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
330 else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
331 else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
332 zBuf[k++] = c;
333 while( cchUTF8<maxUTF8 &&
334 (zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
335 zBuf[k++] = zText[++i];
336 }
337 }
338 else if( fossil_isspace(c) ){
339 si = i;
340 sk = k;
341 if( k==0 || zBuf[k-1]!=' ' ){
342 zBuf[k++] = ' ';
343 }
344

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button