Fossil SCM
Modify the comment formatter to avoid output of incomplete UTF-8 sequences, and to avoid line breaks inside UTF-8 sequences. See [https://fossil-scm.org/forum/forumpost/1247e4a3c4] for detailed information and tests.
Commit
1bbca2c3f89b826d3350ca34a0e1a69a31180b72dcbece58f2714c87f7a8267e
Parent
35563f3db308ca3…
1 file changed
+48
-4
+48
-4
| --- src/comformat.c | ||
| +++ src/comformat.c | ||
| @@ -225,11 +225,35 @@ | ||
| 225 | 225 | charCnt++; |
| 226 | 226 | }else{ |
| 227 | 227 | charCnt++; |
| 228 | 228 | } |
| 229 | 229 | assert( c!='\n' || charCnt==0 ); |
| 230 | - fossil_print("%c", c); | |
| 230 | + /* | |
| 231 | + ** Avoid output of incomplete UTF-8 sequences, and also avoid line breaks | |
| 232 | + ** inside UTF-8 sequences. Incomplete, ill-formed and overlong sequences are | |
| 233 | + ** kept together. The invalid lead bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are | |
| 234 | + ** allowed to initiate (ill-formed) 2- and 4-byte sequences, respectively, | |
| 235 | + ** the other invalid lead bytes 0xF8 to 0xFF are treated as invalid 1-byte | |
| 236 | + ** sequences (as lone trail bytes). | |
| 237 | + */ | |
| 238 | + if( (c&0xc0)==0xc0 && zLine[index]!=0 ){ /* Any UTF-8 lead byte 11xxxxxx */ | |
| 239 | + char zUTF8[5]; /* Buffer to hold a UTF-8 sequence. */ | |
| 240 | + int cchUTF8=1; /* Code units consumed. */ | |
| 241 | + int maxUTF8=1; /* Expected sequence length. */ | |
| 242 | + zUTF8[0]=c; | |
| 243 | + if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */ | |
| 244 | + else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */ | |
| 245 | + else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */ | |
| 246 | + while( cchUTF8<maxUTF8 && | |
| 247 | + (zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ | |
| 248 | + zUTF8[cchUTF8++] = zLine[index++]; | |
| 249 | + } | |
| 250 | + zUTF8[cchUTF8]=0; | |
| 251 | + fossil_print("%s", zUTF8); | |
| 252 | + } | |
| 253 | + else | |
| 254 | + fossil_print("%c", c); | |
| 231 | 255 | if( (c&0x80)==0 || (zLine[index+1]&0xc0)!=0xc0 ) maxChars -= useChars; |
| 232 | 256 | if( maxChars<=0 ) break; |
| 233 | 257 | if( c=='\n' ) break; |
| 234 | 258 | } |
| 235 | 259 | if( charCnt>0 ){ |
| @@ -259,11 +283,11 @@ | ||
| 259 | 283 | const char *zText, /* The comment text to be printed. */ |
| 260 | 284 | int indent, /* Number of spaces to indent each non-initial line. */ |
| 261 | 285 | int width /* Maximum number of characters per line. */ |
| 262 | 286 | ){ |
| 263 | 287 | int maxChars = width - indent; |
| 264 | - int si, sk, i, k; | |
| 288 | + int si, sk, i, k, kc; | |
| 265 | 289 | int doIndent = 0; |
| 266 | 290 | char *zBuf; |
| 267 | 291 | char zBuffer[400]; |
| 268 | 292 | int lineCnt = 0; |
| 269 | 293 | |
| @@ -287,13 +311,33 @@ | ||
| 287 | 311 | lineCnt = 1; |
| 288 | 312 | } |
| 289 | 313 | if( zBuf!=zBuffer) fossil_free(zBuf); |
| 290 | 314 | return lineCnt; |
| 291 | 315 | } |
| 292 | - for(sk=si=i=k=0; zText[i] && k<maxChars; i++){ | |
| 316 | + for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){ | |
| 293 | 317 | char c = zText[i]; |
| 294 | - if( fossil_isspace(c) ){ | |
| 318 | + kc++; /* Count complete UTF-8 sequences. */ | |
| 319 | + /* | |
| 320 | + ** Avoid line breaks inside UTF-8 sequences. Incomplete, ill-formed and | |
| 321 | + ** overlong sequences are kept together. The invalid lead bytes 0xC0 to | |
| 322 | + ** 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and | |
| 323 | + ** 4-byte sequences, respectively, the other invalid lead bytes 0xF8 to | |
| 324 | + ** 0xFF are treated as invalid 1-byte sequences (as lone trail bytes). | |
| 325 | + */ | |
| 326 | + if( (c&0xc0)==0xc0 && zText[i+1]!=0 ){ /* Any UTF-8 lead byte 11xxxxxx */ | |
| 327 | + int cchUTF8=1; /* Code units consumed. */ | |
| 328 | + int maxUTF8=1; /* Expected sequence length. */ | |
| 329 | + if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */ | |
| 330 | + else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */ | |
| 331 | + else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */ | |
| 332 | + zBuf[k++] = c; | |
| 333 | + while( cchUTF8<maxUTF8 && | |
| 334 | + (zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ | |
| 335 | + zBuf[k++] = zText[++i]; | |
| 336 | + } | |
| 337 | + } | |
| 338 | + else if( fossil_isspace(c) ){ | |
| 295 | 339 | si = i; |
| 296 | 340 | sk = k; |
| 297 | 341 | if( k==0 || zBuf[k-1]!=' ' ){ |
| 298 | 342 | zBuf[k++] = ' '; |
| 299 | 343 | } |
| 300 | 344 |
| --- src/comformat.c | |
| +++ src/comformat.c | |
| @@ -225,11 +225,35 @@ | |
| 225 | charCnt++; |
| 226 | }else{ |
| 227 | charCnt++; |
| 228 | } |
| 229 | assert( c!='\n' || charCnt==0 ); |
| 230 | fossil_print("%c", c); |
| 231 | if( (c&0x80)==0 || (zLine[index+1]&0xc0)!=0xc0 ) maxChars -= useChars; |
| 232 | if( maxChars<=0 ) break; |
| 233 | if( c=='\n' ) break; |
| 234 | } |
| 235 | if( charCnt>0 ){ |
| @@ -259,11 +283,11 @@ | |
| 259 | const char *zText, /* The comment text to be printed. */ |
| 260 | int indent, /* Number of spaces to indent each non-initial line. */ |
| 261 | int width /* Maximum number of characters per line. */ |
| 262 | ){ |
| 263 | int maxChars = width - indent; |
| 264 | int si, sk, i, k; |
| 265 | int doIndent = 0; |
| 266 | char *zBuf; |
| 267 | char zBuffer[400]; |
| 268 | int lineCnt = 0; |
| 269 | |
| @@ -287,13 +311,33 @@ | |
| 287 | lineCnt = 1; |
| 288 | } |
| 289 | if( zBuf!=zBuffer) fossil_free(zBuf); |
| 290 | return lineCnt; |
| 291 | } |
| 292 | for(sk=si=i=k=0; zText[i] && k<maxChars; i++){ |
| 293 | char c = zText[i]; |
| 294 | if( fossil_isspace(c) ){ |
| 295 | si = i; |
| 296 | sk = k; |
| 297 | if( k==0 || zBuf[k-1]!=' ' ){ |
| 298 | zBuf[k++] = ' '; |
| 299 | } |
| 300 |
| --- src/comformat.c | |
| +++ src/comformat.c | |
| @@ -225,11 +225,35 @@ | |
| 225 | charCnt++; |
| 226 | }else{ |
| 227 | charCnt++; |
| 228 | } |
| 229 | assert( c!='\n' || charCnt==0 ); |
| 230 | /* |
| 231 | ** Avoid output of incomplete UTF-8 sequences, and also avoid line breaks |
| 232 | ** inside UTF-8 sequences. Incomplete, ill-formed and overlong sequences are |
| 233 | ** kept together. The invalid lead bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are |
| 234 | ** allowed to initiate (ill-formed) 2- and 4-byte sequences, respectively, |
| 235 | ** the other invalid lead bytes 0xF8 to 0xFF are treated as invalid 1-byte |
| 236 | ** sequences (as lone trail bytes). |
| 237 | */ |
| 238 | if( (c&0xc0)==0xc0 && zLine[index]!=0 ){ /* Any UTF-8 lead byte 11xxxxxx */ |
| 239 | char zUTF8[5]; /* Buffer to hold a UTF-8 sequence. */ |
| 240 | int cchUTF8=1; /* Code units consumed. */ |
| 241 | int maxUTF8=1; /* Expected sequence length. */ |
| 242 | zUTF8[0]=c; |
| 243 | if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */ |
| 244 | else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */ |
| 245 | else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */ |
| 246 | while( cchUTF8<maxUTF8 && |
| 247 | (zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ |
| 248 | zUTF8[cchUTF8++] = zLine[index++]; |
| 249 | } |
| 250 | zUTF8[cchUTF8]=0; |
| 251 | fossil_print("%s", zUTF8); |
| 252 | } |
| 253 | else |
| 254 | fossil_print("%c", c); |
| 255 | if( (c&0x80)==0 || (zLine[index+1]&0xc0)!=0xc0 ) maxChars -= useChars; |
| 256 | if( maxChars<=0 ) break; |
| 257 | if( c=='\n' ) break; |
| 258 | } |
| 259 | if( charCnt>0 ){ |
| @@ -259,11 +283,11 @@ | |
| 283 | const char *zText, /* The comment text to be printed. */ |
| 284 | int indent, /* Number of spaces to indent each non-initial line. */ |
| 285 | int width /* Maximum number of characters per line. */ |
| 286 | ){ |
| 287 | int maxChars = width - indent; |
| 288 | int si, sk, i, k, kc; |
| 289 | int doIndent = 0; |
| 290 | char *zBuf; |
| 291 | char zBuffer[400]; |
| 292 | int lineCnt = 0; |
| 293 | |
| @@ -287,13 +311,33 @@ | |
| 311 | lineCnt = 1; |
| 312 | } |
| 313 | if( zBuf!=zBuffer) fossil_free(zBuf); |
| 314 | return lineCnt; |
| 315 | } |
| 316 | for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){ |
| 317 | char c = zText[i]; |
| 318 | kc++; /* Count complete UTF-8 sequences. */ |
| 319 | /* |
| 320 | ** Avoid line breaks inside UTF-8 sequences. Incomplete, ill-formed and |
| 321 | ** overlong sequences are kept together. The invalid lead bytes 0xC0 to |
| 322 | ** 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and |
| 323 | ** 4-byte sequences, respectively, the other invalid lead bytes 0xF8 to |
| 324 | ** 0xFF are treated as invalid 1-byte sequences (as lone trail bytes). |
| 325 | */ |
| 326 | if( (c&0xc0)==0xc0 && zText[i+1]!=0 ){ /* Any UTF-8 lead byte 11xxxxxx */ |
| 327 | int cchUTF8=1; /* Code units consumed. */ |
| 328 | int maxUTF8=1; /* Expected sequence length. */ |
| 329 | if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */ |
| 330 | else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */ |
| 331 | else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */ |
| 332 | zBuf[k++] = c; |
| 333 | while( cchUTF8<maxUTF8 && |
| 334 | (zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ |
| 335 | zBuf[k++] = zText[++i]; |
| 336 | } |
| 337 | } |
| 338 | else if( fossil_isspace(c) ){ |
| 339 | si = i; |
| 340 | sk = k; |
| 341 | if( k==0 || zBuf[k-1]!=' ' ){ |
| 342 | zBuf[k++] = ' '; |
| 343 | } |
| 344 |