Fossil SCM
Fix the off-by-one errors if a fullwidth character only fits partially, and take into account character widths when scanning forward to find the distance to the next space.
Commit
d5479ba7c66e74561ee43db687b3ce19304be0baffddaf0a79a37c146e9b0354
Parent
e483b3b15fad08e…
1 file changed
+102
-79
+102
-79
| --- src/comformat.c | ||
| +++ src/comformat.c | ||
| @@ -241,62 +241,92 @@ | ||
| 241 | 241 | ** algorithm, the NUL character is treated the same as a spacing character. |
| 242 | 242 | */ |
| 243 | 243 | static int comment_next_space( |
| 244 | 244 | const char *zLine, /* [in] The comment line being printed. */ |
| 245 | 245 | int index, /* [in] The current character index being handled. */ |
| 246 | - int *distUTF8 /* [out] Distance to next space in UTF-8 sequences. */ | |
| 246 | + int *sumWidth /* [out] Summated width of all characters to next space. */ | |
| 247 | 247 | ){ |
| 248 | - int nextIndex = index + 1; | |
| 249 | - int fNonASCII=0; | |
| 248 | + int cchUTF8, utf32, wcwidth = 0; | |
| 249 | + int nextIndex = index; | |
| 250 | 250 | for(;;){ |
| 251 | - char c = zLine[nextIndex]; | |
| 252 | - if( (c&0x80)==0x80 ) fNonASCII=1; | |
| 253 | - if( c==0 || fossil_isspace(c) ){ | |
| 254 | - if( distUTF8 ){ | |
| 255 | - if( fNonASCII!=0 ){ | |
| 256 | - *distUTF8 = strlen_utf8(&zLine[index], nextIndex-index); | |
| 257 | - }else{ | |
| 258 | - *distUTF8 = nextIndex-index; | |
| 259 | - } | |
| 260 | - } | |
| 251 | + char_info_utf8(&zLine[nextIndex],&cchUTF8,&utf32); | |
| 252 | + nextIndex += cchUTF8; | |
| 253 | + wcwidth += cli_wcwidth(utf32); | |
| 254 | + if( zLine[nextIndex]==0 || fossil_isspace(zLine[nextIndex]) ){ | |
| 255 | + *sumWidth = wcwidth; | |
| 261 | 256 | return nextIndex; |
| 262 | 257 | } |
| 263 | - nextIndex++; | |
| 264 | 258 | } |
| 265 | 259 | return 0; /* NOT REACHED */ |
| 266 | 260 | } |
| 267 | 261 | |
| 268 | 262 | /* |
| 269 | -** Count the number of UTF-8 sequences in a string. Incomplete, ill-formed and | |
| 270 | -** overlong sequences are counted as one sequence. The invalid lead bytes 0xC0 | |
| 271 | -** to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and 4-byte | |
| 272 | -** sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF are | |
| 273 | -** treated as invalid 1-byte sequences (as lone trail bytes). | |
| 274 | -** Combining characters and East Asian Wide and Fullwidth characters are counted | |
| 275 | -** as one, so this function does not calculate the effective "display width". | |
| 263 | +** Return information about the next (single- or multi-byte) character in the | |
| 264 | +** specified UTF-8 string: The number of UTF-8 code units (in this case: bytes) | |
| 265 | +** and the decoded UTF-32 code point. Incomplete, ill-formed and overlong | |
| 266 | +** sequences are consumed together as one invalid code point. The invalid lead | |
| 267 | +** bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- | |
| 268 | +** and 4-byte sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF | |
| 269 | +** are treated as invalid 1-byte sequences (as lone trail bytes), all resulting | |
| 270 | +** in one invalid code point. Invalid UTF-8 sequences encoding a non-scalar code | |
| 271 | +** point (UTF-16 surrogates U+D800 to U+DFFF) are allowed. | |
| 276 | 272 | */ |
| 277 | -int strlen_utf8(const char *zString, int lengthBytes){ | |
| 278 | - int i; /* Counted bytes. */ | |
| 279 | - int lengthUTF8; /* Counted UTF-8 sequences. */ | |
| 280 | -#if 0 | |
| 281 | - assert( lengthBytes>=0 ); | |
| 273 | +void char_info_utf8( | |
| 274 | + const unsigned char *z, | |
| 275 | + int *pCchUTF8, | |
| 276 | + int *pUtf32 | |
| 277 | +){ | |
| 278 | + int i = 0; /* Counted bytes. */ | |
| 279 | + int cchUTF8 = 1; /* Code units consumed. */ | |
| 280 | + int maxUTF8 = 1; /* Expected sequence length. */ | |
| 281 | + char c = z[i++]; | |
| 282 | + if( (c&0xe0)==0xc0 ) maxUTF8 = 2; /* UTF-8 lead byte 110vvvvv */ | |
| 283 | + else if( (c&0xf0)==0xe0 ) maxUTF8 = 3; /* UTF-8 lead byte 1110vvvv */ | |
| 284 | + else if( (c&0xf8)==0xf0 ) maxUTF8 = 4; /* UTF-8 lead byte 11110vvv */ | |
| 285 | + while( cchUTF8<maxUTF8 && | |
| 286 | + (z[i]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ | |
| 287 | + cchUTF8++; | |
| 288 | + i++; | |
| 289 | + } | |
| 290 | + *pCchUTF8 = cchUTF8; | |
| 291 | + if( cchUTF8!=maxUTF8 || /* Incomplete UTF-8 sequence. */ | |
| 292 | + cchUTF8==1 && (c&0x80)==0x80 ){ /* Lone UTF-8 trail byte. */ | |
| 293 | + *pUtf32 = 0xfffd; /* U+FFFD Replacement Character */ | |
| 294 | +#ifdef FOSSIL_DEBUG | |
| 295 | + assert( *pUtf32!=0xfffd ); /* Invalid UTF-8 sequence. */ | |
| 282 | 296 | #endif |
| 283 | - for(i=0, lengthUTF8=0; i<lengthBytes; i++, lengthUTF8++){ | |
| 284 | - char c = zString[i]; | |
| 285 | - int cchUTF8=1; /* Code units consumed. */ | |
| 286 | - int maxUTF8=1; /* Expected sequence length. */ | |
| 287 | - if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */ | |
| 288 | - else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */ | |
| 289 | - else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */ | |
| 290 | - while( cchUTF8<maxUTF8 && | |
| 291 | - i<lengthBytes-1 && | |
| 292 | - (zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ | |
| 293 | - cchUTF8++; | |
| 294 | - i++; | |
| 295 | - } | |
| 296 | - } | |
| 297 | - return lengthUTF8; | |
| 297 | + return; | |
| 298 | + } | |
| 299 | + switch( cchUTF8 ){ | |
| 300 | + case 4: | |
| 301 | + *pUtf32 = | |
| 302 | + ( (z[0] & 0x0f)<<18 ) | | |
| 303 | + ( (z[1] & 0x3f)<<12 ) | | |
| 304 | + ( (z[2] & 0x3f)<< 6 ) | | |
| 305 | + ( (z[4] & 0x3f)<< 0 ) ; | |
| 306 | + break; | |
| 307 | + case 3: | |
| 308 | + *pUtf32 = | |
| 309 | + ( (z[0] & 0x0f)<<12 ) | | |
| 310 | + ( (z[1] & 0x3f)<< 6 ) | | |
| 311 | + ( (z[2] & 0x3f)<< 0 ) ; | |
| 312 | + break; | |
| 313 | + case 2: | |
| 314 | + *pUtf32 = | |
| 315 | + ( (z[0] & 0x1f)<< 6 ) | | |
| 316 | + ( (z[1] & 0x3f)<< 0 ) ; | |
| 317 | + break; | |
| 318 | + case 1: | |
| 319 | + *pUtf32 = (int)z[0]; | |
| 320 | + break; | |
| 321 | + } | |
| 322 | +#ifdef FOSSIL_DEBUG | |
| 323 | + assert( | |
| 324 | + *pUtf32>=0 && *pUtf32<=0x10ffff && /* Valid range U+0000 to U+10FFFF. */ | |
| 325 | + *pUtf32<0xd800 && *pUtf32>0xdfff /* Non-scalar (UTF-16 surrogates). */ | |
| 326 | + ); | |
| 327 | +#endif | |
| 298 | 328 | } |
| 299 | 329 | |
| 300 | 330 | /* |
| 301 | 331 | ** This function is called when printing a logical comment line to calculate |
| 302 | 332 | ** the necessary indenting. The caller needs to emit the indenting spaces. |
| @@ -339,11 +369,10 @@ | ||
| 339 | 369 | int *pLineCnt, /* [in/out] Pointer to the total line count. */ |
| 340 | 370 | const char **pzLine /* [out] Pointer to the end of the logical line. */ |
| 341 | 371 | ){ |
| 342 | 372 | int index = 0, charCnt = 0, lineCnt = 0, maxChars, i; |
| 343 | 373 | char zBuf[400]; int iBuf=0; /* Output buffer and counter. */ |
| 344 | - int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */ | |
| 345 | 374 | if( !zLine ) return; |
| 346 | 375 | if( lineChars<=0 ) return; |
| 347 | 376 | #if 0 |
| 348 | 377 | assert( indent<sizeof(zBuf)-5 ); /* See following comments to explain */ |
| 349 | 378 | assert( origIndent<sizeof(zBuf)-5 ); /* these limits. */ |
| @@ -362,10 +391,11 @@ | ||
| 362 | 391 | /* Limit line indent to fit output buffer. */ |
| 363 | 392 | origIndent = sizeof(zBuf)-6; |
| 364 | 393 | } |
| 365 | 394 | maxChars = lineChars; |
| 366 | 395 | for(;;){ |
| 396 | + int cchUTF8, utf32; | |
| 367 | 397 | int useChars = 1; |
| 368 | 398 | char c = zLine[index]; |
| 369 | 399 | /* Flush the output buffer if there's no space left for at least one more |
| 370 | 400 | ** (potentially 4-byte) UTF-8 sequence, one level of indentation spaces, |
| 371 | 401 | ** a new line, and a terminating NULL. */ |
| @@ -393,48 +423,47 @@ | ||
| 393 | 423 | if( c=='\n' ){ |
| 394 | 424 | lineCnt++; |
| 395 | 425 | charCnt = 0; |
| 396 | 426 | useChars = 0; |
| 397 | 427 | }else if( c=='\t' ){ |
| 398 | - int distUTF8; | |
| 399 | - int nextIndex = comment_next_space(zLine, index, &distUTF8); | |
| 400 | - if( nextIndex<=0 || distUTF8>maxChars ){ | |
| 428 | + int sumWidth; | |
| 429 | + int nextIndex = comment_next_space(zLine, index, &sumWidth); | |
| 430 | + if( nextIndex<=0 || sumWidth>maxChars ){ | |
| 401 | 431 | break; |
| 402 | 432 | } |
| 403 | 433 | charCnt++; |
| 404 | 434 | useChars = COMMENT_TAB_WIDTH; |
| 405 | 435 | if( maxChars<useChars ){ |
| 406 | 436 | zBuf[iBuf++] = ' '; |
| 407 | 437 | break; |
| 408 | 438 | } |
| 409 | 439 | }else if( wordBreak && fossil_isspace(c) ){ |
| 410 | - int distUTF8; | |
| 411 | - int nextIndex = comment_next_space(zLine, index, &distUTF8); | |
| 412 | - if( nextIndex<=0 || distUTF8>=maxChars ){ | |
| 440 | + int sumWidth; | |
| 441 | + int nextIndex = comment_next_space(zLine, index, &sumWidth); | |
| 442 | + if( nextIndex<=0 || sumWidth>=maxChars ){ | |
| 413 | 443 | break; |
| 414 | 444 | } |
| 415 | 445 | charCnt++; |
| 416 | 446 | }else{ |
| 417 | 447 | charCnt++; |
| 418 | 448 | } |
| 419 | 449 | assert( c!='\n' || charCnt==0 ); |
| 420 | 450 | zBuf[iBuf++] = c; |
| 421 | - /* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */ | |
| 422 | - cchUTF8=1; /* Code units consumed. */ | |
| 423 | - maxUTF8=1; /* Expected sequence length. */ | |
| 424 | - if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */ | |
| 425 | - else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */ | |
| 426 | - else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */ | |
| 427 | - while( cchUTF8<maxUTF8 && | |
| 428 | - (zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ | |
| 429 | - cchUTF8++; | |
| 430 | - zBuf[iBuf++] = zLine[index++]; | |
| 431 | - } | |
| 451 | + char_info_utf8(&zLine[index-1],&cchUTF8,&utf32); | |
| 432 | 452 | if( cchUTF8>1 ){ |
| 433 | - int utf32; | |
| 434 | - decodeUtf8((const unsigned char*)&zLine[index-cchUTF8],&utf32); | |
| 435 | - useChars += cli_wcwidth(utf32) - 1; | |
| 453 | + int wcwidth; | |
| 454 | + wcwidth = cli_wcwidth(utf32); | |
| 455 | + if( wcwidth>maxChars && lineChars>=wcwidth ){ /* rollback */ | |
| 456 | + index--; | |
| 457 | + iBuf--; | |
| 458 | + zBuf[iBuf] = 0; | |
| 459 | + break; | |
| 460 | + } | |
| 461 | + for( ; cchUTF8>1; cchUTF8-- ){ | |
| 462 | + zBuf[iBuf++] = zLine[index++]; | |
| 463 | + } | |
| 464 | + useChars += wcwidth - 1; | |
| 436 | 465 | } |
| 437 | 466 | maxChars -= useChars; |
| 438 | 467 | if( maxChars<=0 ) break; |
| 439 | 468 | if( c=='\n' ) break; |
| 440 | 469 | } |
| @@ -476,11 +505,10 @@ | ||
| 476 | 505 | int si, sk, i, k, kc; |
| 477 | 506 | int doIndent = 0; |
| 478 | 507 | char *zBuf; |
| 479 | 508 | char zBuffer[400]; |
| 480 | 509 | int lineCnt = 0; |
| 481 | - int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */ | |
| 482 | 510 | |
| 483 | 511 | if( width<0 ){ |
| 484 | 512 | comment_set_maxchars(indent, &maxChars); |
| 485 | 513 | } |
| 486 | 514 | if( zText==0 ) zText = "(NULL)"; |
| @@ -502,30 +530,25 @@ | ||
| 502 | 530 | } |
| 503 | 531 | if( zBuf!=zBuffer) fossil_free(zBuf); |
| 504 | 532 | return lineCnt; |
| 505 | 533 | } |
| 506 | 534 | for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){ |
| 535 | + int cchUTF8, utf32; | |
| 507 | 536 | char c = zText[i]; |
| 508 | 537 | kc++; /* Count complete UTF-8 sequences. */ |
| 509 | - /* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */ | |
| 510 | - cchUTF8=1; /* Code units consumed. */ | |
| 511 | - maxUTF8=1; /* Expected sequence length. */ | |
| 512 | - if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */ | |
| 513 | - else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */ | |
| 514 | - else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */ | |
| 515 | - if( maxUTF8>1 ){ | |
| 516 | - zBuf[k++] = c; | |
| 517 | - while( cchUTF8<maxUTF8 && | |
| 518 | - (zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ | |
| 519 | - cchUTF8++; | |
| 538 | + char_info_utf8(&zText[i],&cchUTF8,&utf32); | |
| 539 | + if( cchUTF8>1 ){ | |
| 540 | + int wcwidth; | |
| 541 | + wcwidth = cli_wcwidth(utf32); | |
| 542 | + if( kc+wcwidth-1>maxChars && maxChars>=wcwidth ){ /* rollback */ | |
| 543 | + kc--; | |
| 544 | + break; | |
| 545 | + } | |
| 546 | + for( i--; cchUTF8>0; cchUTF8-- ){ | |
| 520 | 547 | zBuf[k++] = zText[++i]; |
| 521 | 548 | } |
| 522 | - } | |
| 523 | - if( cchUTF8>1 ){ | |
| 524 | - int utf32; | |
| 525 | - decodeUtf8((const unsigned char*)&zText[k-cchUTF8],&utf32); | |
| 526 | - kc += cli_wcwidth(utf32) - 1; | |
| 549 | + kc += wcwidth - 1; | |
| 527 | 550 | } |
| 528 | 551 | else if( fossil_isspace(c) ){ |
| 529 | 552 | si = i; |
| 530 | 553 | sk = k; |
| 531 | 554 | if( k==0 || zBuf[k-1]!=' ' ){ |
| 532 | 555 |
| --- src/comformat.c | |
| +++ src/comformat.c | |
| @@ -241,62 +241,92 @@ | |
| 241 | ** algorithm, the NUL character is treated the same as a spacing character. |
| 242 | */ |
| 243 | static int comment_next_space( |
| 244 | const char *zLine, /* [in] The comment line being printed. */ |
| 245 | int index, /* [in] The current character index being handled. */ |
| 246 | int *distUTF8 /* [out] Distance to next space in UTF-8 sequences. */ |
| 247 | ){ |
| 248 | int nextIndex = index + 1; |
| 249 | int fNonASCII=0; |
| 250 | for(;;){ |
| 251 | char c = zLine[nextIndex]; |
| 252 | if( (c&0x80)==0x80 ) fNonASCII=1; |
| 253 | if( c==0 || fossil_isspace(c) ){ |
| 254 | if( distUTF8 ){ |
| 255 | if( fNonASCII!=0 ){ |
| 256 | *distUTF8 = strlen_utf8(&zLine[index], nextIndex-index); |
| 257 | }else{ |
| 258 | *distUTF8 = nextIndex-index; |
| 259 | } |
| 260 | } |
| 261 | return nextIndex; |
| 262 | } |
| 263 | nextIndex++; |
| 264 | } |
| 265 | return 0; /* NOT REACHED */ |
| 266 | } |
| 267 | |
| 268 | /* |
| 269 | ** Count the number of UTF-8 sequences in a string. Incomplete, ill-formed and |
| 270 | ** overlong sequences are counted as one sequence. The invalid lead bytes 0xC0 |
| 271 | ** to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and 4-byte |
| 272 | ** sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF are |
| 273 | ** treated as invalid 1-byte sequences (as lone trail bytes). |
| 274 | ** Combining characters and East Asian Wide and Fullwidth characters are counted |
| 275 | ** as one, so this function does not calculate the effective "display width". |
| 276 | */ |
| 277 | int strlen_utf8(const char *zString, int lengthBytes){ |
| 278 | int i; /* Counted bytes. */ |
| 279 | int lengthUTF8; /* Counted UTF-8 sequences. */ |
| 280 | #if 0 |
| 281 | assert( lengthBytes>=0 ); |
| 282 | #endif |
| 283 | for(i=0, lengthUTF8=0; i<lengthBytes; i++, lengthUTF8++){ |
| 284 | char c = zString[i]; |
| 285 | int cchUTF8=1; /* Code units consumed. */ |
| 286 | int maxUTF8=1; /* Expected sequence length. */ |
| 287 | if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */ |
| 288 | else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */ |
| 289 | else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */ |
| 290 | while( cchUTF8<maxUTF8 && |
| 291 | i<lengthBytes-1 && |
| 292 | (zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ |
| 293 | cchUTF8++; |
| 294 | i++; |
| 295 | } |
| 296 | } |
| 297 | return lengthUTF8; |
| 298 | } |
| 299 | |
| 300 | /* |
| 301 | ** This function is called when printing a logical comment line to calculate |
| 302 | ** the necessary indenting. The caller needs to emit the indenting spaces. |
| @@ -339,11 +369,10 @@ | |
| 339 | int *pLineCnt, /* [in/out] Pointer to the total line count. */ |
| 340 | const char **pzLine /* [out] Pointer to the end of the logical line. */ |
| 341 | ){ |
| 342 | int index = 0, charCnt = 0, lineCnt = 0, maxChars, i; |
| 343 | char zBuf[400]; int iBuf=0; /* Output buffer and counter. */ |
| 344 | int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */ |
| 345 | if( !zLine ) return; |
| 346 | if( lineChars<=0 ) return; |
| 347 | #if 0 |
| 348 | assert( indent<sizeof(zBuf)-5 ); /* See following comments to explain */ |
| 349 | assert( origIndent<sizeof(zBuf)-5 ); /* these limits. */ |
| @@ -362,10 +391,11 @@ | |
| 362 | /* Limit line indent to fit output buffer. */ |
| 363 | origIndent = sizeof(zBuf)-6; |
| 364 | } |
| 365 | maxChars = lineChars; |
| 366 | for(;;){ |
| 367 | int useChars = 1; |
| 368 | char c = zLine[index]; |
| 369 | /* Flush the output buffer if there's no space left for at least one more |
| 370 | ** (potentially 4-byte) UTF-8 sequence, one level of indentation spaces, |
| 371 | ** a new line, and a terminating NULL. */ |
| @@ -393,48 +423,47 @@ | |
| 393 | if( c=='\n' ){ |
| 394 | lineCnt++; |
| 395 | charCnt = 0; |
| 396 | useChars = 0; |
| 397 | }else if( c=='\t' ){ |
| 398 | int distUTF8; |
| 399 | int nextIndex = comment_next_space(zLine, index, &distUTF8); |
| 400 | if( nextIndex<=0 || distUTF8>maxChars ){ |
| 401 | break; |
| 402 | } |
| 403 | charCnt++; |
| 404 | useChars = COMMENT_TAB_WIDTH; |
| 405 | if( maxChars<useChars ){ |
| 406 | zBuf[iBuf++] = ' '; |
| 407 | break; |
| 408 | } |
| 409 | }else if( wordBreak && fossil_isspace(c) ){ |
| 410 | int distUTF8; |
| 411 | int nextIndex = comment_next_space(zLine, index, &distUTF8); |
| 412 | if( nextIndex<=0 || distUTF8>=maxChars ){ |
| 413 | break; |
| 414 | } |
| 415 | charCnt++; |
| 416 | }else{ |
| 417 | charCnt++; |
| 418 | } |
| 419 | assert( c!='\n' || charCnt==0 ); |
| 420 | zBuf[iBuf++] = c; |
| 421 | /* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */ |
| 422 | cchUTF8=1; /* Code units consumed. */ |
| 423 | maxUTF8=1; /* Expected sequence length. */ |
| 424 | if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */ |
| 425 | else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */ |
| 426 | else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */ |
| 427 | while( cchUTF8<maxUTF8 && |
| 428 | (zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ |
| 429 | cchUTF8++; |
| 430 | zBuf[iBuf++] = zLine[index++]; |
| 431 | } |
| 432 | if( cchUTF8>1 ){ |
| 433 | int utf32; |
| 434 | decodeUtf8((const unsigned char*)&zLine[index-cchUTF8],&utf32); |
| 435 | useChars += cli_wcwidth(utf32) - 1; |
| 436 | } |
| 437 | maxChars -= useChars; |
| 438 | if( maxChars<=0 ) break; |
| 439 | if( c=='\n' ) break; |
| 440 | } |
| @@ -476,11 +505,10 @@ | |
| 476 | int si, sk, i, k, kc; |
| 477 | int doIndent = 0; |
| 478 | char *zBuf; |
| 479 | char zBuffer[400]; |
| 480 | int lineCnt = 0; |
| 481 | int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */ |
| 482 | |
| 483 | if( width<0 ){ |
| 484 | comment_set_maxchars(indent, &maxChars); |
| 485 | } |
| 486 | if( zText==0 ) zText = "(NULL)"; |
| @@ -502,30 +530,25 @@ | |
| 502 | } |
| 503 | if( zBuf!=zBuffer) fossil_free(zBuf); |
| 504 | return lineCnt; |
| 505 | } |
| 506 | for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){ |
| 507 | char c = zText[i]; |
| 508 | kc++; /* Count complete UTF-8 sequences. */ |
| 509 | /* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */ |
| 510 | cchUTF8=1; /* Code units consumed. */ |
| 511 | maxUTF8=1; /* Expected sequence length. */ |
| 512 | if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */ |
| 513 | else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */ |
| 514 | else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */ |
| 515 | if( maxUTF8>1 ){ |
| 516 | zBuf[k++] = c; |
| 517 | while( cchUTF8<maxUTF8 && |
| 518 | (zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ |
| 519 | cchUTF8++; |
| 520 | zBuf[k++] = zText[++i]; |
| 521 | } |
| 522 | } |
| 523 | if( cchUTF8>1 ){ |
| 524 | int utf32; |
| 525 | decodeUtf8((const unsigned char*)&zText[k-cchUTF8],&utf32); |
| 526 | kc += cli_wcwidth(utf32) - 1; |
| 527 | } |
| 528 | else if( fossil_isspace(c) ){ |
| 529 | si = i; |
| 530 | sk = k; |
| 531 | if( k==0 || zBuf[k-1]!=' ' ){ |
| 532 |
| --- src/comformat.c | |
| +++ src/comformat.c | |
| @@ -241,62 +241,92 @@ | |
| 241 | ** algorithm, the NUL character is treated the same as a spacing character. |
| 242 | */ |
| 243 | static int comment_next_space( |
| 244 | const char *zLine, /* [in] The comment line being printed. */ |
| 245 | int index, /* [in] The current character index being handled. */ |
| 246 | int *sumWidth /* [out] Summated width of all characters to next space. */ |
| 247 | ){ |
| 248 | int cchUTF8, utf32, wcwidth = 0; |
| 249 | int nextIndex = index; |
| 250 | for(;;){ |
| 251 | char_info_utf8(&zLine[nextIndex],&cchUTF8,&utf32); |
| 252 | nextIndex += cchUTF8; |
| 253 | wcwidth += cli_wcwidth(utf32); |
| 254 | if( zLine[nextIndex]==0 || fossil_isspace(zLine[nextIndex]) ){ |
| 255 | *sumWidth = wcwidth; |
| 256 | return nextIndex; |
| 257 | } |
| 258 | } |
| 259 | return 0; /* NOT REACHED */ |
| 260 | } |
| 261 | |
| 262 | /* |
| 263 | ** Return information about the next (single- or multi-byte) character in the |
| 264 | ** specified UTF-8 string: The number of UTF-8 code units (in this case: bytes) |
| 265 | ** and the decoded UTF-32 code point. Incomplete, ill-formed and overlong |
| 266 | ** sequences are consumed together as one invalid code point. The invalid lead |
| 267 | ** bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- |
| 268 | ** and 4-byte sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF |
| 269 | ** are treated as invalid 1-byte sequences (as lone trail bytes), all resulting |
| 270 | ** in one invalid code point. Invalid UTF-8 sequences encoding a non-scalar code |
| 271 | ** point (UTF-16 surrogates U+D800 to U+DFFF) are allowed. |
| 272 | */ |
| 273 | void char_info_utf8( |
| 274 | const unsigned char *z, |
| 275 | int *pCchUTF8, |
| 276 | int *pUtf32 |
| 277 | ){ |
| 278 | int i = 0; /* Counted bytes. */ |
| 279 | int cchUTF8 = 1; /* Code units consumed. */ |
| 280 | int maxUTF8 = 1; /* Expected sequence length. */ |
| 281 | char c = z[i++]; |
| 282 | if( (c&0xe0)==0xc0 ) maxUTF8 = 2; /* UTF-8 lead byte 110vvvvv */ |
| 283 | else if( (c&0xf0)==0xe0 ) maxUTF8 = 3; /* UTF-8 lead byte 1110vvvv */ |
| 284 | else if( (c&0xf8)==0xf0 ) maxUTF8 = 4; /* UTF-8 lead byte 11110vvv */ |
| 285 | while( cchUTF8<maxUTF8 && |
| 286 | (z[i]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ |
| 287 | cchUTF8++; |
| 288 | i++; |
| 289 | } |
| 290 | *pCchUTF8 = cchUTF8; |
| 291 | if( cchUTF8!=maxUTF8 || /* Incomplete UTF-8 sequence. */ |
| 292 | cchUTF8==1 && (c&0x80)==0x80 ){ /* Lone UTF-8 trail byte. */ |
| 293 | *pUtf32 = 0xfffd; /* U+FFFD Replacement Character */ |
| 294 | #ifdef FOSSIL_DEBUG |
| 295 | assert( *pUtf32!=0xfffd ); /* Invalid UTF-8 sequence. */ |
| 296 | #endif |
| 297 | return; |
| 298 | } |
| 299 | switch( cchUTF8 ){ |
| 300 | case 4: |
| 301 | *pUtf32 = |
| 302 | ( (z[0] & 0x0f)<<18 ) | |
| 303 | ( (z[1] & 0x3f)<<12 ) | |
| 304 | ( (z[2] & 0x3f)<< 6 ) | |
| 305 | ( (z[4] & 0x3f)<< 0 ) ; |
| 306 | break; |
| 307 | case 3: |
| 308 | *pUtf32 = |
| 309 | ( (z[0] & 0x0f)<<12 ) | |
| 310 | ( (z[1] & 0x3f)<< 6 ) | |
| 311 | ( (z[2] & 0x3f)<< 0 ) ; |
| 312 | break; |
| 313 | case 2: |
| 314 | *pUtf32 = |
| 315 | ( (z[0] & 0x1f)<< 6 ) | |
| 316 | ( (z[1] & 0x3f)<< 0 ) ; |
| 317 | break; |
| 318 | case 1: |
| 319 | *pUtf32 = (int)z[0]; |
| 320 | break; |
| 321 | } |
| 322 | #ifdef FOSSIL_DEBUG |
| 323 | assert( |
| 324 | *pUtf32>=0 && *pUtf32<=0x10ffff && /* Valid range U+0000 to U+10FFFF. */ |
| 325 | *pUtf32<0xd800 && *pUtf32>0xdfff /* Non-scalar (UTF-16 surrogates). */ |
| 326 | ); |
| 327 | #endif |
| 328 | } |
| 329 | |
| 330 | /* |
| 331 | ** This function is called when printing a logical comment line to calculate |
| 332 | ** the necessary indenting. The caller needs to emit the indenting spaces. |
| @@ -339,11 +369,10 @@ | |
| 369 | int *pLineCnt, /* [in/out] Pointer to the total line count. */ |
| 370 | const char **pzLine /* [out] Pointer to the end of the logical line. */ |
| 371 | ){ |
| 372 | int index = 0, charCnt = 0, lineCnt = 0, maxChars, i; |
| 373 | char zBuf[400]; int iBuf=0; /* Output buffer and counter. */ |
| 374 | if( !zLine ) return; |
| 375 | if( lineChars<=0 ) return; |
| 376 | #if 0 |
| 377 | assert( indent<sizeof(zBuf)-5 ); /* See following comments to explain */ |
| 378 | assert( origIndent<sizeof(zBuf)-5 ); /* these limits. */ |
| @@ -362,10 +391,11 @@ | |
| 391 | /* Limit line indent to fit output buffer. */ |
| 392 | origIndent = sizeof(zBuf)-6; |
| 393 | } |
| 394 | maxChars = lineChars; |
| 395 | for(;;){ |
| 396 | int cchUTF8, utf32; |
| 397 | int useChars = 1; |
| 398 | char c = zLine[index]; |
| 399 | /* Flush the output buffer if there's no space left for at least one more |
| 400 | ** (potentially 4-byte) UTF-8 sequence, one level of indentation spaces, |
| 401 | ** a new line, and a terminating NULL. */ |
| @@ -393,48 +423,47 @@ | |
| 423 | if( c=='\n' ){ |
| 424 | lineCnt++; |
| 425 | charCnt = 0; |
| 426 | useChars = 0; |
| 427 | }else if( c=='\t' ){ |
| 428 | int sumWidth; |
| 429 | int nextIndex = comment_next_space(zLine, index, &sumWidth); |
| 430 | if( nextIndex<=0 || sumWidth>maxChars ){ |
| 431 | break; |
| 432 | } |
| 433 | charCnt++; |
| 434 | useChars = COMMENT_TAB_WIDTH; |
| 435 | if( maxChars<useChars ){ |
| 436 | zBuf[iBuf++] = ' '; |
| 437 | break; |
| 438 | } |
| 439 | }else if( wordBreak && fossil_isspace(c) ){ |
| 440 | int sumWidth; |
| 441 | int nextIndex = comment_next_space(zLine, index, &sumWidth); |
| 442 | if( nextIndex<=0 || sumWidth>=maxChars ){ |
| 443 | break; |
| 444 | } |
| 445 | charCnt++; |
| 446 | }else{ |
| 447 | charCnt++; |
| 448 | } |
| 449 | assert( c!='\n' || charCnt==0 ); |
| 450 | zBuf[iBuf++] = c; |
| 451 | char_info_utf8(&zLine[index-1],&cchUTF8,&utf32); |
| 452 | if( cchUTF8>1 ){ |
| 453 | int wcwidth; |
| 454 | wcwidth = cli_wcwidth(utf32); |
| 455 | if( wcwidth>maxChars && lineChars>=wcwidth ){ /* rollback */ |
| 456 | index--; |
| 457 | iBuf--; |
| 458 | zBuf[iBuf] = 0; |
| 459 | break; |
| 460 | } |
| 461 | for( ; cchUTF8>1; cchUTF8-- ){ |
| 462 | zBuf[iBuf++] = zLine[index++]; |
| 463 | } |
| 464 | useChars += wcwidth - 1; |
| 465 | } |
| 466 | maxChars -= useChars; |
| 467 | if( maxChars<=0 ) break; |
| 468 | if( c=='\n' ) break; |
| 469 | } |
| @@ -476,11 +505,10 @@ | |
| 505 | int si, sk, i, k, kc; |
| 506 | int doIndent = 0; |
| 507 | char *zBuf; |
| 508 | char zBuffer[400]; |
| 509 | int lineCnt = 0; |
| 510 | |
| 511 | if( width<0 ){ |
| 512 | comment_set_maxchars(indent, &maxChars); |
| 513 | } |
| 514 | if( zText==0 ) zText = "(NULL)"; |
| @@ -502,30 +530,25 @@ | |
| 530 | } |
| 531 | if( zBuf!=zBuffer) fossil_free(zBuf); |
| 532 | return lineCnt; |
| 533 | } |
| 534 | for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){ |
| 535 | int cchUTF8, utf32; |
| 536 | char c = zText[i]; |
| 537 | kc++; /* Count complete UTF-8 sequences. */ |
| 538 | char_info_utf8(&zText[i],&cchUTF8,&utf32); |
| 539 | if( cchUTF8>1 ){ |
| 540 | int wcwidth; |
| 541 | wcwidth = cli_wcwidth(utf32); |
| 542 | if( kc+wcwidth-1>maxChars && maxChars>=wcwidth ){ /* rollback */ |
| 543 | kc--; |
| 544 | break; |
| 545 | } |
| 546 | for( i--; cchUTF8>0; cchUTF8-- ){ |
| 547 | zBuf[k++] = zText[++i]; |
| 548 | } |
| 549 | kc += wcwidth - 1; |
| 550 | } |
| 551 | else if( fossil_isspace(c) ){ |
| 552 | si = i; |
| 553 | sk = k; |
| 554 | if( k==0 || zBuf[k-1]!=' ' ){ |
| 555 |