Fossil SCM
Improve truncation of UTF-8 encoded title using a function by @florian.balmer per [https://fossil-scm.org/forum/forumpost/6d90d5d99c]
Commit
d076853d10a2f2f7b1812b1db4c45ef522060e0fdcddd6b865d79a164906bea8
Parent
b70a76e35495d50…
2 files changed
+1
-1
+30
+1
-1
| --- src/forum.c | ||
| +++ src/forum.c | ||
| @@ -555,11 +555,11 @@ | ||
| 555 | 555 | * in case of UTF-8 make sure the truncated string remains valid, |
| 556 | 556 | * otherwise (different encoding?) pass as-is |
| 557 | 557 | */ |
| 558 | 558 | if( mxForumPostTitleLen>0 && blob_size(&title)>mxForumPostTitleLen ){ |
| 559 | 559 | int len; |
| 560 | - len = utf8_nearest_codepoint(blob_str(&title), mxForumPostTitleLen); | |
| 560 | + len = utf8_codepoint_index(blob_str(&title), mxForumPostTitleLen); | |
| 561 | 561 | if( len ){ |
| 562 | 562 | blob_truncate(&title, len); |
| 563 | 563 | blob_append(&title, "...", 3); |
| 564 | 564 | } |
| 565 | 565 | } |
| 566 | 566 |
| --- src/forum.c | |
| +++ src/forum.c | |
| @@ -555,11 +555,11 @@ | |
| 555 | * in case of UTF-8 make sure the truncated string remains valid, |
| 556 | * otherwise (different encoding?) pass as-is |
| 557 | */ |
| 558 | if( mxForumPostTitleLen>0 && blob_size(&title)>mxForumPostTitleLen ){ |
| 559 | int len; |
| 560 | len = utf8_nearest_codepoint(blob_str(&title), mxForumPostTitleLen); |
| 561 | if( len ){ |
| 562 | blob_truncate(&title, len); |
| 563 | blob_append(&title, "...", 3); |
| 564 | } |
| 565 | } |
| 566 |
| --- src/forum.c | |
| +++ src/forum.c | |
| @@ -555,11 +555,11 @@ | |
| 555 | * in case of UTF-8 make sure the truncated string remains valid, |
| 556 | * otherwise (different encoding?) pass as-is |
| 557 | */ |
| 558 | if( mxForumPostTitleLen>0 && blob_size(&title)>mxForumPostTitleLen ){ |
| 559 | int len; |
| 560 | len = utf8_codepoint_index(blob_str(&title), mxForumPostTitleLen); |
| 561 | if( len ){ |
| 562 | blob_truncate(&title, len); |
| 563 | blob_append(&title, "...", 3); |
| 564 | } |
| 565 | } |
| 566 |
+30
| --- src/utf8.c | ||
| +++ src/utf8.c | ||
| @@ -321,10 +321,40 @@ | ||
| 321 | 321 | for( n=0, i=maxByteIndex; n<4 && i>=0; n++, i-- ){ |
| 322 | 322 | if( !IsUTF8TrailByte(zString[i]) ) return i; |
| 323 | 323 | } |
| 324 | 324 | return maxByteIndex; |
| 325 | 325 | } |
| 326 | + | |
| 327 | +/* | |
| 328 | +** Find the byte index corresponding to the given code point index in a UTF-8 | |
| 329 | +** string. If the string contains fewer than the given number of code points, | |
| 330 | +** the index of the end of the string (the null-terminator) is returned. | |
| 331 | +** Incomplete, ill-formed and overlong sequences are counted as one sequence. | |
| 332 | +** The invalid lead bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are allowed to initiate | |
| 333 | +** (ill-formed) 2- and 4-byte sequences, respectively, the other invalid lead | |
| 334 | +** bytes 0xF8 to 0xFF are treated as invalid 1-byte sequences (as lone trail | |
| 335 | +** bytes). | |
| 336 | +*/ | |
| 337 | +int utf8_codepoint_index(const char *zString, int nCodePoint){ | |
| 338 | + int i; /* Counted bytes. */ | |
| 339 | + int lenUTF8; /* Counted UTF-8 sequences. */ | |
| 340 | + if( zString==0 ) return 0; | |
| 341 | + for(i=0, lenUTF8=0; zString[i]!=0 && lenUTF8<nCodePoint; i++, lenUTF8++){ | |
| 342 | + char c = zString[i]; | |
| 343 | + int cchUTF8=1; /* Code units consumed. */ | |
| 344 | + int maxUTF8=1; /* Expected sequence length. */ | |
| 345 | + if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */ | |
| 346 | + else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */ | |
| 347 | + else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */ | |
| 348 | + while( cchUTF8<maxUTF8 && | |
| 349 | + (zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ | |
| 350 | + cchUTF8++; | |
| 351 | + i++; | |
| 352 | + } | |
| 353 | + } | |
| 354 | + return i; | |
| 355 | +} | |
| 326 | 356 | |
| 327 | 357 | /* |
| 328 | 358 | ** Display UTF-8 on the console. Return the number of |
| 329 | 359 | ** Characters written. If stdout or stderr is redirected |
| 330 | 360 | ** to a file, -1 is returned and nothing is written |
| 331 | 361 |
| --- src/utf8.c | |
| +++ src/utf8.c | |
| @@ -321,10 +321,40 @@ | |
| 321 | for( n=0, i=maxByteIndex; n<4 && i>=0; n++, i-- ){ |
| 322 | if( !IsUTF8TrailByte(zString[i]) ) return i; |
| 323 | } |
| 324 | return maxByteIndex; |
| 325 | } |
| 326 | |
| 327 | /* |
| 328 | ** Display UTF-8 on the console. Return the number of |
| 329 | ** Characters written. If stdout or stderr is redirected |
| 330 | ** to a file, -1 is returned and nothing is written |
| 331 |
| --- src/utf8.c | |
| +++ src/utf8.c | |
| @@ -321,10 +321,40 @@ | |
| 321 | for( n=0, i=maxByteIndex; n<4 && i>=0; n++, i-- ){ |
| 322 | if( !IsUTF8TrailByte(zString[i]) ) return i; |
| 323 | } |
| 324 | return maxByteIndex; |
| 325 | } |
| 326 | |
| 327 | /* |
| 328 | ** Find the byte index corresponding to the given code point index in a UTF-8 |
| 329 | ** string. If the string contains fewer than the given number of code points, |
| 330 | ** the index of the end of the string (the null-terminator) is returned. |
| 331 | ** Incomplete, ill-formed and overlong sequences are counted as one sequence. |
| 332 | ** The invalid lead bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are allowed to initiate |
| 333 | ** (ill-formed) 2- and 4-byte sequences, respectively, the other invalid lead |
| 334 | ** bytes 0xF8 to 0xFF are treated as invalid 1-byte sequences (as lone trail |
| 335 | ** bytes). |
| 336 | */ |
| 337 | int utf8_codepoint_index(const char *zString, int nCodePoint){ |
| 338 | int i; /* Counted bytes. */ |
| 339 | int lenUTF8; /* Counted UTF-8 sequences. */ |
| 340 | if( zString==0 ) return 0; |
| 341 | for(i=0, lenUTF8=0; zString[i]!=0 && lenUTF8<nCodePoint; i++, lenUTF8++){ |
| 342 | char c = zString[i]; |
| 343 | int cchUTF8=1; /* Code units consumed. */ |
| 344 | int maxUTF8=1; /* Expected sequence length. */ |
| 345 | if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */ |
| 346 | else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */ |
| 347 | else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */ |
| 348 | while( cchUTF8<maxUTF8 && |
| 349 | (zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ |
| 350 | cchUTF8++; |
| 351 | i++; |
| 352 | } |
| 353 | } |
| 354 | return i; |
| 355 | } |
| 356 | |
| 357 | /* |
| 358 | ** Display UTF-8 on the console. Return the number of |
| 359 | ** Characters written. If stdout or stderr is redirected |
| 360 | ** to a file, -1 is returned and nothing is written |
| 361 |