Fossil SCM
Properly truncate a UTF-8 encoded title using a function by @florian.balmer per [http://fossil-scm.org/forum/forumpost/52b141aa91]
Commit
35ad8eca06ffad05f6dfa4606c307cf54dbe67b2234c1b9dc4de3c097e726063
Parent
00e2f0511e2bcd5…
2 files changed
+3
-9
+23
+3
-9
| --- src/forum.c | ||
| +++ src/forum.c | ||
| @@ -554,22 +554,16 @@ | ||
| 554 | 554 | /* truncate the title when longer than max allowed; |
| 555 | 555 | * in case of UTF-8 make sure the truncated string remains valid, |
| 556 | 556 | * otherwise (different encoding?) pass as-is |
| 557 | 557 | */ |
| 558 | 558 | if( mxForumPostTitleLen>0 && blob_size(&title)>mxForumPostTitleLen ){ |
| 559 | - Blob truncated; | |
| 560 | 559 | int len; |
| 561 | - blob_copy(&truncated, &title); | |
| 562 | - for( len = mxForumPostTitleLen; len; --len ){ | |
| 563 | - blob_truncate(&truncated, len); | |
| 564 | - if( !invalid_utf8(&truncated) ) break; | |
| 565 | - } | |
| 560 | + len = utf8_nearest_codepoint(blob_str(&title), mxForumPostTitleLen); | |
| 566 | 561 | if( len ){ |
| 567 | - blob_append(&truncated, "...", 3); | |
| 568 | - blob_copy(&title, &truncated); | |
| 562 | + blob_truncate(&title, len); | |
| 563 | + blob_append(&title, "...", 3); | |
| 569 | 564 | } |
| 570 | - blob_reset(&truncated); | |
| 571 | 565 | } |
| 572 | 566 | style_header("%s%s", blob_str(&title), blob_size(&title) ? " - Forum" : "Forum"); |
| 573 | 567 | blob_reset(&title); |
| 574 | 568 | fossil_free(zThreadTitle); |
| 575 | 569 | return 0; |
| 576 | 570 |
| --- src/forum.c | |
| +++ src/forum.c | |
| @@ -554,22 +554,16 @@ | |
| 554 | /* truncate the title when longer than max allowed; |
| 555 | * in case of UTF-8 make sure the truncated string remains valid, |
| 556 | * otherwise (different encoding?) pass as-is |
| 557 | */ |
| 558 | if( mxForumPostTitleLen>0 && blob_size(&title)>mxForumPostTitleLen ){ |
| 559 | Blob truncated; |
| 560 | int len; |
| 561 | blob_copy(&truncated, &title); |
| 562 | for( len = mxForumPostTitleLen; len; --len ){ |
| 563 | blob_truncate(&truncated, len); |
| 564 | if( !invalid_utf8(&truncated) ) break; |
| 565 | } |
| 566 | if( len ){ |
| 567 | blob_append(&truncated, "...", 3); |
| 568 | blob_copy(&title, &truncated); |
| 569 | } |
| 570 | blob_reset(&truncated); |
| 571 | } |
| 572 | style_header("%s%s", blob_str(&title), blob_size(&title) ? " - Forum" : "Forum"); |
| 573 | blob_reset(&title); |
| 574 | fossil_free(zThreadTitle); |
| 575 | return 0; |
| 576 |
| --- src/forum.c | |
| +++ src/forum.c | |
| @@ -554,22 +554,16 @@ | |
| 554 | /* truncate the title when longer than max allowed; |
| 555 | * in case of UTF-8 make sure the truncated string remains valid, |
| 556 | * otherwise (different encoding?) pass as-is |
| 557 | */ |
| 558 | if( mxForumPostTitleLen>0 && blob_size(&title)>mxForumPostTitleLen ){ |
| 559 | int len; |
| 560 | len = utf8_nearest_codepoint(blob_str(&title), mxForumPostTitleLen); |
| 561 | if( len ){ |
| 562 | blob_truncate(&title, len); |
| 563 | blob_append(&title, "...", 3); |
| 564 | } |
| 565 | } |
| 566 | style_header("%s%s", blob_str(&title), blob_size(&title) ? " - Forum" : "Forum"); |
| 567 | blob_reset(&title); |
| 568 | fossil_free(zThreadTitle); |
| 569 | return 0; |
| 570 |
+23
| --- src/utf8.c | ||
| +++ src/utf8.c | ||
| @@ -298,10 +298,33 @@ | ||
| 298 | 298 | fossil_free(pOld); |
| 299 | 299 | #else |
| 300 | 300 | /* No-op on all other unix */ |
| 301 | 301 | #endif |
| 302 | 302 | } |
| 303 | + | |
| 304 | +/* | |
| 305 | +** For a given index in a UTF-8 string, return the nearest index that is the | |
| 306 | +** start of a new code point. The returned index is equal or lower than the | |
| 307 | +** given index. The end of the string (the null-terminator) is considered a | |
| 308 | +** valid start index. The given index is returned unchanged if the string | |
| 309 | +** contains invalid UTF-8 (i.e. overlong runs of trail bytes). | |
| 310 | +** This function is useful to find code point boundaries for truncation, for | |
| 311 | +** example, so that no incomplete UTF-8 sequences are left at the end of the | |
| 312 | +** truncated string. | |
| 313 | +** This function does not attempt to keep logical and/or visual constructs | |
| 314 | +** spanning across multiple code points intact, that is no attempts are made | |
| 315 | +** keep combining characters together with their base characters, or to keep | |
| 316 | +** more complex grapheme clusters intact. | |
| 317 | +*/ | |
| 318 | +#define IsUTF8TrailByte(c) ( (c&0xc0)==0x80 ) | |
| 319 | +int utf8_nearest_codepoint(const char *zString, int maxByteIndex){ | |
| 320 | + int i,n; | |
| 321 | + for( n=0, i=maxByteIndex; n<4 && i>=0; n++, i-- ){ | |
| 322 | + if( !IsUTF8TrailByte(zString[i]) ) return i; | |
| 323 | + } | |
| 324 | + return maxByteIndex; | |
| 325 | +} | |
| 303 | 326 | |
| 304 | 327 | /* |
| 305 | 328 | ** Display UTF-8 on the console. Return the number of |
| 306 | 329 | ** Characters written. If stdout or stderr is redirected |
| 307 | 330 | ** to a file, -1 is returned and nothing is written |
| 308 | 331 |
| --- src/utf8.c | |
| +++ src/utf8.c | |
| @@ -298,10 +298,33 @@ | |
| 298 | fossil_free(pOld); |
| 299 | #else |
| 300 | /* No-op on all other unix */ |
| 301 | #endif |
| 302 | } |
| 303 | |
| 304 | /* |
| 305 | ** Display UTF-8 on the console. Return the number of |
| 306 | ** Characters written. If stdout or stderr is redirected |
| 307 | ** to a file, -1 is returned and nothing is written |
| 308 |
| --- src/utf8.c | |
| +++ src/utf8.c | |
| @@ -298,10 +298,33 @@ | |
| 298 | fossil_free(pOld); |
| 299 | #else |
| 300 | /* No-op on all other unix */ |
| 301 | #endif |
| 302 | } |
| 303 | |
| 304 | /* |
| 305 | ** For a given index in a UTF-8 string, return the nearest index that is the |
| 306 | ** start of a new code point. The returned index is equal or lower than the |
| 307 | ** given index. The end of the string (the null-terminator) is considered a |
| 308 | ** valid start index. The given index is returned unchanged if the string |
| 309 | ** contains invalid UTF-8 (i.e. overlong runs of trail bytes). |
| 310 | ** This function is useful to find code point boundaries for truncation, for |
| 311 | ** example, so that no incomplete UTF-8 sequences are left at the end of the |
| 312 | ** truncated string. |
| 313 | ** This function does not attempt to keep logical and/or visual constructs |
| 314 | ** spanning across multiple code points intact, that is no attempts are made |
| 315 | ** keep combining characters together with their base characters, or to keep |
| 316 | ** more complex grapheme clusters intact. |
| 317 | */ |
| 318 | #define IsUTF8TrailByte(c) ( (c&0xc0)==0x80 ) |
| 319 | int utf8_nearest_codepoint(const char *zString, int maxByteIndex){ |
| 320 | int i,n; |
| 321 | for( n=0, i=maxByteIndex; n<4 && i>=0; n++, i-- ){ |
| 322 | if( !IsUTF8TrailByte(zString[i]) ) return i; |
| 323 | } |
| 324 | return maxByteIndex; |
| 325 | } |
| 326 | |
| 327 | /* |
| 328 | ** Display UTF-8 on the console. Return the number of |
| 329 | ** Characters written. If stdout or stderr is redirected |
| 330 | ** to a file, -1 is returned and nothing is written |
| 331 |