Fossil SCM

Properly truncate a UTF-8 encoded title using a function by @florian.balmer per [http://fossil-scm.org/forum/forumpost/52b141aa91]

ashepilko 2020-03-03 19:13 trunk
Commit 35ad8eca06ffad05f6dfa4606c307cf54dbe67b2234c1b9dc4de3c097e726063
2 files changed +3 -9 +23
+3 -9
--- src/forum.c
+++ src/forum.c
@@ -554,22 +554,16 @@
554554
/* truncate the title when longer than max allowed;
555555
* in case of UTF-8 make sure the truncated string remains valid,
556556
* otherwise (different encoding?) pass as-is
557557
*/
558558
if( mxForumPostTitleLen>0 && blob_size(&title)>mxForumPostTitleLen ){
559
- Blob truncated;
560559
int len;
561
- blob_copy(&truncated, &title);
562
- for( len = mxForumPostTitleLen; len; --len ){
563
- blob_truncate(&truncated, len);
564
- if( !invalid_utf8(&truncated) ) break;
565
- }
560
+ len = utf8_nearest_codepoint(blob_str(&title), mxForumPostTitleLen);
566561
if( len ){
567
- blob_append(&truncated, "...", 3);
568
- blob_copy(&title, &truncated);
562
+ blob_truncate(&title, len);
563
+ blob_append(&title, "...", 3);
569564
}
570
- blob_reset(&truncated);
571565
}
572566
style_header("%s%s", blob_str(&title), blob_size(&title) ? " - Forum" : "Forum");
573567
blob_reset(&title);
574568
fossil_free(zThreadTitle);
575569
return 0;
576570
--- src/forum.c
+++ src/forum.c
@@ -554,22 +554,16 @@
554 /* truncate the title when longer than max allowed;
555 * in case of UTF-8 make sure the truncated string remains valid,
556 * otherwise (different encoding?) pass as-is
557 */
558 if( mxForumPostTitleLen>0 && blob_size(&title)>mxForumPostTitleLen ){
559 Blob truncated;
560 int len;
561 blob_copy(&truncated, &title);
562 for( len = mxForumPostTitleLen; len; --len ){
563 blob_truncate(&truncated, len);
564 if( !invalid_utf8(&truncated) ) break;
565 }
566 if( len ){
567 blob_append(&truncated, "...", 3);
568 blob_copy(&title, &truncated);
569 }
570 blob_reset(&truncated);
571 }
572 style_header("%s%s", blob_str(&title), blob_size(&title) ? " - Forum" : "Forum");
573 blob_reset(&title);
574 fossil_free(zThreadTitle);
575 return 0;
576
--- src/forum.c
+++ src/forum.c
@@ -554,22 +554,16 @@
554 /* truncate the title when longer than max allowed;
555 * in case of UTF-8 make sure the truncated string remains valid,
556 * otherwise (different encoding?) pass as-is
557 */
558 if( mxForumPostTitleLen>0 && blob_size(&title)>mxForumPostTitleLen ){
 
559 int len;
560 len = utf8_nearest_codepoint(blob_str(&title), mxForumPostTitleLen);
 
 
 
 
561 if( len ){
562 blob_truncate(&title, len);
563 blob_append(&title, "...", 3);
564 }
 
565 }
566 style_header("%s%s", blob_str(&title), blob_size(&title) ? " - Forum" : "Forum");
567 blob_reset(&title);
568 fossil_free(zThreadTitle);
569 return 0;
570
+23
--- src/utf8.c
+++ src/utf8.c
@@ -298,10 +298,33 @@
298298
fossil_free(pOld);
299299
#else
300300
/* No-op on all other unix */
301301
#endif
302302
}
303
+
304
+/*
305
+** For a given index in a UTF-8 string, return the nearest index that is the
306
+** start of a new code point. The returned index is equal or lower than the
307
+** given index. The end of the string (the null-terminator) is considered a
308
+** valid start index. The given index is returned unchanged if the string
309
+** contains invalid UTF-8 (i.e. overlong runs of trail bytes).
310
+** This function is useful to find code point boundaries for truncation, for
311
+** example, so that no incomplete UTF-8 sequences are left at the end of the
312
+** truncated string.
313
+** This function does not attempt to keep logical and/or visual constructs
314
+** spanning across multiple code points intact, that is no attempts are made
315
+** keep combining characters together with their base characters, or to keep
316
+** more complex grapheme clusters intact.
317
+*/
318
+#define IsUTF8TrailByte(c) ( (c&0xc0)==0x80 )
319
+int utf8_nearest_codepoint(const char *zString, int maxByteIndex){
320
+ int i,n;
321
+ for( n=0, i=maxByteIndex; n<4 && i>=0; n++, i-- ){
322
+ if( !IsUTF8TrailByte(zString[i]) ) return i;
323
+ }
324
+ return maxByteIndex;
325
+}
303326
304327
/*
305328
** Display UTF-8 on the console. Return the number of
306329
** Characters written. If stdout or stderr is redirected
307330
** to a file, -1 is returned and nothing is written
308331
--- src/utf8.c
+++ src/utf8.c
@@ -298,10 +298,33 @@
298 fossil_free(pOld);
299 #else
300 /* No-op on all other unix */
301 #endif
302 }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
304 /*
305 ** Display UTF-8 on the console. Return the number of
306 ** Characters written. If stdout or stderr is redirected
307 ** to a file, -1 is returned and nothing is written
308
--- src/utf8.c
+++ src/utf8.c
@@ -298,10 +298,33 @@
298 fossil_free(pOld);
299 #else
300 /* No-op on all other unix */
301 #endif
302 }
303
304 /*
305 ** For a given index in a UTF-8 string, return the nearest index that is the
306 ** start of a new code point. The returned index is equal or lower than the
307 ** given index. The end of the string (the null-terminator) is considered a
308 ** valid start index. The given index is returned unchanged if the string
309 ** contains invalid UTF-8 (i.e. overlong runs of trail bytes).
310 ** This function is useful to find code point boundaries for truncation, for
311 ** example, so that no incomplete UTF-8 sequences are left at the end of the
312 ** truncated string.
313 ** This function does not attempt to keep logical and/or visual constructs
314 ** spanning across multiple code points intact, that is no attempts are made
315 ** keep combining characters together with their base characters, or to keep
316 ** more complex grapheme clusters intact.
317 */
318 #define IsUTF8TrailByte(c) ( (c&0xc0)==0x80 )
319 int utf8_nearest_codepoint(const char *zString, int maxByteIndex){
320 int i,n;
321 for( n=0, i=maxByteIndex; n<4 && i>=0; n++, i-- ){
322 if( !IsUTF8TrailByte(zString[i]) ) return i;
323 }
324 return maxByteIndex;
325 }
326
327 /*
328 ** Display UTF-8 on the console. Return the number of
329 ** Characters written. If stdout or stderr is redirected
330 ** to a file, -1 is returned and nothing is written
331

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button