Fossil SCM

Improve truncation of UTF-8 encoded title using a function by @florian.balmer per [https://fossil-scm.org/forum/forumpost/6d90d5d99c]

ashepilko 2020-03-06 17:08 trunk
Commit d076853d10a2f2f7b1812b1db4c45ef522060e0fdcddd6b865d79a164906bea8
2 files changed +1 -1 +30
+1 -1
--- src/forum.c
+++ src/forum.c
@@ -555,11 +555,11 @@
555555
* in case of UTF-8 make sure the truncated string remains valid,
556556
* otherwise (different encoding?) pass as-is
557557
*/
558558
if( mxForumPostTitleLen>0 && blob_size(&title)>mxForumPostTitleLen ){
559559
int len;
560
- len = utf8_nearest_codepoint(blob_str(&title), mxForumPostTitleLen);
560
+ len = utf8_codepoint_index(blob_str(&title), mxForumPostTitleLen);
561561
if( len ){
562562
blob_truncate(&title, len);
563563
blob_append(&title, "...", 3);
564564
}
565565
}
566566
--- src/forum.c
+++ src/forum.c
@@ -555,11 +555,11 @@
555 * in case of UTF-8 make sure the truncated string remains valid,
556 * otherwise (different encoding?) pass as-is
557 */
558 if( mxForumPostTitleLen>0 && blob_size(&title)>mxForumPostTitleLen ){
559 int len;
560 len = utf8_nearest_codepoint(blob_str(&title), mxForumPostTitleLen);
561 if( len ){
562 blob_truncate(&title, len);
563 blob_append(&title, "...", 3);
564 }
565 }
566
--- src/forum.c
+++ src/forum.c
@@ -555,11 +555,11 @@
555 * in case of UTF-8 make sure the truncated string remains valid,
556 * otherwise (different encoding?) pass as-is
557 */
558 if( mxForumPostTitleLen>0 && blob_size(&title)>mxForumPostTitleLen ){
559 int len;
560 len = utf8_codepoint_index(blob_str(&title), mxForumPostTitleLen);
561 if( len ){
562 blob_truncate(&title, len);
563 blob_append(&title, "...", 3);
564 }
565 }
566
+30
--- src/utf8.c
+++ src/utf8.c
@@ -321,10 +321,40 @@
321321
for( n=0, i=maxByteIndex; n<4 && i>=0; n++, i-- ){
322322
if( !IsUTF8TrailByte(zString[i]) ) return i;
323323
}
324324
return maxByteIndex;
325325
}
326
+
327
+/*
328
+** Find the byte index corresponding to the given code point index in a UTF-8
329
+** string. If the string contains fewer than the given number of code points,
330
+** the index of the end of the string (the null-terminator) is returned.
331
+** Incomplete, ill-formed and overlong sequences are counted as one sequence.
332
+** The invalid lead bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are allowed to initiate
333
+** (ill-formed) 2- and 4-byte sequences, respectively, the other invalid lead
334
+** bytes 0xF8 to 0xFF are treated as invalid 1-byte sequences (as lone trail
335
+** bytes).
336
+*/
337
+int utf8_codepoint_index(const char *zString, int nCodePoint){
338
+ int i; /* Counted bytes. */
339
+ int lenUTF8; /* Counted UTF-8 sequences. */
340
+ if( zString==0 ) return 0;
341
+ for(i=0, lenUTF8=0; zString[i]!=0 && lenUTF8<nCodePoint; i++, lenUTF8++){
342
+ char c = zString[i];
343
+ int cchUTF8=1; /* Code units consumed. */
344
+ int maxUTF8=1; /* Expected sequence length. */
345
+ if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
346
+ else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
347
+ else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
348
+ while( cchUTF8<maxUTF8 &&
349
+ (zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
350
+ cchUTF8++;
351
+ i++;
352
+ }
353
+ }
354
+ return i;
355
+}
326356
327357
/*
328358
** Display UTF-8 on the console. Return the number of
329359
** Characters written. If stdout or stderr is redirected
330360
** to a file, -1 is returned and nothing is written
331361
--- src/utf8.c
+++ src/utf8.c
@@ -321,10 +321,40 @@
321 for( n=0, i=maxByteIndex; n<4 && i>=0; n++, i-- ){
322 if( !IsUTF8TrailByte(zString[i]) ) return i;
323 }
324 return maxByteIndex;
325 }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
327 /*
328 ** Display UTF-8 on the console. Return the number of
329 ** Characters written. If stdout or stderr is redirected
330 ** to a file, -1 is returned and nothing is written
331
--- src/utf8.c
+++ src/utf8.c
@@ -321,10 +321,40 @@
321 for( n=0, i=maxByteIndex; n<4 && i>=0; n++, i-- ){
322 if( !IsUTF8TrailByte(zString[i]) ) return i;
323 }
324 return maxByteIndex;
325 }
326
327 /*
328 ** Find the byte index corresponding to the given code point index in a UTF-8
329 ** string. If the string contains fewer than the given number of code points,
330 ** the index of the end of the string (the null-terminator) is returned.
331 ** Incomplete, ill-formed and overlong sequences are counted as one sequence.
332 ** The invalid lead bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are allowed to initiate
333 ** (ill-formed) 2- and 4-byte sequences, respectively, the other invalid lead
334 ** bytes 0xF8 to 0xFF are treated as invalid 1-byte sequences (as lone trail
335 ** bytes).
336 */
337 int utf8_codepoint_index(const char *zString, int nCodePoint){
338 int i; /* Counted bytes. */
339 int lenUTF8; /* Counted UTF-8 sequences. */
340 if( zString==0 ) return 0;
341 for(i=0, lenUTF8=0; zString[i]!=0 && lenUTF8<nCodePoint; i++, lenUTF8++){
342 char c = zString[i];
343 int cchUTF8=1; /* Code units consumed. */
344 int maxUTF8=1; /* Expected sequence length. */
345 if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
346 else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
347 else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
348 while( cchUTF8<maxUTF8 &&
349 (zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
350 cchUTF8++;
351 i++;
352 }
353 }
354 return i;
355 }
356
357 /*
358 ** Display UTF-8 on the console. Return the number of
359 ** Characters written. If stdout or stderr is redirected
360 ** to a file, -1 is returned and nothing is written
361

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button