Fossil SCM

Properly truncate a UTF-8 encoded title using a function by @florian.balmer per [http://fossil-scm.org/forum/forumpost/52b141aa91]

ashepilko 2020-03-03 19:13 trunk

Commit 35ad8eca06ffad05f6dfa4606c307cf54dbe67b2234c1b9dc4de3c097e726063

Parent 00e2f0511e2bcd5…

2 files changed +3 -9 +23

M src/forum.c

+3 -9

		--- src/forum.c
		+++ src/forum.c
		@@ -554,22 +554,16 @@
554	554	/* truncate the title when longer than max allowed;
555	555	* in case of UTF-8 make sure the truncated string remains valid,
556	556	* otherwise (different encoding?) pass as-is
557	557	*/
558	558	if( mxForumPostTitleLen>0 && blob_size(&title)>mxForumPostTitleLen ){
559		- Blob truncated;
560	559	int len;
561		- blob_copy(&truncated, &title);
562		- for( len = mxForumPostTitleLen; len; --len ){
563		- blob_truncate(&truncated, len);
564		- if( !invalid_utf8(&truncated) ) break;
565		- }
	560	+ len = utf8_nearest_codepoint(blob_str(&title), mxForumPostTitleLen);
566	561	if( len ){
567		- blob_append(&truncated, "...", 3);
568		- blob_copy(&title, &truncated);
	562	+ blob_truncate(&title, len);
	563	+ blob_append(&title, "...", 3);
569	564	}
570		- blob_reset(&truncated);
571	565	}
572	566	style_header("%s%s", blob_str(&title), blob_size(&title) ? " - Forum" : "Forum");
573	567	blob_reset(&title);
574	568	fossil_free(zThreadTitle);
575	569	return 0;
576	570

	--- src/forum.c
	+++ src/forum.c
	@@ -554,22 +554,16 @@
554	/* truncate the title when longer than max allowed;
555	* in case of UTF-8 make sure the truncated string remains valid,
556	* otherwise (different encoding?) pass as-is
557	*/
558	if( mxForumPostTitleLen>0 && blob_size(&title)>mxForumPostTitleLen ){
559	Blob truncated;
560	int len;
561	blob_copy(&truncated, &title);
562	for( len = mxForumPostTitleLen; len; --len ){
563	blob_truncate(&truncated, len);
564	if( !invalid_utf8(&truncated) ) break;
565	}
566	if( len ){
567	blob_append(&truncated, "...", 3);
568	blob_copy(&title, &truncated);
569	}
570	blob_reset(&truncated);
571	}
572	style_header("%s%s", blob_str(&title), blob_size(&title) ? " - Forum" : "Forum");
573	blob_reset(&title);
574	fossil_free(zThreadTitle);
575	return 0;
576

	--- src/forum.c
	+++ src/forum.c
	@@ -554,22 +554,16 @@
554	/* truncate the title when longer than max allowed;
555	* in case of UTF-8 make sure the truncated string remains valid,
556	* otherwise (different encoding?) pass as-is
557	*/
558	if( mxForumPostTitleLen>0 && blob_size(&title)>mxForumPostTitleLen ){

559	int len;
560	len = utf8_nearest_codepoint(blob_str(&title), mxForumPostTitleLen);




561	if( len ){
562	blob_truncate(&title, len);
563	blob_append(&title, "...", 3);
564	}

565	}
566	style_header("%s%s", blob_str(&title), blob_size(&title) ? " - Forum" : "Forum");
567	blob_reset(&title);
568	fossil_free(zThreadTitle);
569	return 0;
570

M src/utf8.c

+23

		--- src/utf8.c
		+++ src/utf8.c
		@@ -298,10 +298,33 @@
298	298	fossil_free(pOld);
299	299	#else
300	300	/* No-op on all other unix */
301	301	#endif
302	302	}
	303	+
	304	+/*
	305	+** For a given index in a UTF-8 string, return the nearest index that is the
	306	+** start of a new code point. The returned index is equal or lower than the
	307	+** given index. The end of the string (the null-terminator) is considered a
	308	+** valid start index. The given index is returned unchanged if the string
	309	+** contains invalid UTF-8 (i.e. overlong runs of trail bytes).
	310	+** This function is useful to find code point boundaries for truncation, for
	311	+** example, so that no incomplete UTF-8 sequences are left at the end of the
	312	+** truncated string.
	313	+** This function does not attempt to keep logical and/or visual constructs
	314	+** spanning across multiple code points intact, that is no attempts are made
	315	+** keep combining characters together with their base characters, or to keep
	316	+** more complex grapheme clusters intact.
	317	+*/
	318	+#define IsUTF8TrailByte(c) ( (c&0xc0)==0x80 )
	319	+int utf8_nearest_codepoint(const char *zString, int maxByteIndex){
	320	+ int i,n;
	321	+ for( n=0, i=maxByteIndex; n<4 && i>=0; n++, i-- ){
	322	+ if( !IsUTF8TrailByte(zString[i]) ) return i;
	323	+ }
	324	+ return maxByteIndex;
	325	+}
303	326
304	327	/*
305	328	** Display UTF-8 on the console. Return the number of
306	329	** Characters written. If stdout or stderr is redirected
307	330	** to a file, -1 is returned and nothing is written
308	331

	--- src/utf8.c
	+++ src/utf8.c
	@@ -298,10 +298,33 @@
298	fossil_free(pOld);
299	#else
300	/* No-op on all other unix */
301	#endif
302	}























303
304	/*
305	** Display UTF-8 on the console. Return the number of
306	** Characters written. If stdout or stderr is redirected
307	** to a file, -1 is returned and nothing is written
308

	--- src/utf8.c
	+++ src/utf8.c
	@@ -298,10 +298,33 @@
298	fossil_free(pOld);
299	#else
300	/* No-op on all other unix */
301	#endif
302	}
303
304	/*
305	** For a given index in a UTF-8 string, return the nearest index that is the
306	** start of a new code point. The returned index is equal or lower than the
307	** given index. The end of the string (the null-terminator) is considered a
308	** valid start index. The given index is returned unchanged if the string
309	** contains invalid UTF-8 (i.e. overlong runs of trail bytes).
310	** This function is useful to find code point boundaries for truncation, for
311	** example, so that no incomplete UTF-8 sequences are left at the end of the
312	** truncated string.
313	** This function does not attempt to keep logical and/or visual constructs
314	** spanning across multiple code points intact, that is no attempts are made
315	** keep combining characters together with their base characters, or to keep
316	** more complex grapheme clusters intact.
317	*/
318	#define IsUTF8TrailByte(c) ( (c&0xc0)==0x80 )
319	int utf8_nearest_codepoint(const char *zString, int maxByteIndex){
320	int i,n;
321	for( n=0, i=maxByteIndex; n<4 && i>=0; n++, i-- ){
322	if( !IsUTF8TrailByte(zString[i]) ) return i;
323	}
324	return maxByteIndex;
325	}
326
327	/*
328	** Display UTF-8 on the console. Return the number of
329	** Characters written. If stdout or stderr is redirected
330	** to a file, -1 is returned and nothing is written
331