Fossil SCM

Improve truncation of UTF-8 encoded title using a function by @florian.balmer per [https://fossil-scm.org/forum/forumpost/6d90d5d99c]

ashepilko 2020-03-06 17:08 trunk

Commit d076853d10a2f2f7b1812b1db4c45ef522060e0fdcddd6b865d79a164906bea8

Parent b70a76e35495d50…

2 files changed +1 -1 +30

~ src/forum.c ~ src/utf8.c

M src/forum.c

+1 -1

		--- src/forum.c
		+++ src/forum.c
		@@ -555,11 +555,11 @@
555	555	* in case of UTF-8 make sure the truncated string remains valid,
556	556	* otherwise (different encoding?) pass as-is
557	557	*/
558	558	if( mxForumPostTitleLen>0 && blob_size(&title)>mxForumPostTitleLen ){
559	559	int len;
560		- len = utf8_nearest_codepoint(blob_str(&title), mxForumPostTitleLen);
	560	+ len = utf8_codepoint_index(blob_str(&title), mxForumPostTitleLen);
561	561	if( len ){
562	562	blob_truncate(&title, len);
563	563	blob_append(&title, "...", 3);
564	564	}
565	565	}
566	566

	--- src/forum.c
	+++ src/forum.c
	@@ -555,11 +555,11 @@
555	* in case of UTF-8 make sure the truncated string remains valid,
556	* otherwise (different encoding?) pass as-is
557	*/
558	if( mxForumPostTitleLen>0 && blob_size(&title)>mxForumPostTitleLen ){
559	int len;
560	len = utf8_nearest_codepoint(blob_str(&title), mxForumPostTitleLen);
561	if( len ){
562	blob_truncate(&title, len);
563	blob_append(&title, "...", 3);
564	}
565	}
566

	--- src/forum.c
	+++ src/forum.c
	@@ -555,11 +555,11 @@
555	* in case of UTF-8 make sure the truncated string remains valid,
556	* otherwise (different encoding?) pass as-is
557	*/
558	if( mxForumPostTitleLen>0 && blob_size(&title)>mxForumPostTitleLen ){
559	int len;
560	len = utf8_codepoint_index(blob_str(&title), mxForumPostTitleLen);
561	if( len ){
562	blob_truncate(&title, len);
563	blob_append(&title, "...", 3);
564	}
565	}
566

M src/utf8.c

+30

		--- src/utf8.c
		+++ src/utf8.c
		@@ -321,10 +321,40 @@
321	321	for( n=0, i=maxByteIndex; n<4 && i>=0; n++, i-- ){
322	322	if( !IsUTF8TrailByte(zString[i]) ) return i;
323	323	}
324	324	return maxByteIndex;
325	325	}
	326	+
	327	+/*
	328	+** Find the byte index corresponding to the given code point index in a UTF-8
	329	+** string. If the string contains fewer than the given number of code points,
	330	+** the index of the end of the string (the null-terminator) is returned.
	331	+** Incomplete, ill-formed and overlong sequences are counted as one sequence.
	332	+** The invalid lead bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are allowed to initiate
	333	+** (ill-formed) 2- and 4-byte sequences, respectively, the other invalid lead
	334	+** bytes 0xF8 to 0xFF are treated as invalid 1-byte sequences (as lone trail
	335	+** bytes).
	336	+*/
	337	+int utf8_codepoint_index(const char *zString, int nCodePoint){
	338	+ int i; /* Counted bytes. */
	339	+ int lenUTF8; /* Counted UTF-8 sequences. */
	340	+ if( zString==0 ) return 0;
	341	+ for(i=0, lenUTF8=0; zString[i]!=0 && lenUTF8<nCodePoint; i++, lenUTF8++){
	342	+ char c = zString[i];
	343	+ int cchUTF8=1; /* Code units consumed. */
	344	+ int maxUTF8=1; /* Expected sequence length. */
	345	+ if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
	346	+ else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
	347	+ else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
	348	+ while( cchUTF8<maxUTF8 &&
	349	+ (zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
	350	+ cchUTF8++;
	351	+ i++;
	352	+ }
	353	+ }
	354	+ return i;
	355	+}
326	356
327	357	/*
328	358	** Display UTF-8 on the console. Return the number of
329	359	** Characters written. If stdout or stderr is redirected
330	360	** to a file, -1 is returned and nothing is written
331	361

	--- src/utf8.c
	+++ src/utf8.c
	@@ -321,10 +321,40 @@
321	for( n=0, i=maxByteIndex; n<4 && i>=0; n++, i-- ){
322	if( !IsUTF8TrailByte(zString[i]) ) return i;
323	}
324	return maxByteIndex;
325	}






























326
327	/*
328	** Display UTF-8 on the console. Return the number of
329	** Characters written. If stdout or stderr is redirected
330	** to a file, -1 is returned and nothing is written
331

	--- src/utf8.c
	+++ src/utf8.c
	@@ -321,10 +321,40 @@
321	for( n=0, i=maxByteIndex; n<4 && i>=0; n++, i-- ){
322	if( !IsUTF8TrailByte(zString[i]) ) return i;
323	}
324	return maxByteIndex;
325	}
326
327	/*
328	** Find the byte index corresponding to the given code point index in a UTF-8
329	** string. If the string contains fewer than the given number of code points,
330	** the index of the end of the string (the null-terminator) is returned.
331	** Incomplete, ill-formed and overlong sequences are counted as one sequence.
332	** The invalid lead bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are allowed to initiate
333	** (ill-formed) 2- and 4-byte sequences, respectively, the other invalid lead
334	** bytes 0xF8 to 0xFF are treated as invalid 1-byte sequences (as lone trail
335	** bytes).
336	*/
337	int utf8_codepoint_index(const char *zString, int nCodePoint){
338	int i; /* Counted bytes. */
339	int lenUTF8; /* Counted UTF-8 sequences. */
340	if( zString==0 ) return 0;
341	for(i=0, lenUTF8=0; zString[i]!=0 && lenUTF8<nCodePoint; i++, lenUTF8++){
342	char c = zString[i];
343	int cchUTF8=1; /* Code units consumed. */
344	int maxUTF8=1; /* Expected sequence length. */
345	if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
346	else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
347	else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
348	while( cchUTF8<maxUTF8 &&
349	(zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
350	cchUTF8++;
351	i++;
352	}
353	}
354	return i;
355	}
356
357	/*
358	** Display UTF-8 on the console. Return the number of
359	** Characters written. If stdout or stderr is redirected
360	** to a file, -1 is returned and nothing is written
361

Fossil SCM

Keyboard Shortcuts