Fossil SCM

Quick test whether the `cli_wcwidth()' function [https://sqlite.org/src/vdiff?branch=variable-width-char | recently added to the SQLite shell] can be used by the comment formatter to take character widths into account when calculating word-break positions. TODOs: (0) Fix the "modern" (i.e. non-legacy) comment formatter being off by one if a fullwidth character only fits partially. (1) Add tests for the comment formatters with non-ASCII input. (2) Implement a modified `decodeUtf8()' function (which is static, anyway) that also accepts single-byte UTF-8 characters and may allow for some simplifications to the comment formatter algorithms.

florian 2024-09-27 04:52 trunk

Commit b2dbdc8afbff1c162400696cae7e8a80e444cd6129b01c496fda15ad59e8f68c

Parent c20aa86727773e3…

2 files changed +1 -1 +10

~ extsrc/shell.c ~ src/comformat.c

M extsrc/shell.c

+1 -1

		--- extsrc/shell.c
		+++ extsrc/shell.c
		@@ -1024,11 +1024,11 @@
1024	1024	** Compute the value and length of a multi-byte UTF-8 character that
1025	1025	** begins at z[0]. Return the length. Write the Unicode value into *pU.
1026	1026	**
1027	1027	** This routine only works for multi-byte UTF-8 characters.
1028	1028	*/
1029		-static int decodeUtf8(const unsigned char z, int pU){
	1029	+int decodeUtf8(const unsigned char z, int pU){
1030	1030	if( (z[0] & 0xe0)==0xc0 && (z[1] & 0xc0)==0x80 ){
1031	1031	*pU = ((z[0] & 0x1f)<<6) \| (z[1] & 0x3f);
1032	1032	return 2;
1033	1033	}
1034	1034	if( (z[0] & 0xf0)==0xe0 && (z[1] & 0xc0)==0x80 && (z[2] & 0xc0)==0x80 ){
1035	1035

	--- extsrc/shell.c
	+++ extsrc/shell.c
	@@ -1024,11 +1024,11 @@
1024	** Compute the value and length of a multi-byte UTF-8 character that
1025	** begins at z[0]. Return the length. Write the Unicode value into *pU.
1026	**
1027	** This routine only works for multi-byte UTF-8 characters.
1028	*/
1029	static int decodeUtf8(const unsigned char z, int pU){
1030	if( (z[0] & 0xe0)==0xc0 && (z[1] & 0xc0)==0x80 ){
1031	*pU = ((z[0] & 0x1f)<<6) \| (z[1] & 0x3f);
1032	return 2;
1033	}
1034	if( (z[0] & 0xf0)==0xe0 && (z[1] & 0xc0)==0x80 && (z[2] & 0xc0)==0x80 ){
1035

	--- extsrc/shell.c
	+++ extsrc/shell.c
	@@ -1024,11 +1024,11 @@
1024	** Compute the value and length of a multi-byte UTF-8 character that
1025	** begins at z[0]. Return the length. Write the Unicode value into *pU.
1026	**
1027	** This routine only works for multi-byte UTF-8 characters.
1028	*/
1029	int decodeUtf8(const unsigned char z, int pU){
1030	if( (z[0] & 0xe0)==0xc0 && (z[1] & 0xc0)==0x80 ){
1031	*pU = ((z[0] & 0x1f)<<6) \| (z[1] & 0x3f);
1032	return 2;
1033	}
1034	if( (z[0] & 0xf0)==0xe0 && (z[1] & 0xc0)==0x80 && (z[2] & 0xc0)==0x80 ){
1035

M src/comformat.c

+10

		--- src/comformat.c
		+++ src/comformat.c
		@@ -294,10 +294,15 @@
294	294	while( cchUTF8<maxUTF8 &&
295	295	(zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
296	296	cchUTF8++;
297	297	zBuf[iBuf++] = zLine[index++];
298	298	}
	299	+ if( cchUTF8>1 ){
	300	+ int utf32;
	301	+ decodeUtf8(&zLine[index-cchUTF8],&utf32);
	302	+ useChars += cli_wcwidth(utf32) - 1;
	303	+ }
299	304	maxChars -= useChars;
300	305	if( maxChars<=0 ) break;
301	306	if( c=='\n' ) break;
302	307	}
303	308	if( charCnt>0 ){
		@@ -380,10 +385,15 @@
380	385	(zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
381	386	cchUTF8++;
382	387	zBuf[k++] = zText[++i];
383	388	}
384	389	}
	390	+ if( cchUTF8>1 ){
	391	+ int utf32;
	392	+ decodeUtf8(&zText[k-cchUTF8],&utf32);
	393	+ kc += cli_wcwidth(utf32) - 1;
	394	+ }
385	395	else if( fossil_isspace(c) ){
386	396	si = i;
387	397	sk = k;
388	398	if( k==0 \|\| zBuf[k-1]!=' ' ){
389	399	zBuf[k++] = ' ';
390	400

	--- src/comformat.c
	+++ src/comformat.c
	@@ -294,10 +294,15 @@
294	while( cchUTF8<maxUTF8 &&
295	(zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
296	cchUTF8++;
297	zBuf[iBuf++] = zLine[index++];
298	}





299	maxChars -= useChars;
300	if( maxChars<=0 ) break;
301	if( c=='\n' ) break;
302	}
303	if( charCnt>0 ){
	@@ -380,10 +385,15 @@
380	(zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
381	cchUTF8++;
382	zBuf[k++] = zText[++i];
383	}
384	}





385	else if( fossil_isspace(c) ){
386	si = i;
387	sk = k;
388	if( k==0 \|\| zBuf[k-1]!=' ' ){
389	zBuf[k++] = ' ';
390

	--- src/comformat.c
	+++ src/comformat.c
	@@ -294,10 +294,15 @@
294	while( cchUTF8<maxUTF8 &&
295	(zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
296	cchUTF8++;
297	zBuf[iBuf++] = zLine[index++];
298	}
299	if( cchUTF8>1 ){
300	int utf32;
301	decodeUtf8(&zLine[index-cchUTF8],&utf32);
302	useChars += cli_wcwidth(utf32) - 1;
303	}
304	maxChars -= useChars;
305	if( maxChars<=0 ) break;
306	if( c=='\n' ) break;
307	}
308	if( charCnt>0 ){
	@@ -380,10 +385,15 @@
385	(zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
386	cchUTF8++;
387	zBuf[k++] = zText[++i];
388	}
389	}
390	if( cchUTF8>1 ){
391	int utf32;
392	decodeUtf8(&zText[k-cchUTF8],&utf32);
393	kc += cli_wcwidth(utf32) - 1;
394	}
395	else if( fossil_isspace(c) ){
396	si = i;
397	sk = k;
398	if( k==0 \|\| zBuf[k-1]!=' ' ){
399	zBuf[k++] = ' ';
400

Fossil SCM

Keyboard Shortcuts