Fossil SCM

Enhance the comment_print() subroutien so that it understands that VT100 escape codes are zero-width characters, and allocates text to lines accordingly.

drh 2025-02-26 18:07 trunk

Commit 32f954a1f2bfa6accba603c712bf914b86766dfd6155f98cf0a41a0a57bccfbc

Parent 25f43cc634e789d…

1 file changed +27 -12

~ src/comformat.c

M src/comformat.c

+27 -12

		--- src/comformat.c
		+++ src/comformat.c
		@@ -234,29 +234,44 @@
234	234	}
235	235	return 0; /* NOT REACHED */
236	236	}
237	237
238	238	/*
239		-** Return information about the next (single- or multi-byte) character in the
240		-** specified UTF-8 string: The number of UTF-8 code units (in this case: bytes)
241		-** and the decoded UTF-32 code point. Incomplete, ill-formed and overlong
242		-** sequences are consumed together as one invalid code point. The invalid lead
243		-** bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2-
244		-** and 4-byte sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF
245		-** are treated as invalid 1-byte sequences (as lone trail bytes), all resulting
246		-** in one invalid code point. Invalid UTF-8 sequences encoding a non-scalar code
247		-** point (UTF-16 surrogates U+D800 to U+DFFF) are allowed.
	239	+** Return information about the next (single- or multi-byte) character in
	240	+** z[0]. Two values are computed:
	241	+**
	242	+** * The number of bytes needed to represent the character.
	243	+** * The UTF code point value.
	244	+**
	245	+** Incomplete, ill-formed and overlong sequences are consumed together as
	246	+** one invalid code point. The invalid lead bytes 0xC0 to 0xC1 and 0xF5 to
	247	+** 0xF7 are allowed to initiate (ill-formed) 2- and 4-byte sequences,
	248	+** respectively, the other invalid lead bytes 0xF8 to 0xFF are treated
	249	+** as invalid 1-byte sequences (as lone trail bytes), all resulting
	250	+** in one invalid code point. Invalid UTF-8 sequences encoding a
	251	+** non-scalar code point (UTF-16 surrogates U+D800 to U+DFFF) are allowed.
	252	+**
	253	+** ANSI escape sequences of the form "\033[...X" are interpreted as a
	254	+** zero-width character.
248	255	*/
249	256	void char_info_utf8(
250		- const char *z,
251		- int *pCchUTF8,
252		- int *pUtf32
	257	+ const char z, / The character to be analyzed */
	258	+ int pCchUTF8, / OUT: The number of bytes used by this character */
	259	+ int pUtf32 / OUT: The UTF8 code point (used to determine width) */
253	260	){
254	261	int i = 0; /* Counted bytes. */
255	262	int cchUTF8 = 1; /* Code units consumed. */
256	263	int maxUTF8 = 1; /* Expected sequence length. */
257	264	char c = z[i++];
	265	+ if( c==0x1b && z[i]=='[' ){
	266	+ do{
	267	+ i++;
	268	+ }while( fossil_isdigit(z[i]) \|\| z[i]==';' );
	269	+ *pCchUTF8 = i+1;
	270	+ pUtf32 = 0x301; / A zero-width character */
	271	+ return;
	272	+ }
258	273	if( (c&0x80)==0x00 ){ /* 7-bit ASCII character. */
259	274	*pCchUTF8 = 1;
260	275	*pUtf32 = (int)z[0];
261	276	return;
262	277	}
263	278

	--- src/comformat.c
	+++ src/comformat.c
	@@ -234,29 +234,44 @@
234	}
235	return 0; /* NOT REACHED */
236	}
237
238	/*
239	** Return information about the next (single- or multi-byte) character in the
240	** specified UTF-8 string: The number of UTF-8 code units (in this case: bytes)
241	** and the decoded UTF-32 code point. Incomplete, ill-formed and overlong
242	** sequences are consumed together as one invalid code point. The invalid lead
243	** bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2-
244	** and 4-byte sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF
245	** are treated as invalid 1-byte sequences (as lone trail bytes), all resulting
246	** in one invalid code point. Invalid UTF-8 sequences encoding a non-scalar code
247	** point (UTF-16 surrogates U+D800 to U+DFFF) are allowed.







248	*/
249	void char_info_utf8(
250	const char *z,
251	int *pCchUTF8,
252	int *pUtf32
253	){
254	int i = 0; /* Counted bytes. */
255	int cchUTF8 = 1; /* Code units consumed. */
256	int maxUTF8 = 1; /* Expected sequence length. */
257	char c = z[i++];








258	if( (c&0x80)==0x00 ){ /* 7-bit ASCII character. */
259	*pCchUTF8 = 1;
260	*pUtf32 = (int)z[0];
261	return;
262	}
263

	--- src/comformat.c
	+++ src/comformat.c
	@@ -234,29 +234,44 @@
234	}
235	return 0; /* NOT REACHED */
236	}
237
238	/*
239	** Return information about the next (single- or multi-byte) character in
240	** z[0]. Two values are computed:
241	**
242	** * The number of bytes needed to represent the character.
243	** * The UTF code point value.
244	**
245	** Incomplete, ill-formed and overlong sequences are consumed together as
246	** one invalid code point. The invalid lead bytes 0xC0 to 0xC1 and 0xF5 to
247	** 0xF7 are allowed to initiate (ill-formed) 2- and 4-byte sequences,
248	** respectively, the other invalid lead bytes 0xF8 to 0xFF are treated
249	** as invalid 1-byte sequences (as lone trail bytes), all resulting
250	** in one invalid code point. Invalid UTF-8 sequences encoding a
251	** non-scalar code point (UTF-16 surrogates U+D800 to U+DFFF) are allowed.
252	**
253	** ANSI escape sequences of the form "\033[...X" are interpreted as a
254	** zero-width character.
255	*/
256	void char_info_utf8(
257	const char z, / The character to be analyzed */
258	int pCchUTF8, / OUT: The number of bytes used by this character */
259	int pUtf32 / OUT: The UTF8 code point (used to determine width) */
260	){
261	int i = 0; /* Counted bytes. */
262	int cchUTF8 = 1; /* Code units consumed. */
263	int maxUTF8 = 1; /* Expected sequence length. */
264	char c = z[i++];
265	if( c==0x1b && z[i]=='[' ){
266	do{
267	i++;
268	}while( fossil_isdigit(z[i]) \|\| z[i]==';' );
269	*pCchUTF8 = i+1;
270	pUtf32 = 0x301; / A zero-width character */
271	return;
272	}
273	if( (c&0x80)==0x00 ){ /* 7-bit ASCII character. */
274	*pCchUTF8 = 1;
275	*pUtf32 = (int)z[0];
276	return;
277	}
278

Fossil SCM

Keyboard Shortcuts