Fossil SCM

Enhance the comment_print() subroutien so that it understands that VT100 escape codes are zero-width characters, and allocates text to lines accordingly.

drh 2025-02-26 18:07 trunk
Commit 32f954a1f2bfa6accba603c712bf914b86766dfd6155f98cf0a41a0a57bccfbc
1 file changed +27 -12
+27 -12
--- src/comformat.c
+++ src/comformat.c
@@ -234,29 +234,44 @@
234234
}
235235
return 0; /* NOT REACHED */
236236
}
237237
238238
/*
239
-** Return information about the next (single- or multi-byte) character in the
240
-** specified UTF-8 string: The number of UTF-8 code units (in this case: bytes)
241
-** and the decoded UTF-32 code point. Incomplete, ill-formed and overlong
242
-** sequences are consumed together as one invalid code point. The invalid lead
243
-** bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2-
244
-** and 4-byte sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF
245
-** are treated as invalid 1-byte sequences (as lone trail bytes), all resulting
246
-** in one invalid code point. Invalid UTF-8 sequences encoding a non-scalar code
247
-** point (UTF-16 surrogates U+D800 to U+DFFF) are allowed.
239
+** Return information about the next (single- or multi-byte) character in
240
+** z[0]. Two values are computed:
241
+**
242
+** * The number of bytes needed to represent the character.
243
+** * The UTF code point value.
244
+**
245
+** Incomplete, ill-formed and overlong sequences are consumed together as
246
+** one invalid code point. The invalid lead bytes 0xC0 to 0xC1 and 0xF5 to
247
+** 0xF7 are allowed to initiate (ill-formed) 2- and 4-byte sequences,
248
+** respectively, the other invalid lead bytes 0xF8 to 0xFF are treated
249
+** as invalid 1-byte sequences (as lone trail bytes), all resulting
250
+** in one invalid code point. Invalid UTF-8 sequences encoding a
251
+** non-scalar code point (UTF-16 surrogates U+D800 to U+DFFF) are allowed.
252
+**
253
+** ANSI escape sequences of the form "\033[...X" are interpreted as a
254
+** zero-width character.
248255
*/
249256
void char_info_utf8(
250
- const char *z,
251
- int *pCchUTF8,
252
- int *pUtf32
257
+ const char *z, /* The character to be analyzed */
258
+ int *pCchUTF8, /* OUT: The number of bytes used by this character */
259
+ int *pUtf32 /* OUT: The UTF8 code point (used to determine width) */
253260
){
254261
int i = 0; /* Counted bytes. */
255262
int cchUTF8 = 1; /* Code units consumed. */
256263
int maxUTF8 = 1; /* Expected sequence length. */
257264
char c = z[i++];
265
+ if( c==0x1b && z[i]=='[' ){
266
+ do{
267
+ i++;
268
+ }while( fossil_isdigit(z[i]) || z[i]==';' );
269
+ *pCchUTF8 = i+1;
270
+ *pUtf32 = 0x301; /* A zero-width character */
271
+ return;
272
+ }
258273
if( (c&0x80)==0x00 ){ /* 7-bit ASCII character. */
259274
*pCchUTF8 = 1;
260275
*pUtf32 = (int)z[0];
261276
return;
262277
}
263278
--- src/comformat.c
+++ src/comformat.c
@@ -234,29 +234,44 @@
234 }
235 return 0; /* NOT REACHED */
236 }
237
238 /*
239 ** Return information about the next (single- or multi-byte) character in the
240 ** specified UTF-8 string: The number of UTF-8 code units (in this case: bytes)
241 ** and the decoded UTF-32 code point. Incomplete, ill-formed and overlong
242 ** sequences are consumed together as one invalid code point. The invalid lead
243 ** bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2-
244 ** and 4-byte sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF
245 ** are treated as invalid 1-byte sequences (as lone trail bytes), all resulting
246 ** in one invalid code point. Invalid UTF-8 sequences encoding a non-scalar code
247 ** point (UTF-16 surrogates U+D800 to U+DFFF) are allowed.
 
 
 
 
 
 
 
248 */
249 void char_info_utf8(
250 const char *z,
251 int *pCchUTF8,
252 int *pUtf32
253 ){
254 int i = 0; /* Counted bytes. */
255 int cchUTF8 = 1; /* Code units consumed. */
256 int maxUTF8 = 1; /* Expected sequence length. */
257 char c = z[i++];
 
 
 
 
 
 
 
 
258 if( (c&0x80)==0x00 ){ /* 7-bit ASCII character. */
259 *pCchUTF8 = 1;
260 *pUtf32 = (int)z[0];
261 return;
262 }
263
--- src/comformat.c
+++ src/comformat.c
@@ -234,29 +234,44 @@
234 }
235 return 0; /* NOT REACHED */
236 }
237
238 /*
239 ** Return information about the next (single- or multi-byte) character in
240 ** z[0]. Two values are computed:
241 **
242 ** * The number of bytes needed to represent the character.
243 ** * The UTF code point value.
244 **
245 ** Incomplete, ill-formed and overlong sequences are consumed together as
246 ** one invalid code point. The invalid lead bytes 0xC0 to 0xC1 and 0xF5 to
247 ** 0xF7 are allowed to initiate (ill-formed) 2- and 4-byte sequences,
248 ** respectively, the other invalid lead bytes 0xF8 to 0xFF are treated
249 ** as invalid 1-byte sequences (as lone trail bytes), all resulting
250 ** in one invalid code point. Invalid UTF-8 sequences encoding a
251 ** non-scalar code point (UTF-16 surrogates U+D800 to U+DFFF) are allowed.
252 **
253 ** ANSI escape sequences of the form "\033[...X" are interpreted as a
254 ** zero-width character.
255 */
256 void char_info_utf8(
257 const char *z, /* The character to be analyzed */
258 int *pCchUTF8, /* OUT: The number of bytes used by this character */
259 int *pUtf32 /* OUT: The UTF8 code point (used to determine width) */
260 ){
261 int i = 0; /* Counted bytes. */
262 int cchUTF8 = 1; /* Code units consumed. */
263 int maxUTF8 = 1; /* Expected sequence length. */
264 char c = z[i++];
265 if( c==0x1b && z[i]=='[' ){
266 do{
267 i++;
268 }while( fossil_isdigit(z[i]) || z[i]==';' );
269 *pCchUTF8 = i+1;
270 *pUtf32 = 0x301; /* A zero-width character */
271 return;
272 }
273 if( (c&0x80)==0x00 ){ /* 7-bit ASCII character. */
274 *pCchUTF8 = 1;
275 *pUtf32 = (int)z[0];
276 return;
277 }
278

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button