Fossil SCM
Enhance the comment_print() subroutien so that it understands that VT100 escape codes are zero-width characters, and allocates text to lines accordingly.
Commit
32f954a1f2bfa6accba603c712bf914b86766dfd6155f98cf0a41a0a57bccfbc
Parent
25f43cc634e789d…
1 file changed
+27
-12
+27
-12
| --- src/comformat.c | ||
| +++ src/comformat.c | ||
| @@ -234,29 +234,44 @@ | ||
| 234 | 234 | } |
| 235 | 235 | return 0; /* NOT REACHED */ |
| 236 | 236 | } |
| 237 | 237 | |
| 238 | 238 | /* |
| 239 | -** Return information about the next (single- or multi-byte) character in the | |
| 240 | -** specified UTF-8 string: The number of UTF-8 code units (in this case: bytes) | |
| 241 | -** and the decoded UTF-32 code point. Incomplete, ill-formed and overlong | |
| 242 | -** sequences are consumed together as one invalid code point. The invalid lead | |
| 243 | -** bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- | |
| 244 | -** and 4-byte sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF | |
| 245 | -** are treated as invalid 1-byte sequences (as lone trail bytes), all resulting | |
| 246 | -** in one invalid code point. Invalid UTF-8 sequences encoding a non-scalar code | |
| 247 | -** point (UTF-16 surrogates U+D800 to U+DFFF) are allowed. | |
| 239 | +** Return information about the next (single- or multi-byte) character in | |
| 240 | +** z[0]. Two values are computed: | |
| 241 | +** | |
| 242 | +** * The number of bytes needed to represent the character. | |
| 243 | +** * The UTF code point value. | |
| 244 | +** | |
| 245 | +** Incomplete, ill-formed and overlong sequences are consumed together as | |
| 246 | +** one invalid code point. The invalid lead bytes 0xC0 to 0xC1 and 0xF5 to | |
| 247 | +** 0xF7 are allowed to initiate (ill-formed) 2- and 4-byte sequences, | |
| 248 | +** respectively, the other invalid lead bytes 0xF8 to 0xFF are treated | |
| 249 | +** as invalid 1-byte sequences (as lone trail bytes), all resulting | |
| 250 | +** in one invalid code point. Invalid UTF-8 sequences encoding a | |
| 251 | +** non-scalar code point (UTF-16 surrogates U+D800 to U+DFFF) are allowed. | |
| 252 | +** | |
| 253 | +** ANSI escape sequences of the form "\033[...X" are interpreted as a | |
| 254 | +** zero-width character. | |
| 248 | 255 | */ |
| 249 | 256 | void char_info_utf8( |
| 250 | - const char *z, | |
| 251 | - int *pCchUTF8, | |
| 252 | - int *pUtf32 | |
| 257 | + const char *z, /* The character to be analyzed */ | |
| 258 | + int *pCchUTF8, /* OUT: The number of bytes used by this character */ | |
| 259 | + int *pUtf32 /* OUT: The UTF8 code point (used to determine width) */ | |
| 253 | 260 | ){ |
| 254 | 261 | int i = 0; /* Counted bytes. */ |
| 255 | 262 | int cchUTF8 = 1; /* Code units consumed. */ |
| 256 | 263 | int maxUTF8 = 1; /* Expected sequence length. */ |
| 257 | 264 | char c = z[i++]; |
| 265 | + if( c==0x1b && z[i]=='[' ){ | |
| 266 | + do{ | |
| 267 | + i++; | |
| 268 | + }while( fossil_isdigit(z[i]) || z[i]==';' ); | |
| 269 | + *pCchUTF8 = i+1; | |
| 270 | + *pUtf32 = 0x301; /* A zero-width character */ | |
| 271 | + return; | |
| 272 | + } | |
| 258 | 273 | if( (c&0x80)==0x00 ){ /* 7-bit ASCII character. */ |
| 259 | 274 | *pCchUTF8 = 1; |
| 260 | 275 | *pUtf32 = (int)z[0]; |
| 261 | 276 | return; |
| 262 | 277 | } |
| 263 | 278 |
| --- src/comformat.c | |
| +++ src/comformat.c | |
| @@ -234,29 +234,44 @@ | |
| 234 | } |
| 235 | return 0; /* NOT REACHED */ |
| 236 | } |
| 237 | |
| 238 | /* |
| 239 | ** Return information about the next (single- or multi-byte) character in the |
| 240 | ** specified UTF-8 string: The number of UTF-8 code units (in this case: bytes) |
| 241 | ** and the decoded UTF-32 code point. Incomplete, ill-formed and overlong |
| 242 | ** sequences are consumed together as one invalid code point. The invalid lead |
| 243 | ** bytes 0xC0 to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- |
| 244 | ** and 4-byte sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF |
| 245 | ** are treated as invalid 1-byte sequences (as lone trail bytes), all resulting |
| 246 | ** in one invalid code point. Invalid UTF-8 sequences encoding a non-scalar code |
| 247 | ** point (UTF-16 surrogates U+D800 to U+DFFF) are allowed. |
| 248 | */ |
| 249 | void char_info_utf8( |
| 250 | const char *z, |
| 251 | int *pCchUTF8, |
| 252 | int *pUtf32 |
| 253 | ){ |
| 254 | int i = 0; /* Counted bytes. */ |
| 255 | int cchUTF8 = 1; /* Code units consumed. */ |
| 256 | int maxUTF8 = 1; /* Expected sequence length. */ |
| 257 | char c = z[i++]; |
| 258 | if( (c&0x80)==0x00 ){ /* 7-bit ASCII character. */ |
| 259 | *pCchUTF8 = 1; |
| 260 | *pUtf32 = (int)z[0]; |
| 261 | return; |
| 262 | } |
| 263 |
| --- src/comformat.c | |
| +++ src/comformat.c | |
| @@ -234,29 +234,44 @@ | |
| 234 | } |
| 235 | return 0; /* NOT REACHED */ |
| 236 | } |
| 237 | |
| 238 | /* |
| 239 | ** Return information about the next (single- or multi-byte) character in |
| 240 | ** z[0]. Two values are computed: |
| 241 | ** |
| 242 | ** * The number of bytes needed to represent the character. |
| 243 | ** * The UTF code point value. |
| 244 | ** |
| 245 | ** Incomplete, ill-formed and overlong sequences are consumed together as |
| 246 | ** one invalid code point. The invalid lead bytes 0xC0 to 0xC1 and 0xF5 to |
| 247 | ** 0xF7 are allowed to initiate (ill-formed) 2- and 4-byte sequences, |
| 248 | ** respectively, the other invalid lead bytes 0xF8 to 0xFF are treated |
| 249 | ** as invalid 1-byte sequences (as lone trail bytes), all resulting |
| 250 | ** in one invalid code point. Invalid UTF-8 sequences encoding a |
| 251 | ** non-scalar code point (UTF-16 surrogates U+D800 to U+DFFF) are allowed. |
| 252 | ** |
| 253 | ** ANSI escape sequences of the form "\033[...X" are interpreted as a |
| 254 | ** zero-width character. |
| 255 | */ |
| 256 | void char_info_utf8( |
| 257 | const char *z, /* The character to be analyzed */ |
| 258 | int *pCchUTF8, /* OUT: The number of bytes used by this character */ |
| 259 | int *pUtf32 /* OUT: The UTF8 code point (used to determine width) */ |
| 260 | ){ |
| 261 | int i = 0; /* Counted bytes. */ |
| 262 | int cchUTF8 = 1; /* Code units consumed. */ |
| 263 | int maxUTF8 = 1; /* Expected sequence length. */ |
| 264 | char c = z[i++]; |
| 265 | if( c==0x1b && z[i]=='[' ){ |
| 266 | do{ |
| 267 | i++; |
| 268 | }while( fossil_isdigit(z[i]) || z[i]==';' ); |
| 269 | *pCchUTF8 = i+1; |
| 270 | *pUtf32 = 0x301; /* A zero-width character */ |
| 271 | return; |
| 272 | } |
| 273 | if( (c&0x80)==0x00 ){ /* 7-bit ASCII character. */ |
| 274 | *pCchUTF8 = 1; |
| 275 | *pUtf32 = (int)z[0]; |
| 276 | return; |
| 277 | } |
| 278 |