Fossil SCM
For better word breaking results with the (non-legacy) comment printing algorithm, make sure the lookahead to the next space character is UTF-8-aware. Also make sure the per-line remaining character count is decremented properly for UTF-8 sequences. The neuralgic points now handle UTF-8 sequences correctly, and they could be enhanced to work with the effective display width, if required (to handle combining characters, and East Asian Wide and Fullwidth characters).
Commit
c9ec3d1886367b546a37e674df1bff9913d8664a
Parent
29d3a2ed4ee03c7…
1 file changed
+56
-7
+56
-7
| --- src/comformat.c | ||
| +++ src/comformat.c | ||
| @@ -120,22 +120,67 @@ | ||
| 120 | 120 | ** zero if such a character cannot be found. For the purposes of this |
| 121 | 121 | ** algorithm, the NUL character is treated the same as a spacing character. |
| 122 | 122 | */ |
| 123 | 123 | static int comment_next_space( |
| 124 | 124 | const char *zLine, /* [in] The comment line being printed. */ |
| 125 | - int index /* [in] The current character index being handled. */ | |
| 125 | + int index, /* [in] The current character index being handled. */ | |
| 126 | + int *distUTF8 /* [out] Distance to next space in UTF-8 sequences. */ | |
| 126 | 127 | ){ |
| 127 | 128 | int nextIndex = index + 1; |
| 129 | + int fNonASCII=0; | |
| 128 | 130 | for(;;){ |
| 129 | 131 | char c = zLine[nextIndex]; |
| 132 | + if ( (c&0x80)==0x80 ) fNonASCII=1; | |
| 130 | 133 | if( c==0 || fossil_isspace(c) ){ |
| 134 | + if ( distUTF8 ){ | |
| 135 | + if ( fNonASCII!=0 ){ | |
| 136 | + *distUTF8 = strlen_utf8(&zLine[index], nextIndex-index); | |
| 137 | + }else{ | |
| 138 | + *distUTF8 = nextIndex-index; | |
| 139 | + } | |
| 140 | + } | |
| 131 | 141 | return nextIndex; |
| 132 | 142 | } |
| 133 | 143 | nextIndex++; |
| 134 | 144 | } |
| 135 | 145 | return 0; /* NOT REACHED */ |
| 136 | 146 | } |
| 147 | + | |
| 148 | +/* | |
| 149 | +** Count the number of UTF-8 sequences in a string. Incomplete, ill-formed and | |
| 150 | +** overlong sequences are counted as one sequence. The invalid lead bytes 0xC0 | |
| 151 | +** to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and 4-byte | |
| 152 | +** sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF are | |
| 153 | +** treated as invalid 1-byte sequences (as lone trail bytes). | |
| 154 | +** Combining characters and East Asian Wide and Fullwidth characters are counted | |
| 155 | +** as one, so this function does not calculate the effective "display width". | |
| 156 | +*/ | |
| 157 | +int strlen_utf8(const char *zString, int lengthBytes) | |
| 158 | +{ | |
| 159 | +#if 0 | |
| 160 | + assert( lengthBytes>=0 ); | |
| 161 | +#endif | |
| 162 | + int lengthUTF8=0; /* Counted UTF-8 sequences. */ | |
| 163 | + int i; | |
| 164 | + for( i=0; i<lengthBytes; i++ ){ | |
| 165 | + char c = zString[i]; | |
| 166 | + lengthUTF8++; | |
| 167 | + if ( (c&0xc0)==0xc0 ){ /* Any UTF-8 lead byte 11xxxxxx */ | |
| 168 | + int cchUTF8=1; /* Code units consumed. */ | |
| 169 | + int maxUTF8=1; /* Expected sequence length. */ | |
| 170 | + if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */ | |
| 171 | + else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */ | |
| 172 | + else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */ | |
| 173 | + while( i<lengthBytes-1 && | |
| 174 | + cchUTF8<maxUTF8 && | |
| 175 | + (zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ | |
| 176 | + i++; | |
| 177 | + } | |
| 178 | + } | |
| 179 | + } | |
| 180 | + return lengthUTF8; | |
| 181 | +} | |
| 137 | 182 | |
| 138 | 183 | /* |
| 139 | 184 | ** This function is called when printing a logical comment line to calculate |
| 140 | 185 | ** the necessary indenting. The caller needs to emit the indenting spaces. |
| 141 | 186 | */ |
| @@ -227,23 +272,25 @@ | ||
| 227 | 272 | if( c=='\n' ){ |
| 228 | 273 | lineCnt++; |
| 229 | 274 | charCnt = 0; |
| 230 | 275 | useChars = 0; |
| 231 | 276 | }else if( c=='\t' ){ |
| 232 | - int nextIndex = comment_next_space(zLine, index); | |
| 233 | - if( nextIndex<=0 || (nextIndex-index)>maxChars ){ | |
| 277 | + int distUTF8; | |
| 278 | + int nextIndex = comment_next_space(zLine, index, &distUTF8); | |
| 279 | + if( nextIndex<=0 || distUTF8>maxChars ){ | |
| 234 | 280 | break; |
| 235 | 281 | } |
| 236 | 282 | charCnt++; |
| 237 | 283 | useChars = COMMENT_TAB_WIDTH; |
| 238 | 284 | if( maxChars<useChars ){ |
| 239 | 285 | zBuf[iBuf++] = ' '; |
| 240 | 286 | break; |
| 241 | 287 | } |
| 242 | 288 | }else if( wordBreak && fossil_isspace(c) ){ |
| 243 | - int nextIndex = comment_next_space(zLine, index); | |
| 244 | - if( nextIndex<=0 || (nextIndex-index)>maxChars ){ | |
| 289 | + int distUTF8; | |
| 290 | + int nextIndex = comment_next_space(zLine, index, &distUTF8); | |
| 291 | + if( nextIndex<=0 || distUTF8>maxChars ){ | |
| 245 | 292 | break; |
| 246 | 293 | } |
| 247 | 294 | charCnt++; |
| 248 | 295 | }else{ |
| 249 | 296 | charCnt++; |
| @@ -267,14 +314,16 @@ | ||
| 267 | 314 | while( cchUTF8<maxUTF8 && |
| 268 | 315 | (zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ |
| 269 | 316 | cchUTF8++; |
| 270 | 317 | zBuf[iBuf++] = zLine[index++]; |
| 271 | 318 | } |
| 319 | + maxChars--; | |
| 272 | 320 | } |
| 273 | - else | |
| 321 | + else { | |
| 274 | 322 | zBuf[iBuf++] = c; |
| 275 | - if( (c&0x80)==0 || (zLine[index+1]&0xc0)!=0xc0 ) maxChars -= useChars; | |
| 323 | + maxChars -= useChars; | |
| 324 | + } | |
| 276 | 325 | if( maxChars<=0 ) break; |
| 277 | 326 | if( c=='\n' ) break; |
| 278 | 327 | } |
| 279 | 328 | if( charCnt>0 ){ |
| 280 | 329 | zBuf[iBuf++] = '\n'; |
| 281 | 330 |
| --- src/comformat.c | |
| +++ src/comformat.c | |
| @@ -120,22 +120,67 @@ | |
| 120 | ** zero if such a character cannot be found. For the purposes of this |
| 121 | ** algorithm, the NUL character is treated the same as a spacing character. |
| 122 | */ |
| 123 | static int comment_next_space( |
| 124 | const char *zLine, /* [in] The comment line being printed. */ |
| 125 | int index /* [in] The current character index being handled. */ |
| 126 | ){ |
| 127 | int nextIndex = index + 1; |
| 128 | for(;;){ |
| 129 | char c = zLine[nextIndex]; |
| 130 | if( c==0 || fossil_isspace(c) ){ |
| 131 | return nextIndex; |
| 132 | } |
| 133 | nextIndex++; |
| 134 | } |
| 135 | return 0; /* NOT REACHED */ |
| 136 | } |
| 137 | |
| 138 | /* |
| 139 | ** This function is called when printing a logical comment line to calculate |
| 140 | ** the necessary indenting. The caller needs to emit the indenting spaces. |
| 141 | */ |
| @@ -227,23 +272,25 @@ | |
| 227 | if( c=='\n' ){ |
| 228 | lineCnt++; |
| 229 | charCnt = 0; |
| 230 | useChars = 0; |
| 231 | }else if( c=='\t' ){ |
| 232 | int nextIndex = comment_next_space(zLine, index); |
| 233 | if( nextIndex<=0 || (nextIndex-index)>maxChars ){ |
| 234 | break; |
| 235 | } |
| 236 | charCnt++; |
| 237 | useChars = COMMENT_TAB_WIDTH; |
| 238 | if( maxChars<useChars ){ |
| 239 | zBuf[iBuf++] = ' '; |
| 240 | break; |
| 241 | } |
| 242 | }else if( wordBreak && fossil_isspace(c) ){ |
| 243 | int nextIndex = comment_next_space(zLine, index); |
| 244 | if( nextIndex<=0 || (nextIndex-index)>maxChars ){ |
| 245 | break; |
| 246 | } |
| 247 | charCnt++; |
| 248 | }else{ |
| 249 | charCnt++; |
| @@ -267,14 +314,16 @@ | |
| 267 | while( cchUTF8<maxUTF8 && |
| 268 | (zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ |
| 269 | cchUTF8++; |
| 270 | zBuf[iBuf++] = zLine[index++]; |
| 271 | } |
| 272 | } |
| 273 | else |
| 274 | zBuf[iBuf++] = c; |
| 275 | if( (c&0x80)==0 || (zLine[index+1]&0xc0)!=0xc0 ) maxChars -= useChars; |
| 276 | if( maxChars<=0 ) break; |
| 277 | if( c=='\n' ) break; |
| 278 | } |
| 279 | if( charCnt>0 ){ |
| 280 | zBuf[iBuf++] = '\n'; |
| 281 |
| --- src/comformat.c | |
| +++ src/comformat.c | |
| @@ -120,22 +120,67 @@ | |
| 120 | ** zero if such a character cannot be found. For the purposes of this |
| 121 | ** algorithm, the NUL character is treated the same as a spacing character. |
| 122 | */ |
| 123 | static int comment_next_space( |
| 124 | const char *zLine, /* [in] The comment line being printed. */ |
| 125 | int index, /* [in] The current character index being handled. */ |
| 126 | int *distUTF8 /* [out] Distance to next space in UTF-8 sequences. */ |
| 127 | ){ |
| 128 | int nextIndex = index + 1; |
| 129 | int fNonASCII=0; |
| 130 | for(;;){ |
| 131 | char c = zLine[nextIndex]; |
| 132 | if ( (c&0x80)==0x80 ) fNonASCII=1; |
| 133 | if( c==0 || fossil_isspace(c) ){ |
| 134 | if ( distUTF8 ){ |
| 135 | if ( fNonASCII!=0 ){ |
| 136 | *distUTF8 = strlen_utf8(&zLine[index], nextIndex-index); |
| 137 | }else{ |
| 138 | *distUTF8 = nextIndex-index; |
| 139 | } |
| 140 | } |
| 141 | return nextIndex; |
| 142 | } |
| 143 | nextIndex++; |
| 144 | } |
| 145 | return 0; /* NOT REACHED */ |
| 146 | } |
| 147 | |
| 148 | /* |
| 149 | ** Count the number of UTF-8 sequences in a string. Incomplete, ill-formed and |
| 150 | ** overlong sequences are counted as one sequence. The invalid lead bytes 0xC0 |
| 151 | ** to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and 4-byte |
| 152 | ** sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF are |
| 153 | ** treated as invalid 1-byte sequences (as lone trail bytes). |
| 154 | ** Combining characters and East Asian Wide and Fullwidth characters are counted |
| 155 | ** as one, so this function does not calculate the effective "display width". |
| 156 | */ |
| 157 | int strlen_utf8(const char *zString, int lengthBytes) |
| 158 | { |
| 159 | #if 0 |
| 160 | assert( lengthBytes>=0 ); |
| 161 | #endif |
| 162 | int lengthUTF8=0; /* Counted UTF-8 sequences. */ |
| 163 | int i; |
| 164 | for( i=0; i<lengthBytes; i++ ){ |
| 165 | char c = zString[i]; |
| 166 | lengthUTF8++; |
| 167 | if ( (c&0xc0)==0xc0 ){ /* Any UTF-8 lead byte 11xxxxxx */ |
| 168 | int cchUTF8=1; /* Code units consumed. */ |
| 169 | int maxUTF8=1; /* Expected sequence length. */ |
| 170 | if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */ |
| 171 | else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */ |
| 172 | else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */ |
| 173 | while( i<lengthBytes-1 && |
| 174 | cchUTF8<maxUTF8 && |
| 175 | (zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ |
| 176 | i++; |
| 177 | } |
| 178 | } |
| 179 | } |
| 180 | return lengthUTF8; |
| 181 | } |
| 182 | |
| 183 | /* |
| 184 | ** This function is called when printing a logical comment line to calculate |
| 185 | ** the necessary indenting. The caller needs to emit the indenting spaces. |
| 186 | */ |
| @@ -227,23 +272,25 @@ | |
| 272 | if( c=='\n' ){ |
| 273 | lineCnt++; |
| 274 | charCnt = 0; |
| 275 | useChars = 0; |
| 276 | }else if( c=='\t' ){ |
| 277 | int distUTF8; |
| 278 | int nextIndex = comment_next_space(zLine, index, &distUTF8); |
| 279 | if( nextIndex<=0 || distUTF8>maxChars ){ |
| 280 | break; |
| 281 | } |
| 282 | charCnt++; |
| 283 | useChars = COMMENT_TAB_WIDTH; |
| 284 | if( maxChars<useChars ){ |
| 285 | zBuf[iBuf++] = ' '; |
| 286 | break; |
| 287 | } |
| 288 | }else if( wordBreak && fossil_isspace(c) ){ |
| 289 | int distUTF8; |
| 290 | int nextIndex = comment_next_space(zLine, index, &distUTF8); |
| 291 | if( nextIndex<=0 || distUTF8>maxChars ){ |
| 292 | break; |
| 293 | } |
| 294 | charCnt++; |
| 295 | }else{ |
| 296 | charCnt++; |
| @@ -267,14 +314,16 @@ | |
| 314 | while( cchUTF8<maxUTF8 && |
| 315 | (zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ |
| 316 | cchUTF8++; |
| 317 | zBuf[iBuf++] = zLine[index++]; |
| 318 | } |
| 319 | maxChars--; |
| 320 | } |
| 321 | else { |
| 322 | zBuf[iBuf++] = c; |
| 323 | maxChars -= useChars; |
| 324 | } |
| 325 | if( maxChars<=0 ) break; |
| 326 | if( c=='\n' ) break; |
| 327 | } |
| 328 | if( charCnt>0 ){ |
| 329 | zBuf[iBuf++] = '\n'; |
| 330 |