Fossil SCM
Improvements to the command-line comment formatter so that it works better with non-ASCII characters.
Commit
1c84a0c14ac3e00e830309881ebf89077a39de942b3f770a4a9fc3afc3bfc31e
Parent
7d034d34bac7521…
1 file changed
+132
-28
+132
-28
| --- src/comformat.c | ||
| +++ src/comformat.c | ||
| @@ -2,11 +2,11 @@ | ||
| 2 | 2 | ** Copyright (c) 2007 D. Richard Hipp |
| 3 | 3 | ** |
| 4 | 4 | ** This program is free software; you can redistribute it and/or |
| 5 | 5 | ** modify it under the terms of the Simplified BSD License (also |
| 6 | 6 | ** known as the "2-Clause License" or "FreeBSD License".) |
| 7 | - | |
| 7 | +** | |
| 8 | 8 | ** This program is distributed in the hope that it will be useful, |
| 9 | 9 | ** but without any warranty; without even the implied warranty of |
| 10 | 10 | ** merchantability or fitness for a particular purpose. |
| 11 | 11 | ** |
| 12 | 12 | ** Author contact information: |
| @@ -95,21 +95,20 @@ | ||
| 95 | 95 | #endif |
| 96 | 96 | } |
| 97 | 97 | |
| 98 | 98 | /* |
| 99 | 99 | ** This function checks the current line being printed against the original |
| 100 | -** comment text. Upon matching, it emits a new line and updates the provided | |
| 101 | -** character and line counts, if applicable. | |
| 100 | +** comment text. Upon matching, it updates the provided character and line | |
| 101 | +** counts, if applicable. The caller needs to emit a new line, if desired. | |
| 102 | 102 | */ |
| 103 | 103 | static int comment_check_orig( |
| 104 | 104 | const char *zOrigText, /* [in] Original comment text ONLY, may be NULL. */ |
| 105 | 105 | const char *zLine, /* [in] The comment line to print. */ |
| 106 | 106 | int *pCharCnt, /* [in/out] Pointer to the line character count. */ |
| 107 | 107 | int *pLineCnt /* [in/out] Pointer to the total line count. */ |
| 108 | 108 | ){ |
| 109 | 109 | if( zOrigText && fossil_strcmp(zLine, zOrigText)==0 ){ |
| 110 | - fossil_print("\n"); | |
| 111 | 110 | if( pCharCnt ) *pCharCnt = 0; |
| 112 | 111 | if( pLineCnt ) (*pLineCnt)++; |
| 113 | 112 | return 1; |
| 114 | 113 | } |
| 115 | 114 | return 0; |
| @@ -121,37 +120,76 @@ | ||
| 121 | 120 | ** zero if such a character cannot be found. For the purposes of this |
| 122 | 121 | ** algorithm, the NUL character is treated the same as a spacing character. |
| 123 | 122 | */ |
| 124 | 123 | static int comment_next_space( |
| 125 | 124 | const char *zLine, /* [in] The comment line being printed. */ |
| 126 | - int index /* [in] The current character index being handled. */ | |
| 125 | + int index, /* [in] The current character index being handled. */ | |
| 126 | + int *distUTF8 /* [out] Distance to next space in UTF-8 sequences. */ | |
| 127 | 127 | ){ |
| 128 | 128 | int nextIndex = index + 1; |
| 129 | + int fNonASCII=0; | |
| 129 | 130 | for(;;){ |
| 130 | 131 | char c = zLine[nextIndex]; |
| 132 | + if( (c&0x80)==0x80 ) fNonASCII=1; | |
| 131 | 133 | if( c==0 || fossil_isspace(c) ){ |
| 134 | + if( distUTF8 ){ | |
| 135 | + if( fNonASCII!=0 ){ | |
| 136 | + *distUTF8 = strlen_utf8(&zLine[index], nextIndex-index); | |
| 137 | + }else{ | |
| 138 | + *distUTF8 = nextIndex-index; | |
| 139 | + } | |
| 140 | + } | |
| 132 | 141 | return nextIndex; |
| 133 | 142 | } |
| 134 | 143 | nextIndex++; |
| 135 | 144 | } |
| 136 | 145 | return 0; /* NOT REACHED */ |
| 137 | 146 | } |
| 138 | 147 | |
| 139 | 148 | /* |
| 140 | -** This function is called when printing a logical comment line to perform | |
| 141 | -** the necessary indenting. | |
| 149 | +** Count the number of UTF-8 sequences in a string. Incomplete, ill-formed and | |
| 150 | +** overlong sequences are counted as one sequence. The invalid lead bytes 0xC0 | |
| 151 | +** to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and 4-byte | |
| 152 | +** sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF are | |
| 153 | +** treated as invalid 1-byte sequences (as lone trail bytes). | |
| 154 | +** Combining characters and East Asian Wide and Fullwidth characters are counted | |
| 155 | +** as one, so this function does not calculate the effective "display width". | |
| 156 | +*/ | |
| 157 | +int strlen_utf8(const char *zString, int lengthBytes){ | |
| 158 | + int i; /* Counted bytes. */ | |
| 159 | + int lengthUTF8; /* Counted UTF-8 sequences. */ | |
| 160 | +#if 0 | |
| 161 | + assert( lengthBytes>=0 ); | |
| 162 | +#endif | |
| 163 | + for(i=0, lengthUTF8=0; i<lengthBytes; i++, lengthUTF8++){ | |
| 164 | + char c = zString[i]; | |
| 165 | + int cchUTF8=1; /* Code units consumed. */ | |
| 166 | + int maxUTF8=1; /* Expected sequence length. */ | |
| 167 | + if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */ | |
| 168 | + else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */ | |
| 169 | + else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */ | |
| 170 | + while( cchUTF8<maxUTF8 && | |
| 171 | + i<lengthBytes-1 && | |
| 172 | + (zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ | |
| 173 | + cchUTF8++; | |
| 174 | + i++; | |
| 175 | + } | |
| 176 | + } | |
| 177 | + return lengthUTF8; | |
| 178 | +} | |
| 179 | + | |
| 180 | +/* | |
| 181 | +** This function is called when printing a logical comment line to calculate | |
| 182 | +** the necessary indenting. The caller needs to emit the indenting spaces. | |
| 142 | 183 | */ |
| 143 | -static void comment_print_indent( | |
| 184 | +static void comment_calc_indent( | |
| 144 | 185 | const char *zLine, /* [in] The comment line being printed. */ |
| 145 | 186 | int indent, /* [in] Number of spaces to indent, zero for none. */ |
| 146 | 187 | int trimCrLf, /* [in] Non-zero to trim leading/trailing CR/LF. */ |
| 147 | 188 | int trimSpace, /* [in] Non-zero to trim leading/trailing spaces. */ |
| 148 | 189 | int *piIndex /* [in/out] Pointer to first non-space character. */ |
| 149 | 190 | ){ |
| 150 | - if( indent>0 ){ | |
| 151 | - fossil_print("%*s", indent, ""); | |
| 152 | - } | |
| 153 | 191 | if( zLine && piIndex ){ |
| 154 | 192 | int index = *piIndex; |
| 155 | 193 | if( trimCrLf ){ |
| 156 | 194 | while( zLine[index]=='\r' || zLine[index]=='\n' ){ index++; } |
| 157 | 195 | } |
| @@ -179,26 +217,56 @@ | ||
| 179 | 217 | int wordBreak, /* [in] Non-zero to try breaking on word boundaries. */ |
| 180 | 218 | int origBreak, /* [in] Non-zero to break before original comment. */ |
| 181 | 219 | int *pLineCnt, /* [in/out] Pointer to the total line count. */ |
| 182 | 220 | const char **pzLine /* [out] Pointer to the end of the logical line. */ |
| 183 | 221 | ){ |
| 184 | - int index = 0, charCnt = 0, lineCnt = 0, maxChars; | |
| 222 | + int index = 0, charCnt = 0, lineCnt = 0, maxChars, i; | |
| 223 | + char zBuf[400]; int iBuf=0; /* Output buffer and counter. */ | |
| 224 | + int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */ | |
| 185 | 225 | if( !zLine ) return; |
| 186 | 226 | if( lineChars<=0 ) return; |
| 187 | - comment_print_indent(zLine, indent, trimCrLf, trimSpace, &index); | |
| 227 | +#if 0 | |
| 228 | + assert( indent<sizeof(zBuf)-5 ); /* See following comments to explain */ | |
| 229 | + assert( origIndent<sizeof(zBuf)-5 ); /* these limits. */ | |
| 230 | +#endif | |
| 231 | + if( indent>sizeof(zBuf)-6 ){ | |
| 232 | + /* Limit initial indent to fit output buffer. */ | |
| 233 | + indent = sizeof(zBuf)-6; | |
| 234 | + } | |
| 235 | + comment_calc_indent(zLine, indent, trimCrLf, trimSpace, &index); | |
| 236 | + if( indent>0 ){ | |
| 237 | + for(i=0; i<indent; i++){ | |
| 238 | + zBuf[iBuf++] = ' '; | |
| 239 | + } | |
| 240 | + } | |
| 241 | + if( origIndent>sizeof(zBuf)-6 ){ | |
| 242 | + /* Limit line indent to fit output buffer. */ | |
| 243 | + origIndent = sizeof(zBuf)-6; | |
| 244 | + } | |
| 188 | 245 | maxChars = lineChars; |
| 189 | 246 | for(;;){ |
| 190 | 247 | int useChars = 1; |
| 191 | 248 | char c = zLine[index]; |
| 249 | + /* Flush the output buffer if there's no space left for at least one more | |
| 250 | + ** (potentially 4-byte) UTF-8 sequence, one level of indentation spaces, | |
| 251 | + ** a new line, and a terminating NULL. */ | |
| 252 | + if( iBuf>sizeof(zBuf)-origIndent-6 ){ | |
| 253 | + zBuf[iBuf]=0; | |
| 254 | + iBuf=0; | |
| 255 | + fossil_print("%s", zBuf); | |
| 256 | + } | |
| 192 | 257 | if( c==0 ){ |
| 193 | 258 | break; |
| 194 | 259 | }else{ |
| 195 | 260 | if( origBreak && index>0 ){ |
| 196 | 261 | const char *zCurrent = &zLine[index]; |
| 197 | 262 | if( comment_check_orig(zOrigText, zCurrent, &charCnt, &lineCnt) ){ |
| 198 | - comment_print_indent(zCurrent, origIndent, trimCrLf, trimSpace, | |
| 199 | - &index); | |
| 263 | + zBuf[iBuf++] = '\n'; | |
| 264 | + comment_calc_indent(zLine, origIndent, trimCrLf, trimSpace, &index); | |
| 265 | + for( i=0; i<origIndent; i++ ){ | |
| 266 | + zBuf[iBuf++] = ' '; | |
| 267 | + } | |
| 200 | 268 | maxChars = lineChars; |
| 201 | 269 | } |
| 202 | 270 | } |
| 203 | 271 | index++; |
| 204 | 272 | } |
| @@ -205,38 +273,57 @@ | ||
| 205 | 273 | if( c=='\n' ){ |
| 206 | 274 | lineCnt++; |
| 207 | 275 | charCnt = 0; |
| 208 | 276 | useChars = 0; |
| 209 | 277 | }else if( c=='\t' ){ |
| 210 | - int nextIndex = comment_next_space(zLine, index); | |
| 211 | - if( nextIndex<=0 || (nextIndex-index)>maxChars ){ | |
| 278 | + int distUTF8; | |
| 279 | + int nextIndex = comment_next_space(zLine, index, &distUTF8); | |
| 280 | + if( nextIndex<=0 || distUTF8>maxChars ){ | |
| 212 | 281 | break; |
| 213 | 282 | } |
| 214 | 283 | charCnt++; |
| 215 | 284 | useChars = COMMENT_TAB_WIDTH; |
| 216 | 285 | if( maxChars<useChars ){ |
| 217 | - fossil_print(" "); | |
| 286 | + zBuf[iBuf++] = ' '; | |
| 218 | 287 | break; |
| 219 | 288 | } |
| 220 | 289 | }else if( wordBreak && fossil_isspace(c) ){ |
| 221 | - int nextIndex = comment_next_space(zLine, index); | |
| 222 | - if( nextIndex<=0 || (nextIndex-index)>maxChars ){ | |
| 290 | + int distUTF8; | |
| 291 | + int nextIndex = comment_next_space(zLine, index, &distUTF8); | |
| 292 | + if( nextIndex<=0 || distUTF8>maxChars ){ | |
| 223 | 293 | break; |
| 224 | 294 | } |
| 225 | 295 | charCnt++; |
| 226 | 296 | }else{ |
| 227 | 297 | charCnt++; |
| 228 | 298 | } |
| 229 | 299 | assert( c!='\n' || charCnt==0 ); |
| 230 | - fossil_print("%c", c); | |
| 231 | - if( (c&0x80)==0 || (zLine[index+1]&0xc0)!=0xc0 ) maxChars -= useChars; | |
| 300 | + zBuf[iBuf++] = c; | |
| 301 | + /* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */ | |
| 302 | + cchUTF8=1; /* Code units consumed. */ | |
| 303 | + maxUTF8=1; /* Expected sequence length. */ | |
| 304 | + if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */ | |
| 305 | + else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */ | |
| 306 | + else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */ | |
| 307 | + while( cchUTF8<maxUTF8 && | |
| 308 | + (zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ | |
| 309 | + cchUTF8++; | |
| 310 | + zBuf[iBuf++] = zLine[index++]; | |
| 311 | + } | |
| 312 | + maxChars -= useChars; | |
| 232 | 313 | if( maxChars<=0 ) break; |
| 233 | 314 | if( c=='\n' ) break; |
| 234 | 315 | } |
| 235 | 316 | if( charCnt>0 ){ |
| 236 | - fossil_print("\n"); | |
| 317 | + zBuf[iBuf++] = '\n'; | |
| 237 | 318 | lineCnt++; |
| 319 | + } | |
| 320 | + /* Flush the remaining output buffer. */ | |
| 321 | + if( iBuf>0 ){ | |
| 322 | + zBuf[iBuf]=0; | |
| 323 | + iBuf=0; | |
| 324 | + fossil_print("%s", zBuf); | |
| 238 | 325 | } |
| 239 | 326 | if( pLineCnt ){ |
| 240 | 327 | *pLineCnt += lineCnt; |
| 241 | 328 | } |
| 242 | 329 | if( pzLine ){ |
| @@ -259,25 +346,27 @@ | ||
| 259 | 346 | const char *zText, /* The comment text to be printed. */ |
| 260 | 347 | int indent, /* Number of spaces to indent each non-initial line. */ |
| 261 | 348 | int width /* Maximum number of characters per line. */ |
| 262 | 349 | ){ |
| 263 | 350 | int maxChars = width - indent; |
| 264 | - int si, sk, i, k; | |
| 351 | + int si, sk, i, k, kc; | |
| 265 | 352 | int doIndent = 0; |
| 266 | 353 | char *zBuf; |
| 267 | 354 | char zBuffer[400]; |
| 268 | 355 | int lineCnt = 0; |
| 356 | + int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */ | |
| 269 | 357 | |
| 270 | 358 | if( width<0 ){ |
| 271 | 359 | comment_set_maxchars(indent, &maxChars); |
| 272 | 360 | } |
| 273 | 361 | if( zText==0 ) zText = "(NULL)"; |
| 274 | 362 | if( maxChars<=0 ){ |
| 275 | 363 | maxChars = strlen(zText); |
| 276 | 364 | } |
| 277 | - if( maxChars >= (sizeof(zBuffer)) ){ | |
| 278 | - zBuf = fossil_malloc(maxChars+1); | |
| 365 | + /* Ensure the buffer can hold the longest-possible UTF-8 sequences. */ | |
| 366 | + if( maxChars >= (sizeof(zBuffer)/4-1) ){ | |
| 367 | + zBuf = fossil_malloc(maxChars*4+1); | |
| 279 | 368 | }else{ |
| 280 | 369 | zBuf = zBuffer; |
| 281 | 370 | } |
| 282 | 371 | for(;;){ |
| 283 | 372 | while( fossil_isspace(zText[0]) ){ zText++; } |
| @@ -287,13 +376,28 @@ | ||
| 287 | 376 | lineCnt = 1; |
| 288 | 377 | } |
| 289 | 378 | if( zBuf!=zBuffer) fossil_free(zBuf); |
| 290 | 379 | return lineCnt; |
| 291 | 380 | } |
| 292 | - for(sk=si=i=k=0; zText[i] && k<maxChars; i++){ | |
| 381 | + for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){ | |
| 293 | 382 | char c = zText[i]; |
| 294 | - if( fossil_isspace(c) ){ | |
| 383 | + kc++; /* Count complete UTF-8 sequences. */ | |
| 384 | + /* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */ | |
| 385 | + cchUTF8=1; /* Code units consumed. */ | |
| 386 | + maxUTF8=1; /* Expected sequence length. */ | |
| 387 | + if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */ | |
| 388 | + else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */ | |
| 389 | + else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */ | |
| 390 | + if( maxUTF8>1 ){ | |
| 391 | + zBuf[k++] = c; | |
| 392 | + while( cchUTF8<maxUTF8 && | |
| 393 | + (zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ | |
| 394 | + cchUTF8++; | |
| 395 | + zBuf[k++] = zText[++i]; | |
| 396 | + } | |
| 397 | + } | |
| 398 | + else if( fossil_isspace(c) ){ | |
| 295 | 399 | si = i; |
| 296 | 400 | sk = k; |
| 297 | 401 | if( k==0 || zBuf[k-1]!=' ' ){ |
| 298 | 402 | zBuf[k++] = ' '; |
| 299 | 403 | } |
| 300 | 404 |
| --- src/comformat.c | |
| +++ src/comformat.c | |
| @@ -2,11 +2,11 @@ | |
| 2 | ** Copyright (c) 2007 D. Richard Hipp |
| 3 | ** |
| 4 | ** This program is free software; you can redistribute it and/or |
| 5 | ** modify it under the terms of the Simplified BSD License (also |
| 6 | ** known as the "2-Clause License" or "FreeBSD License".) |
| 7 | |
| 8 | ** This program is distributed in the hope that it will be useful, |
| 9 | ** but without any warranty; without even the implied warranty of |
| 10 | ** merchantability or fitness for a particular purpose. |
| 11 | ** |
| 12 | ** Author contact information: |
| @@ -95,21 +95,20 @@ | |
| 95 | #endif |
| 96 | } |
| 97 | |
| 98 | /* |
| 99 | ** This function checks the current line being printed against the original |
| 100 | ** comment text. Upon matching, it emits a new line and updates the provided |
| 101 | ** character and line counts, if applicable. |
| 102 | */ |
| 103 | static int comment_check_orig( |
| 104 | const char *zOrigText, /* [in] Original comment text ONLY, may be NULL. */ |
| 105 | const char *zLine, /* [in] The comment line to print. */ |
| 106 | int *pCharCnt, /* [in/out] Pointer to the line character count. */ |
| 107 | int *pLineCnt /* [in/out] Pointer to the total line count. */ |
| 108 | ){ |
| 109 | if( zOrigText && fossil_strcmp(zLine, zOrigText)==0 ){ |
| 110 | fossil_print("\n"); |
| 111 | if( pCharCnt ) *pCharCnt = 0; |
| 112 | if( pLineCnt ) (*pLineCnt)++; |
| 113 | return 1; |
| 114 | } |
| 115 | return 0; |
| @@ -121,37 +120,76 @@ | |
| 121 | ** zero if such a character cannot be found. For the purposes of this |
| 122 | ** algorithm, the NUL character is treated the same as a spacing character. |
| 123 | */ |
| 124 | static int comment_next_space( |
| 125 | const char *zLine, /* [in] The comment line being printed. */ |
| 126 | int index /* [in] The current character index being handled. */ |
| 127 | ){ |
| 128 | int nextIndex = index + 1; |
| 129 | for(;;){ |
| 130 | char c = zLine[nextIndex]; |
| 131 | if( c==0 || fossil_isspace(c) ){ |
| 132 | return nextIndex; |
| 133 | } |
| 134 | nextIndex++; |
| 135 | } |
| 136 | return 0; /* NOT REACHED */ |
| 137 | } |
| 138 | |
| 139 | /* |
| 140 | ** This function is called when printing a logical comment line to perform |
| 141 | ** the necessary indenting. |
| 142 | */ |
| 143 | static void comment_print_indent( |
| 144 | const char *zLine, /* [in] The comment line being printed. */ |
| 145 | int indent, /* [in] Number of spaces to indent, zero for none. */ |
| 146 | int trimCrLf, /* [in] Non-zero to trim leading/trailing CR/LF. */ |
| 147 | int trimSpace, /* [in] Non-zero to trim leading/trailing spaces. */ |
| 148 | int *piIndex /* [in/out] Pointer to first non-space character. */ |
| 149 | ){ |
| 150 | if( indent>0 ){ |
| 151 | fossil_print("%*s", indent, ""); |
| 152 | } |
| 153 | if( zLine && piIndex ){ |
| 154 | int index = *piIndex; |
| 155 | if( trimCrLf ){ |
| 156 | while( zLine[index]=='\r' || zLine[index]=='\n' ){ index++; } |
| 157 | } |
| @@ -179,26 +217,56 @@ | |
| 179 | int wordBreak, /* [in] Non-zero to try breaking on word boundaries. */ |
| 180 | int origBreak, /* [in] Non-zero to break before original comment. */ |
| 181 | int *pLineCnt, /* [in/out] Pointer to the total line count. */ |
| 182 | const char **pzLine /* [out] Pointer to the end of the logical line. */ |
| 183 | ){ |
| 184 | int index = 0, charCnt = 0, lineCnt = 0, maxChars; |
| 185 | if( !zLine ) return; |
| 186 | if( lineChars<=0 ) return; |
| 187 | comment_print_indent(zLine, indent, trimCrLf, trimSpace, &index); |
| 188 | maxChars = lineChars; |
| 189 | for(;;){ |
| 190 | int useChars = 1; |
| 191 | char c = zLine[index]; |
| 192 | if( c==0 ){ |
| 193 | break; |
| 194 | }else{ |
| 195 | if( origBreak && index>0 ){ |
| 196 | const char *zCurrent = &zLine[index]; |
| 197 | if( comment_check_orig(zOrigText, zCurrent, &charCnt, &lineCnt) ){ |
| 198 | comment_print_indent(zCurrent, origIndent, trimCrLf, trimSpace, |
| 199 | &index); |
| 200 | maxChars = lineChars; |
| 201 | } |
| 202 | } |
| 203 | index++; |
| 204 | } |
| @@ -205,38 +273,57 @@ | |
| 205 | if( c=='\n' ){ |
| 206 | lineCnt++; |
| 207 | charCnt = 0; |
| 208 | useChars = 0; |
| 209 | }else if( c=='\t' ){ |
| 210 | int nextIndex = comment_next_space(zLine, index); |
| 211 | if( nextIndex<=0 || (nextIndex-index)>maxChars ){ |
| 212 | break; |
| 213 | } |
| 214 | charCnt++; |
| 215 | useChars = COMMENT_TAB_WIDTH; |
| 216 | if( maxChars<useChars ){ |
| 217 | fossil_print(" "); |
| 218 | break; |
| 219 | } |
| 220 | }else if( wordBreak && fossil_isspace(c) ){ |
| 221 | int nextIndex = comment_next_space(zLine, index); |
| 222 | if( nextIndex<=0 || (nextIndex-index)>maxChars ){ |
| 223 | break; |
| 224 | } |
| 225 | charCnt++; |
| 226 | }else{ |
| 227 | charCnt++; |
| 228 | } |
| 229 | assert( c!='\n' || charCnt==0 ); |
| 230 | fossil_print("%c", c); |
| 231 | if( (c&0x80)==0 || (zLine[index+1]&0xc0)!=0xc0 ) maxChars -= useChars; |
| 232 | if( maxChars<=0 ) break; |
| 233 | if( c=='\n' ) break; |
| 234 | } |
| 235 | if( charCnt>0 ){ |
| 236 | fossil_print("\n"); |
| 237 | lineCnt++; |
| 238 | } |
| 239 | if( pLineCnt ){ |
| 240 | *pLineCnt += lineCnt; |
| 241 | } |
| 242 | if( pzLine ){ |
| @@ -259,25 +346,27 @@ | |
| 259 | const char *zText, /* The comment text to be printed. */ |
| 260 | int indent, /* Number of spaces to indent each non-initial line. */ |
| 261 | int width /* Maximum number of characters per line. */ |
| 262 | ){ |
| 263 | int maxChars = width - indent; |
| 264 | int si, sk, i, k; |
| 265 | int doIndent = 0; |
| 266 | char *zBuf; |
| 267 | char zBuffer[400]; |
| 268 | int lineCnt = 0; |
| 269 | |
| 270 | if( width<0 ){ |
| 271 | comment_set_maxchars(indent, &maxChars); |
| 272 | } |
| 273 | if( zText==0 ) zText = "(NULL)"; |
| 274 | if( maxChars<=0 ){ |
| 275 | maxChars = strlen(zText); |
| 276 | } |
| 277 | if( maxChars >= (sizeof(zBuffer)) ){ |
| 278 | zBuf = fossil_malloc(maxChars+1); |
| 279 | }else{ |
| 280 | zBuf = zBuffer; |
| 281 | } |
| 282 | for(;;){ |
| 283 | while( fossil_isspace(zText[0]) ){ zText++; } |
| @@ -287,13 +376,28 @@ | |
| 287 | lineCnt = 1; |
| 288 | } |
| 289 | if( zBuf!=zBuffer) fossil_free(zBuf); |
| 290 | return lineCnt; |
| 291 | } |
| 292 | for(sk=si=i=k=0; zText[i] && k<maxChars; i++){ |
| 293 | char c = zText[i]; |
| 294 | if( fossil_isspace(c) ){ |
| 295 | si = i; |
| 296 | sk = k; |
| 297 | if( k==0 || zBuf[k-1]!=' ' ){ |
| 298 | zBuf[k++] = ' '; |
| 299 | } |
| 300 |
| --- src/comformat.c | |
| +++ src/comformat.c | |
| @@ -2,11 +2,11 @@ | |
| 2 | ** Copyright (c) 2007 D. Richard Hipp |
| 3 | ** |
| 4 | ** This program is free software; you can redistribute it and/or |
| 5 | ** modify it under the terms of the Simplified BSD License (also |
| 6 | ** known as the "2-Clause License" or "FreeBSD License".) |
| 7 | ** |
| 8 | ** This program is distributed in the hope that it will be useful, |
| 9 | ** but without any warranty; without even the implied warranty of |
| 10 | ** merchantability or fitness for a particular purpose. |
| 11 | ** |
| 12 | ** Author contact information: |
| @@ -95,21 +95,20 @@ | |
| 95 | #endif |
| 96 | } |
| 97 | |
| 98 | /* |
| 99 | ** This function checks the current line being printed against the original |
| 100 | ** comment text. Upon matching, it updates the provided character and line |
| 101 | ** counts, if applicable. The caller needs to emit a new line, if desired. |
| 102 | */ |
| 103 | static int comment_check_orig( |
| 104 | const char *zOrigText, /* [in] Original comment text ONLY, may be NULL. */ |
| 105 | const char *zLine, /* [in] The comment line to print. */ |
| 106 | int *pCharCnt, /* [in/out] Pointer to the line character count. */ |
| 107 | int *pLineCnt /* [in/out] Pointer to the total line count. */ |
| 108 | ){ |
| 109 | if( zOrigText && fossil_strcmp(zLine, zOrigText)==0 ){ |
| 110 | if( pCharCnt ) *pCharCnt = 0; |
| 111 | if( pLineCnt ) (*pLineCnt)++; |
| 112 | return 1; |
| 113 | } |
| 114 | return 0; |
| @@ -121,37 +120,76 @@ | |
| 120 | ** zero if such a character cannot be found. For the purposes of this |
| 121 | ** algorithm, the NUL character is treated the same as a spacing character. |
| 122 | */ |
| 123 | static int comment_next_space( |
| 124 | const char *zLine, /* [in] The comment line being printed. */ |
| 125 | int index, /* [in] The current character index being handled. */ |
| 126 | int *distUTF8 /* [out] Distance to next space in UTF-8 sequences. */ |
| 127 | ){ |
| 128 | int nextIndex = index + 1; |
| 129 | int fNonASCII=0; |
| 130 | for(;;){ |
| 131 | char c = zLine[nextIndex]; |
| 132 | if( (c&0x80)==0x80 ) fNonASCII=1; |
| 133 | if( c==0 || fossil_isspace(c) ){ |
| 134 | if( distUTF8 ){ |
| 135 | if( fNonASCII!=0 ){ |
| 136 | *distUTF8 = strlen_utf8(&zLine[index], nextIndex-index); |
| 137 | }else{ |
| 138 | *distUTF8 = nextIndex-index; |
| 139 | } |
| 140 | } |
| 141 | return nextIndex; |
| 142 | } |
| 143 | nextIndex++; |
| 144 | } |
| 145 | return 0; /* NOT REACHED */ |
| 146 | } |
| 147 | |
| 148 | /* |
| 149 | ** Count the number of UTF-8 sequences in a string. Incomplete, ill-formed and |
| 150 | ** overlong sequences are counted as one sequence. The invalid lead bytes 0xC0 |
| 151 | ** to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and 4-byte |
| 152 | ** sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF are |
| 153 | ** treated as invalid 1-byte sequences (as lone trail bytes). |
| 154 | ** Combining characters and East Asian Wide and Fullwidth characters are counted |
| 155 | ** as one, so this function does not calculate the effective "display width". |
| 156 | */ |
| 157 | int strlen_utf8(const char *zString, int lengthBytes){ |
| 158 | int i; /* Counted bytes. */ |
| 159 | int lengthUTF8; /* Counted UTF-8 sequences. */ |
| 160 | #if 0 |
| 161 | assert( lengthBytes>=0 ); |
| 162 | #endif |
| 163 | for(i=0, lengthUTF8=0; i<lengthBytes; i++, lengthUTF8++){ |
| 164 | char c = zString[i]; |
| 165 | int cchUTF8=1; /* Code units consumed. */ |
| 166 | int maxUTF8=1; /* Expected sequence length. */ |
| 167 | if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */ |
| 168 | else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */ |
| 169 | else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */ |
| 170 | while( cchUTF8<maxUTF8 && |
| 171 | i<lengthBytes-1 && |
| 172 | (zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ |
| 173 | cchUTF8++; |
| 174 | i++; |
| 175 | } |
| 176 | } |
| 177 | return lengthUTF8; |
| 178 | } |
| 179 | |
| 180 | /* |
| 181 | ** This function is called when printing a logical comment line to calculate |
| 182 | ** the necessary indenting. The caller needs to emit the indenting spaces. |
| 183 | */ |
| 184 | static void comment_calc_indent( |
| 185 | const char *zLine, /* [in] The comment line being printed. */ |
| 186 | int indent, /* [in] Number of spaces to indent, zero for none. */ |
| 187 | int trimCrLf, /* [in] Non-zero to trim leading/trailing CR/LF. */ |
| 188 | int trimSpace, /* [in] Non-zero to trim leading/trailing spaces. */ |
| 189 | int *piIndex /* [in/out] Pointer to first non-space character. */ |
| 190 | ){ |
| 191 | if( zLine && piIndex ){ |
| 192 | int index = *piIndex; |
| 193 | if( trimCrLf ){ |
| 194 | while( zLine[index]=='\r' || zLine[index]=='\n' ){ index++; } |
| 195 | } |
| @@ -179,26 +217,56 @@ | |
| 217 | int wordBreak, /* [in] Non-zero to try breaking on word boundaries. */ |
| 218 | int origBreak, /* [in] Non-zero to break before original comment. */ |
| 219 | int *pLineCnt, /* [in/out] Pointer to the total line count. */ |
| 220 | const char **pzLine /* [out] Pointer to the end of the logical line. */ |
| 221 | ){ |
| 222 | int index = 0, charCnt = 0, lineCnt = 0, maxChars, i; |
| 223 | char zBuf[400]; int iBuf=0; /* Output buffer and counter. */ |
| 224 | int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */ |
| 225 | if( !zLine ) return; |
| 226 | if( lineChars<=0 ) return; |
| 227 | #if 0 |
| 228 | assert( indent<sizeof(zBuf)-5 ); /* See following comments to explain */ |
| 229 | assert( origIndent<sizeof(zBuf)-5 ); /* these limits. */ |
| 230 | #endif |
| 231 | if( indent>sizeof(zBuf)-6 ){ |
| 232 | /* Limit initial indent to fit output buffer. */ |
| 233 | indent = sizeof(zBuf)-6; |
| 234 | } |
| 235 | comment_calc_indent(zLine, indent, trimCrLf, trimSpace, &index); |
| 236 | if( indent>0 ){ |
| 237 | for(i=0; i<indent; i++){ |
| 238 | zBuf[iBuf++] = ' '; |
| 239 | } |
| 240 | } |
| 241 | if( origIndent>sizeof(zBuf)-6 ){ |
| 242 | /* Limit line indent to fit output buffer. */ |
| 243 | origIndent = sizeof(zBuf)-6; |
| 244 | } |
| 245 | maxChars = lineChars; |
| 246 | for(;;){ |
| 247 | int useChars = 1; |
| 248 | char c = zLine[index]; |
| 249 | /* Flush the output buffer if there's no space left for at least one more |
| 250 | ** (potentially 4-byte) UTF-8 sequence, one level of indentation spaces, |
| 251 | ** a new line, and a terminating NULL. */ |
| 252 | if( iBuf>sizeof(zBuf)-origIndent-6 ){ |
| 253 | zBuf[iBuf]=0; |
| 254 | iBuf=0; |
| 255 | fossil_print("%s", zBuf); |
| 256 | } |
| 257 | if( c==0 ){ |
| 258 | break; |
| 259 | }else{ |
| 260 | if( origBreak && index>0 ){ |
| 261 | const char *zCurrent = &zLine[index]; |
| 262 | if( comment_check_orig(zOrigText, zCurrent, &charCnt, &lineCnt) ){ |
| 263 | zBuf[iBuf++] = '\n'; |
| 264 | comment_calc_indent(zLine, origIndent, trimCrLf, trimSpace, &index); |
| 265 | for( i=0; i<origIndent; i++ ){ |
| 266 | zBuf[iBuf++] = ' '; |
| 267 | } |
| 268 | maxChars = lineChars; |
| 269 | } |
| 270 | } |
| 271 | index++; |
| 272 | } |
| @@ -205,38 +273,57 @@ | |
| 273 | if( c=='\n' ){ |
| 274 | lineCnt++; |
| 275 | charCnt = 0; |
| 276 | useChars = 0; |
| 277 | }else if( c=='\t' ){ |
| 278 | int distUTF8; |
| 279 | int nextIndex = comment_next_space(zLine, index, &distUTF8); |
| 280 | if( nextIndex<=0 || distUTF8>maxChars ){ |
| 281 | break; |
| 282 | } |
| 283 | charCnt++; |
| 284 | useChars = COMMENT_TAB_WIDTH; |
| 285 | if( maxChars<useChars ){ |
| 286 | zBuf[iBuf++] = ' '; |
| 287 | break; |
| 288 | } |
| 289 | }else if( wordBreak && fossil_isspace(c) ){ |
| 290 | int distUTF8; |
| 291 | int nextIndex = comment_next_space(zLine, index, &distUTF8); |
| 292 | if( nextIndex<=0 || distUTF8>maxChars ){ |
| 293 | break; |
| 294 | } |
| 295 | charCnt++; |
| 296 | }else{ |
| 297 | charCnt++; |
| 298 | } |
| 299 | assert( c!='\n' || charCnt==0 ); |
| 300 | zBuf[iBuf++] = c; |
| 301 | /* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */ |
| 302 | cchUTF8=1; /* Code units consumed. */ |
| 303 | maxUTF8=1; /* Expected sequence length. */ |
| 304 | if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */ |
| 305 | else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */ |
| 306 | else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */ |
| 307 | while( cchUTF8<maxUTF8 && |
| 308 | (zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ |
| 309 | cchUTF8++; |
| 310 | zBuf[iBuf++] = zLine[index++]; |
| 311 | } |
| 312 | maxChars -= useChars; |
| 313 | if( maxChars<=0 ) break; |
| 314 | if( c=='\n' ) break; |
| 315 | } |
| 316 | if( charCnt>0 ){ |
| 317 | zBuf[iBuf++] = '\n'; |
| 318 | lineCnt++; |
| 319 | } |
| 320 | /* Flush the remaining output buffer. */ |
| 321 | if( iBuf>0 ){ |
| 322 | zBuf[iBuf]=0; |
| 323 | iBuf=0; |
| 324 | fossil_print("%s", zBuf); |
| 325 | } |
| 326 | if( pLineCnt ){ |
| 327 | *pLineCnt += lineCnt; |
| 328 | } |
| 329 | if( pzLine ){ |
| @@ -259,25 +346,27 @@ | |
| 346 | const char *zText, /* The comment text to be printed. */ |
| 347 | int indent, /* Number of spaces to indent each non-initial line. */ |
| 348 | int width /* Maximum number of characters per line. */ |
| 349 | ){ |
| 350 | int maxChars = width - indent; |
| 351 | int si, sk, i, k, kc; |
| 352 | int doIndent = 0; |
| 353 | char *zBuf; |
| 354 | char zBuffer[400]; |
| 355 | int lineCnt = 0; |
| 356 | int cchUTF8, maxUTF8; /* Helper variables to count UTF-8 sequences. */ |
| 357 | |
| 358 | if( width<0 ){ |
| 359 | comment_set_maxchars(indent, &maxChars); |
| 360 | } |
| 361 | if( zText==0 ) zText = "(NULL)"; |
| 362 | if( maxChars<=0 ){ |
| 363 | maxChars = strlen(zText); |
| 364 | } |
| 365 | /* Ensure the buffer can hold the longest-possible UTF-8 sequences. */ |
| 366 | if( maxChars >= (sizeof(zBuffer)/4-1) ){ |
| 367 | zBuf = fossil_malloc(maxChars*4+1); |
| 368 | }else{ |
| 369 | zBuf = zBuffer; |
| 370 | } |
| 371 | for(;;){ |
| 372 | while( fossil_isspace(zText[0]) ){ zText++; } |
| @@ -287,13 +376,28 @@ | |
| 376 | lineCnt = 1; |
| 377 | } |
| 378 | if( zBuf!=zBuffer) fossil_free(zBuf); |
| 379 | return lineCnt; |
| 380 | } |
| 381 | for(sk=si=i=k=kc=0; zText[i] && kc<maxChars; i++){ |
| 382 | char c = zText[i]; |
| 383 | kc++; /* Count complete UTF-8 sequences. */ |
| 384 | /* Skip over UTF-8 sequences, see comment on strlen_utf8() for details. */ |
| 385 | cchUTF8=1; /* Code units consumed. */ |
| 386 | maxUTF8=1; /* Expected sequence length. */ |
| 387 | if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */ |
| 388 | else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */ |
| 389 | else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */ |
| 390 | if( maxUTF8>1 ){ |
| 391 | zBuf[k++] = c; |
| 392 | while( cchUTF8<maxUTF8 && |
| 393 | (zText[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */ |
| 394 | cchUTF8++; |
| 395 | zBuf[k++] = zText[++i]; |
| 396 | } |
| 397 | } |
| 398 | else if( fossil_isspace(c) ){ |
| 399 | si = i; |
| 400 | sk = k; |
| 401 | if( k==0 || zBuf[k-1]!=' ' ){ |
| 402 | zBuf[k++] = ' '; |
| 403 | } |
| 404 |