Fossil SCM

For better word breaking results with the (non-legacy) comment printing algorithm, make sure the lookahead to the next space character is UTF-8-aware. Also make sure the per-line remaining character count is decremented properly for UTF-8 sequences. The neuralgic points now handle UTF-8 sequences correctly, and they could be enhanced to work with the effective display width, if required (to handle combining characters, and East Asian Wide and Fullwidth characters).

florian 2018-11-16 11:14 UTC comment-formatter-utf8
Commit c9ec3d1886367b546a37e674df1bff9913d8664a
1 file changed +56 -7
+56 -7
--- src/comformat.c
+++ src/comformat.c
@@ -120,22 +120,67 @@
120120
** zero if such a character cannot be found. For the purposes of this
121121
** algorithm, the NUL character is treated the same as a spacing character.
122122
*/
123123
static int comment_next_space(
124124
const char *zLine, /* [in] The comment line being printed. */
125
- int index /* [in] The current character index being handled. */
125
+ int index, /* [in] The current character index being handled. */
126
+ int *distUTF8 /* [out] Distance to next space in UTF-8 sequences. */
126127
){
127128
int nextIndex = index + 1;
129
+ int fNonASCII=0;
128130
for(;;){
129131
char c = zLine[nextIndex];
132
+ if ( (c&0x80)==0x80 ) fNonASCII=1;
130133
if( c==0 || fossil_isspace(c) ){
134
+ if ( distUTF8 ){
135
+ if ( fNonASCII!=0 ){
136
+ *distUTF8 = strlen_utf8(&zLine[index], nextIndex-index);
137
+ }else{
138
+ *distUTF8 = nextIndex-index;
139
+ }
140
+ }
131141
return nextIndex;
132142
}
133143
nextIndex++;
134144
}
135145
return 0; /* NOT REACHED */
136146
}
147
+
148
+/*
149
+** Count the number of UTF-8 sequences in a string. Incomplete, ill-formed and
150
+** overlong sequences are counted as one sequence. The invalid lead bytes 0xC0
151
+** to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and 4-byte
152
+** sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF are
153
+** treated as invalid 1-byte sequences (as lone trail bytes).
154
+** Combining characters and East Asian Wide and Fullwidth characters are counted
155
+** as one, so this function does not calculate the effective "display width".
156
+*/
157
+int strlen_utf8(const char *zString, int lengthBytes)
158
+{
159
+#if 0
160
+ assert( lengthBytes>=0 );
161
+#endif
162
+ int lengthUTF8=0; /* Counted UTF-8 sequences. */
163
+ int i;
164
+ for( i=0; i<lengthBytes; i++ ){
165
+ char c = zString[i];
166
+ lengthUTF8++;
167
+ if ( (c&0xc0)==0xc0 ){ /* Any UTF-8 lead byte 11xxxxxx */
168
+ int cchUTF8=1; /* Code units consumed. */
169
+ int maxUTF8=1; /* Expected sequence length. */
170
+ if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
171
+ else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
172
+ else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
173
+ while( i<lengthBytes-1 &&
174
+ cchUTF8<maxUTF8 &&
175
+ (zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
176
+ i++;
177
+ }
178
+ }
179
+ }
180
+ return lengthUTF8;
181
+}
137182
138183
/*
139184
** This function is called when printing a logical comment line to calculate
140185
** the necessary indenting. The caller needs to emit the indenting spaces.
141186
*/
@@ -227,23 +272,25 @@
227272
if( c=='\n' ){
228273
lineCnt++;
229274
charCnt = 0;
230275
useChars = 0;
231276
}else if( c=='\t' ){
232
- int nextIndex = comment_next_space(zLine, index);
233
- if( nextIndex<=0 || (nextIndex-index)>maxChars ){
277
+ int distUTF8;
278
+ int nextIndex = comment_next_space(zLine, index, &distUTF8);
279
+ if( nextIndex<=0 || distUTF8>maxChars ){
234280
break;
235281
}
236282
charCnt++;
237283
useChars = COMMENT_TAB_WIDTH;
238284
if( maxChars<useChars ){
239285
zBuf[iBuf++] = ' ';
240286
break;
241287
}
242288
}else if( wordBreak && fossil_isspace(c) ){
243
- int nextIndex = comment_next_space(zLine, index);
244
- if( nextIndex<=0 || (nextIndex-index)>maxChars ){
289
+ int distUTF8;
290
+ int nextIndex = comment_next_space(zLine, index, &distUTF8);
291
+ if( nextIndex<=0 || distUTF8>maxChars ){
245292
break;
246293
}
247294
charCnt++;
248295
}else{
249296
charCnt++;
@@ -267,14 +314,16 @@
267314
while( cchUTF8<maxUTF8 &&
268315
(zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
269316
cchUTF8++;
270317
zBuf[iBuf++] = zLine[index++];
271318
}
319
+ maxChars--;
272320
}
273
- else
321
+ else {
274322
zBuf[iBuf++] = c;
275
- if( (c&0x80)==0 || (zLine[index+1]&0xc0)!=0xc0 ) maxChars -= useChars;
323
+ maxChars -= useChars;
324
+ }
276325
if( maxChars<=0 ) break;
277326
if( c=='\n' ) break;
278327
}
279328
if( charCnt>0 ){
280329
zBuf[iBuf++] = '\n';
281330
--- src/comformat.c
+++ src/comformat.c
@@ -120,22 +120,67 @@
120 ** zero if such a character cannot be found. For the purposes of this
121 ** algorithm, the NUL character is treated the same as a spacing character.
122 */
123 static int comment_next_space(
124 const char *zLine, /* [in] The comment line being printed. */
125 int index /* [in] The current character index being handled. */
 
126 ){
127 int nextIndex = index + 1;
 
128 for(;;){
129 char c = zLine[nextIndex];
 
130 if( c==0 || fossil_isspace(c) ){
 
 
 
 
 
 
 
131 return nextIndex;
132 }
133 nextIndex++;
134 }
135 return 0; /* NOT REACHED */
136 }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
138 /*
139 ** This function is called when printing a logical comment line to calculate
140 ** the necessary indenting. The caller needs to emit the indenting spaces.
141 */
@@ -227,23 +272,25 @@
227 if( c=='\n' ){
228 lineCnt++;
229 charCnt = 0;
230 useChars = 0;
231 }else if( c=='\t' ){
232 int nextIndex = comment_next_space(zLine, index);
233 if( nextIndex<=0 || (nextIndex-index)>maxChars ){
 
234 break;
235 }
236 charCnt++;
237 useChars = COMMENT_TAB_WIDTH;
238 if( maxChars<useChars ){
239 zBuf[iBuf++] = ' ';
240 break;
241 }
242 }else if( wordBreak && fossil_isspace(c) ){
243 int nextIndex = comment_next_space(zLine, index);
244 if( nextIndex<=0 || (nextIndex-index)>maxChars ){
 
245 break;
246 }
247 charCnt++;
248 }else{
249 charCnt++;
@@ -267,14 +314,16 @@
267 while( cchUTF8<maxUTF8 &&
268 (zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
269 cchUTF8++;
270 zBuf[iBuf++] = zLine[index++];
271 }
 
272 }
273 else
274 zBuf[iBuf++] = c;
275 if( (c&0x80)==0 || (zLine[index+1]&0xc0)!=0xc0 ) maxChars -= useChars;
 
276 if( maxChars<=0 ) break;
277 if( c=='\n' ) break;
278 }
279 if( charCnt>0 ){
280 zBuf[iBuf++] = '\n';
281
--- src/comformat.c
+++ src/comformat.c
@@ -120,22 +120,67 @@
120 ** zero if such a character cannot be found. For the purposes of this
121 ** algorithm, the NUL character is treated the same as a spacing character.
122 */
123 static int comment_next_space(
124 const char *zLine, /* [in] The comment line being printed. */
125 int index, /* [in] The current character index being handled. */
126 int *distUTF8 /* [out] Distance to next space in UTF-8 sequences. */
127 ){
128 int nextIndex = index + 1;
129 int fNonASCII=0;
130 for(;;){
131 char c = zLine[nextIndex];
132 if ( (c&0x80)==0x80 ) fNonASCII=1;
133 if( c==0 || fossil_isspace(c) ){
134 if ( distUTF8 ){
135 if ( fNonASCII!=0 ){
136 *distUTF8 = strlen_utf8(&zLine[index], nextIndex-index);
137 }else{
138 *distUTF8 = nextIndex-index;
139 }
140 }
141 return nextIndex;
142 }
143 nextIndex++;
144 }
145 return 0; /* NOT REACHED */
146 }
147
148 /*
149 ** Count the number of UTF-8 sequences in a string. Incomplete, ill-formed and
150 ** overlong sequences are counted as one sequence. The invalid lead bytes 0xC0
151 ** to 0xC1 and 0xF5 to 0xF7 are allowed to initiate (ill-formed) 2- and 4-byte
152 ** sequences, respectively, the other invalid lead bytes 0xF8 to 0xFF are
153 ** treated as invalid 1-byte sequences (as lone trail bytes).
154 ** Combining characters and East Asian Wide and Fullwidth characters are counted
155 ** as one, so this function does not calculate the effective "display width".
156 */
157 int strlen_utf8(const char *zString, int lengthBytes)
158 {
159 #if 0
160 assert( lengthBytes>=0 );
161 #endif
162 int lengthUTF8=0; /* Counted UTF-8 sequences. */
163 int i;
164 for( i=0; i<lengthBytes; i++ ){
165 char c = zString[i];
166 lengthUTF8++;
167 if ( (c&0xc0)==0xc0 ){ /* Any UTF-8 lead byte 11xxxxxx */
168 int cchUTF8=1; /* Code units consumed. */
169 int maxUTF8=1; /* Expected sequence length. */
170 if( (c&0xe0)==0xc0 )maxUTF8=2; /* UTF-8 lead byte 110vvvvv */
171 else if( (c&0xf0)==0xe0 )maxUTF8=3; /* UTF-8 lead byte 1110vvvv */
172 else if( (c&0xf8)==0xf0 )maxUTF8=4; /* UTF-8 lead byte 11110vvv */
173 while( i<lengthBytes-1 &&
174 cchUTF8<maxUTF8 &&
175 (zString[i+1]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
176 i++;
177 }
178 }
179 }
180 return lengthUTF8;
181 }
182
183 /*
184 ** This function is called when printing a logical comment line to calculate
185 ** the necessary indenting. The caller needs to emit the indenting spaces.
186 */
@@ -227,23 +272,25 @@
272 if( c=='\n' ){
273 lineCnt++;
274 charCnt = 0;
275 useChars = 0;
276 }else if( c=='\t' ){
277 int distUTF8;
278 int nextIndex = comment_next_space(zLine, index, &distUTF8);
279 if( nextIndex<=0 || distUTF8>maxChars ){
280 break;
281 }
282 charCnt++;
283 useChars = COMMENT_TAB_WIDTH;
284 if( maxChars<useChars ){
285 zBuf[iBuf++] = ' ';
286 break;
287 }
288 }else if( wordBreak && fossil_isspace(c) ){
289 int distUTF8;
290 int nextIndex = comment_next_space(zLine, index, &distUTF8);
291 if( nextIndex<=0 || distUTF8>maxChars ){
292 break;
293 }
294 charCnt++;
295 }else{
296 charCnt++;
@@ -267,14 +314,16 @@
314 while( cchUTF8<maxUTF8 &&
315 (zLine[index]&0xc0)==0x80 ){ /* UTF-8 trail byte 10vvvvvv */
316 cchUTF8++;
317 zBuf[iBuf++] = zLine[index++];
318 }
319 maxChars--;
320 }
321 else {
322 zBuf[iBuf++] = c;
323 maxChars -= useChars;
324 }
325 if( maxChars<=0 ) break;
326 if( c=='\n' ) break;
327 }
328 if( charCnt>0 ){
329 zBuf[iBuf++] = '\n';
330

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button