Fossil SCM

fixed an edge case in invalid_utf8 where a valid three byte seq could be misidentified

scott 2016-06-09 23:59 trunk
Commit 314cdab0d49d74237ddca987c3e0faa0870373cd
1 file changed +10
--- src/lookslike.c
+++ src/lookslike.c
@@ -161,11 +161,21 @@
161161
if( c2>=0x80 ){
162162
if( ((c2<0xc2) || (c2>=0xf4) || ((c&0xc0)!=0x80)) &&
163163
(((c2!=0xf4) || (c>=0x90)) && ((c2!=0xc0) || (c!=0x80))) ){
164164
return LOOK_INVALID; /* Invalid UTF-8 */
165165
}
166
+ /* the first byte of the sequence is okay
167
+ ** but we need to check the rest
168
+ ** convert next byte to a prefix byte of the next shorter sequence
169
+ ** or a simple space character if the two byte seq was valid
170
+ */
166171
c = (c2 >= 0xe0) ? (c2<<1)+1 : ' ';
172
+ /* edge case: if three byte sequence started with 0xe0
173
+ ** it becomes 0xc1, which is a too short two byte sequence
174
+ ** so fix it up to be the start of a valid two byte sequence
175
+ */
176
+ if (c == 0xc1) c = 0xc2;
167177
}
168178
}
169179
return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */
170180
}
171181
172182
--- src/lookslike.c
+++ src/lookslike.c
@@ -161,11 +161,21 @@
161 if( c2>=0x80 ){
162 if( ((c2<0xc2) || (c2>=0xf4) || ((c&0xc0)!=0x80)) &&
163 (((c2!=0xf4) || (c>=0x90)) && ((c2!=0xc0) || (c!=0x80))) ){
164 return LOOK_INVALID; /* Invalid UTF-8 */
165 }
 
 
 
 
 
166 c = (c2 >= 0xe0) ? (c2<<1)+1 : ' ';
 
 
 
 
 
167 }
168 }
169 return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */
170 }
171
172
--- src/lookslike.c
+++ src/lookslike.c
@@ -161,11 +161,21 @@
161 if( c2>=0x80 ){
162 if( ((c2<0xc2) || (c2>=0xf4) || ((c&0xc0)!=0x80)) &&
163 (((c2!=0xf4) || (c>=0x90)) && ((c2!=0xc0) || (c!=0x80))) ){
164 return LOOK_INVALID; /* Invalid UTF-8 */
165 }
166 /* the first byte of the sequence is okay
167 ** but we need to check the rest
168 ** convert next byte to a prefix byte of the next shorter sequence
169 ** or a simple space character if the two byte seq was valid
170 */
171 c = (c2 >= 0xe0) ? (c2<<1)+1 : ' ';
172 /* edge case: if three byte sequence started with 0xe0
173 ** it becomes 0xc1, which is a too short two byte sequence
174 ** so fix it up to be the start of a valid two byte sequence
175 */
176 if (c == 0xc1) c = 0xc2;
177 }
178 }
179 return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */
180 }
181
182

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button