Fossil SCM
fixed an edge case in invalid_utf8 where a valid three byte seq could be misidentified
Commit
314cdab0d49d74237ddca987c3e0faa0870373cd
Parent
c7e9625d4238f32…
1 file changed
+10
+10
| --- src/lookslike.c | ||
| +++ src/lookslike.c | ||
| @@ -161,11 +161,21 @@ | ||
| 161 | 161 | if( c2>=0x80 ){ |
| 162 | 162 | if( ((c2<0xc2) || (c2>=0xf4) || ((c&0xc0)!=0x80)) && |
| 163 | 163 | (((c2!=0xf4) || (c>=0x90)) && ((c2!=0xc0) || (c!=0x80))) ){ |
| 164 | 164 | return LOOK_INVALID; /* Invalid UTF-8 */ |
| 165 | 165 | } |
| 166 | + /* the first byte of the sequence is okay | |
| 167 | + ** but we need to check the rest | |
| 168 | + ** convert next byte to a prefix byte of the next shorter sequence | |
| 169 | + ** or a simple space character if the two byte seq was valid | |
| 170 | + */ | |
| 166 | 171 | c = (c2 >= 0xe0) ? (c2<<1)+1 : ' '; |
| 172 | + /* edge case: if three byte sequence started with 0xe0 | |
| 173 | + ** it becomes 0xc1, which is a too short two byte sequence | |
| 174 | + ** so fix it up to be the start of a valid two byte sequence | |
| 175 | + */ | |
| 176 | + if (c == 0xc1) c = 0xc2; | |
| 167 | 177 | } |
| 168 | 178 | } |
| 169 | 179 | return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */ |
| 170 | 180 | } |
| 171 | 181 | |
| 172 | 182 |
| --- src/lookslike.c | |
| +++ src/lookslike.c | |
| @@ -161,11 +161,21 @@ | |
| 161 | if( c2>=0x80 ){ |
| 162 | if( ((c2<0xc2) || (c2>=0xf4) || ((c&0xc0)!=0x80)) && |
| 163 | (((c2!=0xf4) || (c>=0x90)) && ((c2!=0xc0) || (c!=0x80))) ){ |
| 164 | return LOOK_INVALID; /* Invalid UTF-8 */ |
| 165 | } |
| 166 | c = (c2 >= 0xe0) ? (c2<<1)+1 : ' '; |
| 167 | } |
| 168 | } |
| 169 | return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */ |
| 170 | } |
| 171 | |
| 172 |
| --- src/lookslike.c | |
| +++ src/lookslike.c | |
| @@ -161,11 +161,21 @@ | |
| 161 | if( c2>=0x80 ){ |
| 162 | if( ((c2<0xc2) || (c2>=0xf4) || ((c&0xc0)!=0x80)) && |
| 163 | (((c2!=0xf4) || (c>=0x90)) && ((c2!=0xc0) || (c!=0x80))) ){ |
| 164 | return LOOK_INVALID; /* Invalid UTF-8 */ |
| 165 | } |
| 166 | /* the first byte of the sequence is okay |
| 167 | ** but we need to check the rest |
| 168 | ** convert next byte to a prefix byte of the next shorter sequence |
| 169 | ** or a simple space character if the two byte seq was valid |
| 170 | */ |
| 171 | c = (c2 >= 0xe0) ? (c2<<1)+1 : ' '; |
| 172 | /* edge case: if three byte sequence started with 0xe0 |
| 173 | ** it becomes 0xc1, which is a too short two byte sequence |
| 174 | ** so fix it up to be the start of a valid two byte sequence |
| 175 | */ |
| 176 | if (c == 0xc1) c = 0xc2; |
| 177 | } |
| 178 | } |
| 179 | return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */ |
| 180 | } |
| 181 | |
| 182 |