Fossil SCM

If the table is encoded as start-value/size, a variable and a comparison can be saved. Should be even faster ....

jan.nijtmans 2016-06-18 16:50 invalid_utf8_improvements
Commit 758e3d318893fe5478bbcade2a5826574a07ec62
1 file changed +10 -10
+10 -10
--- src/lookslike.c
+++ src/lookslike.c
@@ -147,18 +147,19 @@
147147
** 0xc0..0xdf it means that 'c' is expected to contain the last continuation
148148
** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
149149
** more continuation byte is expected.
150150
*/
151151
152
-/* definitions for various UTF-8 sequence lengths */
153
-#define US2A 0x7F, 0x80 /* for lead byte 0xC0 */
154
-#define US2B 0x7F, 0xBF /* for lead bytes 0xC2-0xDF */
155
-#define US3A 0x9F, 0xBF /* for lead byte 0xE0 */
156
-#define US3B 0x7F, 0xBF /* for lead bytes 0xE1-0xEF */
157
-#define US4A 0x8F, 0xBF /* for lead byte 0xF0 */
158
-#define US4B 0x7F, 0xBF /* for lead bytes 0xF1-0xF3 */
159
-#define US4C 0x7F, 0x8F /* for lead byte 0xF4 */
152
+/* definitions for various UTF-8 sequence lengths, encoded as start value
153
+ * and size of each valid range belonging to some lead byte*/
154
+#define US2A 0x80, 0x01 /* for lead byte 0xC0 */
155
+#define US2B 0x80, 0x40 /* for lead bytes 0xC2-0xDF */
156
+#define US3A 0xA0, 0x20 /* for lead byte 0xE0 */
157
+#define US3B 0x80, 0x40 /* for lead bytes 0xE1-0xEF */
158
+#define US4A 0x90, 0x30 /* for lead byte 0xF0 */
159
+#define US4B 0x80, 0x40 /* for lead bytes 0xF1-0xF3 */
160
+#define US4C 0x80, 0x10 /* for lead byte 0xF4 */
160161
#define US0A 0xFF, 0x00 /* for any other lead byte */
161162
162163
/* a table used for quick lookup of the definition that goes with a
163164
* particular lead byte */
164165
static const unsigned char lb_tab[] = {
@@ -189,16 +190,15 @@
189190
190191
if( n==0 ) return 0; /* Empty file -> OK */
191192
c = *z;
192193
while( --n>0 ){
193194
if( c>=0x80 ){
194
- unsigned char fb = *++z; /* follow-up byte after lead byte */
195195
const unsigned char *def; /* pointer to range table*/
196196
197197
c <<= 1; /* multiply by 2 and get rid of highest bit */
198198
def = &lb_tab[c]; /* search fb's valid range in table */
199
- if( (fb<=def[0]) || (fb>def[1]) ){
199
+ if( (unsigned int)(*++z-def[0])>=def[1] ){
200200
return LOOK_INVALID; /* Invalid UTF-8 */
201201
}
202202
c = (c>=0xC0) ? (c|3) : ' '; /* determine next lead byte */
203203
} else {
204204
c = *++z;
205205
--- src/lookslike.c
+++ src/lookslike.c
@@ -147,18 +147,19 @@
147 ** 0xc0..0xdf it means that 'c' is expected to contain the last continuation
148 ** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
149 ** more continuation byte is expected.
150 */
151
152 /* definitions for various UTF-8 sequence lengths */
153 #define US2A 0x7F, 0x80 /* for lead byte 0xC0 */
154 #define US2B 0x7F, 0xBF /* for lead bytes 0xC2-0xDF */
155 #define US3A 0x9F, 0xBF /* for lead byte 0xE0 */
156 #define US3B 0x7F, 0xBF /* for lead bytes 0xE1-0xEF */
157 #define US4A 0x8F, 0xBF /* for lead byte 0xF0 */
158 #define US4B 0x7F, 0xBF /* for lead bytes 0xF1-0xF3 */
159 #define US4C 0x7F, 0x8F /* for lead byte 0xF4 */
 
160 #define US0A 0xFF, 0x00 /* for any other lead byte */
161
162 /* a table used for quick lookup of the definition that goes with a
163 * particular lead byte */
164 static const unsigned char lb_tab[] = {
@@ -189,16 +190,15 @@
189
190 if( n==0 ) return 0; /* Empty file -> OK */
191 c = *z;
192 while( --n>0 ){
193 if( c>=0x80 ){
194 unsigned char fb = *++z; /* follow-up byte after lead byte */
195 const unsigned char *def; /* pointer to range table*/
196
197 c <<= 1; /* multiply by 2 and get rid of highest bit */
198 def = &lb_tab[c]; /* search fb's valid range in table */
199 if( (fb<=def[0]) || (fb>def[1]) ){
200 return LOOK_INVALID; /* Invalid UTF-8 */
201 }
202 c = (c>=0xC0) ? (c|3) : ' '; /* determine next lead byte */
203 } else {
204 c = *++z;
205
--- src/lookslike.c
+++ src/lookslike.c
@@ -147,18 +147,19 @@
147 ** 0xc0..0xdf it means that 'c' is expected to contain the last continuation
148 ** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
149 ** more continuation byte is expected.
150 */
151
152 /* definitions for various UTF-8 sequence lengths, encoded as start value
153 * and size of each valid range belonging to some lead byte*/
154 #define US2A 0x80, 0x01 /* for lead byte 0xC0 */
155 #define US2B 0x80, 0x40 /* for lead bytes 0xC2-0xDF */
156 #define US3A 0xA0, 0x20 /* for lead byte 0xE0 */
157 #define US3B 0x80, 0x40 /* for lead bytes 0xE1-0xEF */
158 #define US4A 0x90, 0x30 /* for lead byte 0xF0 */
159 #define US4B 0x80, 0x40 /* for lead bytes 0xF1-0xF3 */
160 #define US4C 0x80, 0x10 /* for lead byte 0xF4 */
161 #define US0A 0xFF, 0x00 /* for any other lead byte */
162
163 /* a table used for quick lookup of the definition that goes with a
164 * particular lead byte */
165 static const unsigned char lb_tab[] = {
@@ -189,16 +190,15 @@
190
191 if( n==0 ) return 0; /* Empty file -> OK */
192 c = *z;
193 while( --n>0 ){
194 if( c>=0x80 ){
 
195 const unsigned char *def; /* pointer to range table*/
196
197 c <<= 1; /* multiply by 2 and get rid of highest bit */
198 def = &lb_tab[c]; /* search fb's valid range in table */
199 if( (unsigned int)(*++z-def[0])>=def[1] ){
200 return LOOK_INVALID; /* Invalid UTF-8 */
201 }
202 c = (c>=0xC0) ? (c|3) : ' '; /* determine next lead byte */
203 } else {
204 c = *++z;
205

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button