Fossil SCM
If the table is encoded as start-value/size, a variable and a comparison can be saved. Should be even faster ....
Commit
758e3d318893fe5478bbcade2a5826574a07ec62
Parent
7f067f29400dea1…
1 file changed
+10
-10
+10
-10
| --- src/lookslike.c | ||
| +++ src/lookslike.c | ||
| @@ -147,18 +147,19 @@ | ||
| 147 | 147 | ** 0xc0..0xdf it means that 'c' is expected to contain the last continuation |
| 148 | 148 | ** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one |
| 149 | 149 | ** more continuation byte is expected. |
| 150 | 150 | */ |
| 151 | 151 | |
| 152 | -/* definitions for various UTF-8 sequence lengths */ | |
| 153 | -#define US2A 0x7F, 0x80 /* for lead byte 0xC0 */ | |
| 154 | -#define US2B 0x7F, 0xBF /* for lead bytes 0xC2-0xDF */ | |
| 155 | -#define US3A 0x9F, 0xBF /* for lead byte 0xE0 */ | |
| 156 | -#define US3B 0x7F, 0xBF /* for lead bytes 0xE1-0xEF */ | |
| 157 | -#define US4A 0x8F, 0xBF /* for lead byte 0xF0 */ | |
| 158 | -#define US4B 0x7F, 0xBF /* for lead bytes 0xF1-0xF3 */ | |
| 159 | -#define US4C 0x7F, 0x8F /* for lead byte 0xF4 */ | |
| 152 | +/* definitions for various UTF-8 sequence lengths, encoded as start value | |
| 153 | + * and size of each valid range belonging to some lead byte*/ | |
| 154 | +#define US2A 0x80, 0x01 /* for lead byte 0xC0 */ | |
| 155 | +#define US2B 0x80, 0x40 /* for lead bytes 0xC2-0xDF */ | |
| 156 | +#define US3A 0xA0, 0x20 /* for lead byte 0xE0 */ | |
| 157 | +#define US3B 0x80, 0x40 /* for lead bytes 0xE1-0xEF */ | |
| 158 | +#define US4A 0x90, 0x30 /* for lead byte 0xF0 */ | |
| 159 | +#define US4B 0x80, 0x40 /* for lead bytes 0xF1-0xF3 */ | |
| 160 | +#define US4C 0x80, 0x10 /* for lead byte 0xF4 */ | |
| 160 | 161 | #define US0A 0xFF, 0x00 /* for any other lead byte */ |
| 161 | 162 | |
| 162 | 163 | /* a table used for quick lookup of the definition that goes with a |
| 163 | 164 | * particular lead byte */ |
| 164 | 165 | static const unsigned char lb_tab[] = { |
| @@ -189,16 +190,15 @@ | ||
| 189 | 190 | |
| 190 | 191 | if( n==0 ) return 0; /* Empty file -> OK */ |
| 191 | 192 | c = *z; |
| 192 | 193 | while( --n>0 ){ |
| 193 | 194 | if( c>=0x80 ){ |
| 194 | - unsigned char fb = *++z; /* follow-up byte after lead byte */ | |
| 195 | 195 | const unsigned char *def; /* pointer to range table*/ |
| 196 | 196 | |
| 197 | 197 | c <<= 1; /* multiply by 2 and get rid of highest bit */ |
| 198 | 198 | def = &lb_tab[c]; /* search fb's valid range in table */ |
| 199 | - if( (fb<=def[0]) || (fb>def[1]) ){ | |
| 199 | + if( (unsigned int)(*++z-def[0])>=def[1] ){ | |
| 200 | 200 | return LOOK_INVALID; /* Invalid UTF-8 */ |
| 201 | 201 | } |
| 202 | 202 | c = (c>=0xC0) ? (c|3) : ' '; /* determine next lead byte */ |
| 203 | 203 | } else { |
| 204 | 204 | c = *++z; |
| 205 | 205 |
| --- src/lookslike.c | |
| +++ src/lookslike.c | |
| @@ -147,18 +147,19 @@ | |
| 147 | ** 0xc0..0xdf it means that 'c' is expected to contain the last continuation |
| 148 | ** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one |
| 149 | ** more continuation byte is expected. |
| 150 | */ |
| 151 | |
| 152 | /* definitions for various UTF-8 sequence lengths */ |
| 153 | #define US2A 0x7F, 0x80 /* for lead byte 0xC0 */ |
| 154 | #define US2B 0x7F, 0xBF /* for lead bytes 0xC2-0xDF */ |
| 155 | #define US3A 0x9F, 0xBF /* for lead byte 0xE0 */ |
| 156 | #define US3B 0x7F, 0xBF /* for lead bytes 0xE1-0xEF */ |
| 157 | #define US4A 0x8F, 0xBF /* for lead byte 0xF0 */ |
| 158 | #define US4B 0x7F, 0xBF /* for lead bytes 0xF1-0xF3 */ |
| 159 | #define US4C 0x7F, 0x8F /* for lead byte 0xF4 */ |
| 160 | #define US0A 0xFF, 0x00 /* for any other lead byte */ |
| 161 | |
| 162 | /* a table used for quick lookup of the definition that goes with a |
| 163 | * particular lead byte */ |
| 164 | static const unsigned char lb_tab[] = { |
| @@ -189,16 +190,15 @@ | |
| 189 | |
| 190 | if( n==0 ) return 0; /* Empty file -> OK */ |
| 191 | c = *z; |
| 192 | while( --n>0 ){ |
| 193 | if( c>=0x80 ){ |
| 194 | unsigned char fb = *++z; /* follow-up byte after lead byte */ |
| 195 | const unsigned char *def; /* pointer to range table*/ |
| 196 | |
| 197 | c <<= 1; /* multiply by 2 and get rid of highest bit */ |
| 198 | def = &lb_tab[c]; /* search fb's valid range in table */ |
| 199 | if( (fb<=def[0]) || (fb>def[1]) ){ |
| 200 | return LOOK_INVALID; /* Invalid UTF-8 */ |
| 201 | } |
| 202 | c = (c>=0xC0) ? (c|3) : ' '; /* determine next lead byte */ |
| 203 | } else { |
| 204 | c = *++z; |
| 205 |
| --- src/lookslike.c | |
| +++ src/lookslike.c | |
| @@ -147,18 +147,19 @@ | |
| 147 | ** 0xc0..0xdf it means that 'c' is expected to contain the last continuation |
| 148 | ** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one |
| 149 | ** more continuation byte is expected. |
| 150 | */ |
| 151 | |
| 152 | /* definitions for various UTF-8 sequence lengths, encoded as start value |
| 153 | * and size of each valid range belonging to some lead byte*/ |
| 154 | #define US2A 0x80, 0x01 /* for lead byte 0xC0 */ |
| 155 | #define US2B 0x80, 0x40 /* for lead bytes 0xC2-0xDF */ |
| 156 | #define US3A 0xA0, 0x20 /* for lead byte 0xE0 */ |
| 157 | #define US3B 0x80, 0x40 /* for lead bytes 0xE1-0xEF */ |
| 158 | #define US4A 0x90, 0x30 /* for lead byte 0xF0 */ |
| 159 | #define US4B 0x80, 0x40 /* for lead bytes 0xF1-0xF3 */ |
| 160 | #define US4C 0x80, 0x10 /* for lead byte 0xF4 */ |
| 161 | #define US0A 0xFF, 0x00 /* for any other lead byte */ |
| 162 | |
| 163 | /* a table used for quick lookup of the definition that goes with a |
| 164 | * particular lead byte */ |
| 165 | static const unsigned char lb_tab[] = { |
| @@ -189,16 +190,15 @@ | |
| 190 | |
| 191 | if( n==0 ) return 0; /* Empty file -> OK */ |
| 192 | c = *z; |
| 193 | while( --n>0 ){ |
| 194 | if( c>=0x80 ){ |
| 195 | const unsigned char *def; /* pointer to range table*/ |
| 196 | |
| 197 | c <<= 1; /* multiply by 2 and get rid of highest bit */ |
| 198 | def = &lb_tab[c]; /* search fb's valid range in table */ |
| 199 | if( (unsigned int)(*++z-def[0])>=def[1] ){ |
| 200 | return LOOK_INVALID; /* Invalid UTF-8 */ |
| 201 | } |
| 202 | c = (c>=0xC0) ? (c|3) : ' '; /* determine next lead byte */ |
| 203 | } else { |
| 204 | c = *++z; |
| 205 |