Fossil SCM
Further invalid_utf8() improvement: Save one indirection and a check, and make the table size even smaller.
Commit
6a59dbbb99b12982b4c3adea7d2a8d002600b293
Parent
60349a661749067…
1 file changed
+27
-40
+27
-40
| --- src/lookslike.c | ||
| +++ src/lookslike.c | ||
| @@ -148,51 +148,38 @@ | ||
| 148 | 148 | ** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one |
| 149 | 149 | ** more continuation byte is expected. |
| 150 | 150 | */ |
| 151 | 151 | |
| 152 | 152 | /* definitions for various UTF-8 sequence lengths */ |
| 153 | -static const unsigned char us2a[] = { /* for lead byte 0xC0 */ | |
| 154 | - 0x80, 0x80 | |
| 155 | -}; | |
| 156 | -static const unsigned char us2b[] = { /* for lead bytes 0xC2-0xDF */ | |
| 157 | - 0x80, 0xBF | |
| 158 | -}; | |
| 159 | -static const unsigned char us3a[] = { /* for lead byte 0xE0 */ | |
| 160 | - 0xA0, 0xBF | |
| 161 | -}; | |
| 162 | -static const unsigned char us3b[] = { /* for lead bytes 0xE1-0xEF */ | |
| 163 | - 0x80, 0xBF | |
| 164 | -}; | |
| 165 | -static const unsigned char us4a[] = { /* for lead byte 0xF0 */ | |
| 166 | - 0x90, 0xBF | |
| 167 | -}; | |
| 168 | -static const unsigned char us4b[] = { /* for lead bytes 0xF1-0xF3 */ | |
| 169 | - 0x80, 0xBF | |
| 170 | -}; | |
| 171 | -static const unsigned char us4c[] = { /* for lead byte 0xF4 */ | |
| 172 | - 0x80, 0x8F | |
| 173 | -}; | |
| 153 | +#define US2A 0x80, 0x80 /* for lead byte 0xC0 */ | |
| 154 | +#define US2B 0x80, 0xBF /* for lead bytes 0xC2-0xDF */ | |
| 155 | +#define US3A 0xA0, 0xBF /* for lead byte 0xE0 */ | |
| 156 | +#define US3B 0x80, 0xBF /* for lead bytes 0xE1-0xEF */ | |
| 157 | +#define US4A 0x90, 0xBF /* for lead byte 0xF0 */ | |
| 158 | +#define US4B 0x80, 0xBF /* for lead bytes 0xF1-0xF3 */ | |
| 159 | +#define US4C 0x80, 0x8F /* for lead byte 0xF4 */ | |
| 160 | +#define US0A 0xFF, 0x00 /* for any other lead byte */ | |
| 174 | 161 | |
| 175 | 162 | /* a table used for quick lookup of the definition that goes with a |
| 176 | 163 | * particular lead byte */ |
| 177 | -static const unsigned char* const lb_tab[] = { | |
| 178 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 179 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 180 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 181 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 182 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 183 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 184 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 185 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 186 | - us2a, NULL, us2b, us2b, us2b, us2b, us2b, us2b, | |
| 187 | - us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b, | |
| 188 | - us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b, | |
| 189 | - us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b, | |
| 190 | - us3a, us3b, us3b, us3b, us3b, us3b, us3b, us3b, | |
| 191 | - us3b, us3b, us3b, us3b, us3b, us3b, us3b, us3b, | |
| 192 | - us4a, us4b, us4b, us4b, us4c, NULL, NULL, NULL, | |
| 193 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL | |
| 164 | +static const unsigned char lb_tab[] = { | |
| 165 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 166 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 167 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 168 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 169 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 170 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 171 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 172 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 173 | + US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B, | |
| 174 | + US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, | |
| 175 | + US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, | |
| 176 | + US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, | |
| 177 | + US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B, | |
| 178 | + US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B, | |
| 179 | + US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A, | |
| 180 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A | |
| 194 | 181 | }; |
| 195 | 182 | |
| 196 | 183 | int invalid_utf8( |
| 197 | 184 | const Blob *pContent |
| 198 | 185 | ){ |
| @@ -204,12 +191,12 @@ | ||
| 204 | 191 | c = *z; |
| 205 | 192 | while( --n>0 ){ |
| 206 | 193 | c2 = c; |
| 207 | 194 | c = *++z; |
| 208 | 195 | if( c2>=0x80 ){ |
| 209 | - const unsigned char *def = lb_tab[(c2)-0x80]; | |
| 210 | - if( !def || (c<*def++) || (c>*def++) ){ | |
| 196 | + const unsigned char *def = &lb_tab[(2*c2)-0x100]; | |
| 197 | + if( (c<*def++) || (c>*def++) ){ | |
| 211 | 198 | return LOOK_INVALID; /* Invalid UTF-8 */ |
| 212 | 199 | } |
| 213 | 200 | if( c2>=0xe0 ){ |
| 214 | 201 | c = (c2<<1)|3; |
| 215 | 202 | }else{ |
| 216 | 203 |
| --- src/lookslike.c | |
| +++ src/lookslike.c | |
| @@ -148,51 +148,38 @@ | |
| 148 | ** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one |
| 149 | ** more continuation byte is expected. |
| 150 | */ |
| 151 | |
| 152 | /* definitions for various UTF-8 sequence lengths */ |
| 153 | static const unsigned char us2a[] = { /* for lead byte 0xC0 */ |
| 154 | 0x80, 0x80 |
| 155 | }; |
| 156 | static const unsigned char us2b[] = { /* for lead bytes 0xC2-0xDF */ |
| 157 | 0x80, 0xBF |
| 158 | }; |
| 159 | static const unsigned char us3a[] = { /* for lead byte 0xE0 */ |
| 160 | 0xA0, 0xBF |
| 161 | }; |
| 162 | static const unsigned char us3b[] = { /* for lead bytes 0xE1-0xEF */ |
| 163 | 0x80, 0xBF |
| 164 | }; |
| 165 | static const unsigned char us4a[] = { /* for lead byte 0xF0 */ |
| 166 | 0x90, 0xBF |
| 167 | }; |
| 168 | static const unsigned char us4b[] = { /* for lead bytes 0xF1-0xF3 */ |
| 169 | 0x80, 0xBF |
| 170 | }; |
| 171 | static const unsigned char us4c[] = { /* for lead byte 0xF4 */ |
| 172 | 0x80, 0x8F |
| 173 | }; |
| 174 | |
| 175 | /* a table used for quick lookup of the definition that goes with a |
| 176 | * particular lead byte */ |
| 177 | static const unsigned char* const lb_tab[] = { |
| 178 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 179 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 180 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 181 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 182 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 183 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 184 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 185 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 186 | us2a, NULL, us2b, us2b, us2b, us2b, us2b, us2b, |
| 187 | us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b, |
| 188 | us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b, |
| 189 | us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b, |
| 190 | us3a, us3b, us3b, us3b, us3b, us3b, us3b, us3b, |
| 191 | us3b, us3b, us3b, us3b, us3b, us3b, us3b, us3b, |
| 192 | us4a, us4b, us4b, us4b, us4c, NULL, NULL, NULL, |
| 193 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL |
| 194 | }; |
| 195 | |
| 196 | int invalid_utf8( |
| 197 | const Blob *pContent |
| 198 | ){ |
| @@ -204,12 +191,12 @@ | |
| 204 | c = *z; |
| 205 | while( --n>0 ){ |
| 206 | c2 = c; |
| 207 | c = *++z; |
| 208 | if( c2>=0x80 ){ |
| 209 | const unsigned char *def = lb_tab[(c2)-0x80]; |
| 210 | if( !def || (c<*def++) || (c>*def++) ){ |
| 211 | return LOOK_INVALID; /* Invalid UTF-8 */ |
| 212 | } |
| 213 | if( c2>=0xe0 ){ |
| 214 | c = (c2<<1)|3; |
| 215 | }else{ |
| 216 |
| --- src/lookslike.c | |
| +++ src/lookslike.c | |
| @@ -148,51 +148,38 @@ | |
| 148 | ** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one |
| 149 | ** more continuation byte is expected. |
| 150 | */ |
| 151 | |
| 152 | /* definitions for various UTF-8 sequence lengths */ |
| 153 | #define US2A 0x80, 0x80 /* for lead byte 0xC0 */ |
| 154 | #define US2B 0x80, 0xBF /* for lead bytes 0xC2-0xDF */ |
| 155 | #define US3A 0xA0, 0xBF /* for lead byte 0xE0 */ |
| 156 | #define US3B 0x80, 0xBF /* for lead bytes 0xE1-0xEF */ |
| 157 | #define US4A 0x90, 0xBF /* for lead byte 0xF0 */ |
| 158 | #define US4B 0x80, 0xBF /* for lead bytes 0xF1-0xF3 */ |
| 159 | #define US4C 0x80, 0x8F /* for lead byte 0xF4 */ |
| 160 | #define US0A 0xFF, 0x00 /* for any other lead byte */ |
| 161 | |
| 162 | /* a table used for quick lookup of the definition that goes with a |
| 163 | * particular lead byte */ |
| 164 | static const unsigned char lb_tab[] = { |
| 165 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 166 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 167 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 168 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 169 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 170 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 171 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 172 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 173 | US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B, |
| 174 | US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, |
| 175 | US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, |
| 176 | US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, |
| 177 | US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B, |
| 178 | US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B, |
| 179 | US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A, |
| 180 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A |
| 181 | }; |
| 182 | |
| 183 | int invalid_utf8( |
| 184 | const Blob *pContent |
| 185 | ){ |
| @@ -204,12 +191,12 @@ | |
| 191 | c = *z; |
| 192 | while( --n>0 ){ |
| 193 | c2 = c; |
| 194 | c = *++z; |
| 195 | if( c2>=0x80 ){ |
| 196 | const unsigned char *def = &lb_tab[(2*c2)-0x100]; |
| 197 | if( (c<*def++) || (c>*def++) ){ |
| 198 | return LOOK_INVALID; /* Invalid UTF-8 */ |
| 199 | } |
| 200 | if( c2>=0xe0 ){ |
| 201 | c = (c2<<1)|3; |
| 202 | }else{ |
| 203 |