Fossil SCM
More optimizations, taken over from trunk.
Commit
ec7f6b2e71c5001416d37d2d5a88b63ab1b0da23
Parent
c22ec007ea9a62a…
2 files changed
+28
-43
+28
-43
+28
-43
| --- src/lookslike.c | ||
| +++ src/lookslike.c | ||
| @@ -143,51 +143,38 @@ | ||
| 143 | 143 | ** the derivatives CESU-8 & WTF-8 (as described in the same |
| 144 | 144 | ** wikipedia article referenced previously). |
| 145 | 145 | */ |
| 146 | 146 | |
| 147 | 147 | /* definitions for various UTF-8 sequence lengths */ |
| 148 | -static const unsigned char us2a[] = { /* for lead byte 0xC0 */ | |
| 149 | - 2, 0x80, 0x80 | |
| 150 | -}; | |
| 151 | -static const unsigned char us2b[] = { /* for lead bytes 0xC2-0xDF */ | |
| 152 | - 2, 0x80, 0xBF | |
| 153 | -}; | |
| 154 | -static const unsigned char us3a[] = { /* for lead byte 0xE0 */ | |
| 155 | - 3, 0xA0, 0xBF | |
| 156 | -}; | |
| 157 | -static const unsigned char us3b[] = { /* for lead bytes 0xE1-0xEF */ | |
| 158 | - 3, 0x80, 0xBF | |
| 159 | -}; | |
| 160 | -static const unsigned char us4a[] = { /* for lead byte 0xF0 */ | |
| 161 | - 4, 0x90, 0xBF | |
| 162 | -}; | |
| 163 | -static const unsigned char us4b[] = { /* for lead bytes 0xF1-0xF3 */ | |
| 164 | - 4, 0x80, 0xBF | |
| 165 | -}; | |
| 166 | -static const unsigned char us4c[] = { /* for lead byte 0xF4 */ | |
| 167 | - 4, 0x80, 0x8F | |
| 168 | -}; | |
| 148 | +#define US2A 2, 0x80, 0x80 /* for lead byte 0xC0 */ | |
| 149 | +#define US2B 2, 0x80, 0xBF /* for lead bytes 0xC2-0xDF */ | |
| 150 | +#define US3A 3, 0xA0, 0xBF /* for lead byte 0xE0 */ | |
| 151 | +#define US3B 3, 0x80, 0xBF /* for lead bytes 0xE1-0xEF */ | |
| 152 | +#define US4A 4, 0x90, 0xBF /* for lead byte 0xF0 */ | |
| 153 | +#define US4B 4, 0x80, 0xBF /* for lead bytes 0xF1-0xF3 */ | |
| 154 | +#define US4C 4, 0x80, 0x8F /* for lead byte 0xF4 */ | |
| 155 | +#define US0A 0xFF, 0xFF, 0x00 /* for any other lead byte */ | |
| 169 | 156 | |
| 170 | 157 | /* a table used for quick lookup of the definition that goes with a |
| 171 | 158 | * particular lead byte */ |
| 172 | -static const unsigned char* const lb_tab[] = { | |
| 173 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 174 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 175 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 176 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 177 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 178 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 179 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 180 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 181 | - us2a, NULL, us2b, us2b, us2b, us2b, us2b, us2b, | |
| 182 | - us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b, | |
| 183 | - us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b, | |
| 184 | - us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b, | |
| 185 | - us3a, us3b, us3b, us3b, us3b, us3b, us3b, us3b, | |
| 186 | - us3b, us3b, us3b, us3b, us3b, us3b, us3b, us3b, | |
| 187 | - us4a, us4b, us4b, us4b, us4c, NULL, NULL, NULL, | |
| 188 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL | |
| 159 | +static const unsigned char lb_tab[] = { | |
| 160 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 161 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 162 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 163 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 164 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 165 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 166 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 167 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 168 | + US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B, | |
| 169 | + US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, | |
| 170 | + US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, | |
| 171 | + US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, | |
| 172 | + US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B, | |
| 173 | + US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B, | |
| 174 | + US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A, | |
| 175 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A | |
| 189 | 176 | }; |
| 190 | 177 | |
| 191 | 178 | int invalid_utf8( |
| 192 | 179 | const Blob *pContent |
| 193 | 180 | ){ |
| @@ -201,23 +188,21 @@ | ||
| 201 | 188 | if( *z<0x80 ){ |
| 202 | 189 | ++z; |
| 203 | 190 | --n; |
| 204 | 191 | }else{ |
| 205 | 192 | /* get the definition for this lead byte */ |
| 206 | - const unsigned char* def = lb_tab[(*z++)-0x80]; | |
| 193 | + const unsigned char* def = &lb_tab[(3 * *z++)-0x180]; | |
| 207 | 194 | unsigned char len; |
| 208 | 195 | |
| 209 | - /* if the definition doesn't exist, return invalid */ | |
| 210 | - if( !def ) return LOOK_INVALID; | |
| 211 | 196 | /* get the expected sequence length */ |
| 212 | - len = *def++; | |
| 197 | + len = *def; | |
| 213 | 198 | /* if there aren't enough bytes left, return invalid */ |
| 214 | 199 | if( n<len ) { |
| 215 | 200 | return LOOK_INVALID; |
| 216 | 201 | } |
| 217 | 202 | /* we already know byte #0 is good, so check the remaining bytes */ |
| 218 | - if( (*z<*def++) || (*z++>*def++) ){ | |
| 203 | + if( (*z<*++def) || (*z++>*++def) ){ | |
| 219 | 204 | /* if the byte is outside the allowed range for this definition, |
| 220 | 205 | * return invalid */ |
| 221 | 206 | return LOOK_INVALID; |
| 222 | 207 | } |
| 223 | 208 | if( len > 2 ){ |
| 224 | 209 |
| --- src/lookslike.c | |
| +++ src/lookslike.c | |
| @@ -143,51 +143,38 @@ | |
| 143 | ** the derivatives CESU-8 & WTF-8 (as described in the same |
| 144 | ** wikipedia article referenced previously). |
| 145 | */ |
| 146 | |
| 147 | /* definitions for various UTF-8 sequence lengths */ |
| 148 | static const unsigned char us2a[] = { /* for lead byte 0xC0 */ |
| 149 | 2, 0x80, 0x80 |
| 150 | }; |
| 151 | static const unsigned char us2b[] = { /* for lead bytes 0xC2-0xDF */ |
| 152 | 2, 0x80, 0xBF |
| 153 | }; |
| 154 | static const unsigned char us3a[] = { /* for lead byte 0xE0 */ |
| 155 | 3, 0xA0, 0xBF |
| 156 | }; |
| 157 | static const unsigned char us3b[] = { /* for lead bytes 0xE1-0xEF */ |
| 158 | 3, 0x80, 0xBF |
| 159 | }; |
| 160 | static const unsigned char us4a[] = { /* for lead byte 0xF0 */ |
| 161 | 4, 0x90, 0xBF |
| 162 | }; |
| 163 | static const unsigned char us4b[] = { /* for lead bytes 0xF1-0xF3 */ |
| 164 | 4, 0x80, 0xBF |
| 165 | }; |
| 166 | static const unsigned char us4c[] = { /* for lead byte 0xF4 */ |
| 167 | 4, 0x80, 0x8F |
| 168 | }; |
| 169 | |
| 170 | /* a table used for quick lookup of the definition that goes with a |
| 171 | * particular lead byte */ |
| 172 | static const unsigned char* const lb_tab[] = { |
| 173 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 174 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 175 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 176 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 177 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 178 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 179 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 180 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 181 | us2a, NULL, us2b, us2b, us2b, us2b, us2b, us2b, |
| 182 | us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b, |
| 183 | us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b, |
| 184 | us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b, |
| 185 | us3a, us3b, us3b, us3b, us3b, us3b, us3b, us3b, |
| 186 | us3b, us3b, us3b, us3b, us3b, us3b, us3b, us3b, |
| 187 | us4a, us4b, us4b, us4b, us4c, NULL, NULL, NULL, |
| 188 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL |
| 189 | }; |
| 190 | |
| 191 | int invalid_utf8( |
| 192 | const Blob *pContent |
| 193 | ){ |
| @@ -201,23 +188,21 @@ | |
| 201 | if( *z<0x80 ){ |
| 202 | ++z; |
| 203 | --n; |
| 204 | }else{ |
| 205 | /* get the definition for this lead byte */ |
| 206 | const unsigned char* def = lb_tab[(*z++)-0x80]; |
| 207 | unsigned char len; |
| 208 | |
| 209 | /* if the definition doesn't exist, return invalid */ |
| 210 | if( !def ) return LOOK_INVALID; |
| 211 | /* get the expected sequence length */ |
| 212 | len = *def++; |
| 213 | /* if there aren't enough bytes left, return invalid */ |
| 214 | if( n<len ) { |
| 215 | return LOOK_INVALID; |
| 216 | } |
| 217 | /* we already know byte #0 is good, so check the remaining bytes */ |
| 218 | if( (*z<*def++) || (*z++>*def++) ){ |
| 219 | /* if the byte is outside the allowed range for this definition, |
| 220 | * return invalid */ |
| 221 | return LOOK_INVALID; |
| 222 | } |
| 223 | if( len > 2 ){ |
| 224 |
| --- src/lookslike.c | |
| +++ src/lookslike.c | |
| @@ -143,51 +143,38 @@ | |
| 143 | ** the derivatives CESU-8 & WTF-8 (as described in the same |
| 144 | ** wikipedia article referenced previously). |
| 145 | */ |
| 146 | |
| 147 | /* definitions for various UTF-8 sequence lengths */ |
| 148 | #define US2A 2, 0x80, 0x80 /* for lead byte 0xC0 */ |
| 149 | #define US2B 2, 0x80, 0xBF /* for lead bytes 0xC2-0xDF */ |
| 150 | #define US3A 3, 0xA0, 0xBF /* for lead byte 0xE0 */ |
| 151 | #define US3B 3, 0x80, 0xBF /* for lead bytes 0xE1-0xEF */ |
| 152 | #define US4A 4, 0x90, 0xBF /* for lead byte 0xF0 */ |
| 153 | #define US4B 4, 0x80, 0xBF /* for lead bytes 0xF1-0xF3 */ |
| 154 | #define US4C 4, 0x80, 0x8F /* for lead byte 0xF4 */ |
| 155 | #define US0A 0xFF, 0xFF, 0x00 /* for any other lead byte */ |
| 156 | |
| 157 | /* a table used for quick lookup of the definition that goes with a |
| 158 | * particular lead byte */ |
| 159 | static const unsigned char lb_tab[] = { |
| 160 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 161 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 162 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 163 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 164 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 165 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 166 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 167 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 168 | US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B, |
| 169 | US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, |
| 170 | US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, |
| 171 | US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, |
| 172 | US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B, |
| 173 | US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B, |
| 174 | US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A, |
| 175 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A |
| 176 | }; |
| 177 | |
| 178 | int invalid_utf8( |
| 179 | const Blob *pContent |
| 180 | ){ |
| @@ -201,23 +188,21 @@ | |
| 188 | if( *z<0x80 ){ |
| 189 | ++z; |
| 190 | --n; |
| 191 | }else{ |
| 192 | /* get the definition for this lead byte */ |
| 193 | const unsigned char* def = &lb_tab[(3 * *z++)-0x180]; |
| 194 | unsigned char len; |
| 195 | |
| 196 | /* get the expected sequence length */ |
| 197 | len = *def; |
| 198 | /* if there aren't enough bytes left, return invalid */ |
| 199 | if( n<len ) { |
| 200 | return LOOK_INVALID; |
| 201 | } |
| 202 | /* we already know byte #0 is good, so check the remaining bytes */ |
| 203 | if( (*z<*++def) || (*z++>*++def) ){ |
| 204 | /* if the byte is outside the allowed range for this definition, |
| 205 | * return invalid */ |
| 206 | return LOOK_INVALID; |
| 207 | } |
| 208 | if( len > 2 ){ |
| 209 |
+28
-43
| --- src/lookslike.c | ||
| +++ src/lookslike.c | ||
| @@ -143,51 +143,38 @@ | ||
| 143 | 143 | ** the derivatives CESU-8 & WTF-8 (as described in the same |
| 144 | 144 | ** wikipedia article referenced previously). |
| 145 | 145 | */ |
| 146 | 146 | |
| 147 | 147 | /* definitions for various UTF-8 sequence lengths */ |
| 148 | -static const unsigned char us2a[] = { /* for lead byte 0xC0 */ | |
| 149 | - 2, 0x80, 0x80 | |
| 150 | -}; | |
| 151 | -static const unsigned char us2b[] = { /* for lead bytes 0xC2-0xDF */ | |
| 152 | - 2, 0x80, 0xBF | |
| 153 | -}; | |
| 154 | -static const unsigned char us3a[] = { /* for lead byte 0xE0 */ | |
| 155 | - 3, 0xA0, 0xBF | |
| 156 | -}; | |
| 157 | -static const unsigned char us3b[] = { /* for lead bytes 0xE1-0xEF */ | |
| 158 | - 3, 0x80, 0xBF | |
| 159 | -}; | |
| 160 | -static const unsigned char us4a[] = { /* for lead byte 0xF0 */ | |
| 161 | - 4, 0x90, 0xBF | |
| 162 | -}; | |
| 163 | -static const unsigned char us4b[] = { /* for lead bytes 0xF1-0xF3 */ | |
| 164 | - 4, 0x80, 0xBF | |
| 165 | -}; | |
| 166 | -static const unsigned char us4c[] = { /* for lead byte 0xF4 */ | |
| 167 | - 4, 0x80, 0x8F | |
| 168 | -}; | |
| 148 | +#define US2A 2, 0x80, 0x80 /* for lead byte 0xC0 */ | |
| 149 | +#define US2B 2, 0x80, 0xBF /* for lead bytes 0xC2-0xDF */ | |
| 150 | +#define US3A 3, 0xA0, 0xBF /* for lead byte 0xE0 */ | |
| 151 | +#define US3B 3, 0x80, 0xBF /* for lead bytes 0xE1-0xEF */ | |
| 152 | +#define US4A 4, 0x90, 0xBF /* for lead byte 0xF0 */ | |
| 153 | +#define US4B 4, 0x80, 0xBF /* for lead bytes 0xF1-0xF3 */ | |
| 154 | +#define US4C 4, 0x80, 0x8F /* for lead byte 0xF4 */ | |
| 155 | +#define US0A 0xFF, 0xFF, 0x00 /* for any other lead byte */ | |
| 169 | 156 | |
| 170 | 157 | /* a table used for quick lookup of the definition that goes with a |
| 171 | 158 | * particular lead byte */ |
| 172 | -static const unsigned char* const lb_tab[] = { | |
| 173 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 174 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 175 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 176 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 177 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 178 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 179 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 180 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 181 | - us2a, NULL, us2b, us2b, us2b, us2b, us2b, us2b, | |
| 182 | - us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b, | |
| 183 | - us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b, | |
| 184 | - us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b, | |
| 185 | - us3a, us3b, us3b, us3b, us3b, us3b, us3b, us3b, | |
| 186 | - us3b, us3b, us3b, us3b, us3b, us3b, us3b, us3b, | |
| 187 | - us4a, us4b, us4b, us4b, us4c, NULL, NULL, NULL, | |
| 188 | - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL | |
| 159 | +static const unsigned char lb_tab[] = { | |
| 160 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 161 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 162 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 163 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 164 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 165 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 166 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 167 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 168 | + US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B, | |
| 169 | + US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, | |
| 170 | + US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, | |
| 171 | + US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, | |
| 172 | + US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B, | |
| 173 | + US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B, | |
| 174 | + US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A, | |
| 175 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A | |
| 189 | 176 | }; |
| 190 | 177 | |
| 191 | 178 | int invalid_utf8( |
| 192 | 179 | const Blob *pContent |
| 193 | 180 | ){ |
| @@ -201,23 +188,21 @@ | ||
| 201 | 188 | if( *z<0x80 ){ |
| 202 | 189 | ++z; |
| 203 | 190 | --n; |
| 204 | 191 | }else{ |
| 205 | 192 | /* get the definition for this lead byte */ |
| 206 | - const unsigned char* def = lb_tab[(*z++)-0x80]; | |
| 193 | + const unsigned char* def = &lb_tab[(3 * *z++)-0x180]; | |
| 207 | 194 | unsigned char len; |
| 208 | 195 | |
| 209 | - /* if the definition doesn't exist, return invalid */ | |
| 210 | - if( !def ) return LOOK_INVALID; | |
| 211 | 196 | /* get the expected sequence length */ |
| 212 | - len = *def++; | |
| 197 | + len = *def; | |
| 213 | 198 | /* if there aren't enough bytes left, return invalid */ |
| 214 | 199 | if( n<len ) { |
| 215 | 200 | return LOOK_INVALID; |
| 216 | 201 | } |
| 217 | 202 | /* we already know byte #0 is good, so check the remaining bytes */ |
| 218 | - if( (*z<*def++) || (*z++>*def++) ){ | |
| 203 | + if( (*z<*++def) || (*z++>*++def) ){ | |
| 219 | 204 | /* if the byte is outside the allowed range for this definition, |
| 220 | 205 | * return invalid */ |
| 221 | 206 | return LOOK_INVALID; |
| 222 | 207 | } |
| 223 | 208 | if( len > 2 ){ |
| 224 | 209 |
| --- src/lookslike.c | |
| +++ src/lookslike.c | |
| @@ -143,51 +143,38 @@ | |
| 143 | ** the derivatives CESU-8 & WTF-8 (as described in the same |
| 144 | ** wikipedia article referenced previously). |
| 145 | */ |
| 146 | |
| 147 | /* definitions for various UTF-8 sequence lengths */ |
| 148 | static const unsigned char us2a[] = { /* for lead byte 0xC0 */ |
| 149 | 2, 0x80, 0x80 |
| 150 | }; |
| 151 | static const unsigned char us2b[] = { /* for lead bytes 0xC2-0xDF */ |
| 152 | 2, 0x80, 0xBF |
| 153 | }; |
| 154 | static const unsigned char us3a[] = { /* for lead byte 0xE0 */ |
| 155 | 3, 0xA0, 0xBF |
| 156 | }; |
| 157 | static const unsigned char us3b[] = { /* for lead bytes 0xE1-0xEF */ |
| 158 | 3, 0x80, 0xBF |
| 159 | }; |
| 160 | static const unsigned char us4a[] = { /* for lead byte 0xF0 */ |
| 161 | 4, 0x90, 0xBF |
| 162 | }; |
| 163 | static const unsigned char us4b[] = { /* for lead bytes 0xF1-0xF3 */ |
| 164 | 4, 0x80, 0xBF |
| 165 | }; |
| 166 | static const unsigned char us4c[] = { /* for lead byte 0xF4 */ |
| 167 | 4, 0x80, 0x8F |
| 168 | }; |
| 169 | |
| 170 | /* a table used for quick lookup of the definition that goes with a |
| 171 | * particular lead byte */ |
| 172 | static const unsigned char* const lb_tab[] = { |
| 173 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 174 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 175 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 176 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 177 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 178 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 179 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 180 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 181 | us2a, NULL, us2b, us2b, us2b, us2b, us2b, us2b, |
| 182 | us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b, |
| 183 | us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b, |
| 184 | us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b, |
| 185 | us3a, us3b, us3b, us3b, us3b, us3b, us3b, us3b, |
| 186 | us3b, us3b, us3b, us3b, us3b, us3b, us3b, us3b, |
| 187 | us4a, us4b, us4b, us4b, us4c, NULL, NULL, NULL, |
| 188 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL |
| 189 | }; |
| 190 | |
| 191 | int invalid_utf8( |
| 192 | const Blob *pContent |
| 193 | ){ |
| @@ -201,23 +188,21 @@ | |
| 201 | if( *z<0x80 ){ |
| 202 | ++z; |
| 203 | --n; |
| 204 | }else{ |
| 205 | /* get the definition for this lead byte */ |
| 206 | const unsigned char* def = lb_tab[(*z++)-0x80]; |
| 207 | unsigned char len; |
| 208 | |
| 209 | /* if the definition doesn't exist, return invalid */ |
| 210 | if( !def ) return LOOK_INVALID; |
| 211 | /* get the expected sequence length */ |
| 212 | len = *def++; |
| 213 | /* if there aren't enough bytes left, return invalid */ |
| 214 | if( n<len ) { |
| 215 | return LOOK_INVALID; |
| 216 | } |
| 217 | /* we already know byte #0 is good, so check the remaining bytes */ |
| 218 | if( (*z<*def++) || (*z++>*def++) ){ |
| 219 | /* if the byte is outside the allowed range for this definition, |
| 220 | * return invalid */ |
| 221 | return LOOK_INVALID; |
| 222 | } |
| 223 | if( len > 2 ){ |
| 224 |
| --- src/lookslike.c | |
| +++ src/lookslike.c | |
| @@ -143,51 +143,38 @@ | |
| 143 | ** the derivatives CESU-8 & WTF-8 (as described in the same |
| 144 | ** wikipedia article referenced previously). |
| 145 | */ |
| 146 | |
| 147 | /* definitions for various UTF-8 sequence lengths */ |
| 148 | #define US2A 2, 0x80, 0x80 /* for lead byte 0xC0 */ |
| 149 | #define US2B 2, 0x80, 0xBF /* for lead bytes 0xC2-0xDF */ |
| 150 | #define US3A 3, 0xA0, 0xBF /* for lead byte 0xE0 */ |
| 151 | #define US3B 3, 0x80, 0xBF /* for lead bytes 0xE1-0xEF */ |
| 152 | #define US4A 4, 0x90, 0xBF /* for lead byte 0xF0 */ |
| 153 | #define US4B 4, 0x80, 0xBF /* for lead bytes 0xF1-0xF3 */ |
| 154 | #define US4C 4, 0x80, 0x8F /* for lead byte 0xF4 */ |
| 155 | #define US0A 0xFF, 0xFF, 0x00 /* for any other lead byte */ |
| 156 | |
| 157 | /* a table used for quick lookup of the definition that goes with a |
| 158 | * particular lead byte */ |
| 159 | static const unsigned char lb_tab[] = { |
| 160 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 161 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 162 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 163 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 164 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 165 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 166 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 167 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 168 | US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B, |
| 169 | US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, |
| 170 | US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, |
| 171 | US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, |
| 172 | US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B, |
| 173 | US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B, |
| 174 | US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A, |
| 175 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A |
| 176 | }; |
| 177 | |
| 178 | int invalid_utf8( |
| 179 | const Blob *pContent |
| 180 | ){ |
| @@ -201,23 +188,21 @@ | |
| 188 | if( *z<0x80 ){ |
| 189 | ++z; |
| 190 | --n; |
| 191 | }else{ |
| 192 | /* get the definition for this lead byte */ |
| 193 | const unsigned char* def = &lb_tab[(3 * *z++)-0x180]; |
| 194 | unsigned char len; |
| 195 | |
| 196 | /* get the expected sequence length */ |
| 197 | len = *def; |
| 198 | /* if there aren't enough bytes left, return invalid */ |
| 199 | if( n<len ) { |
| 200 | return LOOK_INVALID; |
| 201 | } |
| 202 | /* we already know byte #0 is good, so check the remaining bytes */ |
| 203 | if( (*z<*++def) || (*z++>*++def) ){ |
| 204 | /* if the byte is outside the allowed range for this definition, |
| 205 | * return invalid */ |
| 206 | return LOOK_INVALID; |
| 207 | } |
| 208 | if( len > 2 ){ |
| 209 |