Fossil SCM
performance optimizations
Commit
635f3b0300cffc2aa01ece178fe9684ca8120f0c
Parent
8a65d6f05c51962…
1 file changed
+39
-22
+39
-22
| --- src/lookslike.c | ||
| +++ src/lookslike.c | ||
| @@ -143,24 +143,23 @@ | ||
| 143 | 143 | ** Java and Tcl use it. This function also considers valid |
| 144 | 144 | ** the derivatives CESU-8 & WTF-8 (as described in the same |
| 145 | 145 | ** wikipedia article referenced previously). |
| 146 | 146 | */ |
| 147 | 147 | |
| 148 | -int invalid_utf8(const Blob *pContent) | |
| 148 | +int invalid_utf8_b(const Blob *pContent) | |
| 149 | 149 | { |
| 150 | 150 | /* definitions for various utf-8 sequence lengths */ |
| 151 | - static unsigned char def_1a[] = { 1, 0x00, 0x7F }; | |
| 152 | 151 | static unsigned char def_2a[] = { 2, 0xC0, 0xC0, 0x80, 0x80 }; |
| 153 | 152 | static unsigned char def_2b[] = { 2, 0xC2, 0xDF, 0x80, 0xBF }; |
| 154 | 153 | static unsigned char def_3a[] = { 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF }; |
| 155 | 154 | static unsigned char def_3b[] = { 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF }; |
| 156 | 155 | static unsigned char def_4a[] = { 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF }; |
| 157 | 156 | static unsigned char def_4b[] = { 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF }; |
| 158 | 157 | static unsigned char def_4c[] = { 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF }; |
| 159 | 158 | |
| 160 | 159 | /* an array of all the definitions */ |
| 161 | - static unsigned char* def_arr[] = { def_1a, def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL }; | |
| 160 | + static unsigned char* def_arr[] = { def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL }; | |
| 162 | 161 | |
| 163 | 162 | /* a table used for quick lookup of the definition that goes with a particular lead byte */ |
| 164 | 163 | static unsigned char* lb_tab[256] = { NULL }; |
| 165 | 164 | |
| 166 | 165 | /* a pointer to the table; NULL means not yet setup */ |
| @@ -189,29 +188,47 @@ | ||
| 189 | 188 | unsigned int n = blob_size(pContent); |
| 190 | 189 | |
| 191 | 190 | /* while we haven't checked all the bytes in the buffer */ |
| 192 | 191 | while (n > 0) |
| 193 | 192 | { |
| 194 | - /* get the definition for this lead byte */ | |
| 195 | - unsigned char* def = lb_ptr[*z]; | |
| 196 | - unsigned char i; | |
| 197 | - | |
| 198 | - /* if the definition doesn't exist, or there aren't enough bytes left, return invalid */ | |
| 199 | - if (!def || (n < def[0])) | |
| 200 | - return LOOK_INVALID; | |
| 201 | - | |
| 202 | - /* we already know byte #0 is good, so check the remaining bytes */ | |
| 203 | - for (i = 1; i < def[0]; ++i) | |
| 204 | - { | |
| 205 | - /* if the byte is outside the allowed range for this definition, return invalid */ | |
| 206 | - if ((z[i] < def[1 + i * 2 + 0]) || (z[i] > def[1 + i * 2 + 1])) | |
| 207 | - return LOOK_INVALID; | |
| 208 | - } | |
| 209 | - | |
| 210 | - /* advance to the next sequence */ | |
| 211 | - z += def[0]; | |
| 212 | - n -= def[0]; | |
| 193 | + /* ascii is trivial */ | |
| 194 | + if (*z < 0x80) | |
| 195 | + { | |
| 196 | + ++z; | |
| 197 | + --n; | |
| 198 | + } | |
| 199 | + else | |
| 200 | + { | |
| 201 | + /* get the definition for this lead byte */ | |
| 202 | + unsigned char* def = lb_ptr[*z++]; | |
| 203 | + unsigned char i, len; | |
| 204 | + | |
| 205 | + /* if the definition doesn't exist, return invalid */ | |
| 206 | + if (!def) | |
| 207 | + return LOOK_INVALID; | |
| 208 | + | |
| 209 | + /* get the expected sequence length */ | |
| 210 | + len = *def; | |
| 211 | + | |
| 212 | + /* if there aren't enough bytes left, return invalid */ | |
| 213 | + if (n < len) | |
| 214 | + return LOOK_INVALID; | |
| 215 | + | |
| 216 | + /* skip the length & lead byte range */ | |
| 217 | + def += 3; | |
| 218 | + | |
| 219 | + /* we already know byte #0 is good, so check the remaining bytes */ | |
| 220 | + for (i = 1; i < len; ++i) | |
| 221 | + { | |
| 222 | + /* if the byte is outside the allowed range for this definition, return invalid */ | |
| 223 | + if ((*z < *def++) || (*z++ > *def++)) | |
| 224 | + return LOOK_INVALID; | |
| 225 | + } | |
| 226 | + | |
| 227 | + /* advance to the next sequence */ | |
| 228 | + n -= len; | |
| 229 | + } | |
| 213 | 230 | } |
| 214 | 231 | |
| 215 | 232 | /* we made it all the way through the buffer so it's not invalid */ |
| 216 | 233 | return 0; |
| 217 | 234 | } |
| 218 | 235 |
| --- src/lookslike.c | |
| +++ src/lookslike.c | |
| @@ -143,24 +143,23 @@ | |
| 143 | ** Java and Tcl use it. This function also considers valid |
| 144 | ** the derivatives CESU-8 & WTF-8 (as described in the same |
| 145 | ** wikipedia article referenced previously). |
| 146 | */ |
| 147 | |
| 148 | int invalid_utf8(const Blob *pContent) |
| 149 | { |
| 150 | /* definitions for various utf-8 sequence lengths */ |
| 151 | static unsigned char def_1a[] = { 1, 0x00, 0x7F }; |
| 152 | static unsigned char def_2a[] = { 2, 0xC0, 0xC0, 0x80, 0x80 }; |
| 153 | static unsigned char def_2b[] = { 2, 0xC2, 0xDF, 0x80, 0xBF }; |
| 154 | static unsigned char def_3a[] = { 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF }; |
| 155 | static unsigned char def_3b[] = { 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF }; |
| 156 | static unsigned char def_4a[] = { 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF }; |
| 157 | static unsigned char def_4b[] = { 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF }; |
| 158 | static unsigned char def_4c[] = { 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF }; |
| 159 | |
| 160 | /* an array of all the definitions */ |
| 161 | static unsigned char* def_arr[] = { def_1a, def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL }; |
| 162 | |
| 163 | /* a table used for quick lookup of the definition that goes with a particular lead byte */ |
| 164 | static unsigned char* lb_tab[256] = { NULL }; |
| 165 | |
| 166 | /* a pointer to the table; NULL means not yet setup */ |
| @@ -189,29 +188,47 @@ | |
| 189 | unsigned int n = blob_size(pContent); |
| 190 | |
| 191 | /* while we haven't checked all the bytes in the buffer */ |
| 192 | while (n > 0) |
| 193 | { |
| 194 | /* get the definition for this lead byte */ |
| 195 | unsigned char* def = lb_ptr[*z]; |
| 196 | unsigned char i; |
| 197 | |
| 198 | /* if the definition doesn't exist, or there aren't enough bytes left, return invalid */ |
| 199 | if (!def || (n < def[0])) |
| 200 | return LOOK_INVALID; |
| 201 | |
| 202 | /* we already know byte #0 is good, so check the remaining bytes */ |
| 203 | for (i = 1; i < def[0]; ++i) |
| 204 | { |
| 205 | /* if the byte is outside the allowed range for this definition, return invalid */ |
| 206 | if ((z[i] < def[1 + i * 2 + 0]) || (z[i] > def[1 + i * 2 + 1])) |
| 207 | return LOOK_INVALID; |
| 208 | } |
| 209 | |
| 210 | /* advance to the next sequence */ |
| 211 | z += def[0]; |
| 212 | n -= def[0]; |
| 213 | } |
| 214 | |
| 215 | /* we made it all the way through the buffer so it's not invalid */ |
| 216 | return 0; |
| 217 | } |
| 218 |
| --- src/lookslike.c | |
| +++ src/lookslike.c | |
| @@ -143,24 +143,23 @@ | |
| 143 | ** Java and Tcl use it. This function also considers valid |
| 144 | ** the derivatives CESU-8 & WTF-8 (as described in the same |
| 145 | ** wikipedia article referenced previously). |
| 146 | */ |
| 147 | |
| 148 | int invalid_utf8_b(const Blob *pContent) |
| 149 | { |
| 150 | /* definitions for various utf-8 sequence lengths */ |
| 151 | static unsigned char def_2a[] = { 2, 0xC0, 0xC0, 0x80, 0x80 }; |
| 152 | static unsigned char def_2b[] = { 2, 0xC2, 0xDF, 0x80, 0xBF }; |
| 153 | static unsigned char def_3a[] = { 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF }; |
| 154 | static unsigned char def_3b[] = { 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF }; |
| 155 | static unsigned char def_4a[] = { 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF }; |
| 156 | static unsigned char def_4b[] = { 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF }; |
| 157 | static unsigned char def_4c[] = { 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF }; |
| 158 | |
| 159 | /* an array of all the definitions */ |
| 160 | static unsigned char* def_arr[] = { def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL }; |
| 161 | |
| 162 | /* a table used for quick lookup of the definition that goes with a particular lead byte */ |
| 163 | static unsigned char* lb_tab[256] = { NULL }; |
| 164 | |
| 165 | /* a pointer to the table; NULL means not yet setup */ |
| @@ -189,29 +188,47 @@ | |
| 188 | unsigned int n = blob_size(pContent); |
| 189 | |
| 190 | /* while we haven't checked all the bytes in the buffer */ |
| 191 | while (n > 0) |
| 192 | { |
| 193 | /* ascii is trivial */ |
| 194 | if (*z < 0x80) |
| 195 | { |
| 196 | ++z; |
| 197 | --n; |
| 198 | } |
| 199 | else |
| 200 | { |
| 201 | /* get the definition for this lead byte */ |
| 202 | unsigned char* def = lb_ptr[*z++]; |
| 203 | unsigned char i, len; |
| 204 | |
| 205 | /* if the definition doesn't exist, return invalid */ |
| 206 | if (!def) |
| 207 | return LOOK_INVALID; |
| 208 | |
| 209 | /* get the expected sequence length */ |
| 210 | len = *def; |
| 211 | |
| 212 | /* if there aren't enough bytes left, return invalid */ |
| 213 | if (n < len) |
| 214 | return LOOK_INVALID; |
| 215 | |
| 216 | /* skip the length & lead byte range */ |
| 217 | def += 3; |
| 218 | |
| 219 | /* we already know byte #0 is good, so check the remaining bytes */ |
| 220 | for (i = 1; i < len; ++i) |
| 221 | { |
| 222 | /* if the byte is outside the allowed range for this definition, return invalid */ |
| 223 | if ((*z < *def++) || (*z++ > *def++)) |
| 224 | return LOOK_INVALID; |
| 225 | } |
| 226 | |
| 227 | /* advance to the next sequence */ |
| 228 | n -= len; |
| 229 | } |
| 230 | } |
| 231 | |
| 232 | /* we made it all the way through the buffer so it's not invalid */ |
| 233 | return 0; |
| 234 | } |
| 235 |