Fossil SCM
restructured the invalid_utf8 so that it doesn't have to initialize the table on the first pass and shrink the size of the table
Commit
d3fc377276b80b413d1d0f2eed2cd5d57517d029
Parent
d22c72bc9248b5e…
1 file changed
+48
-58
+48
-58
| --- src/lookslike.c | ||
| +++ src/lookslike.c | ||
| @@ -141,81 +141,71 @@ | ||
| 141 | 141 | ** which is not considered invalid here: Some languages like |
| 142 | 142 | ** Java and Tcl use it. This function also considers valid |
| 143 | 143 | ** the derivatives CESU-8 & WTF-8 (as described in the same |
| 144 | 144 | ** wikipedia article referenced previously). |
| 145 | 145 | */ |
| 146 | + | |
| 147 | +/* definitions for various UTF-8 sequence lengths */ | |
| 148 | +static const unsigned char us2a[] = { | |
| 149 | + 2, 0xC0, 0xC0, 0x80, 0x80 | |
| 150 | +}; | |
| 151 | +static const unsigned char us2b[] = { | |
| 152 | + 2, 0xC2, 0xDF, 0x80, 0xBF | |
| 153 | +}; | |
| 154 | +static const unsigned char us3a[] = { | |
| 155 | + 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF | |
| 156 | +}; | |
| 157 | +static const unsigned char us3b[] = { | |
| 158 | + 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF | |
| 159 | +}; | |
| 160 | +static const unsigned char us4a[] = { | |
| 161 | + 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF | |
| 162 | +}; | |
| 163 | +static const unsigned char us4b[] = { | |
| 164 | + 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF | |
| 165 | +}; | |
| 166 | +static const unsigned char us4c[] = { | |
| 167 | + 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF | |
| 168 | +}; | |
| 169 | + | |
| 170 | +/* a table used for quick lookup of the definition that goes with a | |
| 171 | + * particular lead byte */ | |
| 172 | +static const unsigned char* lb_tab[] = { | |
| 173 | + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 174 | + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 175 | + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 176 | + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 177 | + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 178 | + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 179 | + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 180 | + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, | |
| 181 | + us2a, NULL, us2b, us2b, us2b, us2b, us2b, us2b, | |
| 182 | + us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b, | |
| 183 | + us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b, | |
| 184 | + us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b, | |
| 185 | + us3a, us3b, us3b, us3b, us3b, us3b, us3b, us3b, | |
| 186 | + us3b, us3b, us3b, us3b, us3b, us3b, us3b, us3b, | |
| 187 | + us4a, us4b, us4b, us4b, us4c, NULL, NULL, NULL, | |
| 188 | + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL | |
| 189 | +}; | |
| 146 | 190 | |
| 147 | 191 | int invalid_utf8( |
| 148 | 192 | const Blob *pContent |
| 149 | 193 | ){ |
| 150 | - /* definitions for various UTF-8 sequence lengths */ | |
| 151 | - static unsigned char def_2a[] = { | |
| 152 | - 2, 0xC0, 0xC0, 0x80, 0x80 | |
| 153 | - }; | |
| 154 | - static unsigned char def_2b[] = { | |
| 155 | - 2, 0xC2, 0xDF, 0x80, 0xBF | |
| 156 | - }; | |
| 157 | - static unsigned char def_3a[] = { | |
| 158 | - 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF | |
| 159 | - }; | |
| 160 | - static unsigned char def_3b[] = { | |
| 161 | - 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF | |
| 162 | - }; | |
| 163 | - static unsigned char def_4a[] = { | |
| 164 | - 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF | |
| 165 | - }; | |
| 166 | - static unsigned char def_4b[] = { | |
| 167 | - 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF | |
| 168 | - }; | |
| 169 | - static unsigned char def_4c[] = { | |
| 170 | - 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF | |
| 171 | - }; | |
| 172 | - | |
| 173 | - /* an array of all the definitions */ | |
| 174 | - static unsigned char* def_arr[] = { | |
| 175 | - def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL | |
| 176 | - }; | |
| 177 | - | |
| 178 | - /* a table used for quick lookup of the definition that goes with a | |
| 179 | - * particular lead byte */ | |
| 180 | - static unsigned char* lb_tab[256] = { NULL }; | |
| 181 | - | |
| 182 | - /* a pointer to the table; NULL means not yet setup */ | |
| 183 | - static unsigned char** lb_ptr = NULL; | |
| 184 | - | |
| 185 | 194 | /* buffer pointer and size */ |
| 186 | - const unsigned char *z; | |
| 187 | - unsigned int n; | |
| 188 | - | |
| 189 | - /* if the table pointer hasn't been initialized */ | |
| 190 | - if( lb_ptr==NULL ){ | |
| 191 | - unsigned char** pp; | |
| 192 | - /* for each definition, set the lead byte table pointer to the | |
| 193 | - * proper definition */ | |
| 194 | - lb_ptr = lb_tab; | |
| 195 | - pp = def_arr; | |
| 196 | - while( *pp!=NULL ){ | |
| 197 | - unsigned char lo = pp[0][1]; | |
| 198 | - unsigned char hi = pp[0][2]; | |
| 199 | - unsigned char i; | |
| 200 | - for(i=lo; i<=hi; ++i){ | |
| 201 | - lb_ptr[i] = pp[0]; | |
| 202 | - } | |
| 203 | - ++pp; | |
| 204 | - } | |
| 205 | - } | |
| 206 | - z = (unsigned char *)blob_buffer(pContent); | |
| 207 | - n = blob_size(pContent); | |
| 195 | + const unsigned char *z = (unsigned char *)blob_buffer(pContent); | |
| 196 | + unsigned int n = blob_size(pContent); | |
| 197 | + | |
| 208 | 198 | /* while we haven't checked all the bytes in the buffer */ |
| 209 | 199 | while( n>0 ){ |
| 210 | 200 | /* ascii is trivial */ |
| 211 | 201 | if( *z<0x80 ){ |
| 212 | 202 | ++z; |
| 213 | 203 | --n; |
| 214 | 204 | }else{ |
| 215 | 205 | /* get the definition for this lead byte */ |
| 216 | - unsigned char* def = lb_ptr[*z++]; | |
| 206 | + unsigned char* def = lb_tab[(*z++)-0x80]; | |
| 217 | 207 | unsigned char i, len; |
| 218 | 208 | |
| 219 | 209 | /* if the definition doesn't exist, return invalid */ |
| 220 | 210 | if( !def ) return LOOK_INVALID; |
| 221 | 211 | /* get the expected sequence length */ |
| 222 | 212 |
| --- src/lookslike.c | |
| +++ src/lookslike.c | |
| @@ -141,81 +141,71 @@ | |
| 141 | ** which is not considered invalid here: Some languages like |
| 142 | ** Java and Tcl use it. This function also considers valid |
| 143 | ** the derivatives CESU-8 & WTF-8 (as described in the same |
| 144 | ** wikipedia article referenced previously). |
| 145 | */ |
| 146 | |
| 147 | int invalid_utf8( |
| 148 | const Blob *pContent |
| 149 | ){ |
| 150 | /* definitions for various UTF-8 sequence lengths */ |
| 151 | static unsigned char def_2a[] = { |
| 152 | 2, 0xC0, 0xC0, 0x80, 0x80 |
| 153 | }; |
| 154 | static unsigned char def_2b[] = { |
| 155 | 2, 0xC2, 0xDF, 0x80, 0xBF |
| 156 | }; |
| 157 | static unsigned char def_3a[] = { |
| 158 | 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF |
| 159 | }; |
| 160 | static unsigned char def_3b[] = { |
| 161 | 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF |
| 162 | }; |
| 163 | static unsigned char def_4a[] = { |
| 164 | 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF |
| 165 | }; |
| 166 | static unsigned char def_4b[] = { |
| 167 | 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF |
| 168 | }; |
| 169 | static unsigned char def_4c[] = { |
| 170 | 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF |
| 171 | }; |
| 172 | |
| 173 | /* an array of all the definitions */ |
| 174 | static unsigned char* def_arr[] = { |
| 175 | def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL |
| 176 | }; |
| 177 | |
| 178 | /* a table used for quick lookup of the definition that goes with a |
| 179 | * particular lead byte */ |
| 180 | static unsigned char* lb_tab[256] = { NULL }; |
| 181 | |
| 182 | /* a pointer to the table; NULL means not yet setup */ |
| 183 | static unsigned char** lb_ptr = NULL; |
| 184 | |
| 185 | /* buffer pointer and size */ |
| 186 | const unsigned char *z; |
| 187 | unsigned int n; |
| 188 | |
| 189 | /* if the table pointer hasn't been initialized */ |
| 190 | if( lb_ptr==NULL ){ |
| 191 | unsigned char** pp; |
| 192 | /* for each definition, set the lead byte table pointer to the |
| 193 | * proper definition */ |
| 194 | lb_ptr = lb_tab; |
| 195 | pp = def_arr; |
| 196 | while( *pp!=NULL ){ |
| 197 | unsigned char lo = pp[0][1]; |
| 198 | unsigned char hi = pp[0][2]; |
| 199 | unsigned char i; |
| 200 | for(i=lo; i<=hi; ++i){ |
| 201 | lb_ptr[i] = pp[0]; |
| 202 | } |
| 203 | ++pp; |
| 204 | } |
| 205 | } |
| 206 | z = (unsigned char *)blob_buffer(pContent); |
| 207 | n = blob_size(pContent); |
| 208 | /* while we haven't checked all the bytes in the buffer */ |
| 209 | while( n>0 ){ |
| 210 | /* ascii is trivial */ |
| 211 | if( *z<0x80 ){ |
| 212 | ++z; |
| 213 | --n; |
| 214 | }else{ |
| 215 | /* get the definition for this lead byte */ |
| 216 | unsigned char* def = lb_ptr[*z++]; |
| 217 | unsigned char i, len; |
| 218 | |
| 219 | /* if the definition doesn't exist, return invalid */ |
| 220 | if( !def ) return LOOK_INVALID; |
| 221 | /* get the expected sequence length */ |
| 222 |
| --- src/lookslike.c | |
| +++ src/lookslike.c | |
| @@ -141,81 +141,71 @@ | |
| 141 | ** which is not considered invalid here: Some languages like |
| 142 | ** Java and Tcl use it. This function also considers valid |
| 143 | ** the derivatives CESU-8 & WTF-8 (as described in the same |
| 144 | ** wikipedia article referenced previously). |
| 145 | */ |
| 146 | |
| 147 | /* definitions for various UTF-8 sequence lengths */ |
| 148 | static const unsigned char us2a[] = { |
| 149 | 2, 0xC0, 0xC0, 0x80, 0x80 |
| 150 | }; |
| 151 | static const unsigned char us2b[] = { |
| 152 | 2, 0xC2, 0xDF, 0x80, 0xBF |
| 153 | }; |
| 154 | static const unsigned char us3a[] = { |
| 155 | 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF |
| 156 | }; |
| 157 | static const unsigned char us3b[] = { |
| 158 | 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF |
| 159 | }; |
| 160 | static const unsigned char us4a[] = { |
| 161 | 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF |
| 162 | }; |
| 163 | static const unsigned char us4b[] = { |
| 164 | 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF |
| 165 | }; |
| 166 | static const unsigned char us4c[] = { |
| 167 | 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF |
| 168 | }; |
| 169 | |
| 170 | /* a table used for quick lookup of the definition that goes with a |
| 171 | * particular lead byte */ |
| 172 | static const unsigned char* lb_tab[] = { |
| 173 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 174 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 175 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 176 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 177 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 178 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 179 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 180 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, |
| 181 | us2a, NULL, us2b, us2b, us2b, us2b, us2b, us2b, |
| 182 | us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b, |
| 183 | us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b, |
| 184 | us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b, |
| 185 | us3a, us3b, us3b, us3b, us3b, us3b, us3b, us3b, |
| 186 | us3b, us3b, us3b, us3b, us3b, us3b, us3b, us3b, |
| 187 | us4a, us4b, us4b, us4b, us4c, NULL, NULL, NULL, |
| 188 | NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL |
| 189 | }; |
| 190 | |
| 191 | int invalid_utf8( |
| 192 | const Blob *pContent |
| 193 | ){ |
| 194 | /* buffer pointer and size */ |
| 195 | const unsigned char *z = (unsigned char *)blob_buffer(pContent); |
| 196 | unsigned int n = blob_size(pContent); |
| 197 | |
| 198 | /* while we haven't checked all the bytes in the buffer */ |
| 199 | while( n>0 ){ |
| 200 | /* ascii is trivial */ |
| 201 | if( *z<0x80 ){ |
| 202 | ++z; |
| 203 | --n; |
| 204 | }else{ |
| 205 | /* get the definition for this lead byte */ |
| 206 | unsigned char* def = lb_tab[(*z++)-0x80]; |
| 207 | unsigned char i, len; |
| 208 | |
| 209 | /* if the definition doesn't exist, return invalid */ |
| 210 | if( !def ) return LOOK_INVALID; |
| 211 | /* get the expected sequence length */ |
| 212 |