Fossil SCM

restructured the invalid_utf8 so that it doesn't have to initialize the table on the first pass and shrink the size of the table

sdr 2016-06-14 18:06 invalid_utf8_table
Commit d3fc377276b80b413d1d0f2eed2cd5d57517d029
1 file changed +48 -58
+48 -58
--- src/lookslike.c
+++ src/lookslike.c
@@ -141,81 +141,71 @@
141141
** which is not considered invalid here: Some languages like
142142
** Java and Tcl use it. This function also considers valid
143143
** the derivatives CESU-8 & WTF-8 (as described in the same
144144
** wikipedia article referenced previously).
145145
*/
146
+
147
+/* definitions for various UTF-8 sequence lengths */
148
+static const unsigned char us2a[] = {
149
+ 2, 0xC0, 0xC0, 0x80, 0x80
150
+};
151
+static const unsigned char us2b[] = {
152
+ 2, 0xC2, 0xDF, 0x80, 0xBF
153
+};
154
+static const unsigned char us3a[] = {
155
+ 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF
156
+};
157
+static const unsigned char us3b[] = {
158
+ 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF
159
+};
160
+static const unsigned char us4a[] = {
161
+ 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF
162
+};
163
+static const unsigned char us4b[] = {
164
+ 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF
165
+};
166
+static const unsigned char us4c[] = {
167
+ 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF
168
+};
169
+
170
+/* a table used for quick lookup of the definition that goes with a
171
+ * particular lead byte */
172
+static const unsigned char* lb_tab[] = {
173
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
174
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
175
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
176
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
177
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
178
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
179
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
180
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
181
+ us2a, NULL, us2b, us2b, us2b, us2b, us2b, us2b,
182
+ us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
183
+ us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
184
+ us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
185
+ us3a, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
186
+ us3b, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
187
+ us4a, us4b, us4b, us4b, us4c, NULL, NULL, NULL,
188
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
189
+};
146190
147191
int invalid_utf8(
148192
const Blob *pContent
149193
){
150
- /* definitions for various UTF-8 sequence lengths */
151
- static unsigned char def_2a[] = {
152
- 2, 0xC0, 0xC0, 0x80, 0x80
153
- };
154
- static unsigned char def_2b[] = {
155
- 2, 0xC2, 0xDF, 0x80, 0xBF
156
- };
157
- static unsigned char def_3a[] = {
158
- 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF
159
- };
160
- static unsigned char def_3b[] = {
161
- 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF
162
- };
163
- static unsigned char def_4a[] = {
164
- 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF
165
- };
166
- static unsigned char def_4b[] = {
167
- 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF
168
- };
169
- static unsigned char def_4c[] = {
170
- 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF
171
- };
172
-
173
- /* an array of all the definitions */
174
- static unsigned char* def_arr[] = {
175
- def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL
176
- };
177
-
178
- /* a table used for quick lookup of the definition that goes with a
179
- * particular lead byte */
180
- static unsigned char* lb_tab[256] = { NULL };
181
-
182
- /* a pointer to the table; NULL means not yet setup */
183
- static unsigned char** lb_ptr = NULL;
184
-
185194
/* buffer pointer and size */
186
- const unsigned char *z;
187
- unsigned int n;
188
-
189
- /* if the table pointer hasn't been initialized */
190
- if( lb_ptr==NULL ){
191
- unsigned char** pp;
192
- /* for each definition, set the lead byte table pointer to the
193
- * proper definition */
194
- lb_ptr = lb_tab;
195
- pp = def_arr;
196
- while( *pp!=NULL ){
197
- unsigned char lo = pp[0][1];
198
- unsigned char hi = pp[0][2];
199
- unsigned char i;
200
- for(i=lo; i<=hi; ++i){
201
- lb_ptr[i] = pp[0];
202
- }
203
- ++pp;
204
- }
205
- }
206
- z = (unsigned char *)blob_buffer(pContent);
207
- n = blob_size(pContent);
195
+ const unsigned char *z = (unsigned char *)blob_buffer(pContent);
196
+ unsigned int n = blob_size(pContent);
197
+
208198
/* while we haven't checked all the bytes in the buffer */
209199
while( n>0 ){
210200
/* ascii is trivial */
211201
if( *z<0x80 ){
212202
++z;
213203
--n;
214204
}else{
215205
/* get the definition for this lead byte */
216
- unsigned char* def = lb_ptr[*z++];
206
+ unsigned char* def = lb_tab[(*z++)-0x80];
217207
unsigned char i, len;
218208
219209
/* if the definition doesn't exist, return invalid */
220210
if( !def ) return LOOK_INVALID;
221211
/* get the expected sequence length */
222212
--- src/lookslike.c
+++ src/lookslike.c
@@ -141,81 +141,71 @@
141 ** which is not considered invalid here: Some languages like
142 ** Java and Tcl use it. This function also considers valid
143 ** the derivatives CESU-8 & WTF-8 (as described in the same
144 ** wikipedia article referenced previously).
145 */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
147 int invalid_utf8(
148 const Blob *pContent
149 ){
150 /* definitions for various UTF-8 sequence lengths */
151 static unsigned char def_2a[] = {
152 2, 0xC0, 0xC0, 0x80, 0x80
153 };
154 static unsigned char def_2b[] = {
155 2, 0xC2, 0xDF, 0x80, 0xBF
156 };
157 static unsigned char def_3a[] = {
158 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF
159 };
160 static unsigned char def_3b[] = {
161 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF
162 };
163 static unsigned char def_4a[] = {
164 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF
165 };
166 static unsigned char def_4b[] = {
167 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF
168 };
169 static unsigned char def_4c[] = {
170 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF
171 };
172
173 /* an array of all the definitions */
174 static unsigned char* def_arr[] = {
175 def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL
176 };
177
178 /* a table used for quick lookup of the definition that goes with a
179 * particular lead byte */
180 static unsigned char* lb_tab[256] = { NULL };
181
182 /* a pointer to the table; NULL means not yet setup */
183 static unsigned char** lb_ptr = NULL;
184
185 /* buffer pointer and size */
186 const unsigned char *z;
187 unsigned int n;
188
189 /* if the table pointer hasn't been initialized */
190 if( lb_ptr==NULL ){
191 unsigned char** pp;
192 /* for each definition, set the lead byte table pointer to the
193 * proper definition */
194 lb_ptr = lb_tab;
195 pp = def_arr;
196 while( *pp!=NULL ){
197 unsigned char lo = pp[0][1];
198 unsigned char hi = pp[0][2];
199 unsigned char i;
200 for(i=lo; i<=hi; ++i){
201 lb_ptr[i] = pp[0];
202 }
203 ++pp;
204 }
205 }
206 z = (unsigned char *)blob_buffer(pContent);
207 n = blob_size(pContent);
208 /* while we haven't checked all the bytes in the buffer */
209 while( n>0 ){
210 /* ascii is trivial */
211 if( *z<0x80 ){
212 ++z;
213 --n;
214 }else{
215 /* get the definition for this lead byte */
216 unsigned char* def = lb_ptr[*z++];
217 unsigned char i, len;
218
219 /* if the definition doesn't exist, return invalid */
220 if( !def ) return LOOK_INVALID;
221 /* get the expected sequence length */
222
--- src/lookslike.c
+++ src/lookslike.c
@@ -141,81 +141,71 @@
141 ** which is not considered invalid here: Some languages like
142 ** Java and Tcl use it. This function also considers valid
143 ** the derivatives CESU-8 & WTF-8 (as described in the same
144 ** wikipedia article referenced previously).
145 */
146
147 /* definitions for various UTF-8 sequence lengths */
148 static const unsigned char us2a[] = {
149 2, 0xC0, 0xC0, 0x80, 0x80
150 };
151 static const unsigned char us2b[] = {
152 2, 0xC2, 0xDF, 0x80, 0xBF
153 };
154 static const unsigned char us3a[] = {
155 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF
156 };
157 static const unsigned char us3b[] = {
158 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF
159 };
160 static const unsigned char us4a[] = {
161 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF
162 };
163 static const unsigned char us4b[] = {
164 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF
165 };
166 static const unsigned char us4c[] = {
167 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF
168 };
169
170 /* a table used for quick lookup of the definition that goes with a
171 * particular lead byte */
172 static const unsigned char* lb_tab[] = {
173 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
174 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
175 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
176 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
177 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
178 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
179 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
180 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
181 us2a, NULL, us2b, us2b, us2b, us2b, us2b, us2b,
182 us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
183 us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
184 us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
185 us3a, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
186 us3b, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
187 us4a, us4b, us4b, us4b, us4c, NULL, NULL, NULL,
188 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
189 };
190
191 int invalid_utf8(
192 const Blob *pContent
193 ){
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194 /* buffer pointer and size */
195 const unsigned char *z = (unsigned char *)blob_buffer(pContent);
196 unsigned int n = blob_size(pContent);
197
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198 /* while we haven't checked all the bytes in the buffer */
199 while( n>0 ){
200 /* ascii is trivial */
201 if( *z<0x80 ){
202 ++z;
203 --n;
204 }else{
205 /* get the definition for this lead byte */
206 unsigned char* def = lb_tab[(*z++)-0x80];
207 unsigned char i, len;
208
209 /* if the definition doesn't exist, return invalid */
210 if( !def ) return LOOK_INVALID;
211 /* get the expected sequence length */
212

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button