Fossil SCM

performance optimizations

sdr 2016-06-10 20:45 invalid_utf8_table
Commit 635f3b0300cffc2aa01ece178fe9684ca8120f0c
1 file changed +39 -22
+39 -22
--- src/lookslike.c
+++ src/lookslike.c
@@ -143,24 +143,23 @@
143143
** Java and Tcl use it. This function also considers valid
144144
** the derivatives CESU-8 & WTF-8 (as described in the same
145145
** wikipedia article referenced previously).
146146
*/
147147
148
-int invalid_utf8(const Blob *pContent)
148
+int invalid_utf8_b(const Blob *pContent)
149149
{
150150
/* definitions for various utf-8 sequence lengths */
151
- static unsigned char def_1a[] = { 1, 0x00, 0x7F };
152151
static unsigned char def_2a[] = { 2, 0xC0, 0xC0, 0x80, 0x80 };
153152
static unsigned char def_2b[] = { 2, 0xC2, 0xDF, 0x80, 0xBF };
154153
static unsigned char def_3a[] = { 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF };
155154
static unsigned char def_3b[] = { 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF };
156155
static unsigned char def_4a[] = { 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF };
157156
static unsigned char def_4b[] = { 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF };
158157
static unsigned char def_4c[] = { 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF };
159158
160159
/* an array of all the definitions */
161
- static unsigned char* def_arr[] = { def_1a, def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL };
160
+ static unsigned char* def_arr[] = { def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL };
162161
163162
/* a table used for quick lookup of the definition that goes with a particular lead byte */
164163
static unsigned char* lb_tab[256] = { NULL };
165164
166165
/* a pointer to the table; NULL means not yet setup */
@@ -189,29 +188,47 @@
189188
unsigned int n = blob_size(pContent);
190189
191190
/* while we haven't checked all the bytes in the buffer */
192191
while (n > 0)
193192
{
194
- /* get the definition for this lead byte */
195
- unsigned char* def = lb_ptr[*z];
196
- unsigned char i;
197
-
198
- /* if the definition doesn't exist, or there aren't enough bytes left, return invalid */
199
- if (!def || (n < def[0]))
200
- return LOOK_INVALID;
201
-
202
- /* we already know byte #0 is good, so check the remaining bytes */
203
- for (i = 1; i < def[0]; ++i)
204
- {
205
- /* if the byte is outside the allowed range for this definition, return invalid */
206
- if ((z[i] < def[1 + i * 2 + 0]) || (z[i] > def[1 + i * 2 + 1]))
207
- return LOOK_INVALID;
208
- }
209
-
210
- /* advance to the next sequence */
211
- z += def[0];
212
- n -= def[0];
193
+ /* ascii is trivial */
194
+ if (*z < 0x80)
195
+ {
196
+ ++z;
197
+ --n;
198
+ }
199
+ else
200
+ {
201
+ /* get the definition for this lead byte */
202
+ unsigned char* def = lb_ptr[*z++];
203
+ unsigned char i, len;
204
+
205
+ /* if the definition doesn't exist, return invalid */
206
+ if (!def)
207
+ return LOOK_INVALID;
208
+
209
+ /* get the expected sequence length */
210
+ len = *def;
211
+
212
+ /* if there aren't enough bytes left, return invalid */
213
+ if (n < len)
214
+ return LOOK_INVALID;
215
+
216
+ /* skip the length & lead byte range */
217
+ def += 3;
218
+
219
+ /* we already know byte #0 is good, so check the remaining bytes */
220
+ for (i = 1; i < len; ++i)
221
+ {
222
+ /* if the byte is outside the allowed range for this definition, return invalid */
223
+ if ((*z < *def++) || (*z++ > *def++))
224
+ return LOOK_INVALID;
225
+ }
226
+
227
+ /* advance to the next sequence */
228
+ n -= len;
229
+ }
213230
}
214231
215232
/* we made it all the way through the buffer so it's not invalid */
216233
return 0;
217234
}
218235
--- src/lookslike.c
+++ src/lookslike.c
@@ -143,24 +143,23 @@
143 ** Java and Tcl use it. This function also considers valid
144 ** the derivatives CESU-8 & WTF-8 (as described in the same
145 ** wikipedia article referenced previously).
146 */
147
148 int invalid_utf8(const Blob *pContent)
149 {
150 /* definitions for various utf-8 sequence lengths */
151 static unsigned char def_1a[] = { 1, 0x00, 0x7F };
152 static unsigned char def_2a[] = { 2, 0xC0, 0xC0, 0x80, 0x80 };
153 static unsigned char def_2b[] = { 2, 0xC2, 0xDF, 0x80, 0xBF };
154 static unsigned char def_3a[] = { 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF };
155 static unsigned char def_3b[] = { 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF };
156 static unsigned char def_4a[] = { 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF };
157 static unsigned char def_4b[] = { 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF };
158 static unsigned char def_4c[] = { 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF };
159
160 /* an array of all the definitions */
161 static unsigned char* def_arr[] = { def_1a, def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL };
162
163 /* a table used for quick lookup of the definition that goes with a particular lead byte */
164 static unsigned char* lb_tab[256] = { NULL };
165
166 /* a pointer to the table; NULL means not yet setup */
@@ -189,29 +188,47 @@
189 unsigned int n = blob_size(pContent);
190
191 /* while we haven't checked all the bytes in the buffer */
192 while (n > 0)
193 {
194 /* get the definition for this lead byte */
195 unsigned char* def = lb_ptr[*z];
196 unsigned char i;
197
198 /* if the definition doesn't exist, or there aren't enough bytes left, return invalid */
199 if (!def || (n < def[0]))
200 return LOOK_INVALID;
201
202 /* we already know byte #0 is good, so check the remaining bytes */
203 for (i = 1; i < def[0]; ++i)
204 {
205 /* if the byte is outside the allowed range for this definition, return invalid */
206 if ((z[i] < def[1 + i * 2 + 0]) || (z[i] > def[1 + i * 2 + 1]))
207 return LOOK_INVALID;
208 }
209
210 /* advance to the next sequence */
211 z += def[0];
212 n -= def[0];
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213 }
214
215 /* we made it all the way through the buffer so it's not invalid */
216 return 0;
217 }
218
--- src/lookslike.c
+++ src/lookslike.c
@@ -143,24 +143,23 @@
143 ** Java and Tcl use it. This function also considers valid
144 ** the derivatives CESU-8 & WTF-8 (as described in the same
145 ** wikipedia article referenced previously).
146 */
147
148 int invalid_utf8_b(const Blob *pContent)
149 {
150 /* definitions for various utf-8 sequence lengths */
 
151 static unsigned char def_2a[] = { 2, 0xC0, 0xC0, 0x80, 0x80 };
152 static unsigned char def_2b[] = { 2, 0xC2, 0xDF, 0x80, 0xBF };
153 static unsigned char def_3a[] = { 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF };
154 static unsigned char def_3b[] = { 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF };
155 static unsigned char def_4a[] = { 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF };
156 static unsigned char def_4b[] = { 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF };
157 static unsigned char def_4c[] = { 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF };
158
159 /* an array of all the definitions */
160 static unsigned char* def_arr[] = { def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL };
161
162 /* a table used for quick lookup of the definition that goes with a particular lead byte */
163 static unsigned char* lb_tab[256] = { NULL };
164
165 /* a pointer to the table; NULL means not yet setup */
@@ -189,29 +188,47 @@
188 unsigned int n = blob_size(pContent);
189
190 /* while we haven't checked all the bytes in the buffer */
191 while (n > 0)
192 {
193 /* ascii is trivial */
194 if (*z < 0x80)
195 {
196 ++z;
197 --n;
198 }
199 else
200 {
201 /* get the definition for this lead byte */
202 unsigned char* def = lb_ptr[*z++];
203 unsigned char i, len;
204
205 /* if the definition doesn't exist, return invalid */
206 if (!def)
207 return LOOK_INVALID;
208
209 /* get the expected sequence length */
210 len = *def;
211
212 /* if there aren't enough bytes left, return invalid */
213 if (n < len)
214 return LOOK_INVALID;
215
216 /* skip the length & lead byte range */
217 def += 3;
218
219 /* we already know byte #0 is good, so check the remaining bytes */
220 for (i = 1; i < len; ++i)
221 {
222 /* if the byte is outside the allowed range for this definition, return invalid */
223 if ((*z < *def++) || (*z++ > *def++))
224 return LOOK_INVALID;
225 }
226
227 /* advance to the next sequence */
228 n -= len;
229 }
230 }
231
232 /* we made it all the way through the buffer so it's not invalid */
233 return 0;
234 }
235

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button