Fossil SCM

Further invalid_utf8() improvement: Save one indirection and a check, and make the table size even smaller.

jan.nijtmans 2016-06-16 09:44 trunk
Commit 6a59dbbb99b12982b4c3adea7d2a8d002600b293
1 file changed +27 -40
+27 -40
--- src/lookslike.c
+++ src/lookslike.c
@@ -148,51 +148,38 @@
148148
** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
149149
** more continuation byte is expected.
150150
*/
151151
152152
/* definitions for various UTF-8 sequence lengths */
153
-static const unsigned char us2a[] = { /* for lead byte 0xC0 */
154
- 0x80, 0x80
155
-};
156
-static const unsigned char us2b[] = { /* for lead bytes 0xC2-0xDF */
157
- 0x80, 0xBF
158
-};
159
-static const unsigned char us3a[] = { /* for lead byte 0xE0 */
160
- 0xA0, 0xBF
161
-};
162
-static const unsigned char us3b[] = { /* for lead bytes 0xE1-0xEF */
163
- 0x80, 0xBF
164
-};
165
-static const unsigned char us4a[] = { /* for lead byte 0xF0 */
166
- 0x90, 0xBF
167
-};
168
-static const unsigned char us4b[] = { /* for lead bytes 0xF1-0xF3 */
169
- 0x80, 0xBF
170
-};
171
-static const unsigned char us4c[] = { /* for lead byte 0xF4 */
172
- 0x80, 0x8F
173
-};
153
+#define US2A 0x80, 0x80 /* for lead byte 0xC0 */
154
+#define US2B 0x80, 0xBF /* for lead bytes 0xC2-0xDF */
155
+#define US3A 0xA0, 0xBF /* for lead byte 0xE0 */
156
+#define US3B 0x80, 0xBF /* for lead bytes 0xE1-0xEF */
157
+#define US4A 0x90, 0xBF /* for lead byte 0xF0 */
158
+#define US4B 0x80, 0xBF /* for lead bytes 0xF1-0xF3 */
159
+#define US4C 0x80, 0x8F /* for lead byte 0xF4 */
160
+#define US0A 0xFF, 0x00 /* for any other lead byte */
174161
175162
/* a table used for quick lookup of the definition that goes with a
176163
* particular lead byte */
177
-static const unsigned char* const lb_tab[] = {
178
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
179
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
180
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
181
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
182
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
183
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
184
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
185
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
186
- us2a, NULL, us2b, us2b, us2b, us2b, us2b, us2b,
187
- us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
188
- us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
189
- us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
190
- us3a, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
191
- us3b, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
192
- us4a, us4b, us4b, us4b, us4c, NULL, NULL, NULL,
193
- NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
164
+static const unsigned char lb_tab[] = {
165
+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
166
+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
167
+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
168
+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
169
+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
170
+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
171
+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
172
+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
173
+ US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
174
+ US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
175
+ US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
176
+ US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
177
+ US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
178
+ US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
179
+ US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A,
180
+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A
194181
};
195182
196183
int invalid_utf8(
197184
const Blob *pContent
198185
){
@@ -204,12 +191,12 @@
204191
c = *z;
205192
while( --n>0 ){
206193
c2 = c;
207194
c = *++z;
208195
if( c2>=0x80 ){
209
- const unsigned char *def = lb_tab[(c2)-0x80];
210
- if( !def || (c<*def++) || (c>*def++) ){
196
+ const unsigned char *def = &lb_tab[(2*c2)-0x100];
197
+ if( (c<*def++) || (c>*def++) ){
211198
return LOOK_INVALID; /* Invalid UTF-8 */
212199
}
213200
if( c2>=0xe0 ){
214201
c = (c2<<1)|3;
215202
}else{
216203
--- src/lookslike.c
+++ src/lookslike.c
@@ -148,51 +148,38 @@
148 ** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
149 ** more continuation byte is expected.
150 */
151
152 /* definitions for various UTF-8 sequence lengths */
153 static const unsigned char us2a[] = { /* for lead byte 0xC0 */
154 0x80, 0x80
155 };
156 static const unsigned char us2b[] = { /* for lead bytes 0xC2-0xDF */
157 0x80, 0xBF
158 };
159 static const unsigned char us3a[] = { /* for lead byte 0xE0 */
160 0xA0, 0xBF
161 };
162 static const unsigned char us3b[] = { /* for lead bytes 0xE1-0xEF */
163 0x80, 0xBF
164 };
165 static const unsigned char us4a[] = { /* for lead byte 0xF0 */
166 0x90, 0xBF
167 };
168 static const unsigned char us4b[] = { /* for lead bytes 0xF1-0xF3 */
169 0x80, 0xBF
170 };
171 static const unsigned char us4c[] = { /* for lead byte 0xF4 */
172 0x80, 0x8F
173 };
174
175 /* a table used for quick lookup of the definition that goes with a
176 * particular lead byte */
177 static const unsigned char* const lb_tab[] = {
178 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
179 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
180 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
181 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
182 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
183 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
184 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
185 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
186 us2a, NULL, us2b, us2b, us2b, us2b, us2b, us2b,
187 us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
188 us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
189 us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
190 us3a, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
191 us3b, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
192 us4a, us4b, us4b, us4b, us4c, NULL, NULL, NULL,
193 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
194 };
195
196 int invalid_utf8(
197 const Blob *pContent
198 ){
@@ -204,12 +191,12 @@
204 c = *z;
205 while( --n>0 ){
206 c2 = c;
207 c = *++z;
208 if( c2>=0x80 ){
209 const unsigned char *def = lb_tab[(c2)-0x80];
210 if( !def || (c<*def++) || (c>*def++) ){
211 return LOOK_INVALID; /* Invalid UTF-8 */
212 }
213 if( c2>=0xe0 ){
214 c = (c2<<1)|3;
215 }else{
216
--- src/lookslike.c
+++ src/lookslike.c
@@ -148,51 +148,38 @@
148 ** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
149 ** more continuation byte is expected.
150 */
151
152 /* definitions for various UTF-8 sequence lengths */
153 #define US2A 0x80, 0x80 /* for lead byte 0xC0 */
154 #define US2B 0x80, 0xBF /* for lead bytes 0xC2-0xDF */
155 #define US3A 0xA0, 0xBF /* for lead byte 0xE0 */
156 #define US3B 0x80, 0xBF /* for lead bytes 0xE1-0xEF */
157 #define US4A 0x90, 0xBF /* for lead byte 0xF0 */
158 #define US4B 0x80, 0xBF /* for lead bytes 0xF1-0xF3 */
159 #define US4C 0x80, 0x8F /* for lead byte 0xF4 */
160 #define US0A 0xFF, 0x00 /* for any other lead byte */
 
 
 
 
 
 
 
 
 
 
 
 
 
161
162 /* a table used for quick lookup of the definition that goes with a
163 * particular lead byte */
164 static const unsigned char lb_tab[] = {
165 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
166 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
167 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
168 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
169 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
170 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
171 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
172 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
173 US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
174 US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
175 US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
176 US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
177 US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
178 US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
179 US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A,
180 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A
181 };
182
183 int invalid_utf8(
184 const Blob *pContent
185 ){
@@ -204,12 +191,12 @@
191 c = *z;
192 while( --n>0 ){
193 c2 = c;
194 c = *++z;
195 if( c2>=0x80 ){
196 const unsigned char *def = &lb_tab[(2*c2)-0x100];
197 if( (c<*def++) || (c>*def++) ){
198 return LOOK_INVALID; /* Invalid UTF-8 */
199 }
200 if( c2>=0xe0 ){
201 c = (c2<<1)|3;
202 }else{
203

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button