Fossil SCM

micro-optimizing invalid_utf8 function, should be as fast as possible now

jan.nijtmans 2016-06-26 17:05 trunk merge
Commit 7c08a68503a45da327ac55ca9252d9a71b43ff17
1 file changed +52 -47
+52 -47
--- src/lookslike.c
+++ src/lookslike.c
@@ -50,10 +50,41 @@
5050
#define LOOK_INVALID ((int)0x00000200) /* Invalid sequence was found. */
5151
#define LOOK_BINARY (LOOK_NUL | LOOK_LONG | LOOK_SHORT) /* May be binary. */
5252
#define LOOK_EOL (LOOK_LONE_CR | LOOK_LONE_LF | LOOK_CRLF) /* Line seps. */
5353
#endif /* INTERFACE */
5454
55
+/* definitions for various UTF-8 sequence lengths, encoded as start value
56
+ * and size of each valid range belonging to some lead byte*/
57
+#define US2A 0x80, 0x01 /* for lead byte 0xC0 */
58
+#define US2B 0x80, 0x40 /* for lead bytes 0xC2-0xDF */
59
+#define US3A 0xA0, 0x20 /* for lead byte 0xE0 */
60
+#define US3B 0x80, 0x40 /* for lead bytes 0xE1-0xEF */
61
+#define US4A 0x90, 0x30 /* for lead byte 0xF0 */
62
+#define US4B 0x80, 0x40 /* for lead bytes 0xF1-0xF3 */
63
+#define US4C 0x80, 0x10 /* for lead byte 0xF4 */
64
+#define US0A 0x00, 0x00 /* for any other lead byte */
65
+
66
+/* a table used for quick lookup of the definition that goes with a
67
+ * particular lead byte */
68
+static const unsigned char lb_tab[] = {
69
+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
70
+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
71
+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
72
+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
73
+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
74
+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
75
+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
76
+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
77
+ US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
78
+ US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
79
+ US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
80
+ US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
81
+ US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
82
+ US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
83
+ US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A,
84
+ US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A
85
+};
5586
5687
/*
5788
** This function attempts to scan each logical line within the blob to
5889
** determine the type of content it appears to contain. The return value
5990
** is a combination of one or more of the LOOK_XXX flags (see above):
@@ -135,72 +166,46 @@
135166
}
136167
137168
/*
138169
** Checks for proper UTF-8. It uses the method described in:
139170
** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
140
-** except for the "overlong form" of \u0000 which is not considered invalid
141
-** here: Some languages like Java and Tcl use it. This function also
142
-** considers valid the derivatives CESU-8 & WTF-8 (as described in the
143
-** same wikipedia article referenced previously). For UTF-8 characters
144
-** > 7f, the variable 'c2' not necessary means the previous character.
145
-** It's number of higher 1-bits indicate the number of continuation bytes
146
-** that are expected to be followed. E.g. when 'c2' has a value in the range
147
-** 0xc0..0xdf it means that 'c' is expected to contain the last continuation
148
-** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
149
-** more continuation byte is expected.
171
+** except for the "overlong form" of \u0000 which is not considered
172
+** invalid here: Some languages like Java and Tcl use it. This function
173
+** also considers valid the derivatives CESU-8 & WTF-8 (as described in
174
+** the same wikipedia article referenced previously). For UTF-8 characters
175
+** > 0x7f, the variable 'c' not necessary means the real lead byte.
176
+** It's number of higher 1-bits indicate the number of continuation
177
+** bytes that are expected to be followed. E.g. when 'c' has a value
178
+** in the range 0xc0..0xdf it means that after 'c' a single continuation
179
+** byte is expected. A value 0xe0..0xef means that after 'c' two more
180
+** continuation bytes are expected.
150181
*/
151182
152
-/* definitions for various UTF-8 sequence lengths */
153
-#define US2A 0x80, 0x80 /* for lead byte 0xC0 */
154
-#define US2B 0x80, 0xBF /* for lead bytes 0xC2-0xDF */
155
-#define US3A 0xA0, 0xBF /* for lead byte 0xE0 */
156
-#define US3B 0x80, 0xBF /* for lead bytes 0xE1-0xEF */
157
-#define US4A 0x90, 0xBF /* for lead byte 0xF0 */
158
-#define US4B 0x80, 0xBF /* for lead bytes 0xF1-0xF3 */
159
-#define US4C 0x80, 0x8F /* for lead byte 0xF4 */
160
-#define US0A 0xFF, 0x00 /* for any other lead byte */
161
-
162
-/* a table used for quick lookup of the definition that goes with a
163
- * particular lead byte */
164
-static const unsigned char lb_tab[] = {
165
- US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
166
- US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
167
- US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
168
- US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
169
- US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
170
- US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
171
- US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A,
172
- US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A
173
-};
174
-
175183
int invalid_utf8(
176184
const Blob *pContent
177185
){
178186
const unsigned char *z = (unsigned char *) blob_buffer(pContent);
179187
unsigned int n = blob_size(pContent);
180
- unsigned char c, c2;
188
+ unsigned char c; /* lead byte to be handled. */
181189
182190
if( n==0 ) return 0; /* Empty file -> OK */
183191
c = *z;
184192
while( --n>0 ){
185
- c2 = c;
186
- c = *++z;
187
- if( c2>=0xC0 ){
188
- const unsigned char *def = &lb_tab[(2*c2)-0x180];
189
- if( (c<*def) || (c>*++def) ){
193
+ if( c>=0x80 ){
194
+ const unsigned char *def; /* pointer to range table*/
195
+
196
+ c <<= 1; /* multiply by 2 and get rid of highest bit */
197
+ def = &lb_tab[c]; /* search fb's valid range in table */
198
+ if( (unsigned int)(*++z-def[0])>=def[1] ){
190199
return LOOK_INVALID; /* Invalid UTF-8 */
191200
}
192
- if( c2>=0xe0 ){
193
- c = (c2<<1)|3;
194
- }else{
195
- c = ' ';
196
- }
197
- }else if( c2>=0x80 ){
198
- return LOOK_INVALID;
201
+ c = (c>=0xC0) ? (c|3) : ' '; /* determine next lead byte */
202
+ } else {
203
+ c = *++z;
199204
}
200205
}
201
- return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */
206
+ return (c>=0x80) ? LOOK_INVALID : 0; /* Final lead byte must be ASCII. */
202207
}
203208
204209
/*
205210
** Define the type needed to represent a Unicode (UTF-16) character.
206211
*/
207212
--- src/lookslike.c
+++ src/lookslike.c
@@ -50,10 +50,41 @@
50 #define LOOK_INVALID ((int)0x00000200) /* Invalid sequence was found. */
51 #define LOOK_BINARY (LOOK_NUL | LOOK_LONG | LOOK_SHORT) /* May be binary. */
52 #define LOOK_EOL (LOOK_LONE_CR | LOOK_LONE_LF | LOOK_CRLF) /* Line seps. */
53 #endif /* INTERFACE */
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
56 /*
57 ** This function attempts to scan each logical line within the blob to
58 ** determine the type of content it appears to contain. The return value
59 ** is a combination of one or more of the LOOK_XXX flags (see above):
@@ -135,72 +166,46 @@
135 }
136
137 /*
138 ** Checks for proper UTF-8. It uses the method described in:
139 ** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
140 ** except for the "overlong form" of \u0000 which is not considered invalid
141 ** here: Some languages like Java and Tcl use it. This function also
142 ** considers valid the derivatives CESU-8 & WTF-8 (as described in the
143 ** same wikipedia article referenced previously). For UTF-8 characters
144 ** > 7f, the variable 'c2' not necessary means the previous character.
145 ** It's number of higher 1-bits indicate the number of continuation bytes
146 ** that are expected to be followed. E.g. when 'c2' has a value in the range
147 ** 0xc0..0xdf it means that 'c' is expected to contain the last continuation
148 ** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
149 ** more continuation byte is expected.
150 */
151
152 /* definitions for various UTF-8 sequence lengths */
153 #define US2A 0x80, 0x80 /* for lead byte 0xC0 */
154 #define US2B 0x80, 0xBF /* for lead bytes 0xC2-0xDF */
155 #define US3A 0xA0, 0xBF /* for lead byte 0xE0 */
156 #define US3B 0x80, 0xBF /* for lead bytes 0xE1-0xEF */
157 #define US4A 0x90, 0xBF /* for lead byte 0xF0 */
158 #define US4B 0x80, 0xBF /* for lead bytes 0xF1-0xF3 */
159 #define US4C 0x80, 0x8F /* for lead byte 0xF4 */
160 #define US0A 0xFF, 0x00 /* for any other lead byte */
161
162 /* a table used for quick lookup of the definition that goes with a
163 * particular lead byte */
164 static const unsigned char lb_tab[] = {
165 US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
166 US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
167 US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
168 US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
169 US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
170 US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
171 US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A,
172 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A
173 };
174
175 int invalid_utf8(
176 const Blob *pContent
177 ){
178 const unsigned char *z = (unsigned char *) blob_buffer(pContent);
179 unsigned int n = blob_size(pContent);
180 unsigned char c, c2;
181
182 if( n==0 ) return 0; /* Empty file -> OK */
183 c = *z;
184 while( --n>0 ){
185 c2 = c;
186 c = *++z;
187 if( c2>=0xC0 ){
188 const unsigned char *def = &lb_tab[(2*c2)-0x180];
189 if( (c<*def) || (c>*++def) ){
 
190 return LOOK_INVALID; /* Invalid UTF-8 */
191 }
192 if( c2>=0xe0 ){
193 c = (c2<<1)|3;
194 }else{
195 c = ' ';
196 }
197 }else if( c2>=0x80 ){
198 return LOOK_INVALID;
199 }
200 }
201 return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */
202 }
203
204 /*
205 ** Define the type needed to represent a Unicode (UTF-16) character.
206 */
207
--- src/lookslike.c
+++ src/lookslike.c
@@ -50,10 +50,41 @@
50 #define LOOK_INVALID ((int)0x00000200) /* Invalid sequence was found. */
51 #define LOOK_BINARY (LOOK_NUL | LOOK_LONG | LOOK_SHORT) /* May be binary. */
52 #define LOOK_EOL (LOOK_LONE_CR | LOOK_LONE_LF | LOOK_CRLF) /* Line seps. */
53 #endif /* INTERFACE */
54
55 /* definitions for various UTF-8 sequence lengths, encoded as start value
56 * and size of each valid range belonging to some lead byte*/
57 #define US2A 0x80, 0x01 /* for lead byte 0xC0 */
58 #define US2B 0x80, 0x40 /* for lead bytes 0xC2-0xDF */
59 #define US3A 0xA0, 0x20 /* for lead byte 0xE0 */
60 #define US3B 0x80, 0x40 /* for lead bytes 0xE1-0xEF */
61 #define US4A 0x90, 0x30 /* for lead byte 0xF0 */
62 #define US4B 0x80, 0x40 /* for lead bytes 0xF1-0xF3 */
63 #define US4C 0x80, 0x10 /* for lead byte 0xF4 */
64 #define US0A 0x00, 0x00 /* for any other lead byte */
65
66 /* a table used for quick lookup of the definition that goes with a
67 * particular lead byte */
68 static const unsigned char lb_tab[] = {
69 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
70 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
71 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
72 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
73 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
74 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
75 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
76 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
77 US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
78 US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
79 US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
80 US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
81 US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
82 US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
83 US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A,
84 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A
85 };
86
87 /*
88 ** This function attempts to scan each logical line within the blob to
89 ** determine the type of content it appears to contain. The return value
90 ** is a combination of one or more of the LOOK_XXX flags (see above):
@@ -135,72 +166,46 @@
166 }
167
168 /*
169 ** Checks for proper UTF-8. It uses the method described in:
170 ** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
171 ** except for the "overlong form" of \u0000 which is not considered
172 ** invalid here: Some languages like Java and Tcl use it. This function
173 ** also considers valid the derivatives CESU-8 & WTF-8 (as described in
174 ** the same wikipedia article referenced previously). For UTF-8 characters
175 ** > 0x7f, the variable 'c' not necessary means the real lead byte.
176 ** It's number of higher 1-bits indicate the number of continuation
177 ** bytes that are expected to be followed. E.g. when 'c' has a value
178 ** in the range 0xc0..0xdf it means that after 'c' a single continuation
179 ** byte is expected. A value 0xe0..0xef means that after 'c' two more
180 ** continuation bytes are expected.
181 */
182
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183 int invalid_utf8(
184 const Blob *pContent
185 ){
186 const unsigned char *z = (unsigned char *) blob_buffer(pContent);
187 unsigned int n = blob_size(pContent);
188 unsigned char c; /* lead byte to be handled. */
189
190 if( n==0 ) return 0; /* Empty file -> OK */
191 c = *z;
192 while( --n>0 ){
193 if( c>=0x80 ){
194 const unsigned char *def; /* pointer to range table*/
195
196 c <<= 1; /* multiply by 2 and get rid of highest bit */
197 def = &lb_tab[c]; /* search fb's valid range in table */
198 if( (unsigned int)(*++z-def[0])>=def[1] ){
199 return LOOK_INVALID; /* Invalid UTF-8 */
200 }
201 c = (c>=0xC0) ? (c|3) : ' '; /* determine next lead byte */
202 } else {
203 c = *++z;
 
 
 
 
204 }
205 }
206 return (c>=0x80) ? LOOK_INVALID : 0; /* Final lead byte must be ASCII. */
207 }
208
209 /*
210 ** Define the type needed to represent a Unicode (UTF-16) character.
211 */
212

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button