Fossil SCM

Use faster table-based approach when checking for invalid utf-8, in stead of complex bit-operations.

jan.nijtmans 2016-06-16 09:13 trunk
Commit 60349a6617490676a2ea1a31fdce56decb641dc6
1 file changed +53 -10
+53 -10
--- src/lookslike.c
+++ src/lookslike.c
@@ -132,25 +132,72 @@
132132
flags |= LOOK_LONG; /* Very long line -> binary */
133133
}
134134
return flags;
135135
}
136136
137
-
138137
/*
139138
** Checks for proper UTF-8. It uses the method described in:
140139
** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
141140
** except for the "overlong form" of \u0000 which is not considered invalid
142
-** here: Some languages like Java and Tcl use it. For UTF-8 characters
141
+** here: Some languages like Java and Tcl use it. This function also
142
+** considers valid the derivatives CESU-8 & WTF-8 (as described in the
143
+** same wikipedia article referenced previously). For UTF-8 characters
143144
** > 7f, the variable 'c2' not necessary means the previous character.
144145
** It's number of higher 1-bits indicate the number of continuation bytes
145146
** that are expected to be followed. E.g. when 'c2' has a value in the range
146147
** 0xc0..0xdf it means that 'c' is expected to contain the last continuation
147148
** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
148149
** more continuation byte is expected.
149150
*/
150151
151
-int invalid_utf8(const Blob *pContent){
152
+/* definitions for various UTF-8 sequence lengths */
153
+static const unsigned char us2a[] = { /* for lead byte 0xC0 */
154
+ 0x80, 0x80
155
+};
156
+static const unsigned char us2b[] = { /* for lead bytes 0xC2-0xDF */
157
+ 0x80, 0xBF
158
+};
159
+static const unsigned char us3a[] = { /* for lead byte 0xE0 */
160
+ 0xA0, 0xBF
161
+};
162
+static const unsigned char us3b[] = { /* for lead bytes 0xE1-0xEF */
163
+ 0x80, 0xBF
164
+};
165
+static const unsigned char us4a[] = { /* for lead byte 0xF0 */
166
+ 0x90, 0xBF
167
+};
168
+static const unsigned char us4b[] = { /* for lead bytes 0xF1-0xF3 */
169
+ 0x80, 0xBF
170
+};
171
+static const unsigned char us4c[] = { /* for lead byte 0xF4 */
172
+ 0x80, 0x8F
173
+};
174
+
175
+/* a table used for quick lookup of the definition that goes with a
176
+ * particular lead byte */
177
+static const unsigned char* const lb_tab[] = {
178
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
179
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
180
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
181
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
182
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
183
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
184
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
185
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
186
+ us2a, NULL, us2b, us2b, us2b, us2b, us2b, us2b,
187
+ us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
188
+ us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
189
+ us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
190
+ us3a, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
191
+ us3b, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
192
+ us4a, us4b, us4b, us4b, us4c, NULL, NULL, NULL,
193
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
194
+};
195
+
196
+int invalid_utf8(
197
+ const Blob *pContent
198
+){
152199
const unsigned char *z = (unsigned char *) blob_buffer(pContent);
153200
unsigned int n = blob_size(pContent);
154201
unsigned char c, c2;
155202
156203
if( n==0 ) return 0; /* Empty file -> OK */
@@ -157,27 +204,23 @@
157204
c = *z;
158205
while( --n>0 ){
159206
c2 = c;
160207
c = *++z;
161208
if( c2>=0x80 ){
162
- if( ((c&0xc0)!=0x80) || (((c2<0xc2) || (c2>=0xf4)) &&
163
- (((c2!=0xf4) || (c>=0x90)) && ((c2!=0xc0) || (c!=0x80)))) ){
209
+ const unsigned char *def = lb_tab[(c2)-0x80];
210
+ if( !def || (c<*def++) || (c>*def++) ){
164211
return LOOK_INVALID; /* Invalid UTF-8 */
165212
}
166213
if( c2>=0xe0 ){
167
- if ((c2==0xf0 && c<0x90)||(c2==0xe0 && c<0xa0) ){
168
- return LOOK_INVALID; /* Invalid UTF-8, too short */
169
- }
170214
c = (c2<<1)|3;
171215
}else{
172216
c = ' ';
173217
}
174218
}
175219
}
176220
return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */
177221
}
178
-
179222
180223
/*
181224
** Define the type needed to represent a Unicode (UTF-16) character.
182225
*/
183226
#ifndef WCHAR_T
@@ -405,11 +448,11 @@
405448
fUnicode = could_be_utf16(&blob, 0) || fForceUtf16;
406449
}
407450
if( fUnicode ){
408451
lookFlags = looks_like_utf16(&blob, bRevUtf16, 0);
409452
}else{
410
- lookFlags = looks_like_utf8(&blob, 0)|invalid_utf8(&blob);
453
+ lookFlags = looks_like_utf8(&blob, 0) | invalid_utf8(&blob);
411454
}
412455
}
413456
fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
414457
fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
415458
fossil_print("Starts with UTF-16 BOM: %s\n",
416459
--- src/lookslike.c
+++ src/lookslike.c
@@ -132,25 +132,72 @@
132 flags |= LOOK_LONG; /* Very long line -> binary */
133 }
134 return flags;
135 }
136
137
138 /*
139 ** Checks for proper UTF-8. It uses the method described in:
140 ** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
141 ** except for the "overlong form" of \u0000 which is not considered invalid
142 ** here: Some languages like Java and Tcl use it. For UTF-8 characters
 
 
143 ** > 7f, the variable 'c2' not necessary means the previous character.
144 ** It's number of higher 1-bits indicate the number of continuation bytes
145 ** that are expected to be followed. E.g. when 'c2' has a value in the range
146 ** 0xc0..0xdf it means that 'c' is expected to contain the last continuation
147 ** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
148 ** more continuation byte is expected.
149 */
150
151 int invalid_utf8(const Blob *pContent){
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152 const unsigned char *z = (unsigned char *) blob_buffer(pContent);
153 unsigned int n = blob_size(pContent);
154 unsigned char c, c2;
155
156 if( n==0 ) return 0; /* Empty file -> OK */
@@ -157,27 +204,23 @@
157 c = *z;
158 while( --n>0 ){
159 c2 = c;
160 c = *++z;
161 if( c2>=0x80 ){
162 if( ((c&0xc0)!=0x80) || (((c2<0xc2) || (c2>=0xf4)) &&
163 (((c2!=0xf4) || (c>=0x90)) && ((c2!=0xc0) || (c!=0x80)))) ){
164 return LOOK_INVALID; /* Invalid UTF-8 */
165 }
166 if( c2>=0xe0 ){
167 if ((c2==0xf0 && c<0x90)||(c2==0xe0 && c<0xa0) ){
168 return LOOK_INVALID; /* Invalid UTF-8, too short */
169 }
170 c = (c2<<1)|3;
171 }else{
172 c = ' ';
173 }
174 }
175 }
176 return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */
177 }
178
179
180 /*
181 ** Define the type needed to represent a Unicode (UTF-16) character.
182 */
183 #ifndef WCHAR_T
@@ -405,11 +448,11 @@
405 fUnicode = could_be_utf16(&blob, 0) || fForceUtf16;
406 }
407 if( fUnicode ){
408 lookFlags = looks_like_utf16(&blob, bRevUtf16, 0);
409 }else{
410 lookFlags = looks_like_utf8(&blob, 0)|invalid_utf8(&blob);
411 }
412 }
413 fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
414 fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
415 fossil_print("Starts with UTF-16 BOM: %s\n",
416
--- src/lookslike.c
+++ src/lookslike.c
@@ -132,25 +132,72 @@
132 flags |= LOOK_LONG; /* Very long line -> binary */
133 }
134 return flags;
135 }
136
 
137 /*
138 ** Checks for proper UTF-8. It uses the method described in:
139 ** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
140 ** except for the "overlong form" of \u0000 which is not considered invalid
141 ** here: Some languages like Java and Tcl use it. This function also
142 ** considers valid the derivatives CESU-8 & WTF-8 (as described in the
143 ** same wikipedia article referenced previously). For UTF-8 characters
144 ** > 7f, the variable 'c2' not necessary means the previous character.
145 ** It's number of higher 1-bits indicate the number of continuation bytes
146 ** that are expected to be followed. E.g. when 'c2' has a value in the range
147 ** 0xc0..0xdf it means that 'c' is expected to contain the last continuation
148 ** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
149 ** more continuation byte is expected.
150 */
151
152 /* definitions for various UTF-8 sequence lengths */
153 static const unsigned char us2a[] = { /* for lead byte 0xC0 */
154 0x80, 0x80
155 };
156 static const unsigned char us2b[] = { /* for lead bytes 0xC2-0xDF */
157 0x80, 0xBF
158 };
159 static const unsigned char us3a[] = { /* for lead byte 0xE0 */
160 0xA0, 0xBF
161 };
162 static const unsigned char us3b[] = { /* for lead bytes 0xE1-0xEF */
163 0x80, 0xBF
164 };
165 static const unsigned char us4a[] = { /* for lead byte 0xF0 */
166 0x90, 0xBF
167 };
168 static const unsigned char us4b[] = { /* for lead bytes 0xF1-0xF3 */
169 0x80, 0xBF
170 };
171 static const unsigned char us4c[] = { /* for lead byte 0xF4 */
172 0x80, 0x8F
173 };
174
175 /* a table used for quick lookup of the definition that goes with a
176 * particular lead byte */
177 static const unsigned char* const lb_tab[] = {
178 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
179 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
180 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
181 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
182 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
183 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
184 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
185 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
186 us2a, NULL, us2b, us2b, us2b, us2b, us2b, us2b,
187 us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
188 us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
189 us2b, us2b, us2b, us2b, us2b, us2b, us2b, us2b,
190 us3a, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
191 us3b, us3b, us3b, us3b, us3b, us3b, us3b, us3b,
192 us4a, us4b, us4b, us4b, us4c, NULL, NULL, NULL,
193 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
194 };
195
196 int invalid_utf8(
197 const Blob *pContent
198 ){
199 const unsigned char *z = (unsigned char *) blob_buffer(pContent);
200 unsigned int n = blob_size(pContent);
201 unsigned char c, c2;
202
203 if( n==0 ) return 0; /* Empty file -> OK */
@@ -157,27 +204,23 @@
204 c = *z;
205 while( --n>0 ){
206 c2 = c;
207 c = *++z;
208 if( c2>=0x80 ){
209 const unsigned char *def = lb_tab[(c2)-0x80];
210 if( !def || (c<*def++) || (c>*def++) ){
211 return LOOK_INVALID; /* Invalid UTF-8 */
212 }
213 if( c2>=0xe0 ){
 
 
 
214 c = (c2<<1)|3;
215 }else{
216 c = ' ';
217 }
218 }
219 }
220 return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */
221 }
 
222
223 /*
224 ** Define the type needed to represent a Unicode (UTF-16) character.
225 */
226 #ifndef WCHAR_T
@@ -405,11 +448,11 @@
448 fUnicode = could_be_utf16(&blob, 0) || fForceUtf16;
449 }
450 if( fUnicode ){
451 lookFlags = looks_like_utf16(&blob, bRevUtf16, 0);
452 }else{
453 lookFlags = looks_like_utf8(&blob, 0) | invalid_utf8(&blob);
454 }
455 }
456 fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
457 fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
458 fossil_print("Starts with UTF-16 BOM: %s\n",
459

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button