Fossil SCM

proposed new invalid_utf8 function

sdr 2016-06-10 08:07 trunk
Commit e58334a00799c303c6ad187e6c84c2151457b146
1 file changed +73 -36
+73 -36
--- src/lookslike.c
+++ src/lookslike.c
@@ -136,49 +136,86 @@
136136
137137
138138
/*
139139
** Checks for proper UTF-8. It uses the method described in:
140140
** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
141
-** except for the "overlong form" of \u0000 which is not considered invalid
142
-** here: Some languages like Java and Tcl use it. For UTF-8 characters
143
-** > 7f, the variable 'c2' not necessary means the previous character.
144
-** It's number of higher 1-bits indicate the number of continuation bytes
145
-** that are expected to be followed. E.g. when 'c2' has a value in the range
146
-** 0xc0..0xdf it means that 'c' is expected to contain the last continuation
147
-** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
148
-** more continuation byte is expected.
141
+** except for the "overlong form" of \u0000 (Modified UTF-8)
142
+** which is not considered invalid here: Some languages like
143
+** Java and Tcl use it. This function also considers valid
144
+** the derivatives CESU-8 & WTF-8 (as described in the same
145
+** wikipedia article referenced previously).
149146
*/
150147
151
-int invalid_utf8(const Blob *pContent){
152
- const unsigned char *z = (unsigned char *) blob_buffer(pContent);
148
+int invalid_utf8(const Blob *pContent)
149
+{
150
+ /* definitions for various utf-8 sequence lengths */
151
+ static unsigned char def_1a[] = { 1, 0x00, 0x7F };
152
+ static unsigned char def_2a[] = { 2, 0xC0, 0xC0, 0x80, 0x80 };
153
+ static unsigned char def_2b[] = { 2, 0xC2, 0xDF, 0x80, 0xBF };
154
+ static unsigned char def_3a[] = { 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF };
155
+ static unsigned char def_3b[] = { 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF };
156
+ static unsigned char def_4a[] = { 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF };
157
+ static unsigned char def_4b[] = { 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF };
158
+ static unsigned char def_4c[] = { 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF };
159
+
160
+ /* an array of all the definitions */
161
+ static unsigned char* def_arr[] = { def_1a, def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL };
162
+
163
+ /* a table used for quick lookup of the definition that goes with a particular lead byte */
164
+ static unsigned char* lb_tab[256] = { NULL };
165
+
166
+ /* a pointer to the table; NULL means not yet setup */
167
+ static unsigned char** lb_ptr = NULL;
168
+
169
+ /* if the table pointer hasn't been initialized */
170
+ if (lb_ptr == NULL)
171
+ {
172
+ lb_ptr = lb_tab;
173
+
174
+ /* for each definition, set the lead byte table pointer to the proper definition */
175
+ unsigned char** pp = def_arr;
176
+ while (*pp != NULL)
177
+ {
178
+ unsigned char lo = pp[0][1];
179
+ unsigned char hi = pp[0][2];
180
+ unsigned char i;
181
+ for (i = lo; i <= hi; ++i)
182
+ lb_ptr[i] = pp[0];
183
+ ++pp;
184
+ }
185
+ }
186
+
187
+ /* buffer pointer and size */
188
+ const unsigned char *z = (unsigned char *)blob_buffer(pContent);
153189
unsigned int n = blob_size(pContent);
154
- unsigned char c, c2;
155
-
156
- if( n==0 ) return 0; /* Empty file -> OK */
157
- c = *z;
158
- while( --n>0 ){
159
- c2 = c;
160
- c = *++z;
161
- if( c2>=0x80 ){
162
- if( ((c2<0xc2) || (c2>=0xf4) || ((c&0xc0)!=0x80)) &&
163
- (((c2!=0xf4) || (c>=0x90)) && ((c2!=0xc0) || (c!=0x80))) ){
164
- return LOOK_INVALID; /* Invalid UTF-8 */
165
- }
166
- /* the first byte of the sequence is okay
167
- ** but we need to check the rest
168
- ** convert next byte to a prefix byte of the next shorter sequence
169
- ** or a simple space character if the two byte seq was valid
170
- */
171
- c = (c2 >= 0xe0) ? (c2<<1)+1 : ' ';
172
- /* edge case: if three byte sequence started with 0xe0
173
- ** it becomes 0xc1, which is a too short two byte sequence
174
- ** so fix it up to be the start of a valid two byte sequence
175
- */
176
- if (c == 0xc1) c = 0xc2;
177
- }
178
- }
179
- return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */
190
+
191
+ /* while we haven't checked all the bytes in the buffer */
192
+ while (n > 0)
193
+ {
194
+ /* get the definition for this lead byte */
195
+ unsigned char* def = lb_ptr[*z];
196
+ unsigned char i;
197
+
198
+ /* if the definition doesn't exist, or there aren't enough bytes left, return invalid */
199
+ if (!def || (n < def[0]))
200
+ return LOOK_INVALID;
201
+
202
+ /* we already know byte #0 is good, so check the remaining bytes */
203
+ for (i = 1; i < def[0]; ++i)
204
+ {
205
+ /* if the byte is outside the allowed range for this definition, return invalid */
206
+ if ((z[i] < def[1 + i * 2 + 0]) || (z[i] > def[1 + i * 2 + 1]))
207
+ return LOOK_INVALID;
208
+ }
209
+
210
+ /* advance to the next sequence */
211
+ z += def[0];
212
+ n -= def[0];
213
+ }
214
+
215
+ /* we made it all the way through the buffer so it's not invalid */
216
+ return 0;
180217
}
181218
182219
183220
/*
184221
** Define the type needed to represent a Unicode (UTF-16) character.
185222
--- src/lookslike.c
+++ src/lookslike.c
@@ -136,49 +136,86 @@
136
137
138 /*
139 ** Checks for proper UTF-8. It uses the method described in:
140 ** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
141 ** except for the "overlong form" of \u0000 which is not considered invalid
142 ** here: Some languages like Java and Tcl use it. For UTF-8 characters
143 ** > 7f, the variable 'c2' not necessary means the previous character.
144 ** It's number of higher 1-bits indicate the number of continuation bytes
145 ** that are expected to be followed. E.g. when 'c2' has a value in the range
146 ** 0xc0..0xdf it means that 'c' is expected to contain the last continuation
147 ** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one
148 ** more continuation byte is expected.
149 */
150
151 int invalid_utf8(const Blob *pContent){
152 const unsigned char *z = (unsigned char *) blob_buffer(pContent);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153 unsigned int n = blob_size(pContent);
154 unsigned char c, c2;
155
156 if( n==0 ) return 0; /* Empty file -> OK */
157 c = *z;
158 while( --n>0 ){
159 c2 = c;
160 c = *++z;
161 if( c2>=0x80 ){
162 if( ((c2<0xc2) || (c2>=0xf4) || ((c&0xc0)!=0x80)) &&
163 (((c2!=0xf4) || (c>=0x90)) && ((c2!=0xc0) || (c!=0x80))) ){
164 return LOOK_INVALID; /* Invalid UTF-8 */
165 }
166 /* the first byte of the sequence is okay
167 ** but we need to check the rest
168 ** convert next byte to a prefix byte of the next shorter sequence
169 ** or a simple space character if the two byte seq was valid
170 */
171 c = (c2 >= 0xe0) ? (c2<<1)+1 : ' ';
172 /* edge case: if three byte sequence started with 0xe0
173 ** it becomes 0xc1, which is a too short two byte sequence
174 ** so fix it up to be the start of a valid two byte sequence
175 */
176 if (c == 0xc1) c = 0xc2;
177 }
178 }
179 return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */
 
180 }
181
182
183 /*
184 ** Define the type needed to represent a Unicode (UTF-16) character.
185
--- src/lookslike.c
+++ src/lookslike.c
@@ -136,49 +136,86 @@
136
137
138 /*
139 ** Checks for proper UTF-8. It uses the method described in:
140 ** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
141 ** except for the "overlong form" of \u0000 (Modified UTF-8)
142 ** which is not considered invalid here: Some languages like
143 ** Java and Tcl use it. This function also considers valid
144 ** the derivatives CESU-8 & WTF-8 (as described in the same
145 ** wikipedia article referenced previously).
 
 
 
146 */
147
148 int invalid_utf8(const Blob *pContent)
149 {
150 /* definitions for various utf-8 sequence lengths */
151 static unsigned char def_1a[] = { 1, 0x00, 0x7F };
152 static unsigned char def_2a[] = { 2, 0xC0, 0xC0, 0x80, 0x80 };
153 static unsigned char def_2b[] = { 2, 0xC2, 0xDF, 0x80, 0xBF };
154 static unsigned char def_3a[] = { 3, 0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF };
155 static unsigned char def_3b[] = { 3, 0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF };
156 static unsigned char def_4a[] = { 4, 0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF };
157 static unsigned char def_4b[] = { 4, 0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF };
158 static unsigned char def_4c[] = { 4, 0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF };
159
160 /* an array of all the definitions */
161 static unsigned char* def_arr[] = { def_1a, def_2a, def_2b, def_3a, def_3b, def_4a, def_4b, def_4c, NULL };
162
163 /* a table used for quick lookup of the definition that goes with a particular lead byte */
164 static unsigned char* lb_tab[256] = { NULL };
165
166 /* a pointer to the table; NULL means not yet setup */
167 static unsigned char** lb_ptr = NULL;
168
169 /* if the table pointer hasn't been initialized */
170 if (lb_ptr == NULL)
171 {
172 lb_ptr = lb_tab;
173
174 /* for each definition, set the lead byte table pointer to the proper definition */
175 unsigned char** pp = def_arr;
176 while (*pp != NULL)
177 {
178 unsigned char lo = pp[0][1];
179 unsigned char hi = pp[0][2];
180 unsigned char i;
181 for (i = lo; i <= hi; ++i)
182 lb_ptr[i] = pp[0];
183 ++pp;
184 }
185 }
186
187 /* buffer pointer and size */
188 const unsigned char *z = (unsigned char *)blob_buffer(pContent);
189 unsigned int n = blob_size(pContent);
190
191 /* while we haven't checked all the bytes in the buffer */
192 while (n > 0)
193 {
194 /* get the definition for this lead byte */
195 unsigned char* def = lb_ptr[*z];
196 unsigned char i;
197
198 /* if the definition doesn't exist, or there aren't enough bytes left, return invalid */
199 if (!def || (n < def[0]))
200 return LOOK_INVALID;
201
202 /* we already know byte #0 is good, so check the remaining bytes */
203 for (i = 1; i < def[0]; ++i)
204 {
205 /* if the byte is outside the allowed range for this definition, return invalid */
206 if ((z[i] < def[1 + i * 2 + 0]) || (z[i] > def[1 + i * 2 + 1]))
207 return LOOK_INVALID;
208 }
209
210 /* advance to the next sequence */
211 z += def[0];
212 n -= def[0];
213 }
214
215 /* we made it all the way through the buffer so it's not invalid */
216 return 0;
217 }
218
219
220 /*
221 ** Define the type needed to represent a Unicode (UTF-16) character.
222

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button