Fossil SCM
micro-optimizing invalid_utf8 function, should be as fast as possible now
Commit
7c08a68503a45da327ac55ca9252d9a71b43ff17
Parent
e1034c4c35195ef…
1 file changed
+52
-47
+52
-47
| --- src/lookslike.c | ||
| +++ src/lookslike.c | ||
| @@ -50,10 +50,41 @@ | ||
| 50 | 50 | #define LOOK_INVALID ((int)0x00000200) /* Invalid sequence was found. */ |
| 51 | 51 | #define LOOK_BINARY (LOOK_NUL | LOOK_LONG | LOOK_SHORT) /* May be binary. */ |
| 52 | 52 | #define LOOK_EOL (LOOK_LONE_CR | LOOK_LONE_LF | LOOK_CRLF) /* Line seps. */ |
| 53 | 53 | #endif /* INTERFACE */ |
| 54 | 54 | |
| 55 | +/* definitions for various UTF-8 sequence lengths, encoded as start value | |
| 56 | + * and size of each valid range belonging to some lead byte*/ | |
| 57 | +#define US2A 0x80, 0x01 /* for lead byte 0xC0 */ | |
| 58 | +#define US2B 0x80, 0x40 /* for lead bytes 0xC2-0xDF */ | |
| 59 | +#define US3A 0xA0, 0x20 /* for lead byte 0xE0 */ | |
| 60 | +#define US3B 0x80, 0x40 /* for lead bytes 0xE1-0xEF */ | |
| 61 | +#define US4A 0x90, 0x30 /* for lead byte 0xF0 */ | |
| 62 | +#define US4B 0x80, 0x40 /* for lead bytes 0xF1-0xF3 */ | |
| 63 | +#define US4C 0x80, 0x10 /* for lead byte 0xF4 */ | |
| 64 | +#define US0A 0x00, 0x00 /* for any other lead byte */ | |
| 65 | + | |
| 66 | +/* a table used for quick lookup of the definition that goes with a | |
| 67 | + * particular lead byte */ | |
| 68 | +static const unsigned char lb_tab[] = { | |
| 69 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 70 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 71 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 72 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 73 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 74 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 75 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 76 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 77 | + US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B, | |
| 78 | + US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, | |
| 79 | + US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, | |
| 80 | + US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, | |
| 81 | + US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B, | |
| 82 | + US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B, | |
| 83 | + US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A, | |
| 84 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A | |
| 85 | +}; | |
| 55 | 86 | |
| 56 | 87 | /* |
| 57 | 88 | ** This function attempts to scan each logical line within the blob to |
| 58 | 89 | ** determine the type of content it appears to contain. The return value |
| 59 | 90 | ** is a combination of one or more of the LOOK_XXX flags (see above): |
| @@ -135,72 +166,46 @@ | ||
| 135 | 166 | } |
| 136 | 167 | |
| 137 | 168 | /* |
| 138 | 169 | ** Checks for proper UTF-8. It uses the method described in: |
| 139 | 170 | ** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences |
| 140 | -** except for the "overlong form" of \u0000 which is not considered invalid | |
| 141 | -** here: Some languages like Java and Tcl use it. This function also | |
| 142 | -** considers valid the derivatives CESU-8 & WTF-8 (as described in the | |
| 143 | -** same wikipedia article referenced previously). For UTF-8 characters | |
| 144 | -** > 7f, the variable 'c2' not necessary means the previous character. | |
| 145 | -** It's number of higher 1-bits indicate the number of continuation bytes | |
| 146 | -** that are expected to be followed. E.g. when 'c2' has a value in the range | |
| 147 | -** 0xc0..0xdf it means that 'c' is expected to contain the last continuation | |
| 148 | -** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one | |
| 149 | -** more continuation byte is expected. | |
| 171 | +** except for the "overlong form" of \u0000 which is not considered | |
| 172 | +** invalid here: Some languages like Java and Tcl use it. This function | |
| 173 | +** also considers valid the derivatives CESU-8 & WTF-8 (as described in | |
| 174 | +** the same wikipedia article referenced previously). For UTF-8 characters | |
| 175 | +** > 0x7f, the variable 'c' not necessary means the real lead byte. | |
| 176 | +** It's number of higher 1-bits indicate the number of continuation | |
| 177 | +** bytes that are expected to be followed. E.g. when 'c' has a value | |
| 178 | +** in the range 0xc0..0xdf it means that after 'c' a single continuation | |
| 179 | +** byte is expected. A value 0xe0..0xef means that after 'c' two more | |
| 180 | +** continuation bytes are expected. | |
| 150 | 181 | */ |
| 151 | 182 | |
| 152 | -/* definitions for various UTF-8 sequence lengths */ | |
| 153 | -#define US2A 0x80, 0x80 /* for lead byte 0xC0 */ | |
| 154 | -#define US2B 0x80, 0xBF /* for lead bytes 0xC2-0xDF */ | |
| 155 | -#define US3A 0xA0, 0xBF /* for lead byte 0xE0 */ | |
| 156 | -#define US3B 0x80, 0xBF /* for lead bytes 0xE1-0xEF */ | |
| 157 | -#define US4A 0x90, 0xBF /* for lead byte 0xF0 */ | |
| 158 | -#define US4B 0x80, 0xBF /* for lead bytes 0xF1-0xF3 */ | |
| 159 | -#define US4C 0x80, 0x8F /* for lead byte 0xF4 */ | |
| 160 | -#define US0A 0xFF, 0x00 /* for any other lead byte */ | |
| 161 | - | |
| 162 | -/* a table used for quick lookup of the definition that goes with a | |
| 163 | - * particular lead byte */ | |
| 164 | -static const unsigned char lb_tab[] = { | |
| 165 | - US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B, | |
| 166 | - US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, | |
| 167 | - US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, | |
| 168 | - US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, | |
| 169 | - US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B, | |
| 170 | - US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B, | |
| 171 | - US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A, | |
| 172 | - US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A | |
| 173 | -}; | |
| 174 | - | |
| 175 | 183 | int invalid_utf8( |
| 176 | 184 | const Blob *pContent |
| 177 | 185 | ){ |
| 178 | 186 | const unsigned char *z = (unsigned char *) blob_buffer(pContent); |
| 179 | 187 | unsigned int n = blob_size(pContent); |
| 180 | - unsigned char c, c2; | |
| 188 | + unsigned char c; /* lead byte to be handled. */ | |
| 181 | 189 | |
| 182 | 190 | if( n==0 ) return 0; /* Empty file -> OK */ |
| 183 | 191 | c = *z; |
| 184 | 192 | while( --n>0 ){ |
| 185 | - c2 = c; | |
| 186 | - c = *++z; | |
| 187 | - if( c2>=0xC0 ){ | |
| 188 | - const unsigned char *def = &lb_tab[(2*c2)-0x180]; | |
| 189 | - if( (c<*def) || (c>*++def) ){ | |
| 193 | + if( c>=0x80 ){ | |
| 194 | + const unsigned char *def; /* pointer to range table*/ | |
| 195 | + | |
| 196 | + c <<= 1; /* multiply by 2 and get rid of highest bit */ | |
| 197 | + def = &lb_tab[c]; /* search fb's valid range in table */ | |
| 198 | + if( (unsigned int)(*++z-def[0])>=def[1] ){ | |
| 190 | 199 | return LOOK_INVALID; /* Invalid UTF-8 */ |
| 191 | 200 | } |
| 192 | - if( c2>=0xe0 ){ | |
| 193 | - c = (c2<<1)|3; | |
| 194 | - }else{ | |
| 195 | - c = ' '; | |
| 196 | - } | |
| 197 | - }else if( c2>=0x80 ){ | |
| 198 | - return LOOK_INVALID; | |
| 201 | + c = (c>=0xC0) ? (c|3) : ' '; /* determine next lead byte */ | |
| 202 | + } else { | |
| 203 | + c = *++z; | |
| 199 | 204 | } |
| 200 | 205 | } |
| 201 | - return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */ | |
| 206 | + return (c>=0x80) ? LOOK_INVALID : 0; /* Final lead byte must be ASCII. */ | |
| 202 | 207 | } |
| 203 | 208 | |
| 204 | 209 | /* |
| 205 | 210 | ** Define the type needed to represent a Unicode (UTF-16) character. |
| 206 | 211 | */ |
| 207 | 212 |
| --- src/lookslike.c | |
| +++ src/lookslike.c | |
| @@ -50,10 +50,41 @@ | |
| 50 | #define LOOK_INVALID ((int)0x00000200) /* Invalid sequence was found. */ |
| 51 | #define LOOK_BINARY (LOOK_NUL | LOOK_LONG | LOOK_SHORT) /* May be binary. */ |
| 52 | #define LOOK_EOL (LOOK_LONE_CR | LOOK_LONE_LF | LOOK_CRLF) /* Line seps. */ |
| 53 | #endif /* INTERFACE */ |
| 54 | |
| 55 | |
| 56 | /* |
| 57 | ** This function attempts to scan each logical line within the blob to |
| 58 | ** determine the type of content it appears to contain. The return value |
| 59 | ** is a combination of one or more of the LOOK_XXX flags (see above): |
| @@ -135,72 +166,46 @@ | |
| 135 | } |
| 136 | |
| 137 | /* |
| 138 | ** Checks for proper UTF-8. It uses the method described in: |
| 139 | ** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences |
| 140 | ** except for the "overlong form" of \u0000 which is not considered invalid |
| 141 | ** here: Some languages like Java and Tcl use it. This function also |
| 142 | ** considers valid the derivatives CESU-8 & WTF-8 (as described in the |
| 143 | ** same wikipedia article referenced previously). For UTF-8 characters |
| 144 | ** > 7f, the variable 'c2' not necessary means the previous character. |
| 145 | ** It's number of higher 1-bits indicate the number of continuation bytes |
| 146 | ** that are expected to be followed. E.g. when 'c2' has a value in the range |
| 147 | ** 0xc0..0xdf it means that 'c' is expected to contain the last continuation |
| 148 | ** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one |
| 149 | ** more continuation byte is expected. |
| 150 | */ |
| 151 | |
| 152 | /* definitions for various UTF-8 sequence lengths */ |
| 153 | #define US2A 0x80, 0x80 /* for lead byte 0xC0 */ |
| 154 | #define US2B 0x80, 0xBF /* for lead bytes 0xC2-0xDF */ |
| 155 | #define US3A 0xA0, 0xBF /* for lead byte 0xE0 */ |
| 156 | #define US3B 0x80, 0xBF /* for lead bytes 0xE1-0xEF */ |
| 157 | #define US4A 0x90, 0xBF /* for lead byte 0xF0 */ |
| 158 | #define US4B 0x80, 0xBF /* for lead bytes 0xF1-0xF3 */ |
| 159 | #define US4C 0x80, 0x8F /* for lead byte 0xF4 */ |
| 160 | #define US0A 0xFF, 0x00 /* for any other lead byte */ |
| 161 | |
| 162 | /* a table used for quick lookup of the definition that goes with a |
| 163 | * particular lead byte */ |
| 164 | static const unsigned char lb_tab[] = { |
| 165 | US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B, |
| 166 | US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, |
| 167 | US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, |
| 168 | US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, |
| 169 | US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B, |
| 170 | US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B, |
| 171 | US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A, |
| 172 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A |
| 173 | }; |
| 174 | |
| 175 | int invalid_utf8( |
| 176 | const Blob *pContent |
| 177 | ){ |
| 178 | const unsigned char *z = (unsigned char *) blob_buffer(pContent); |
| 179 | unsigned int n = blob_size(pContent); |
| 180 | unsigned char c, c2; |
| 181 | |
| 182 | if( n==0 ) return 0; /* Empty file -> OK */ |
| 183 | c = *z; |
| 184 | while( --n>0 ){ |
| 185 | c2 = c; |
| 186 | c = *++z; |
| 187 | if( c2>=0xC0 ){ |
| 188 | const unsigned char *def = &lb_tab[(2*c2)-0x180]; |
| 189 | if( (c<*def) || (c>*++def) ){ |
| 190 | return LOOK_INVALID; /* Invalid UTF-8 */ |
| 191 | } |
| 192 | if( c2>=0xe0 ){ |
| 193 | c = (c2<<1)|3; |
| 194 | }else{ |
| 195 | c = ' '; |
| 196 | } |
| 197 | }else if( c2>=0x80 ){ |
| 198 | return LOOK_INVALID; |
| 199 | } |
| 200 | } |
| 201 | return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */ |
| 202 | } |
| 203 | |
| 204 | /* |
| 205 | ** Define the type needed to represent a Unicode (UTF-16) character. |
| 206 | */ |
| 207 |
| --- src/lookslike.c | |
| +++ src/lookslike.c | |
| @@ -50,10 +50,41 @@ | |
| 50 | #define LOOK_INVALID ((int)0x00000200) /* Invalid sequence was found. */ |
| 51 | #define LOOK_BINARY (LOOK_NUL | LOOK_LONG | LOOK_SHORT) /* May be binary. */ |
| 52 | #define LOOK_EOL (LOOK_LONE_CR | LOOK_LONE_LF | LOOK_CRLF) /* Line seps. */ |
| 53 | #endif /* INTERFACE */ |
| 54 | |
| 55 | /* definitions for various UTF-8 sequence lengths, encoded as start value |
| 56 | * and size of each valid range belonging to some lead byte*/ |
| 57 | #define US2A 0x80, 0x01 /* for lead byte 0xC0 */ |
| 58 | #define US2B 0x80, 0x40 /* for lead bytes 0xC2-0xDF */ |
| 59 | #define US3A 0xA0, 0x20 /* for lead byte 0xE0 */ |
| 60 | #define US3B 0x80, 0x40 /* for lead bytes 0xE1-0xEF */ |
| 61 | #define US4A 0x90, 0x30 /* for lead byte 0xF0 */ |
| 62 | #define US4B 0x80, 0x40 /* for lead bytes 0xF1-0xF3 */ |
| 63 | #define US4C 0x80, 0x10 /* for lead byte 0xF4 */ |
| 64 | #define US0A 0x00, 0x00 /* for any other lead byte */ |
| 65 | |
| 66 | /* a table used for quick lookup of the definition that goes with a |
| 67 | * particular lead byte */ |
| 68 | static const unsigned char lb_tab[] = { |
| 69 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 70 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 71 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 72 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 73 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 74 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 75 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 76 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 77 | US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B, |
| 78 | US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, |
| 79 | US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, |
| 80 | US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, |
| 81 | US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B, |
| 82 | US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B, |
| 83 | US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A, |
| 84 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A |
| 85 | }; |
| 86 | |
| 87 | /* |
| 88 | ** This function attempts to scan each logical line within the blob to |
| 89 | ** determine the type of content it appears to contain. The return value |
| 90 | ** is a combination of one or more of the LOOK_XXX flags (see above): |
| @@ -135,72 +166,46 @@ | |
| 166 | } |
| 167 | |
| 168 | /* |
| 169 | ** Checks for proper UTF-8. It uses the method described in: |
| 170 | ** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences |
| 171 | ** except for the "overlong form" of \u0000 which is not considered |
| 172 | ** invalid here: Some languages like Java and Tcl use it. This function |
| 173 | ** also considers valid the derivatives CESU-8 & WTF-8 (as described in |
| 174 | ** the same wikipedia article referenced previously). For UTF-8 characters |
| 175 | ** > 0x7f, the variable 'c' not necessary means the real lead byte. |
| 176 | ** It's number of higher 1-bits indicate the number of continuation |
| 177 | ** bytes that are expected to be followed. E.g. when 'c' has a value |
| 178 | ** in the range 0xc0..0xdf it means that after 'c' a single continuation |
| 179 | ** byte is expected. A value 0xe0..0xef means that after 'c' two more |
| 180 | ** continuation bytes are expected. |
| 181 | */ |
| 182 | |
| 183 | int invalid_utf8( |
| 184 | const Blob *pContent |
| 185 | ){ |
| 186 | const unsigned char *z = (unsigned char *) blob_buffer(pContent); |
| 187 | unsigned int n = blob_size(pContent); |
| 188 | unsigned char c; /* lead byte to be handled. */ |
| 189 | |
| 190 | if( n==0 ) return 0; /* Empty file -> OK */ |
| 191 | c = *z; |
| 192 | while( --n>0 ){ |
| 193 | if( c>=0x80 ){ |
| 194 | const unsigned char *def; /* pointer to range table*/ |
| 195 | |
| 196 | c <<= 1; /* multiply by 2 and get rid of highest bit */ |
| 197 | def = &lb_tab[c]; /* search fb's valid range in table */ |
| 198 | if( (unsigned int)(*++z-def[0])>=def[1] ){ |
| 199 | return LOOK_INVALID; /* Invalid UTF-8 */ |
| 200 | } |
| 201 | c = (c>=0xC0) ? (c|3) : ' '; /* determine next lead byte */ |
| 202 | } else { |
| 203 | c = *++z; |
| 204 | } |
| 205 | } |
| 206 | return (c>=0x80) ? LOOK_INVALID : 0; /* Final lead byte must be ASCII. */ |
| 207 | } |
| 208 | |
| 209 | /* |
| 210 | ** Define the type needed to represent a Unicode (UTF-16) character. |
| 211 | */ |
| 212 |