Fossil SCM
Juggle variables and code arround, making it as efficient and readable as possible. Also add more comments.
Commit
7f067f29400dea123adce3f822e43e19bf278dc4
Parent
6bcfe1d22c13281…
1 file changed
+27
-21
+27
-21
| --- src/lookslike.c | ||
| +++ src/lookslike.c | ||
| @@ -148,22 +148,30 @@ | ||
| 148 | 148 | ** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one |
| 149 | 149 | ** more continuation byte is expected. |
| 150 | 150 | */ |
| 151 | 151 | |
| 152 | 152 | /* definitions for various UTF-8 sequence lengths */ |
| 153 | -#define US2A 0x80, 0x80 /* for lead byte 0xC0 */ | |
| 154 | -#define US2B 0x80, 0xBF /* for lead bytes 0xC2-0xDF */ | |
| 155 | -#define US3A 0xA0, 0xBF /* for lead byte 0xE0 */ | |
| 156 | -#define US3B 0x80, 0xBF /* for lead bytes 0xE1-0xEF */ | |
| 157 | -#define US4A 0x90, 0xBF /* for lead byte 0xF0 */ | |
| 158 | -#define US4B 0x80, 0xBF /* for lead bytes 0xF1-0xF3 */ | |
| 159 | -#define US4C 0x80, 0x8F /* for lead byte 0xF4 */ | |
| 153 | +#define US2A 0x7F, 0x80 /* for lead byte 0xC0 */ | |
| 154 | +#define US2B 0x7F, 0xBF /* for lead bytes 0xC2-0xDF */ | |
| 155 | +#define US3A 0x9F, 0xBF /* for lead byte 0xE0 */ | |
| 156 | +#define US3B 0x7F, 0xBF /* for lead bytes 0xE1-0xEF */ | |
| 157 | +#define US4A 0x8F, 0xBF /* for lead byte 0xF0 */ | |
| 158 | +#define US4B 0x7F, 0xBF /* for lead bytes 0xF1-0xF3 */ | |
| 159 | +#define US4C 0x7F, 0x8F /* for lead byte 0xF4 */ | |
| 160 | 160 | #define US0A 0xFF, 0x00 /* for any other lead byte */ |
| 161 | 161 | |
| 162 | 162 | /* a table used for quick lookup of the definition that goes with a |
| 163 | 163 | * particular lead byte */ |
| 164 | 164 | static const unsigned char lb_tab[] = { |
| 165 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 166 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 167 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 168 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 169 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 170 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 171 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 172 | + US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, | |
| 165 | 173 | US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B, |
| 166 | 174 | US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, |
| 167 | 175 | US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, |
| 168 | 176 | US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, |
| 169 | 177 | US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B, |
| @@ -175,32 +183,30 @@ | ||
| 175 | 183 | int invalid_utf8( |
| 176 | 184 | const Blob *pContent |
| 177 | 185 | ){ |
| 178 | 186 | const unsigned char *z = (unsigned char *) blob_buffer(pContent); |
| 179 | 187 | unsigned int n = blob_size(pContent); |
| 180 | - unsigned char c, c2; | |
| 188 | + unsigned char c; /* lead byte to be handled. */ | |
| 181 | 189 | |
| 182 | 190 | if( n==0 ) return 0; /* Empty file -> OK */ |
| 183 | 191 | c = *z; |
| 184 | 192 | while( --n>0 ){ |
| 185 | - c2 = c; | |
| 186 | - c = *++z; | |
| 187 | - if( c2>=0xC0 ){ | |
| 188 | - const unsigned char *def = &lb_tab[(2*c2)-0x180]; | |
| 189 | - if( (c<*def) || (c>*++def) ){ | |
| 193 | + if( c>=0x80 ){ | |
| 194 | + unsigned char fb = *++z; /* follow-up byte after lead byte */ | |
| 195 | + const unsigned char *def; /* pointer to range table*/ | |
| 196 | + | |
| 197 | + c <<= 1; /* multiply by 2 and get rid of highest bit */ | |
| 198 | + def = &lb_tab[c]; /* search fb's valid range in table */ | |
| 199 | + if( (fb<=def[0]) || (fb>def[1]) ){ | |
| 190 | 200 | return LOOK_INVALID; /* Invalid UTF-8 */ |
| 191 | 201 | } |
| 192 | - if( c2>=0xe0 ){ | |
| 193 | - c = (c2<<1)|3; | |
| 194 | - }else{ | |
| 195 | - c = ' '; | |
| 196 | - } | |
| 197 | - }else if( c2>=0x80 ){ | |
| 198 | - return LOOK_INVALID; | |
| 202 | + c = (c>=0xC0) ? (c|3) : ' '; /* determine next lead byte */ | |
| 203 | + } else { | |
| 204 | + c = *++z; | |
| 199 | 205 | } |
| 200 | 206 | } |
| 201 | - return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */ | |
| 207 | + return (c>=0x80) ? LOOK_INVALID : 0; /* Final lead byte must be ASCII. */ | |
| 202 | 208 | } |
| 203 | 209 | |
| 204 | 210 | /* |
| 205 | 211 | ** Define the type needed to represent a Unicode (UTF-16) character. |
| 206 | 212 | */ |
| 207 | 213 |
| --- src/lookslike.c | |
| +++ src/lookslike.c | |
| @@ -148,22 +148,30 @@ | |
| 148 | ** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one |
| 149 | ** more continuation byte is expected. |
| 150 | */ |
| 151 | |
| 152 | /* definitions for various UTF-8 sequence lengths */ |
| 153 | #define US2A 0x80, 0x80 /* for lead byte 0xC0 */ |
| 154 | #define US2B 0x80, 0xBF /* for lead bytes 0xC2-0xDF */ |
| 155 | #define US3A 0xA0, 0xBF /* for lead byte 0xE0 */ |
| 156 | #define US3B 0x80, 0xBF /* for lead bytes 0xE1-0xEF */ |
| 157 | #define US4A 0x90, 0xBF /* for lead byte 0xF0 */ |
| 158 | #define US4B 0x80, 0xBF /* for lead bytes 0xF1-0xF3 */ |
| 159 | #define US4C 0x80, 0x8F /* for lead byte 0xF4 */ |
| 160 | #define US0A 0xFF, 0x00 /* for any other lead byte */ |
| 161 | |
| 162 | /* a table used for quick lookup of the definition that goes with a |
| 163 | * particular lead byte */ |
| 164 | static const unsigned char lb_tab[] = { |
| 165 | US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B, |
| 166 | US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, |
| 167 | US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, |
| 168 | US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, |
| 169 | US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B, |
| @@ -175,32 +183,30 @@ | |
| 175 | int invalid_utf8( |
| 176 | const Blob *pContent |
| 177 | ){ |
| 178 | const unsigned char *z = (unsigned char *) blob_buffer(pContent); |
| 179 | unsigned int n = blob_size(pContent); |
| 180 | unsigned char c, c2; |
| 181 | |
| 182 | if( n==0 ) return 0; /* Empty file -> OK */ |
| 183 | c = *z; |
| 184 | while( --n>0 ){ |
| 185 | c2 = c; |
| 186 | c = *++z; |
| 187 | if( c2>=0xC0 ){ |
| 188 | const unsigned char *def = &lb_tab[(2*c2)-0x180]; |
| 189 | if( (c<*def) || (c>*++def) ){ |
| 190 | return LOOK_INVALID; /* Invalid UTF-8 */ |
| 191 | } |
| 192 | if( c2>=0xe0 ){ |
| 193 | c = (c2<<1)|3; |
| 194 | }else{ |
| 195 | c = ' '; |
| 196 | } |
| 197 | }else if( c2>=0x80 ){ |
| 198 | return LOOK_INVALID; |
| 199 | } |
| 200 | } |
| 201 | return (c>=0x80) ? LOOK_INVALID : 0; /* Last byte must be ASCII. */ |
| 202 | } |
| 203 | |
| 204 | /* |
| 205 | ** Define the type needed to represent a Unicode (UTF-16) character. |
| 206 | */ |
| 207 |
| --- src/lookslike.c | |
| +++ src/lookslike.c | |
| @@ -148,22 +148,30 @@ | |
| 148 | ** byte of a UTF-8 character. A value 0xe0..0xef means that after 'c' one |
| 149 | ** more continuation byte is expected. |
| 150 | */ |
| 151 | |
| 152 | /* definitions for various UTF-8 sequence lengths */ |
| 153 | #define US2A 0x7F, 0x80 /* for lead byte 0xC0 */ |
| 154 | #define US2B 0x7F, 0xBF /* for lead bytes 0xC2-0xDF */ |
| 155 | #define US3A 0x9F, 0xBF /* for lead byte 0xE0 */ |
| 156 | #define US3B 0x7F, 0xBF /* for lead bytes 0xE1-0xEF */ |
| 157 | #define US4A 0x8F, 0xBF /* for lead byte 0xF0 */ |
| 158 | #define US4B 0x7F, 0xBF /* for lead bytes 0xF1-0xF3 */ |
| 159 | #define US4C 0x7F, 0x8F /* for lead byte 0xF4 */ |
| 160 | #define US0A 0xFF, 0x00 /* for any other lead byte */ |
| 161 | |
| 162 | /* a table used for quick lookup of the definition that goes with a |
| 163 | * particular lead byte */ |
| 164 | static const unsigned char lb_tab[] = { |
| 165 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 166 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 167 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 168 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 169 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 170 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 171 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 172 | US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
| 173 | US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B, |
| 174 | US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, |
| 175 | US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, |
| 176 | US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, |
| 177 | US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B, |
| @@ -175,32 +183,30 @@ | |
| 183 | int invalid_utf8( |
| 184 | const Blob *pContent |
| 185 | ){ |
| 186 | const unsigned char *z = (unsigned char *) blob_buffer(pContent); |
| 187 | unsigned int n = blob_size(pContent); |
| 188 | unsigned char c; /* lead byte to be handled. */ |
| 189 | |
| 190 | if( n==0 ) return 0; /* Empty file -> OK */ |
| 191 | c = *z; |
| 192 | while( --n>0 ){ |
| 193 | if( c>=0x80 ){ |
| 194 | unsigned char fb = *++z; /* follow-up byte after lead byte */ |
| 195 | const unsigned char *def; /* pointer to range table*/ |
| 196 | |
| 197 | c <<= 1; /* multiply by 2 and get rid of highest bit */ |
| 198 | def = &lb_tab[c]; /* search fb's valid range in table */ |
| 199 | if( (fb<=def[0]) || (fb>def[1]) ){ |
| 200 | return LOOK_INVALID; /* Invalid UTF-8 */ |
| 201 | } |
| 202 | c = (c>=0xC0) ? (c|3) : ' '; /* determine next lead byte */ |
| 203 | } else { |
| 204 | c = *++z; |
| 205 | } |
| 206 | } |
| 207 | return (c>=0x80) ? LOOK_INVALID : 0; /* Final lead byte must be ASCII. */ |
| 208 | } |
| 209 | |
| 210 | /* |
| 211 | ** Define the type needed to represent a Unicode (UTF-16) character. |
| 212 | */ |
| 213 |