Fossil SCM

fossil-scm / src / lookslike.c
Source Blame History 596 lines
d70ea7e… drh 1 /*
d70ea7e… drh 2 ** Copyright (c) 2013 D. Richard Hipp
d70ea7e… drh 3 **
d70ea7e… drh 4 ** This program is free software; you can redistribute it and/or
d70ea7e… drh 5 ** modify it under the terms of the Simplified BSD License (also
d70ea7e… drh 6 ** known as the "2-Clause License" or "FreeBSD License".)
d70ea7e… drh 7
d70ea7e… drh 8 ** This program is distributed in the hope that it will be useful,
d70ea7e… drh 9 ** but without any warranty; without even the implied warranty of
d70ea7e… drh 10 ** merchantability or fitness for a particular purpose.
d70ea7e… drh 11 **
d70ea7e… drh 12 ** Author contact information:
d70ea7e… drh 13 ** [email protected]
d70ea7e… drh 14 ** http://www.hwaci.com/drh/
d70ea7e… drh 15 **
d70ea7e… drh 16 *******************************************************************************
d70ea7e… drh 17 **
d70ea7e… drh 18 ** This file contains code used to try to guess if a particular file is
d70ea7e… drh 19 ** text or binary, what types of line endings it uses, is it UTF8 or
d70ea7e… drh 20 ** UTF16, etc.
d70ea7e… drh 21 */
d70ea7e… drh 22 #include "config.h"
d70ea7e… drh 23 #include "lookslike.h"
d70ea7e… drh 24 #include <assert.h>
d70ea7e… drh 25
d70ea7e… drh 26
d70ea7e… drh 27 #if INTERFACE
d70ea7e… drh 28
d70ea7e… drh 29 /*
d70ea7e… drh 30 ** This macro is designed to return non-zero if the specified blob contains
d70ea7e… drh 31 ** data that MAY be binary in nature; otherwise, zero will be returned.
d70ea7e… drh 32 */
d70ea7e… drh 33 #define looks_like_binary(blob) \
d70ea7e… drh 34 ((looks_like_utf8((blob), LOOK_BINARY) & LOOK_BINARY) != LOOK_NONE)
d70ea7e… drh 35
d70ea7e… drh 36 /*
d70ea7e… drh 37 ** Output flags for the looks_like_utf8() and looks_like_utf16() routines used
d70ea7e… drh 38 ** to convey status information about the blob content.
d70ea7e… drh 39 */
d70ea7e… drh 40 #define LOOK_NONE ((int)0x00000000) /* Nothing special was found. */
d70ea7e… drh 41 #define LOOK_NUL ((int)0x00000001) /* One or more NUL chars were found. */
3161968… mistachkin 42 #define LOOK_CR ((int)0x00000002) /* One or more CR chars were found. */
3161968… mistachkin 43 #define LOOK_LONE_CR ((int)0x00000004) /* An unpaired CR char was found. */
3161968… mistachkin 44 #define LOOK_LF ((int)0x00000008) /* One or more LF chars were found. */
3161968… mistachkin 45 #define LOOK_LONE_LF ((int)0x00000010) /* An unpaired LF char was found. */
3161968… mistachkin 46 #define LOOK_CRLF ((int)0x00000020) /* One or more CR/LF pairs were found. */
3161968… mistachkin 47 #define LOOK_LONG ((int)0x00000040) /* An over length line was found. */
3161968… mistachkin 48 #define LOOK_ODD ((int)0x00000080) /* An odd number of bytes was found. */
3161968… mistachkin 49 #define LOOK_SHORT ((int)0x00000100) /* Unable to perform full check. */
3161968… mistachkin 50 #define LOOK_INVALID ((int)0x00000200) /* Invalid sequence was found. */
d70ea7e… drh 51 #define LOOK_BINARY (LOOK_NUL | LOOK_LONG | LOOK_SHORT) /* May be binary. */
d70ea7e… drh 52 #define LOOK_EOL (LOOK_LONE_CR | LOOK_LONE_LF | LOOK_CRLF) /* Line seps. */
d70ea7e… drh 53 #endif /* INTERFACE */
d70ea7e… drh 54
7c08a68… jan.nijtmans 55 /* definitions for various UTF-8 sequence lengths, encoded as start value
7c08a68… jan.nijtmans 56 * and size of each valid range belonging to some lead byte*/
7c08a68… jan.nijtmans 57 #define US2A 0x80, 0x01 /* for lead byte 0xC0 */
7c08a68… jan.nijtmans 58 #define US2B 0x80, 0x40 /* for lead bytes 0xC2-0xDF */
7c08a68… jan.nijtmans 59 #define US3A 0xA0, 0x20 /* for lead byte 0xE0 */
7c08a68… jan.nijtmans 60 #define US3B 0x80, 0x40 /* for lead bytes 0xE1-0xEF */
7c08a68… jan.nijtmans 61 #define US4A 0x90, 0x30 /* for lead byte 0xF0 */
7c08a68… jan.nijtmans 62 #define US4B 0x80, 0x40 /* for lead bytes 0xF1-0xF3 */
7c08a68… jan.nijtmans 63 #define US4C 0x80, 0x10 /* for lead byte 0xF4 */
7c08a68… jan.nijtmans 64 #define US0A 0x00, 0x00 /* for any other lead byte */
7c08a68… jan.nijtmans 65
7c08a68… jan.nijtmans 66 /* a table used for quick lookup of the definition that goes with a
7c08a68… jan.nijtmans 67 * particular lead byte */
7c08a68… jan.nijtmans 68 static const unsigned char lb_tab[] = {
7c08a68… jan.nijtmans 69 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
7c08a68… jan.nijtmans 70 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
7c08a68… jan.nijtmans 71 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
7c08a68… jan.nijtmans 72 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
7c08a68… jan.nijtmans 73 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
7c08a68… jan.nijtmans 74 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
7c08a68… jan.nijtmans 75 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
7c08a68… jan.nijtmans 76 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
7c08a68… jan.nijtmans 77 US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
7c08a68… jan.nijtmans 78 US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
7c08a68… jan.nijtmans 79 US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
7c08a68… jan.nijtmans 80 US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
7c08a68… jan.nijtmans 81 US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
7c08a68… jan.nijtmans 82 US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
7c08a68… jan.nijtmans 83 US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A,
7c08a68… jan.nijtmans 84 US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A
7c08a68… jan.nijtmans 85 };
d70ea7e… drh 86
d70ea7e… drh 87 /*
d70ea7e… drh 88 ** This function attempts to scan each logical line within the blob to
d70ea7e… drh 89 ** determine the type of content it appears to contain. The return value
d70ea7e… drh 90 ** is a combination of one or more of the LOOK_XXX flags (see above):
d70ea7e… drh 91 **
d70ea7e… drh 92 ** !LOOK_BINARY -- The content appears to consist entirely of text; however,
d70ea7e… drh 93 ** the encoding may not be UTF-8.
d70ea7e… drh 94 **
d70ea7e… drh 95 ** LOOK_BINARY -- The content appears to be binary because it contains one
d70ea7e… drh 96 ** or more embedded NUL characters or an extremely long line.
d70ea7e… drh 97 ** Since this function does not understand UTF-16, it may
d70ea7e… drh 98 ** falsely consider UTF-16 text to be binary.
d70ea7e… drh 99 **
d70ea7e… drh 100 ** Additional flags (i.e. those other than the ones included in LOOK_BINARY)
d70ea7e… drh 101 ** may be present in the result as well; however, they should not impact the
d70ea7e… drh 102 ** determination of text versus binary content.
d70ea7e… drh 103 **
d70ea7e… drh 104 ************************************ WARNING **********************************
d70ea7e… drh 105 **
d70ea7e… drh 106 ** This function does not validate that the blob content is properly formed
d70ea7e… drh 107 ** UTF-8. It assumes that all code points are the same size. It does not
d70ea7e… drh 108 ** validate any code points. It makes no attempt to detect if any [invalid]
d70ea7e… drh 109 ** switches between UTF-8 and other encodings occur.
d70ea7e… drh 110 **
d70ea7e… drh 111 ** The only code points that this function cares about are the NUL character,
d70ea7e… drh 112 ** carriage-return, and line-feed.
d70ea7e… drh 113 **
d70ea7e… drh 114 ** This function examines the contents of the blob until one of the flags
d70ea7e… drh 115 ** specified in "stopFlags" is set.
d70ea7e… drh 116 **
d70ea7e… drh 117 ************************************ WARNING **********************************
d70ea7e… drh 118 */
d70ea7e… drh 119 int looks_like_utf8(const Blob *pContent, int stopFlags){
d70ea7e… drh 120 const char *z = blob_buffer(pContent);
d70ea7e… drh 121 unsigned int n = blob_size(pContent);
d70ea7e… drh 122 int j, c, flags = LOOK_NONE; /* Assume UTF-8 text, prove otherwise */
d70ea7e… drh 123
d70ea7e… drh 124 if( n==0 ) return flags; /* Empty file -> text */
d70ea7e… drh 125 c = *z;
d70ea7e… drh 126 if( c==0 ){
d70ea7e… drh 127 flags |= LOOK_NUL; /* NUL character in a file -> binary */
d70ea7e… drh 128 }else if( c=='\r' ){
3161968… mistachkin 129 flags |= LOOK_CR;
d70ea7e… drh 130 if( n<=1 || z[1]!='\n' ){
6e3fceb… mistachkin 131 flags |= LOOK_LONE_CR; /* Not enough chars or next char not LF */
d70ea7e… drh 132 }
d70ea7e… drh 133 }
d70ea7e… drh 134 j = (c!='\n');
3161968… mistachkin 135 if( !j ) flags |= (LOOK_LF | LOOK_LONE_LF); /* Found LF as first char */
d70ea7e… drh 136 while( !(flags&stopFlags) && --n>0 ){
d70ea7e… drh 137 int c2 = c;
d70ea7e… drh 138 c = *++z; ++j;
d70ea7e… drh 139 if( c==0 ){
d70ea7e… drh 140 flags |= LOOK_NUL; /* NUL character in a file -> binary */
d70ea7e… drh 141 }else if( c=='\n' ){
3161968… mistachkin 142 flags |= LOOK_LF;
d70ea7e… drh 143 if( c2=='\r' ){
3161968… mistachkin 144 flags |= (LOOK_CR | LOOK_CRLF); /* Found LF preceded by CR */
d70ea7e… drh 145 }else{
d70ea7e… drh 146 flags |= LOOK_LONE_LF;
d70ea7e… drh 147 }
d70ea7e… drh 148 if( j>LENGTH_MASK ){
d70ea7e… drh 149 flags |= LOOK_LONG; /* Very long line -> binary */
d70ea7e… drh 150 }
d70ea7e… drh 151 j = 0;
d70ea7e… drh 152 }else if( c=='\r' ){
3161968… mistachkin 153 flags |= LOOK_CR;
d70ea7e… drh 154 if( n<=1 || z[1]!='\n' ){
6e3fceb… mistachkin 155 flags |= LOOK_LONE_CR; /* Not enough chars or next char not LF */
d70ea7e… drh 156 }
d70ea7e… drh 157 }
d70ea7e… drh 158 }
d70ea7e… drh 159 if( n ){
d70ea7e… drh 160 flags |= LOOK_SHORT; /* The whole blob was not examined */
d70ea7e… drh 161 }
d70ea7e… drh 162 if( j>LENGTH_MASK ){
d70ea7e… drh 163 flags |= LOOK_LONG; /* Very long line -> binary */
d70ea7e… drh 164 }
d70ea7e… drh 165 return flags;
d70ea7e… drh 166 }
d70ea7e… drh 167
5f24da1… jan.nijtmans 168 /*
5f24da1… jan.nijtmans 169 ** Checks for proper UTF-8. It uses the method described in:
5f24da1… jan.nijtmans 170 ** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
7c08a68… jan.nijtmans 171 ** except for the "overlong form" of \u0000 which is not considered
7c08a68… jan.nijtmans 172 ** invalid here: Some languages like Java and Tcl use it. This function
7c08a68… jan.nijtmans 173 ** also considers valid the derivatives CESU-8 & WTF-8 (as described in
7c08a68… jan.nijtmans 174 ** the same wikipedia article referenced previously). For UTF-8 characters
7c08a68… jan.nijtmans 175 ** > 0x7f, the variable 'c' not necessary means the real lead byte.
7c08a68… jan.nijtmans 176 ** It's number of higher 1-bits indicate the number of continuation
7c08a68… jan.nijtmans 177 ** bytes that are expected to be followed. E.g. when 'c' has a value
7c08a68… jan.nijtmans 178 ** in the range 0xc0..0xdf it means that after 'c' a single continuation
7c08a68… jan.nijtmans 179 ** byte is expected. A value 0xe0..0xef means that after 'c' two more
7c08a68… jan.nijtmans 180 ** continuation bytes are expected.
5f24da1… jan.nijtmans 181 */
5f24da1… jan.nijtmans 182
60349a6… jan.nijtmans 183 int invalid_utf8(
60349a6… jan.nijtmans 184 const Blob *pContent
60349a6… jan.nijtmans 185 ){
5f24da1… jan.nijtmans 186 const unsigned char *z = (unsigned char *) blob_buffer(pContent);
5f24da1… jan.nijtmans 187 unsigned int n = blob_size(pContent);
7c08a68… jan.nijtmans 188 unsigned char c; /* lead byte to be handled. */
5f24da1… jan.nijtmans 189
5f24da1… jan.nijtmans 190 if( n==0 ) return 0; /* Empty file -> OK */
5f24da1… jan.nijtmans 191 c = *z;
5f24da1… jan.nijtmans 192 while( --n>0 ){
7c08a68… jan.nijtmans 193 if( c>=0x80 ){
7c08a68… jan.nijtmans 194 const unsigned char *def; /* pointer to range table*/
7c08a68… jan.nijtmans 195
7c08a68… jan.nijtmans 196 c <<= 1; /* multiply by 2 and get rid of highest bit */
7c08a68… jan.nijtmans 197 def = &lb_tab[c]; /* search fb's valid range in table */
7c08a68… jan.nijtmans 198 if( (unsigned int)(*++z-def[0])>=def[1] ){
5f24da1… jan.nijtmans 199 return LOOK_INVALID; /* Invalid UTF-8 */
5f24da1… jan.nijtmans 200 }
7c08a68… jan.nijtmans 201 c = (c>=0xC0) ? (c|3) : ' '; /* determine next lead byte */
7c08a68… jan.nijtmans 202 } else {
7c08a68… jan.nijtmans 203 c = *++z;
1ca5983… jan.nijtmans 204 }
1ca5983… jan.nijtmans 205 }
7c08a68… jan.nijtmans 206 return (c>=0x80) ? LOOK_INVALID : 0; /* Final lead byte must be ASCII. */
1ca5983… jan.nijtmans 207 }
d70ea7e… drh 208
d70ea7e… drh 209 /*
d70ea7e… drh 210 ** Define the type needed to represent a Unicode (UTF-16) character.
d70ea7e… drh 211 */
d70ea7e… drh 212 #ifndef WCHAR_T
d70ea7e… drh 213 # ifdef _WIN32
d70ea7e… drh 214 # define WCHAR_T wchar_t
d70ea7e… drh 215 # else
d70ea7e… drh 216 # define WCHAR_T unsigned short
d70ea7e… drh 217 # endif
d70ea7e… drh 218 #endif
d70ea7e… drh 219
d70ea7e… drh 220 /*
d70ea7e… drh 221 ** Maximum length of a line in a text file, in UTF-16 characters. (4096)
d70ea7e… drh 222 ** The number of bytes represented by this value cannot exceed LENGTH_MASK
d70ea7e… drh 223 ** bytes, because that is the line buffer size used by the diff engine.
d70ea7e… drh 224 */
d70ea7e… drh 225 #define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char)))
d70ea7e… drh 226 #define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1)
d70ea7e… drh 227
d70ea7e… drh 228 /*
d70ea7e… drh 229 ** This macro is used to swap the byte order of a UTF-16 character in the
d70ea7e… drh 230 ** looks_like_utf16() function.
d70ea7e… drh 231 */
5f24da1… jan.nijtmans 232 #define UTF16_SWAP(ch) ((((ch) << 8) & 0xff00) | (((ch) >> 8) & 0xff))
d70ea7e… drh 233 #define UTF16_SWAP_IF(expr,ch) ((expr) ? UTF16_SWAP((ch)) : (ch))
d70ea7e… drh 234
d70ea7e… drh 235 /*
d70ea7e… drh 236 ** This function attempts to scan each logical line within the blob to
d70ea7e… drh 237 ** determine the type of content it appears to contain. The return value
d70ea7e… drh 238 ** is a combination of one or more of the LOOK_XXX flags (see above):
d70ea7e… drh 239 **
d70ea7e… drh 240 ** !LOOK_BINARY -- The content appears to consist entirely of text; however,
d70ea7e… drh 241 ** the encoding may not be UTF-16.
d70ea7e… drh 242 **
d70ea7e… drh 243 ** LOOK_BINARY -- The content appears to be binary because it contains one
d70ea7e… drh 244 ** or more embedded NUL characters or an extremely long line.
d70ea7e… drh 245 ** Since this function does not understand UTF-8, it may
d70ea7e… drh 246 ** falsely consider UTF-8 text to be binary.
d70ea7e… drh 247 **
d70ea7e… drh 248 ** Additional flags (i.e. those other than the ones included in LOOK_BINARY)
d70ea7e… drh 249 ** may be present in the result as well; however, they should not impact the
d70ea7e… drh 250 ** determination of text versus binary content.
d70ea7e… drh 251 **
d70ea7e… drh 252 ************************************ WARNING **********************************
d70ea7e… drh 253 **
d70ea7e… drh 254 ** This function does not validate that the blob content is properly formed
d70ea7e… drh 255 ** UTF-16. It assumes that all code points are the same size. It does not
d70ea7e… drh 256 ** validate any code points. It makes no attempt to detect if any [invalid]
d70ea7e… drh 257 ** switches between the UTF-16be and UTF-16le encodings occur.
d70ea7e… drh 258 **
d70ea7e… drh 259 ** The only code points that this function cares about are the NUL character,
d70ea7e… drh 260 ** carriage-return, and line-feed.
d70ea7e… drh 261 **
d70ea7e… drh 262 ** This function examines the contents of the blob until one of the flags
d70ea7e… drh 263 ** specified in "stopFlags" is set.
d70ea7e… drh 264 **
d70ea7e… drh 265 ************************************ WARNING **********************************
d70ea7e… drh 266 */
d70ea7e… drh 267 int looks_like_utf16(const Blob *pContent, int bReverse, int stopFlags){
d70ea7e… drh 268 const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent);
d70ea7e… drh 269 unsigned int n = blob_size(pContent);
d70ea7e… drh 270 int j, c, flags = LOOK_NONE; /* Assume UTF-16 text, prove otherwise */
d70ea7e… drh 271
d70ea7e… drh 272 if( n%sizeof(WCHAR_T) ){
d70ea7e… drh 273 flags |= LOOK_ODD; /* Odd number of bytes -> binary (UTF-8?) */
d70ea7e… drh 274 }
275da70… danield 275 if( n<sizeof(WCHAR_T) ) return flags;/* Zero or One byte -> binary (UTF-8?) */
d70ea7e… drh 276 c = *z;
d70ea7e… drh 277 if( bReverse ){
d70ea7e… drh 278 c = UTF16_SWAP(c);
d70ea7e… drh 279 }
d70ea7e… drh 280 if( c==0 ){
d70ea7e… drh 281 flags |= LOOK_NUL; /* NUL character in a file -> binary */
d70ea7e… drh 282 }else if( c=='\r' ){
3161968… mistachkin 283 flags |= LOOK_CR;
d70ea7e… drh 284 if( n<(2*sizeof(WCHAR_T)) || UTF16_SWAP_IF(bReverse, z[1])!='\n' ){
6e3fceb… mistachkin 285 flags |= LOOK_LONE_CR; /* Not enough chars or next char not LF */
d70ea7e… drh 286 }
d70ea7e… drh 287 }
d70ea7e… drh 288 j = (c!='\n');
3161968… mistachkin 289 if( !j ) flags |= (LOOK_LF | LOOK_LONE_LF); /* Found LF as first char */
7458a18… jan.nijtmans 290 while( !(flags&stopFlags) && ((n-=sizeof(WCHAR_T))>=sizeof(WCHAR_T)) ){
d70ea7e… drh 291 int c2 = c;
d70ea7e… drh 292 c = *++z;
d70ea7e… drh 293 if( bReverse ){
d70ea7e… drh 294 c = UTF16_SWAP(c);
d70ea7e… drh 295 }
d70ea7e… drh 296 ++j;
d70ea7e… drh 297 if( c==0 ){
d70ea7e… drh 298 flags |= LOOK_NUL; /* NUL character in a file -> binary */
d70ea7e… drh 299 }else if( c=='\n' ){
3161968… mistachkin 300 flags |= LOOK_LF;
d70ea7e… drh 301 if( c2=='\r' ){
3161968… mistachkin 302 flags |= (LOOK_CR | LOOK_CRLF); /* Found LF preceded by CR */
d70ea7e… drh 303 }else{
d70ea7e… drh 304 flags |= LOOK_LONE_LF;
d70ea7e… drh 305 }
d70ea7e… drh 306 if( j>UTF16_LENGTH_MASK ){
d70ea7e… drh 307 flags |= LOOK_LONG; /* Very long line -> binary */
d70ea7e… drh 308 }
d70ea7e… drh 309 j = 0;
d70ea7e… drh 310 }else if( c=='\r' ){
3161968… mistachkin 311 flags |= LOOK_CR;
d70ea7e… drh 312 if( n<(2*sizeof(WCHAR_T)) || UTF16_SWAP_IF(bReverse, z[1])!='\n' ){
6e3fceb… mistachkin 313 flags |= LOOK_LONE_CR; /* Not enough chars or next char not LF */
d70ea7e… drh 314 }
d70ea7e… drh 315 }
d70ea7e… drh 316 }
d70ea7e… drh 317 if( n ){
d70ea7e… drh 318 flags |= LOOK_SHORT; /* The whole blob was not examined */
d70ea7e… drh 319 }
d70ea7e… drh 320 if( j>UTF16_LENGTH_MASK ){
d70ea7e… drh 321 flags |= LOOK_LONG; /* Very long line -> binary */
d70ea7e… drh 322 }
d70ea7e… drh 323 return flags;
d70ea7e… drh 324 }
d70ea7e… drh 325
d70ea7e… drh 326 /*
d70ea7e… drh 327 ** This function returns an array of bytes representing the byte-order-mark
d70ea7e… drh 328 ** for UTF-8.
d70ea7e… drh 329 */
d70ea7e… drh 330 const unsigned char *get_utf8_bom(int *pnByte){
d70ea7e… drh 331 static const unsigned char bom[] = {
5f24da1… jan.nijtmans 332 0xef, 0xbb, 0xbf, 0x00, 0x00, 0x00
d70ea7e… drh 333 };
d70ea7e… drh 334 if( pnByte ) *pnByte = 3;
d70ea7e… drh 335 return bom;
d70ea7e… drh 336 }
d70ea7e… drh 337
d70ea7e… drh 338 /*
d70ea7e… drh 339 ** This function returns non-zero if the blob starts with a UTF-8
d70ea7e… drh 340 ** byte-order-mark (BOM).
d70ea7e… drh 341 */
d70ea7e… drh 342 int starts_with_utf8_bom(const Blob *pContent, int *pnByte){
d70ea7e… drh 343 const char *z = blob_buffer(pContent);
d70ea7e… drh 344 int bomSize = 0;
d70ea7e… drh 345 const unsigned char *bom = get_utf8_bom(&bomSize);
d70ea7e… drh 346
d70ea7e… drh 347 if( pnByte ) *pnByte = bomSize;
53db40e… drh 348 if( (int)blob_size(pContent)<bomSize ) return 0;
d70ea7e… drh 349 return memcmp(z, bom, bomSize)==0;
d70ea7e… drh 350 }
d70ea7e… drh 351
d70ea7e… drh 352 /*
d70ea7e… drh 353 ** This function returns non-zero if the blob starts with a UTF-16
d70ea7e… drh 354 ** byte-order-mark (BOM), either in the endianness of the machine
d70ea7e… drh 355 ** or in reversed byte order. The UTF-32 BOM is ruled out by checking
d70ea7e… drh 356 ** if the UTF-16 BOM is not immediately followed by (utf16) 0.
d70ea7e… drh 357 ** pnByte is only set when the function returns 1.
d70ea7e… drh 358 **
d70ea7e… drh 359 ** pbReverse is always set, even when no BOM is found. Without a BOM,
d70ea7e… drh 360 ** it is set to 1 on little-endian and 0 on big-endian platforms. See
d70ea7e… drh 361 ** clause D98 of conformance (section 3.10) of the Unicode standard.
d70ea7e… drh 362 */
d70ea7e… drh 363 int starts_with_utf16_bom(
d70ea7e… drh 364 const Blob *pContent, /* IN: Blob content to perform BOM detection on. */
d70ea7e… drh 365 int *pnByte, /* OUT: The number of bytes used for the BOM. */
d70ea7e… drh 366 int *pbReverse /* OUT: Non-zero for BOM in reverse byte-order. */
d70ea7e… drh 367 ){
f7c41be… drh 368 const unsigned char *z = (unsigned char *)blob_buffer(pContent);
d70ea7e… drh 369 int bomSize = sizeof(unsigned short);
d70ea7e… drh 370 int size = blob_size(pContent);
f7c41be… drh 371 unsigned short i0;
d70ea7e… drh 372
d70ea7e… drh 373 if( size<bomSize ) goto noBom; /* No: cannot read BOM. */
f7c41be… drh 374 if( size>=(2*bomSize) && z[2]==0 && z[3]==0 ) goto noBom;
f7c41be… drh 375 memcpy(&i0, z, sizeof(i0));
f7c41be… drh 376 if( i0==0xfeff ){
d70ea7e… drh 377 if( pbReverse ) *pbReverse = 0;
f7c41be… drh 378 }else if( i0==0xfffe ){
d70ea7e… drh 379 if( pbReverse ) *pbReverse = 1;
d70ea7e… drh 380 }else{
d70ea7e… drh 381 static const int one = 1;
d70ea7e… drh 382 noBom:
d70ea7e… drh 383 if( pbReverse ) *pbReverse = *(char *) &one;
d70ea7e… drh 384 return 0; /* No: UTF-16 byte-order-mark not found. */
d70ea7e… drh 385 }
d70ea7e… drh 386 if( pnByte ) *pnByte = bomSize;
d70ea7e… drh 387 return 1; /* Yes. */
d70ea7e… drh 388 }
d70ea7e… drh 389
d70ea7e… drh 390 /*
d70ea7e… drh 391 ** Returns non-zero if the specified content could be valid UTF-16.
d70ea7e… drh 392 */
d70ea7e… drh 393 int could_be_utf16(const Blob *pContent, int *pbReverse){
d70ea7e… drh 394 return (blob_size(pContent) % sizeof(WCHAR_T) == 0) ?
d70ea7e… drh 395 starts_with_utf16_bom(pContent, 0, pbReverse) : 0;
d70ea7e… drh 396 }
d70ea7e… drh 397
d70ea7e… drh 398
d70ea7e… drh 399 /*
d70ea7e… drh 400 ** COMMAND: test-looks-like-utf
d70ea7e… drh 401 **
d70ea7e… drh 402 ** Usage: %fossil test-looks-like-utf FILENAME
d70ea7e… drh 403 **
d70ea7e… drh 404 ** Options:
11384f1… drh 405 ** -n|--limit N Repeat looks-like function N times, for
4cb50c4… stephan 406 ** performance measurement. Default = 1
d70ea7e… drh 407 ** --utf8 Ignoring BOM and file size, force UTF-8 checking
d70ea7e… drh 408 ** --utf16 Ignoring BOM and file size, force UTF-16 checking
d70ea7e… drh 409 **
d70ea7e… drh 410 ** FILENAME is the name of a file to check for textual content in the UTF-8
d70ea7e… drh 411 ** and/or UTF-16 encodings.
d70ea7e… drh 412 */
d70ea7e… drh 413 void looks_like_utf_test_cmd(void){
503482a… jan.nijtmans 414 Blob blob; /* the contents of the specified file */
503482a… jan.nijtmans 415 int fUtf8 = 0; /* return value of starts_with_utf8_bom() */
503482a… jan.nijtmans 416 int fUtf16 = 0; /* return value of starts_with_utf16_bom() */
503482a… jan.nijtmans 417 int fUnicode = 0; /* return value of could_be_utf16() */
503482a… jan.nijtmans 418 int lookFlags = 0; /* output flags from looks_like_utf8/utf16() */
d70ea7e… drh 419 int bRevUtf16 = 0; /* non-zero -> UTF-16 byte order reversed */
d70ea7e… drh 420 int fForceUtf8 = find_option("utf8",0,0)!=0;
d70ea7e… drh 421 int fForceUtf16 = find_option("utf16",0,0)!=0;
5f24da1… jan.nijtmans 422 const char *zCount = find_option("limit","n",1);
5f24da1… jan.nijtmans 423 int nRepeat = 1;
5f24da1… jan.nijtmans 424
d70ea7e… drh 425 if( g.argc!=3 ) usage("FILENAME");
5f24da1… jan.nijtmans 426 if( zCount ){
5f24da1… jan.nijtmans 427 nRepeat = atoi(zCount);
5f24da1… jan.nijtmans 428 }
1772357… drh 429 blob_read_from_file(&blob, g.argv[2], ExtFILE);
5f24da1… jan.nijtmans 430 while( --nRepeat >= 0 ){
5f24da1… jan.nijtmans 431 fUtf8 = starts_with_utf8_bom(&blob, 0);
5f24da1… jan.nijtmans 432 fUtf16 = starts_with_utf16_bom(&blob, 0, &bRevUtf16);
5f24da1… jan.nijtmans 433 if( fForceUtf8 ){
5f24da1… jan.nijtmans 434 fUnicode = 0;
5f24da1… jan.nijtmans 435 }else{
09f2386… jan.nijtmans 436 fUnicode = could_be_utf16(&blob, 0) || fForceUtf16;
5f24da1… jan.nijtmans 437 }
5f24da1… jan.nijtmans 438 if( fUnicode ){
09f2386… jan.nijtmans 439 lookFlags = looks_like_utf16(&blob, bRevUtf16, 0);
5f24da1… jan.nijtmans 440 }else{
60349a6… jan.nijtmans 441 lookFlags = looks_like_utf8(&blob, 0) | invalid_utf8(&blob);
5f24da1… jan.nijtmans 442 }
5f24da1… jan.nijtmans 443 }
d70ea7e… drh 444 fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
d70ea7e… drh 445 fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
d70ea7e… drh 446 fossil_print("Starts with UTF-16 BOM: %s\n",
d70ea7e… drh 447 fUtf16?(bRevUtf16?"reversed":"yes"):"no");
d70ea7e… drh 448 fossil_print("Looks like UTF-%s: %s\n",fUnicode?"16":"8",
d70ea7e… drh 449 (lookFlags&LOOK_BINARY)?"no":"yes");
d70ea7e… drh 450 fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no");
d70ea7e… drh 451 fossil_print("Has flag LOOK_CR: %s\n",(lookFlags&LOOK_CR)?"yes":"no");
d70ea7e… drh 452 fossil_print("Has flag LOOK_LONE_CR: %s\n",
d70ea7e… drh 453 (lookFlags&LOOK_LONE_CR)?"yes":"no");
d70ea7e… drh 454 fossil_print("Has flag LOOK_LF: %s\n",(lookFlags&LOOK_LF)?"yes":"no");
d70ea7e… drh 455 fossil_print("Has flag LOOK_LONE_LF: %s\n",
d70ea7e… drh 456 (lookFlags&LOOK_LONE_LF)?"yes":"no");
d70ea7e… drh 457 fossil_print("Has flag LOOK_CRLF: %s\n",(lookFlags&LOOK_CRLF)?"yes":"no");
d70ea7e… drh 458 fossil_print("Has flag LOOK_LONG: %s\n",(lookFlags&LOOK_LONG)?"yes":"no");
d70ea7e… drh 459 fossil_print("Has flag LOOK_INVALID: %s\n",
d70ea7e… drh 460 (lookFlags&LOOK_INVALID)?"yes":"no");
d70ea7e… drh 461 fossil_print("Has flag LOOK_ODD: %s\n",(lookFlags&LOOK_ODD)?"yes":"no");
d70ea7e… drh 462 fossil_print("Has flag LOOK_SHORT: %s\n",(lookFlags&LOOK_SHORT)?"yes":"no");
d70ea7e… drh 463 blob_reset(&blob);
534c10f… stephan 464 }
534c10f… stephan 465
534c10f… stephan 466 /*
57f1e87… drh 467 ** Return true if z[i] is the whole word given by zWord in a context that
57f1e87… drh 468 ** might be an attempted SQL injection.
d3cb62f… drh 469 */
d3cb62f… drh 470 static int isWholeWord(const char *z, unsigned int i, const char *zWord, int n){
57f1e87… drh 471 if( i==0 ) return 0;
d3cb62f… drh 472 if( sqlite3_strnicmp(z+i, zWord, n)!=0 ) return 0;
57f1e87… drh 473 if( fossil_isalnum(z[i-1]) ) return 0;
eb7fad0… drh 474 if( fossil_isalnum(z[i+n]) ) return 0;
57f1e87… drh 475 if( strchr("-)_", z[i-1])!=0 ) return 0;
57f1e87… drh 476 if( strchr("(_", z[i+n])!=0 ) return 0;
d3cb62f… drh 477 return 1;
d3cb62f… drh 478 }
d3cb62f… drh 479
d3cb62f… drh 480 /*
534c10f… stephan 481 ** Returns true if the given text contains certain keywords or
eb7fad0… drh 482 ** punctuation which indicate that it might be an SQL injection attempt
8612122… drh 483 ** or Cross-site scripting attempt or some other kind of mischief.
eb7fad0… drh 484 **
8612122… drh 485 ** This is not a primary defense against vulnerabilities in the Fossil
8612122… drh 486 ** code. Rather, this is part of an effort to do early detection of malicious
8612122… drh 487 ** spiders to avoid them using up too many CPU cycles. Or, this routine
8612122… drh 488 ** can also be thought of as a secondary layer of defense against attacks.
d3cb62f… drh 489 */
8612122… drh 490 int looks_like_attack(const char *zTxt){
d3cb62f… drh 491 unsigned int i;
5a33f30… drh 492 int rc = 0;
d3cb62f… drh 493 if( zTxt==0 ) return 0;
d3cb62f… drh 494 for(i=0; zTxt[i]; i++){
d3cb62f… drh 495 switch( zTxt[i] ){
5a33f30… drh 496 case '<':
d3cb62f… drh 497 case ';':
d3cb62f… drh 498 case '\'':
d3cb62f… drh 499 return 1;
eb7fad0… drh 500 case '/': /* 0123456789 123456789 */
5a33f30… drh 501 if( strncmp(zTxt+i+1, "/wp-content/plugins/", 20)==0 ) rc = 1;
5a33f30… drh 502 if( strncmp(zTxt+i+1, "/wp-admin/admin-ajax", 20)==0 ) rc = 1;
eb7fad0… drh 503 break;
d3cb62f… drh 504 case 'a':
d3cb62f… drh 505 case 'A':
5a33f30… drh 506 if( isWholeWord(zTxt, i, "and", 3) ) rc = 1;
d3cb62f… drh 507 break;
d3cb62f… drh 508 case 'n':
d3cb62f… drh 509 case 'N':
5a33f30… drh 510 if( isWholeWord(zTxt, i, "null", 4) ) rc = 1;
d3cb62f… drh 511 break;
d3cb62f… drh 512 case 'o':
d3cb62f… drh 513 case 'O':
57f1e87… drh 514 if( isWholeWord(zTxt, i, "order", 5) && fossil_isspace(zTxt[i+5]) ){
5a33f30… drh 515 rc = 1;
57f1e87… drh 516 }
5a33f30… drh 517 if( isWholeWord(zTxt, i, "or", 2) ) rc = 1;
d3cb62f… drh 518 break;
d3cb62f… drh 519 case 's':
d3cb62f… drh 520 case 'S':
5a33f30… drh 521 if( isWholeWord(zTxt, i, "select", 6) ) rc = 1;
d3cb62f… drh 522 break;
d3cb62f… drh 523 case 'w':
d3cb62f… drh 524 case 'W':
5a33f30… drh 525 if( isWholeWord(zTxt, i, "waitfor", 7) ) rc = 1;
d3cb62f… drh 526 break;
d3cb62f… drh 527 }
d3cb62f… drh 528 }
5a33f30… drh 529 if( rc ){
5a33f30… drh 530 /* The test/markdown-test3.md document which is part of the Fossil source
5a33f30… drh 531 ** tree intentionally tries to fake an attack. Do not report such
5a33f30… drh 532 ** errors. */
5a33f30… drh 533 const char *zPathInfo = P("PATH_INFO");
5a33f30… drh 534 if( sqlite3_strglob("/doc/*/test/markdown-test3.md", zPathInfo)==0 ){
5a33f30… drh 535 rc = 0;
5a33f30… drh 536 }
5a33f30… drh 537 }
5a33f30… drh 538 return rc;
d3cb62f… drh 539 }
d3cb62f… drh 540
d3cb62f… drh 541 /*
d3cb62f… drh 542 ** This is a utility routine associated with the test-looks-like-sql-injection
d3cb62f… drh 543 ** command.
d3cb62f… drh 544 **
d3cb62f… drh 545 ** Read input from zInFile and print only those lines that look like they
d3cb62f… drh 546 ** might be SQL injection.
d3cb62f… drh 547 **
d3cb62f… drh 548 ** Or if bInvert is true, then show the opposite - those lines that do NOT
d3cb62f… drh 549 ** look like SQL injection.
d3cb62f… drh 550 */
8612122… drh 551 static void show_attack_lines(
d3cb62f… drh 552 const char *zInFile, /* Name of input file */
d3cb62f… drh 553 int bInvert, /* Invert the sense of the output (-v) */
d3cb62f… drh 554 int bDeHttpize /* De-httpize the inputs. (-d) */
d3cb62f… drh 555 ){
d3cb62f… drh 556 FILE *in;
d3cb62f… drh 557 char zLine[10000];
d3cb62f… drh 558 if( zInFile==0 || strcmp(zInFile,"-")==0 ){
d3cb62f… drh 559 in = stdin;
d3cb62f… drh 560 }else{
d3cb62f… drh 561 in = fopen(zInFile, "rb");
d3cb62f… drh 562 if( in==0 ){
d3cb62f… drh 563 fossil_fatal("cannot open \"%s\" for reading\n", zInFile);
d3cb62f… drh 564 }
d3cb62f… drh 565 }
d3cb62f… drh 566 while( fgets(zLine, sizeof(zLine), in) ){
d3cb62f… drh 567 dehttpize(zLine);
8612122… drh 568 if( (looks_like_attack(zLine)!=0) ^ bInvert ){
d3cb62f… drh 569 fossil_print("%s", zLine);
d3cb62f… drh 570 }
d3cb62f… drh 571 }
d3cb62f… drh 572 if( in!=stdin ) fclose(in);
d3cb62f… drh 573 }
d3cb62f… drh 574
d3cb62f… drh 575 /*
8612122… drh 576 ** COMMAND: test-looks-like-attack
d3cb62f… drh 577 **
d3cb62f… drh 578 ** Read lines of input from files named as arguments (or from standard
d3cb62f… drh 579 ** input if no arguments are provided) and print those that look like they
d3cb62f… drh 580 ** might be part of an SQL injection attack.
d3cb62f… drh 581 **
8612122… drh 582 ** Used to test the looks_lile_attack() utility subroutine, possibly
d3cb62f… drh 583 ** by piping in actual server log data.
534c10f… stephan 584 */
8612122… drh 585 void test_looks_like_attack(void){
d3cb62f… drh 586 int i;
d3cb62f… drh 587 int bInvert = find_option("invert","v",0)!=0;
d3cb62f… drh 588 int bDeHttpize = find_option("dehttpize","d",0)!=0;
d3cb62f… drh 589 verify_all_options();
d3cb62f… drh 590 if( g.argc==2 ){
8612122… drh 591 show_attack_lines(0, bInvert, bDeHttpize);
d3cb62f… drh 592 }
d3cb62f… drh 593 for(i=2; i<g.argc; i++){
8612122… drh 594 show_attack_lines(g.argv[i], bInvert, bDeHttpize);
d3cb62f… drh 595 }
d70ea7e… drh 596 }

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button