|
d70ea7e…
|
drh
|
1 |
/* |
|
d70ea7e…
|
drh
|
2 |
** Copyright (c) 2013 D. Richard Hipp |
|
d70ea7e…
|
drh
|
3 |
** |
|
d70ea7e…
|
drh
|
4 |
** This program is free software; you can redistribute it and/or |
|
d70ea7e…
|
drh
|
5 |
** modify it under the terms of the Simplified BSD License (also |
|
d70ea7e…
|
drh
|
6 |
** known as the "2-Clause License" or "FreeBSD License".) |
|
d70ea7e…
|
drh
|
7 |
|
|
d70ea7e…
|
drh
|
8 |
** This program is distributed in the hope that it will be useful, |
|
d70ea7e…
|
drh
|
9 |
** but without any warranty; without even the implied warranty of |
|
d70ea7e…
|
drh
|
10 |
** merchantability or fitness for a particular purpose. |
|
d70ea7e…
|
drh
|
11 |
** |
|
d70ea7e…
|
drh
|
12 |
** Author contact information: |
|
d70ea7e…
|
drh
|
13 |
** [email protected] |
|
d70ea7e…
|
drh
|
14 |
** http://www.hwaci.com/drh/ |
|
d70ea7e…
|
drh
|
15 |
** |
|
d70ea7e…
|
drh
|
16 |
******************************************************************************* |
|
d70ea7e…
|
drh
|
17 |
** |
|
d70ea7e…
|
drh
|
18 |
** This file contains code used to try to guess if a particular file is |
|
d70ea7e…
|
drh
|
19 |
** text or binary, what types of line endings it uses, is it UTF8 or |
|
d70ea7e…
|
drh
|
20 |
** UTF16, etc. |
|
d70ea7e…
|
drh
|
21 |
*/ |
|
d70ea7e…
|
drh
|
22 |
#include "config.h" |
|
d70ea7e…
|
drh
|
23 |
#include "lookslike.h" |
|
d70ea7e…
|
drh
|
24 |
#include <assert.h> |
|
d70ea7e…
|
drh
|
25 |
|
|
d70ea7e…
|
drh
|
26 |
|
|
d70ea7e…
|
drh
|
27 |
#if INTERFACE |
|
d70ea7e…
|
drh
|
28 |
|
|
d70ea7e…
|
drh
|
29 |
/* |
|
d70ea7e…
|
drh
|
30 |
** This macro is designed to return non-zero if the specified blob contains |
|
d70ea7e…
|
drh
|
31 |
** data that MAY be binary in nature; otherwise, zero will be returned. |
|
d70ea7e…
|
drh
|
32 |
*/ |
|
d70ea7e…
|
drh
|
33 |
#define looks_like_binary(blob) \ |
|
d70ea7e…
|
drh
|
34 |
((looks_like_utf8((blob), LOOK_BINARY) & LOOK_BINARY) != LOOK_NONE) |
|
d70ea7e…
|
drh
|
35 |
|
|
d70ea7e…
|
drh
|
36 |
/* |
|
d70ea7e…
|
drh
|
37 |
** Output flags for the looks_like_utf8() and looks_like_utf16() routines used |
|
d70ea7e…
|
drh
|
38 |
** to convey status information about the blob content. |
|
d70ea7e…
|
drh
|
39 |
*/ |
|
d70ea7e…
|
drh
|
40 |
#define LOOK_NONE ((int)0x00000000) /* Nothing special was found. */ |
|
d70ea7e…
|
drh
|
41 |
#define LOOK_NUL ((int)0x00000001) /* One or more NUL chars were found. */ |
|
3161968…
|
mistachkin
|
42 |
#define LOOK_CR ((int)0x00000002) /* One or more CR chars were found. */ |
|
3161968…
|
mistachkin
|
43 |
#define LOOK_LONE_CR ((int)0x00000004) /* An unpaired CR char was found. */ |
|
3161968…
|
mistachkin
|
44 |
#define LOOK_LF ((int)0x00000008) /* One or more LF chars were found. */ |
|
3161968…
|
mistachkin
|
45 |
#define LOOK_LONE_LF ((int)0x00000010) /* An unpaired LF char was found. */ |
|
3161968…
|
mistachkin
|
46 |
#define LOOK_CRLF ((int)0x00000020) /* One or more CR/LF pairs were found. */ |
|
3161968…
|
mistachkin
|
47 |
#define LOOK_LONG ((int)0x00000040) /* An over length line was found. */ |
|
3161968…
|
mistachkin
|
48 |
#define LOOK_ODD ((int)0x00000080) /* An odd number of bytes was found. */ |
|
3161968…
|
mistachkin
|
49 |
#define LOOK_SHORT ((int)0x00000100) /* Unable to perform full check. */ |
|
3161968…
|
mistachkin
|
50 |
#define LOOK_INVALID ((int)0x00000200) /* Invalid sequence was found. */ |
|
d70ea7e…
|
drh
|
51 |
#define LOOK_BINARY (LOOK_NUL | LOOK_LONG | LOOK_SHORT) /* May be binary. */ |
|
d70ea7e…
|
drh
|
52 |
#define LOOK_EOL (LOOK_LONE_CR | LOOK_LONE_LF | LOOK_CRLF) /* Line seps. */ |
|
d70ea7e…
|
drh
|
53 |
#endif /* INTERFACE */ |
|
d70ea7e…
|
drh
|
54 |
|
|
7c08a68…
|
jan.nijtmans
|
55 |
/* definitions for various UTF-8 sequence lengths, encoded as start value |
|
7c08a68…
|
jan.nijtmans
|
56 |
* and size of each valid range belonging to some lead byte*/ |
|
7c08a68…
|
jan.nijtmans
|
57 |
#define US2A 0x80, 0x01 /* for lead byte 0xC0 */ |
|
7c08a68…
|
jan.nijtmans
|
58 |
#define US2B 0x80, 0x40 /* for lead bytes 0xC2-0xDF */ |
|
7c08a68…
|
jan.nijtmans
|
59 |
#define US3A 0xA0, 0x20 /* for lead byte 0xE0 */ |
|
7c08a68…
|
jan.nijtmans
|
60 |
#define US3B 0x80, 0x40 /* for lead bytes 0xE1-0xEF */ |
|
7c08a68…
|
jan.nijtmans
|
61 |
#define US4A 0x90, 0x30 /* for lead byte 0xF0 */ |
|
7c08a68…
|
jan.nijtmans
|
62 |
#define US4B 0x80, 0x40 /* for lead bytes 0xF1-0xF3 */ |
|
7c08a68…
|
jan.nijtmans
|
63 |
#define US4C 0x80, 0x10 /* for lead byte 0xF4 */ |
|
7c08a68…
|
jan.nijtmans
|
64 |
#define US0A 0x00, 0x00 /* for any other lead byte */ |
|
7c08a68…
|
jan.nijtmans
|
65 |
|
|
7c08a68…
|
jan.nijtmans
|
66 |
/* a table used for quick lookup of the definition that goes with a |
|
7c08a68…
|
jan.nijtmans
|
67 |
* particular lead byte */ |
|
7c08a68…
|
jan.nijtmans
|
68 |
static const unsigned char lb_tab[] = { |
|
7c08a68…
|
jan.nijtmans
|
69 |
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
|
7c08a68…
|
jan.nijtmans
|
70 |
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
|
7c08a68…
|
jan.nijtmans
|
71 |
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
|
7c08a68…
|
jan.nijtmans
|
72 |
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
|
7c08a68…
|
jan.nijtmans
|
73 |
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
|
7c08a68…
|
jan.nijtmans
|
74 |
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
|
7c08a68…
|
jan.nijtmans
|
75 |
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
|
7c08a68…
|
jan.nijtmans
|
76 |
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
|
7c08a68…
|
jan.nijtmans
|
77 |
US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B, |
|
7c08a68…
|
jan.nijtmans
|
78 |
US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, |
|
7c08a68…
|
jan.nijtmans
|
79 |
US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, |
|
7c08a68…
|
jan.nijtmans
|
80 |
US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, |
|
7c08a68…
|
jan.nijtmans
|
81 |
US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B, |
|
7c08a68…
|
jan.nijtmans
|
82 |
US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B, |
|
7c08a68…
|
jan.nijtmans
|
83 |
US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A, |
|
7c08a68…
|
jan.nijtmans
|
84 |
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A |
|
7c08a68…
|
jan.nijtmans
|
85 |
}; |
|
d70ea7e…
|
drh
|
86 |
|
|
d70ea7e…
|
drh
|
87 |
/* |
|
d70ea7e…
|
drh
|
88 |
** This function attempts to scan each logical line within the blob to |
|
d70ea7e…
|
drh
|
89 |
** determine the type of content it appears to contain. The return value |
|
d70ea7e…
|
drh
|
90 |
** is a combination of one or more of the LOOK_XXX flags (see above): |
|
d70ea7e…
|
drh
|
91 |
** |
|
d70ea7e…
|
drh
|
92 |
** !LOOK_BINARY -- The content appears to consist entirely of text; however, |
|
d70ea7e…
|
drh
|
93 |
** the encoding may not be UTF-8. |
|
d70ea7e…
|
drh
|
94 |
** |
|
d70ea7e…
|
drh
|
95 |
** LOOK_BINARY -- The content appears to be binary because it contains one |
|
d70ea7e…
|
drh
|
96 |
** or more embedded NUL characters or an extremely long line. |
|
d70ea7e…
|
drh
|
97 |
** Since this function does not understand UTF-16, it may |
|
d70ea7e…
|
drh
|
98 |
** falsely consider UTF-16 text to be binary. |
|
d70ea7e…
|
drh
|
99 |
** |
|
d70ea7e…
|
drh
|
100 |
** Additional flags (i.e. those other than the ones included in LOOK_BINARY) |
|
d70ea7e…
|
drh
|
101 |
** may be present in the result as well; however, they should not impact the |
|
d70ea7e…
|
drh
|
102 |
** determination of text versus binary content. |
|
d70ea7e…
|
drh
|
103 |
** |
|
d70ea7e…
|
drh
|
104 |
************************************ WARNING ********************************** |
|
d70ea7e…
|
drh
|
105 |
** |
|
d70ea7e…
|
drh
|
106 |
** This function does not validate that the blob content is properly formed |
|
d70ea7e…
|
drh
|
107 |
** UTF-8. It assumes that all code points are the same size. It does not |
|
d70ea7e…
|
drh
|
108 |
** validate any code points. It makes no attempt to detect if any [invalid] |
|
d70ea7e…
|
drh
|
109 |
** switches between UTF-8 and other encodings occur. |
|
d70ea7e…
|
drh
|
110 |
** |
|
d70ea7e…
|
drh
|
111 |
** The only code points that this function cares about are the NUL character, |
|
d70ea7e…
|
drh
|
112 |
** carriage-return, and line-feed. |
|
d70ea7e…
|
drh
|
113 |
** |
|
d70ea7e…
|
drh
|
114 |
** This function examines the contents of the blob until one of the flags |
|
d70ea7e…
|
drh
|
115 |
** specified in "stopFlags" is set. |
|
d70ea7e…
|
drh
|
116 |
** |
|
d70ea7e…
|
drh
|
117 |
************************************ WARNING ********************************** |
|
d70ea7e…
|
drh
|
118 |
*/ |
|
d70ea7e…
|
drh
|
119 |
int looks_like_utf8(const Blob *pContent, int stopFlags){ |
|
d70ea7e…
|
drh
|
120 |
const char *z = blob_buffer(pContent); |
|
d70ea7e…
|
drh
|
121 |
unsigned int n = blob_size(pContent); |
|
d70ea7e…
|
drh
|
122 |
int j, c, flags = LOOK_NONE; /* Assume UTF-8 text, prove otherwise */ |
|
d70ea7e…
|
drh
|
123 |
|
|
d70ea7e…
|
drh
|
124 |
if( n==0 ) return flags; /* Empty file -> text */ |
|
d70ea7e…
|
drh
|
125 |
c = *z; |
|
d70ea7e…
|
drh
|
126 |
if( c==0 ){ |
|
d70ea7e…
|
drh
|
127 |
flags |= LOOK_NUL; /* NUL character in a file -> binary */ |
|
d70ea7e…
|
drh
|
128 |
}else if( c=='\r' ){ |
|
3161968…
|
mistachkin
|
129 |
flags |= LOOK_CR; |
|
d70ea7e…
|
drh
|
130 |
if( n<=1 || z[1]!='\n' ){ |
|
6e3fceb…
|
mistachkin
|
131 |
flags |= LOOK_LONE_CR; /* Not enough chars or next char not LF */ |
|
d70ea7e…
|
drh
|
132 |
} |
|
d70ea7e…
|
drh
|
133 |
} |
|
d70ea7e…
|
drh
|
134 |
j = (c!='\n'); |
|
3161968…
|
mistachkin
|
135 |
if( !j ) flags |= (LOOK_LF | LOOK_LONE_LF); /* Found LF as first char */ |
|
d70ea7e…
|
drh
|
136 |
while( !(flags&stopFlags) && --n>0 ){ |
|
d70ea7e…
|
drh
|
137 |
int c2 = c; |
|
d70ea7e…
|
drh
|
138 |
c = *++z; ++j; |
|
d70ea7e…
|
drh
|
139 |
if( c==0 ){ |
|
d70ea7e…
|
drh
|
140 |
flags |= LOOK_NUL; /* NUL character in a file -> binary */ |
|
d70ea7e…
|
drh
|
141 |
}else if( c=='\n' ){ |
|
3161968…
|
mistachkin
|
142 |
flags |= LOOK_LF; |
|
d70ea7e…
|
drh
|
143 |
if( c2=='\r' ){ |
|
3161968…
|
mistachkin
|
144 |
flags |= (LOOK_CR | LOOK_CRLF); /* Found LF preceded by CR */ |
|
d70ea7e…
|
drh
|
145 |
}else{ |
|
d70ea7e…
|
drh
|
146 |
flags |= LOOK_LONE_LF; |
|
d70ea7e…
|
drh
|
147 |
} |
|
d70ea7e…
|
drh
|
148 |
if( j>LENGTH_MASK ){ |
|
d70ea7e…
|
drh
|
149 |
flags |= LOOK_LONG; /* Very long line -> binary */ |
|
d70ea7e…
|
drh
|
150 |
} |
|
d70ea7e…
|
drh
|
151 |
j = 0; |
|
d70ea7e…
|
drh
|
152 |
}else if( c=='\r' ){ |
|
3161968…
|
mistachkin
|
153 |
flags |= LOOK_CR; |
|
d70ea7e…
|
drh
|
154 |
if( n<=1 || z[1]!='\n' ){ |
|
6e3fceb…
|
mistachkin
|
155 |
flags |= LOOK_LONE_CR; /* Not enough chars or next char not LF */ |
|
d70ea7e…
|
drh
|
156 |
} |
|
d70ea7e…
|
drh
|
157 |
} |
|
d70ea7e…
|
drh
|
158 |
} |
|
d70ea7e…
|
drh
|
159 |
if( n ){ |
|
d70ea7e…
|
drh
|
160 |
flags |= LOOK_SHORT; /* The whole blob was not examined */ |
|
d70ea7e…
|
drh
|
161 |
} |
|
d70ea7e…
|
drh
|
162 |
if( j>LENGTH_MASK ){ |
|
d70ea7e…
|
drh
|
163 |
flags |= LOOK_LONG; /* Very long line -> binary */ |
|
d70ea7e…
|
drh
|
164 |
} |
|
d70ea7e…
|
drh
|
165 |
return flags; |
|
d70ea7e…
|
drh
|
166 |
} |
|
d70ea7e…
|
drh
|
167 |
|
|
5f24da1…
|
jan.nijtmans
|
168 |
/* |
|
5f24da1…
|
jan.nijtmans
|
169 |
** Checks for proper UTF-8. It uses the method described in: |
|
5f24da1…
|
jan.nijtmans
|
170 |
** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences |
|
7c08a68…
|
jan.nijtmans
|
171 |
** except for the "overlong form" of \u0000 which is not considered |
|
7c08a68…
|
jan.nijtmans
|
172 |
** invalid here: Some languages like Java and Tcl use it. This function |
|
7c08a68…
|
jan.nijtmans
|
173 |
** also considers valid the derivatives CESU-8 & WTF-8 (as described in |
|
7c08a68…
|
jan.nijtmans
|
174 |
** the same wikipedia article referenced previously). For UTF-8 characters |
|
7c08a68…
|
jan.nijtmans
|
175 |
** > 0x7f, the variable 'c' not necessary means the real lead byte. |
|
7c08a68…
|
jan.nijtmans
|
176 |
** It's number of higher 1-bits indicate the number of continuation |
|
7c08a68…
|
jan.nijtmans
|
177 |
** bytes that are expected to be followed. E.g. when 'c' has a value |
|
7c08a68…
|
jan.nijtmans
|
178 |
** in the range 0xc0..0xdf it means that after 'c' a single continuation |
|
7c08a68…
|
jan.nijtmans
|
179 |
** byte is expected. A value 0xe0..0xef means that after 'c' two more |
|
7c08a68…
|
jan.nijtmans
|
180 |
** continuation bytes are expected. |
|
5f24da1…
|
jan.nijtmans
|
181 |
*/ |
|
5f24da1…
|
jan.nijtmans
|
182 |
|
|
60349a6…
|
jan.nijtmans
|
183 |
int invalid_utf8( |
|
60349a6…
|
jan.nijtmans
|
184 |
const Blob *pContent |
|
60349a6…
|
jan.nijtmans
|
185 |
){ |
|
5f24da1…
|
jan.nijtmans
|
186 |
const unsigned char *z = (unsigned char *) blob_buffer(pContent); |
|
5f24da1…
|
jan.nijtmans
|
187 |
unsigned int n = blob_size(pContent); |
|
7c08a68…
|
jan.nijtmans
|
188 |
unsigned char c; /* lead byte to be handled. */ |
|
5f24da1…
|
jan.nijtmans
|
189 |
|
|
5f24da1…
|
jan.nijtmans
|
190 |
if( n==0 ) return 0; /* Empty file -> OK */ |
|
5f24da1…
|
jan.nijtmans
|
191 |
c = *z; |
|
5f24da1…
|
jan.nijtmans
|
192 |
while( --n>0 ){ |
|
7c08a68…
|
jan.nijtmans
|
193 |
if( c>=0x80 ){ |
|
7c08a68…
|
jan.nijtmans
|
194 |
const unsigned char *def; /* pointer to range table*/ |
|
7c08a68…
|
jan.nijtmans
|
195 |
|
|
7c08a68…
|
jan.nijtmans
|
196 |
c <<= 1; /* multiply by 2 and get rid of highest bit */ |
|
7c08a68…
|
jan.nijtmans
|
197 |
def = &lb_tab[c]; /* search fb's valid range in table */ |
|
7c08a68…
|
jan.nijtmans
|
198 |
if( (unsigned int)(*++z-def[0])>=def[1] ){ |
|
5f24da1…
|
jan.nijtmans
|
199 |
return LOOK_INVALID; /* Invalid UTF-8 */ |
|
5f24da1…
|
jan.nijtmans
|
200 |
} |
|
7c08a68…
|
jan.nijtmans
|
201 |
c = (c>=0xC0) ? (c|3) : ' '; /* determine next lead byte */ |
|
7c08a68…
|
jan.nijtmans
|
202 |
} else { |
|
7c08a68…
|
jan.nijtmans
|
203 |
c = *++z; |
|
1ca5983…
|
jan.nijtmans
|
204 |
} |
|
1ca5983…
|
jan.nijtmans
|
205 |
} |
|
7c08a68…
|
jan.nijtmans
|
206 |
return (c>=0x80) ? LOOK_INVALID : 0; /* Final lead byte must be ASCII. */ |
|
1ca5983…
|
jan.nijtmans
|
207 |
} |
|
d70ea7e…
|
drh
|
208 |
|
|
d70ea7e…
|
drh
|
209 |
/* |
|
d70ea7e…
|
drh
|
210 |
** Define the type needed to represent a Unicode (UTF-16) character. |
|
d70ea7e…
|
drh
|
211 |
*/ |
|
d70ea7e…
|
drh
|
212 |
#ifndef WCHAR_T |
|
d70ea7e…
|
drh
|
213 |
# ifdef _WIN32 |
|
d70ea7e…
|
drh
|
214 |
# define WCHAR_T wchar_t |
|
d70ea7e…
|
drh
|
215 |
# else |
|
d70ea7e…
|
drh
|
216 |
# define WCHAR_T unsigned short |
|
d70ea7e…
|
drh
|
217 |
# endif |
|
d70ea7e…
|
drh
|
218 |
#endif |
|
d70ea7e…
|
drh
|
219 |
|
|
d70ea7e…
|
drh
|
220 |
/* |
|
d70ea7e…
|
drh
|
221 |
** Maximum length of a line in a text file, in UTF-16 characters. (4096) |
|
d70ea7e…
|
drh
|
222 |
** The number of bytes represented by this value cannot exceed LENGTH_MASK |
|
d70ea7e…
|
drh
|
223 |
** bytes, because that is the line buffer size used by the diff engine. |
|
d70ea7e…
|
drh
|
224 |
*/ |
|
d70ea7e…
|
drh
|
225 |
#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char))) |
|
d70ea7e…
|
drh
|
226 |
#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1) |
|
d70ea7e…
|
drh
|
227 |
|
|
d70ea7e…
|
drh
|
228 |
/* |
|
d70ea7e…
|
drh
|
229 |
** This macro is used to swap the byte order of a UTF-16 character in the |
|
d70ea7e…
|
drh
|
230 |
** looks_like_utf16() function. |
|
d70ea7e…
|
drh
|
231 |
*/ |
|
5f24da1…
|
jan.nijtmans
|
232 |
#define UTF16_SWAP(ch) ((((ch) << 8) & 0xff00) | (((ch) >> 8) & 0xff)) |
|
d70ea7e…
|
drh
|
233 |
#define UTF16_SWAP_IF(expr,ch) ((expr) ? UTF16_SWAP((ch)) : (ch)) |
|
d70ea7e…
|
drh
|
234 |
|
|
d70ea7e…
|
drh
|
235 |
/* |
|
d70ea7e…
|
drh
|
236 |
** This function attempts to scan each logical line within the blob to |
|
d70ea7e…
|
drh
|
237 |
** determine the type of content it appears to contain. The return value |
|
d70ea7e…
|
drh
|
238 |
** is a combination of one or more of the LOOK_XXX flags (see above): |
|
d70ea7e…
|
drh
|
239 |
** |
|
d70ea7e…
|
drh
|
240 |
** !LOOK_BINARY -- The content appears to consist entirely of text; however, |
|
d70ea7e…
|
drh
|
241 |
** the encoding may not be UTF-16. |
|
d70ea7e…
|
drh
|
242 |
** |
|
d70ea7e…
|
drh
|
243 |
** LOOK_BINARY -- The content appears to be binary because it contains one |
|
d70ea7e…
|
drh
|
244 |
** or more embedded NUL characters or an extremely long line. |
|
d70ea7e…
|
drh
|
245 |
** Since this function does not understand UTF-8, it may |
|
d70ea7e…
|
drh
|
246 |
** falsely consider UTF-8 text to be binary. |
|
d70ea7e…
|
drh
|
247 |
** |
|
d70ea7e…
|
drh
|
248 |
** Additional flags (i.e. those other than the ones included in LOOK_BINARY) |
|
d70ea7e…
|
drh
|
249 |
** may be present in the result as well; however, they should not impact the |
|
d70ea7e…
|
drh
|
250 |
** determination of text versus binary content. |
|
d70ea7e…
|
drh
|
251 |
** |
|
d70ea7e…
|
drh
|
252 |
************************************ WARNING ********************************** |
|
d70ea7e…
|
drh
|
253 |
** |
|
d70ea7e…
|
drh
|
254 |
** This function does not validate that the blob content is properly formed |
|
d70ea7e…
|
drh
|
255 |
** UTF-16. It assumes that all code points are the same size. It does not |
|
d70ea7e…
|
drh
|
256 |
** validate any code points. It makes no attempt to detect if any [invalid] |
|
d70ea7e…
|
drh
|
257 |
** switches between the UTF-16be and UTF-16le encodings occur. |
|
d70ea7e…
|
drh
|
258 |
** |
|
d70ea7e…
|
drh
|
259 |
** The only code points that this function cares about are the NUL character, |
|
d70ea7e…
|
drh
|
260 |
** carriage-return, and line-feed. |
|
d70ea7e…
|
drh
|
261 |
** |
|
d70ea7e…
|
drh
|
262 |
** This function examines the contents of the blob until one of the flags |
|
d70ea7e…
|
drh
|
263 |
** specified in "stopFlags" is set. |
|
d70ea7e…
|
drh
|
264 |
** |
|
d70ea7e…
|
drh
|
265 |
************************************ WARNING ********************************** |
|
d70ea7e…
|
drh
|
266 |
*/ |
|
d70ea7e…
|
drh
|
267 |
int looks_like_utf16(const Blob *pContent, int bReverse, int stopFlags){ |
|
d70ea7e…
|
drh
|
268 |
const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent); |
|
d70ea7e…
|
drh
|
269 |
unsigned int n = blob_size(pContent); |
|
d70ea7e…
|
drh
|
270 |
int j, c, flags = LOOK_NONE; /* Assume UTF-16 text, prove otherwise */ |
|
d70ea7e…
|
drh
|
271 |
|
|
d70ea7e…
|
drh
|
272 |
if( n%sizeof(WCHAR_T) ){ |
|
d70ea7e…
|
drh
|
273 |
flags |= LOOK_ODD; /* Odd number of bytes -> binary (UTF-8?) */ |
|
d70ea7e…
|
drh
|
274 |
} |
|
275da70…
|
danield
|
275 |
if( n<sizeof(WCHAR_T) ) return flags;/* Zero or One byte -> binary (UTF-8?) */ |
|
d70ea7e…
|
drh
|
276 |
c = *z; |
|
d70ea7e…
|
drh
|
277 |
if( bReverse ){ |
|
d70ea7e…
|
drh
|
278 |
c = UTF16_SWAP(c); |
|
d70ea7e…
|
drh
|
279 |
} |
|
d70ea7e…
|
drh
|
280 |
if( c==0 ){ |
|
d70ea7e…
|
drh
|
281 |
flags |= LOOK_NUL; /* NUL character in a file -> binary */ |
|
d70ea7e…
|
drh
|
282 |
}else if( c=='\r' ){ |
|
3161968…
|
mistachkin
|
283 |
flags |= LOOK_CR; |
|
d70ea7e…
|
drh
|
284 |
if( n<(2*sizeof(WCHAR_T)) || UTF16_SWAP_IF(bReverse, z[1])!='\n' ){ |
|
6e3fceb…
|
mistachkin
|
285 |
flags |= LOOK_LONE_CR; /* Not enough chars or next char not LF */ |
|
d70ea7e…
|
drh
|
286 |
} |
|
d70ea7e…
|
drh
|
287 |
} |
|
d70ea7e…
|
drh
|
288 |
j = (c!='\n'); |
|
3161968…
|
mistachkin
|
289 |
if( !j ) flags |= (LOOK_LF | LOOK_LONE_LF); /* Found LF as first char */ |
|
7458a18…
|
jan.nijtmans
|
290 |
while( !(flags&stopFlags) && ((n-=sizeof(WCHAR_T))>=sizeof(WCHAR_T)) ){ |
|
d70ea7e…
|
drh
|
291 |
int c2 = c; |
|
d70ea7e…
|
drh
|
292 |
c = *++z; |
|
d70ea7e…
|
drh
|
293 |
if( bReverse ){ |
|
d70ea7e…
|
drh
|
294 |
c = UTF16_SWAP(c); |
|
d70ea7e…
|
drh
|
295 |
} |
|
d70ea7e…
|
drh
|
296 |
++j; |
|
d70ea7e…
|
drh
|
297 |
if( c==0 ){ |
|
d70ea7e…
|
drh
|
298 |
flags |= LOOK_NUL; /* NUL character in a file -> binary */ |
|
d70ea7e…
|
drh
|
299 |
}else if( c=='\n' ){ |
|
3161968…
|
mistachkin
|
300 |
flags |= LOOK_LF; |
|
d70ea7e…
|
drh
|
301 |
if( c2=='\r' ){ |
|
3161968…
|
mistachkin
|
302 |
flags |= (LOOK_CR | LOOK_CRLF); /* Found LF preceded by CR */ |
|
d70ea7e…
|
drh
|
303 |
}else{ |
|
d70ea7e…
|
drh
|
304 |
flags |= LOOK_LONE_LF; |
|
d70ea7e…
|
drh
|
305 |
} |
|
d70ea7e…
|
drh
|
306 |
if( j>UTF16_LENGTH_MASK ){ |
|
d70ea7e…
|
drh
|
307 |
flags |= LOOK_LONG; /* Very long line -> binary */ |
|
d70ea7e…
|
drh
|
308 |
} |
|
d70ea7e…
|
drh
|
309 |
j = 0; |
|
d70ea7e…
|
drh
|
310 |
}else if( c=='\r' ){ |
|
3161968…
|
mistachkin
|
311 |
flags |= LOOK_CR; |
|
d70ea7e…
|
drh
|
312 |
if( n<(2*sizeof(WCHAR_T)) || UTF16_SWAP_IF(bReverse, z[1])!='\n' ){ |
|
6e3fceb…
|
mistachkin
|
313 |
flags |= LOOK_LONE_CR; /* Not enough chars or next char not LF */ |
|
d70ea7e…
|
drh
|
314 |
} |
|
d70ea7e…
|
drh
|
315 |
} |
|
d70ea7e…
|
drh
|
316 |
} |
|
d70ea7e…
|
drh
|
317 |
if( n ){ |
|
d70ea7e…
|
drh
|
318 |
flags |= LOOK_SHORT; /* The whole blob was not examined */ |
|
d70ea7e…
|
drh
|
319 |
} |
|
d70ea7e…
|
drh
|
320 |
if( j>UTF16_LENGTH_MASK ){ |
|
d70ea7e…
|
drh
|
321 |
flags |= LOOK_LONG; /* Very long line -> binary */ |
|
d70ea7e…
|
drh
|
322 |
} |
|
d70ea7e…
|
drh
|
323 |
return flags; |
|
d70ea7e…
|
drh
|
324 |
} |
|
d70ea7e…
|
drh
|
325 |
|
|
d70ea7e…
|
drh
|
326 |
/* |
|
d70ea7e…
|
drh
|
327 |
** This function returns an array of bytes representing the byte-order-mark |
|
d70ea7e…
|
drh
|
328 |
** for UTF-8. |
|
d70ea7e…
|
drh
|
329 |
*/ |
|
d70ea7e…
|
drh
|
330 |
const unsigned char *get_utf8_bom(int *pnByte){ |
|
d70ea7e…
|
drh
|
331 |
static const unsigned char bom[] = { |
|
5f24da1…
|
jan.nijtmans
|
332 |
0xef, 0xbb, 0xbf, 0x00, 0x00, 0x00 |
|
d70ea7e…
|
drh
|
333 |
}; |
|
d70ea7e…
|
drh
|
334 |
if( pnByte ) *pnByte = 3; |
|
d70ea7e…
|
drh
|
335 |
return bom; |
|
d70ea7e…
|
drh
|
336 |
} |
|
d70ea7e…
|
drh
|
337 |
|
|
d70ea7e…
|
drh
|
338 |
/* |
|
d70ea7e…
|
drh
|
339 |
** This function returns non-zero if the blob starts with a UTF-8 |
|
d70ea7e…
|
drh
|
340 |
** byte-order-mark (BOM). |
|
d70ea7e…
|
drh
|
341 |
*/ |
|
d70ea7e…
|
drh
|
342 |
int starts_with_utf8_bom(const Blob *pContent, int *pnByte){ |
|
d70ea7e…
|
drh
|
343 |
const char *z = blob_buffer(pContent); |
|
d70ea7e…
|
drh
|
344 |
int bomSize = 0; |
|
d70ea7e…
|
drh
|
345 |
const unsigned char *bom = get_utf8_bom(&bomSize); |
|
d70ea7e…
|
drh
|
346 |
|
|
d70ea7e…
|
drh
|
347 |
if( pnByte ) *pnByte = bomSize; |
|
53db40e…
|
drh
|
348 |
if( (int)blob_size(pContent)<bomSize ) return 0; |
|
d70ea7e…
|
drh
|
349 |
return memcmp(z, bom, bomSize)==0; |
|
d70ea7e…
|
drh
|
350 |
} |
|
d70ea7e…
|
drh
|
351 |
|
|
d70ea7e…
|
drh
|
352 |
/* |
|
d70ea7e…
|
drh
|
353 |
** This function returns non-zero if the blob starts with a UTF-16 |
|
d70ea7e…
|
drh
|
354 |
** byte-order-mark (BOM), either in the endianness of the machine |
|
d70ea7e…
|
drh
|
355 |
** or in reversed byte order. The UTF-32 BOM is ruled out by checking |
|
d70ea7e…
|
drh
|
356 |
** if the UTF-16 BOM is not immediately followed by (utf16) 0. |
|
d70ea7e…
|
drh
|
357 |
** pnByte is only set when the function returns 1. |
|
d70ea7e…
|
drh
|
358 |
** |
|
d70ea7e…
|
drh
|
359 |
** pbReverse is always set, even when no BOM is found. Without a BOM, |
|
d70ea7e…
|
drh
|
360 |
** it is set to 1 on little-endian and 0 on big-endian platforms. See |
|
d70ea7e…
|
drh
|
361 |
** clause D98 of conformance (section 3.10) of the Unicode standard. |
|
d70ea7e…
|
drh
|
362 |
*/ |
|
d70ea7e…
|
drh
|
363 |
int starts_with_utf16_bom( |
|
d70ea7e…
|
drh
|
364 |
const Blob *pContent, /* IN: Blob content to perform BOM detection on. */ |
|
d70ea7e…
|
drh
|
365 |
int *pnByte, /* OUT: The number of bytes used for the BOM. */ |
|
d70ea7e…
|
drh
|
366 |
int *pbReverse /* OUT: Non-zero for BOM in reverse byte-order. */ |
|
d70ea7e…
|
drh
|
367 |
){ |
|
f7c41be…
|
drh
|
368 |
const unsigned char *z = (unsigned char *)blob_buffer(pContent); |
|
d70ea7e…
|
drh
|
369 |
int bomSize = sizeof(unsigned short); |
|
d70ea7e…
|
drh
|
370 |
int size = blob_size(pContent); |
|
f7c41be…
|
drh
|
371 |
unsigned short i0; |
|
d70ea7e…
|
drh
|
372 |
|
|
d70ea7e…
|
drh
|
373 |
if( size<bomSize ) goto noBom; /* No: cannot read BOM. */ |
|
f7c41be…
|
drh
|
374 |
if( size>=(2*bomSize) && z[2]==0 && z[3]==0 ) goto noBom; |
|
f7c41be…
|
drh
|
375 |
memcpy(&i0, z, sizeof(i0)); |
|
f7c41be…
|
drh
|
376 |
if( i0==0xfeff ){ |
|
d70ea7e…
|
drh
|
377 |
if( pbReverse ) *pbReverse = 0; |
|
f7c41be…
|
drh
|
378 |
}else if( i0==0xfffe ){ |
|
d70ea7e…
|
drh
|
379 |
if( pbReverse ) *pbReverse = 1; |
|
d70ea7e…
|
drh
|
380 |
}else{ |
|
d70ea7e…
|
drh
|
381 |
static const int one = 1; |
|
d70ea7e…
|
drh
|
382 |
noBom: |
|
d70ea7e…
|
drh
|
383 |
if( pbReverse ) *pbReverse = *(char *) &one; |
|
d70ea7e…
|
drh
|
384 |
return 0; /* No: UTF-16 byte-order-mark not found. */ |
|
d70ea7e…
|
drh
|
385 |
} |
|
d70ea7e…
|
drh
|
386 |
if( pnByte ) *pnByte = bomSize; |
|
d70ea7e…
|
drh
|
387 |
return 1; /* Yes. */ |
|
d70ea7e…
|
drh
|
388 |
} |
|
d70ea7e…
|
drh
|
389 |
|
|
d70ea7e…
|
drh
|
390 |
/* |
|
d70ea7e…
|
drh
|
391 |
** Returns non-zero if the specified content could be valid UTF-16. |
|
d70ea7e…
|
drh
|
392 |
*/ |
|
d70ea7e…
|
drh
|
393 |
int could_be_utf16(const Blob *pContent, int *pbReverse){ |
|
d70ea7e…
|
drh
|
394 |
return (blob_size(pContent) % sizeof(WCHAR_T) == 0) ? |
|
d70ea7e…
|
drh
|
395 |
starts_with_utf16_bom(pContent, 0, pbReverse) : 0; |
|
d70ea7e…
|
drh
|
396 |
} |
|
d70ea7e…
|
drh
|
397 |
|
|
d70ea7e…
|
drh
|
398 |
|
|
d70ea7e…
|
drh
|
399 |
/* |
|
d70ea7e…
|
drh
|
400 |
** COMMAND: test-looks-like-utf |
|
d70ea7e…
|
drh
|
401 |
** |
|
d70ea7e…
|
drh
|
402 |
** Usage: %fossil test-looks-like-utf FILENAME |
|
d70ea7e…
|
drh
|
403 |
** |
|
d70ea7e…
|
drh
|
404 |
** Options: |
|
11384f1…
|
drh
|
405 |
** -n|--limit N Repeat looks-like function N times, for |
|
4cb50c4…
|
stephan
|
406 |
** performance measurement. Default = 1 |
|
d70ea7e…
|
drh
|
407 |
** --utf8 Ignoring BOM and file size, force UTF-8 checking |
|
d70ea7e…
|
drh
|
408 |
** --utf16 Ignoring BOM and file size, force UTF-16 checking |
|
d70ea7e…
|
drh
|
409 |
** |
|
d70ea7e…
|
drh
|
410 |
** FILENAME is the name of a file to check for textual content in the UTF-8 |
|
d70ea7e…
|
drh
|
411 |
** and/or UTF-16 encodings. |
|
d70ea7e…
|
drh
|
412 |
*/ |
|
d70ea7e…
|
drh
|
413 |
void looks_like_utf_test_cmd(void){ |
|
503482a…
|
jan.nijtmans
|
414 |
Blob blob; /* the contents of the specified file */ |
|
503482a…
|
jan.nijtmans
|
415 |
int fUtf8 = 0; /* return value of starts_with_utf8_bom() */ |
|
503482a…
|
jan.nijtmans
|
416 |
int fUtf16 = 0; /* return value of starts_with_utf16_bom() */ |
|
503482a…
|
jan.nijtmans
|
417 |
int fUnicode = 0; /* return value of could_be_utf16() */ |
|
503482a…
|
jan.nijtmans
|
418 |
int lookFlags = 0; /* output flags from looks_like_utf8/utf16() */ |
|
d70ea7e…
|
drh
|
419 |
int bRevUtf16 = 0; /* non-zero -> UTF-16 byte order reversed */ |
|
d70ea7e…
|
drh
|
420 |
int fForceUtf8 = find_option("utf8",0,0)!=0; |
|
d70ea7e…
|
drh
|
421 |
int fForceUtf16 = find_option("utf16",0,0)!=0; |
|
5f24da1…
|
jan.nijtmans
|
422 |
const char *zCount = find_option("limit","n",1); |
|
5f24da1…
|
jan.nijtmans
|
423 |
int nRepeat = 1; |
|
5f24da1…
|
jan.nijtmans
|
424 |
|
|
d70ea7e…
|
drh
|
425 |
if( g.argc!=3 ) usage("FILENAME"); |
|
5f24da1…
|
jan.nijtmans
|
426 |
if( zCount ){ |
|
5f24da1…
|
jan.nijtmans
|
427 |
nRepeat = atoi(zCount); |
|
5f24da1…
|
jan.nijtmans
|
428 |
} |
|
1772357…
|
drh
|
429 |
blob_read_from_file(&blob, g.argv[2], ExtFILE); |
|
5f24da1…
|
jan.nijtmans
|
430 |
while( --nRepeat >= 0 ){ |
|
5f24da1…
|
jan.nijtmans
|
431 |
fUtf8 = starts_with_utf8_bom(&blob, 0); |
|
5f24da1…
|
jan.nijtmans
|
432 |
fUtf16 = starts_with_utf16_bom(&blob, 0, &bRevUtf16); |
|
5f24da1…
|
jan.nijtmans
|
433 |
if( fForceUtf8 ){ |
|
5f24da1…
|
jan.nijtmans
|
434 |
fUnicode = 0; |
|
5f24da1…
|
jan.nijtmans
|
435 |
}else{ |
|
09f2386…
|
jan.nijtmans
|
436 |
fUnicode = could_be_utf16(&blob, 0) || fForceUtf16; |
|
5f24da1…
|
jan.nijtmans
|
437 |
} |
|
5f24da1…
|
jan.nijtmans
|
438 |
if( fUnicode ){ |
|
09f2386…
|
jan.nijtmans
|
439 |
lookFlags = looks_like_utf16(&blob, bRevUtf16, 0); |
|
5f24da1…
|
jan.nijtmans
|
440 |
}else{ |
|
60349a6…
|
jan.nijtmans
|
441 |
lookFlags = looks_like_utf8(&blob, 0) | invalid_utf8(&blob); |
|
5f24da1…
|
jan.nijtmans
|
442 |
} |
|
5f24da1…
|
jan.nijtmans
|
443 |
} |
|
d70ea7e…
|
drh
|
444 |
fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob)); |
|
d70ea7e…
|
drh
|
445 |
fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no"); |
|
d70ea7e…
|
drh
|
446 |
fossil_print("Starts with UTF-16 BOM: %s\n", |
|
d70ea7e…
|
drh
|
447 |
fUtf16?(bRevUtf16?"reversed":"yes"):"no"); |
|
d70ea7e…
|
drh
|
448 |
fossil_print("Looks like UTF-%s: %s\n",fUnicode?"16":"8", |
|
d70ea7e…
|
drh
|
449 |
(lookFlags&LOOK_BINARY)?"no":"yes"); |
|
d70ea7e…
|
drh
|
450 |
fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no"); |
|
d70ea7e…
|
drh
|
451 |
fossil_print("Has flag LOOK_CR: %s\n",(lookFlags&LOOK_CR)?"yes":"no"); |
|
d70ea7e…
|
drh
|
452 |
fossil_print("Has flag LOOK_LONE_CR: %s\n", |
|
d70ea7e…
|
drh
|
453 |
(lookFlags&LOOK_LONE_CR)?"yes":"no"); |
|
d70ea7e…
|
drh
|
454 |
fossil_print("Has flag LOOK_LF: %s\n",(lookFlags&LOOK_LF)?"yes":"no"); |
|
d70ea7e…
|
drh
|
455 |
fossil_print("Has flag LOOK_LONE_LF: %s\n", |
|
d70ea7e…
|
drh
|
456 |
(lookFlags&LOOK_LONE_LF)?"yes":"no"); |
|
d70ea7e…
|
drh
|
457 |
fossil_print("Has flag LOOK_CRLF: %s\n",(lookFlags&LOOK_CRLF)?"yes":"no"); |
|
d70ea7e…
|
drh
|
458 |
fossil_print("Has flag LOOK_LONG: %s\n",(lookFlags&LOOK_LONG)?"yes":"no"); |
|
d70ea7e…
|
drh
|
459 |
fossil_print("Has flag LOOK_INVALID: %s\n", |
|
d70ea7e…
|
drh
|
460 |
(lookFlags&LOOK_INVALID)?"yes":"no"); |
|
d70ea7e…
|
drh
|
461 |
fossil_print("Has flag LOOK_ODD: %s\n",(lookFlags&LOOK_ODD)?"yes":"no"); |
|
d70ea7e…
|
drh
|
462 |
fossil_print("Has flag LOOK_SHORT: %s\n",(lookFlags&LOOK_SHORT)?"yes":"no"); |
|
d70ea7e…
|
drh
|
463 |
blob_reset(&blob); |
|
534c10f…
|
stephan
|
464 |
} |
|
534c10f…
|
stephan
|
465 |
|
|
534c10f…
|
stephan
|
466 |
/* |
|
57f1e87…
|
drh
|
467 |
** Return true if z[i] is the whole word given by zWord in a context that |
|
57f1e87…
|
drh
|
468 |
** might be an attempted SQL injection. |
|
d3cb62f…
|
drh
|
469 |
*/ |
|
d3cb62f…
|
drh
|
470 |
static int isWholeWord(const char *z, unsigned int i, const char *zWord, int n){ |
|
57f1e87…
|
drh
|
471 |
if( i==0 ) return 0; |
|
d3cb62f…
|
drh
|
472 |
if( sqlite3_strnicmp(z+i, zWord, n)!=0 ) return 0; |
|
57f1e87…
|
drh
|
473 |
if( fossil_isalnum(z[i-1]) ) return 0; |
|
eb7fad0…
|
drh
|
474 |
if( fossil_isalnum(z[i+n]) ) return 0; |
|
57f1e87…
|
drh
|
475 |
if( strchr("-)_", z[i-1])!=0 ) return 0; |
|
57f1e87…
|
drh
|
476 |
if( strchr("(_", z[i+n])!=0 ) return 0; |
|
d3cb62f…
|
drh
|
477 |
return 1; |
|
d3cb62f…
|
drh
|
478 |
} |
|
d3cb62f…
|
drh
|
479 |
|
|
d3cb62f…
|
drh
|
480 |
/* |
|
534c10f…
|
stephan
|
481 |
** Returns true if the given text contains certain keywords or |
|
eb7fad0…
|
drh
|
482 |
** punctuation which indicate that it might be an SQL injection attempt |
|
8612122…
|
drh
|
483 |
** or Cross-site scripting attempt or some other kind of mischief. |
|
eb7fad0…
|
drh
|
484 |
** |
|
8612122…
|
drh
|
485 |
** This is not a primary defense against vulnerabilities in the Fossil |
|
8612122…
|
drh
|
486 |
** code. Rather, this is part of an effort to do early detection of malicious |
|
8612122…
|
drh
|
487 |
** spiders to avoid them using up too many CPU cycles. Or, this routine |
|
8612122…
|
drh
|
488 |
** can also be thought of as a secondary layer of defense against attacks. |
|
d3cb62f…
|
drh
|
489 |
*/ |
|
8612122…
|
drh
|
490 |
int looks_like_attack(const char *zTxt){ |
|
d3cb62f…
|
drh
|
491 |
unsigned int i; |
|
5a33f30…
|
drh
|
492 |
int rc = 0; |
|
d3cb62f…
|
drh
|
493 |
if( zTxt==0 ) return 0; |
|
d3cb62f…
|
drh
|
494 |
for(i=0; zTxt[i]; i++){ |
|
d3cb62f…
|
drh
|
495 |
switch( zTxt[i] ){ |
|
5a33f30…
|
drh
|
496 |
case '<': |
|
d3cb62f…
|
drh
|
497 |
case ';': |
|
d3cb62f…
|
drh
|
498 |
case '\'': |
|
d3cb62f…
|
drh
|
499 |
return 1; |
|
eb7fad0…
|
drh
|
500 |
case '/': /* 0123456789 123456789 */ |
|
5a33f30…
|
drh
|
501 |
if( strncmp(zTxt+i+1, "/wp-content/plugins/", 20)==0 ) rc = 1; |
|
5a33f30…
|
drh
|
502 |
if( strncmp(zTxt+i+1, "/wp-admin/admin-ajax", 20)==0 ) rc = 1; |
|
eb7fad0…
|
drh
|
503 |
break; |
|
d3cb62f…
|
drh
|
504 |
case 'a': |
|
d3cb62f…
|
drh
|
505 |
case 'A': |
|
5a33f30…
|
drh
|
506 |
if( isWholeWord(zTxt, i, "and", 3) ) rc = 1; |
|
d3cb62f…
|
drh
|
507 |
break; |
|
d3cb62f…
|
drh
|
508 |
case 'n': |
|
d3cb62f…
|
drh
|
509 |
case 'N': |
|
5a33f30…
|
drh
|
510 |
if( isWholeWord(zTxt, i, "null", 4) ) rc = 1; |
|
d3cb62f…
|
drh
|
511 |
break; |
|
d3cb62f…
|
drh
|
512 |
case 'o': |
|
d3cb62f…
|
drh
|
513 |
case 'O': |
|
57f1e87…
|
drh
|
514 |
if( isWholeWord(zTxt, i, "order", 5) && fossil_isspace(zTxt[i+5]) ){ |
|
5a33f30…
|
drh
|
515 |
rc = 1; |
|
57f1e87…
|
drh
|
516 |
} |
|
5a33f30…
|
drh
|
517 |
if( isWholeWord(zTxt, i, "or", 2) ) rc = 1; |
|
d3cb62f…
|
drh
|
518 |
break; |
|
d3cb62f…
|
drh
|
519 |
case 's': |
|
d3cb62f…
|
drh
|
520 |
case 'S': |
|
5a33f30…
|
drh
|
521 |
if( isWholeWord(zTxt, i, "select", 6) ) rc = 1; |
|
d3cb62f…
|
drh
|
522 |
break; |
|
d3cb62f…
|
drh
|
523 |
case 'w': |
|
d3cb62f…
|
drh
|
524 |
case 'W': |
|
5a33f30…
|
drh
|
525 |
if( isWholeWord(zTxt, i, "waitfor", 7) ) rc = 1; |
|
d3cb62f…
|
drh
|
526 |
break; |
|
d3cb62f…
|
drh
|
527 |
} |
|
d3cb62f…
|
drh
|
528 |
} |
|
5a33f30…
|
drh
|
529 |
if( rc ){ |
|
5a33f30…
|
drh
|
530 |
/* The test/markdown-test3.md document which is part of the Fossil source |
|
5a33f30…
|
drh
|
531 |
** tree intentionally tries to fake an attack. Do not report such |
|
5a33f30…
|
drh
|
532 |
** errors. */ |
|
5a33f30…
|
drh
|
533 |
const char *zPathInfo = P("PATH_INFO"); |
|
5a33f30…
|
drh
|
534 |
if( sqlite3_strglob("/doc/*/test/markdown-test3.md", zPathInfo)==0 ){ |
|
5a33f30…
|
drh
|
535 |
rc = 0; |
|
5a33f30…
|
drh
|
536 |
} |
|
5a33f30…
|
drh
|
537 |
} |
|
5a33f30…
|
drh
|
538 |
return rc; |
|
d3cb62f…
|
drh
|
539 |
} |
|
d3cb62f…
|
drh
|
540 |
|
|
d3cb62f…
|
drh
|
541 |
/* |
|
d3cb62f…
|
drh
|
542 |
** This is a utility routine associated with the test-looks-like-sql-injection |
|
d3cb62f…
|
drh
|
543 |
** command. |
|
d3cb62f…
|
drh
|
544 |
** |
|
d3cb62f…
|
drh
|
545 |
** Read input from zInFile and print only those lines that look like they |
|
d3cb62f…
|
drh
|
546 |
** might be SQL injection. |
|
d3cb62f…
|
drh
|
547 |
** |
|
d3cb62f…
|
drh
|
548 |
** Or if bInvert is true, then show the opposite - those lines that do NOT |
|
d3cb62f…
|
drh
|
549 |
** look like SQL injection. |
|
d3cb62f…
|
drh
|
550 |
*/ |
|
8612122…
|
drh
|
551 |
static void show_attack_lines( |
|
d3cb62f…
|
drh
|
552 |
const char *zInFile, /* Name of input file */ |
|
d3cb62f…
|
drh
|
553 |
int bInvert, /* Invert the sense of the output (-v) */ |
|
d3cb62f…
|
drh
|
554 |
int bDeHttpize /* De-httpize the inputs. (-d) */ |
|
d3cb62f…
|
drh
|
555 |
){ |
|
d3cb62f…
|
drh
|
556 |
FILE *in; |
|
d3cb62f…
|
drh
|
557 |
char zLine[10000]; |
|
d3cb62f…
|
drh
|
558 |
if( zInFile==0 || strcmp(zInFile,"-")==0 ){ |
|
d3cb62f…
|
drh
|
559 |
in = stdin; |
|
d3cb62f…
|
drh
|
560 |
}else{ |
|
d3cb62f…
|
drh
|
561 |
in = fopen(zInFile, "rb"); |
|
d3cb62f…
|
drh
|
562 |
if( in==0 ){ |
|
d3cb62f…
|
drh
|
563 |
fossil_fatal("cannot open \"%s\" for reading\n", zInFile); |
|
d3cb62f…
|
drh
|
564 |
} |
|
d3cb62f…
|
drh
|
565 |
} |
|
d3cb62f…
|
drh
|
566 |
while( fgets(zLine, sizeof(zLine), in) ){ |
|
d3cb62f…
|
drh
|
567 |
dehttpize(zLine); |
|
8612122…
|
drh
|
568 |
if( (looks_like_attack(zLine)!=0) ^ bInvert ){ |
|
d3cb62f…
|
drh
|
569 |
fossil_print("%s", zLine); |
|
d3cb62f…
|
drh
|
570 |
} |
|
d3cb62f…
|
drh
|
571 |
} |
|
d3cb62f…
|
drh
|
572 |
if( in!=stdin ) fclose(in); |
|
d3cb62f…
|
drh
|
573 |
} |
|
d3cb62f…
|
drh
|
574 |
|
|
d3cb62f…
|
drh
|
575 |
/* |
|
8612122…
|
drh
|
576 |
** COMMAND: test-looks-like-attack |
|
d3cb62f…
|
drh
|
577 |
** |
|
d3cb62f…
|
drh
|
578 |
** Read lines of input from files named as arguments (or from standard |
|
d3cb62f…
|
drh
|
579 |
** input if no arguments are provided) and print those that look like they |
|
d3cb62f…
|
drh
|
580 |
** might be part of an SQL injection attack. |
|
d3cb62f…
|
drh
|
581 |
** |
|
8612122…
|
drh
|
582 |
** Used to test the looks_lile_attack() utility subroutine, possibly |
|
d3cb62f…
|
drh
|
583 |
** by piping in actual server log data. |
|
534c10f…
|
stephan
|
584 |
*/ |
|
8612122…
|
drh
|
585 |
void test_looks_like_attack(void){ |
|
d3cb62f…
|
drh
|
586 |
int i; |
|
d3cb62f…
|
drh
|
587 |
int bInvert = find_option("invert","v",0)!=0; |
|
d3cb62f…
|
drh
|
588 |
int bDeHttpize = find_option("dehttpize","d",0)!=0; |
|
d3cb62f…
|
drh
|
589 |
verify_all_options(); |
|
d3cb62f…
|
drh
|
590 |
if( g.argc==2 ){ |
|
8612122…
|
drh
|
591 |
show_attack_lines(0, bInvert, bDeHttpize); |
|
d3cb62f…
|
drh
|
592 |
} |
|
d3cb62f…
|
drh
|
593 |
for(i=2; i<g.argc; i++){ |
|
8612122…
|
drh
|
594 |
show_attack_lines(g.argv[i], bInvert, bDeHttpize); |
|
d3cb62f…
|
drh
|
595 |
} |
|
d70ea7e…
|
drh
|
596 |
} |