|
1
|
/* |
|
2
|
** Copyright (c) 2013 D. Richard Hipp |
|
3
|
** |
|
4
|
** This program is free software; you can redistribute it and/or |
|
5
|
** modify it under the terms of the Simplified BSD License (also |
|
6
|
** known as the "2-Clause License" or "FreeBSD License".) |
|
7
|
|
|
8
|
** This program is distributed in the hope that it will be useful, |
|
9
|
** but without any warranty; without even the implied warranty of |
|
10
|
** merchantability or fitness for a particular purpose. |
|
11
|
** |
|
12
|
** Author contact information: |
|
13
|
** [email protected] |
|
14
|
** http://www.hwaci.com/drh/ |
|
15
|
** |
|
16
|
******************************************************************************* |
|
17
|
** |
|
18
|
** This file contains code used to try to guess if a particular file is |
|
19
|
** text or binary, what types of line endings it uses, is it UTF8 or |
|
20
|
** UTF16, etc. |
|
21
|
*/ |
|
22
|
#include "config.h" |
|
23
|
#include "lookslike.h" |
|
24
|
#include <assert.h> |
|
25
|
|
|
26
|
|
|
27
|
#if INTERFACE |
|
28
|
|
|
29
|
/* |
|
30
|
** This macro is designed to return non-zero if the specified blob contains |
|
31
|
** data that MAY be binary in nature; otherwise, zero will be returned. |
|
32
|
*/ |
|
33
|
#define looks_like_binary(blob) \ |
|
34
|
((looks_like_utf8((blob), LOOK_BINARY, 0) & LOOK_BINARY) != LOOK_NONE) |
|
35
|
|
|
36
|
/* |
|
37
|
** Output flags for the looks_like_utf8() and looks_like_utf16() routines used |
|
38
|
** to convey status information about the blob content. |
|
39
|
*/ |
|
40
|
#define LOOK_NONE ((int)0x00000000) /* Nothing special was found. */ |
|
41
|
#define LOOK_NUL ((int)0x00000001) /* One or more NUL chars were found. */ |
|
42
|
#define LOOK_CR ((int)0x00000002) /* One or more CR chars were found. */ |
|
43
|
#define LOOK_LONE_CR ((int)0x00000004) /* An unpaired CR char was found. */ |
|
44
|
#define LOOK_LF ((int)0x00000008) /* One or more LF chars were found. */ |
|
45
|
#define LOOK_LONE_LF ((int)0x00000010) /* An unpaired LF char was found. */ |
|
46
|
#define LOOK_CRLF ((int)0x00000020) /* One or more CR/LF pairs were found. */ |
|
47
|
#define LOOK_LONG ((int)0x00000040) /* An over length line was found. */ |
|
48
|
#define LOOK_ODD ((int)0x00000080) /* An odd number of bytes was found. */ |
|
49
|
#define LOOK_SHORT ((int)0x00000100) /* Unable to perform full check. */ |
|
50
|
#define LOOK_INVALID ((int)0x00000200) /* Invalid sequence was found. */ |
|
51
|
#define LOOK_BINARY (LOOK_NUL | LOOK_LONG | LOOK_SHORT) /* May be binary. */ |
|
52
|
#define LOOK_EOL (LOOK_LONE_CR | LOOK_LONE_LF | LOOK_CRLF) /* Line seps. */ |
|
53
|
#endif /* INTERFACE */ |
|
54
|
|
|
55
|
/* definitions for various UTF-8 sequence lengths, encoded as start value |
|
56
|
* and size of each valid range belonging to some lead byte*/ |
|
57
|
#define US2A 0x80, 0x01 /* for lead byte 0xC0 */ |
|
58
|
#define US2B 0x80, 0x40 /* for lead bytes 0xC2-0xDF */ |
|
59
|
#define US3A 0xA0, 0x20 /* for lead byte 0xE0 */ |
|
60
|
#define US3B 0x80, 0x40 /* for lead bytes 0xE1-0xEF */ |
|
61
|
#define US4A 0x90, 0x30 /* for lead byte 0xF0 */ |
|
62
|
#define US4B 0x80, 0x40 /* for lead bytes 0xF1-0xF3 */ |
|
63
|
#define US4C 0x80, 0x10 /* for lead byte 0xF4 */ |
|
64
|
#define US0A 0x00, 0x00 /* for any other lead byte */ |
|
65
|
|
|
66
|
/* a table used for quick lookup of the definition that goes with a |
|
67
|
* particular lead byte */ |
|
68
|
static const unsigned char lb_tab[] = { |
|
69
|
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
|
70
|
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
|
71
|
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
|
72
|
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
|
73
|
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
|
74
|
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
|
75
|
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
|
76
|
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A, |
|
77
|
US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B, |
|
78
|
US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, |
|
79
|
US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, |
|
80
|
US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B, |
|
81
|
US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B, |
|
82
|
US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B, |
|
83
|
US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A, |
|
84
|
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A |
|
85
|
}; |
|
86
|
|
|
87
|
/* |
|
88
|
** This function attempts to scan each logical line within the blob to |
|
89
|
** determine the type of content it appears to contain. The return value |
|
90
|
** is a combination of one or more of the LOOK_XXX flags (see above): |
|
91
|
** |
|
92
|
** !LOOK_BINARY -- The content appears to consist entirely of text; however, |
|
93
|
** the encoding may not be UTF-8. |
|
94
|
** |
|
95
|
** LOOK_BINARY -- The content appears to be binary because it contains one |
|
96
|
** or more embedded NUL characters or an extremely long line. |
|
97
|
** Since this function does not understand UTF-16, it may |
|
98
|
** falsely consider UTF-16 text to be binary. |
|
99
|
** |
|
100
|
** Additional flags (i.e. those other than the ones included in LOOK_BINARY) |
|
101
|
** may be present in the result as well; however, they should not impact the |
|
102
|
** determination of text versus binary content. |
|
103
|
** |
|
104
|
************************************ WARNING ********************************** |
|
105
|
** |
|
106
|
** This function does not validate that the blob content is properly formed |
|
107
|
** UTF-8. It assumes that all code points are the same size. It does not |
|
108
|
** validate any code points. It makes no attempt to detect if any [invalid] |
|
109
|
** switches between UTF-8 and other encodings occur. |
|
110
|
** |
|
111
|
** The only code points that this function cares about are the NUL character, |
|
112
|
** carriage-return, and line-feed. |
|
113
|
** |
|
114
|
** This function examines the contents of the blob until one of the flags |
|
115
|
** specified in "stopFlags" is set. |
|
116
|
** |
|
117
|
************************************ WARNING ********************************** |
|
118
|
*/ |
|
119
|
int looks_like_utf8(const Blob *pContent, int stopFlags, int fVerbose){ |
|
120
|
const char *z = blob_buffer(pContent); |
|
121
|
unsigned int n = blob_size(pContent); |
|
122
|
int j, c, flags = LOOK_NONE; /* Assume UTF-8 text, prove otherwise */ |
|
123
|
int nLine = 1; |
|
124
|
|
|
125
|
if( n==0 ) return flags; /* Empty file -> text */ |
|
126
|
c = *z; |
|
127
|
if( c==0 ){ |
|
128
|
flags |= LOOK_NUL; /* NUL character in a file -> binary */ |
|
129
|
if( fVerbose ) fossil_print("NUL at start\n"); |
|
130
|
}else if( c=='\r' ){ |
|
131
|
flags |= LOOK_CR; |
|
132
|
if( fVerbose ) fossil_print("CR at start\n"); |
|
133
|
if( n<=1 || z[1]!='\n' ){ |
|
134
|
flags |= LOOK_LONE_CR; /* Not enough chars or next char not LF */ |
|
135
|
if( fVerbose ) fossil_print("Lone CR at start\n"); |
|
136
|
} |
|
137
|
} |
|
138
|
j = (c!='\n'); |
|
139
|
if( !j ) flags |= (LOOK_LF | LOOK_LONE_LF); /* Found LF as first char */ |
|
140
|
while( !(flags&stopFlags) && --n>0 ){ |
|
141
|
int c2 = c; |
|
142
|
c = *++z; ++j; |
|
143
|
if( c==0 ){ |
|
144
|
if( fVerbose && !(flags&LOOK_NUL) ){ |
|
145
|
fossil_print("NUL on line %d\n", nLine); |
|
146
|
} |
|
147
|
flags |= LOOK_NUL; /* NUL character in a file -> binary */ |
|
148
|
}else if( c=='\n' ){ |
|
149
|
flags |= LOOK_LF; |
|
150
|
if( c2=='\r' ){ |
|
151
|
if( fVerbose && !(flags&LOOK_CRLF) ){ |
|
152
|
fossil_print("CRLF on line %d\n", nLine); |
|
153
|
} |
|
154
|
flags |= (LOOK_CR | LOOK_CRLF); /* Found LF preceded by CR */ |
|
155
|
}else{ |
|
156
|
if( fVerbose && !(flags&LOOK_LONE_LF) ){ |
|
157
|
fossil_print("Lone LF on line %d\n", nLine); |
|
158
|
} |
|
159
|
flags |= LOOK_LONE_LF; |
|
160
|
} |
|
161
|
if( j>LENGTH_MASK ){ |
|
162
|
if( fVerbose && !(flags&LOOK_LONG) ){ |
|
163
|
fossil_print("Line %d is longer than %d bytes\n", nLine, j); |
|
164
|
} |
|
165
|
flags |= LOOK_LONG; /* Very long line -> binary */ |
|
166
|
} |
|
167
|
++nLine; |
|
168
|
j = 0; |
|
169
|
}else if( c=='\r' ){ |
|
170
|
flags |= LOOK_CR; |
|
171
|
if( n<=1 || z[1]!='\n' ){ |
|
172
|
if( fVerbose && !(flags&LOOK_LONE_CR) ){ |
|
173
|
fossil_print("Lone CR on line %d\n", nLine); |
|
174
|
} |
|
175
|
flags |= LOOK_LONE_CR; /* Not enough chars or next char not LF */ |
|
176
|
} |
|
177
|
} |
|
178
|
} |
|
179
|
if( n ){ |
|
180
|
flags |= LOOK_SHORT; /* The whole blob was not examined */ |
|
181
|
} |
|
182
|
if( j>LENGTH_MASK ){ |
|
183
|
flags |= LOOK_LONG; /* Very long line -> binary */ |
|
184
|
} |
|
185
|
return flags; |
|
186
|
} |
|
187
|
|
|
188
|
/* |
|
189
|
** Checks for proper UTF-8. It uses the method described in: |
|
190
|
** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences |
|
191
|
** except for the "overlong form" of \u0000 which is not considered |
|
192
|
** invalid here: Some languages like Java and Tcl use it. This function |
|
193
|
** also considers valid the derivatives CESU-8 & WTF-8 (as described in |
|
194
|
** the same wikipedia article referenced previously). For UTF-8 characters |
|
195
|
** > 0x7f, the variable 'c' not necessary means the real lead byte. |
|
196
|
** It's number of higher 1-bits indicate the number of continuation |
|
197
|
** bytes that are expected to be followed. E.g. when 'c' has a value |
|
198
|
** in the range 0xc0..0xdf it means that after 'c' a single continuation |
|
199
|
** byte is expected. A value 0xe0..0xef means that after 'c' two more |
|
200
|
** continuation bytes are expected. |
|
201
|
*/ |
|
202
|
|
|
203
|
int invalid_utf8( |
|
204
|
const Blob *pContent |
|
205
|
){ |
|
206
|
const unsigned char *z = (unsigned char *) blob_buffer(pContent); |
|
207
|
unsigned int n = blob_size(pContent); |
|
208
|
unsigned char c; /* lead byte to be handled. */ |
|
209
|
|
|
210
|
if( n==0 ) return 0; /* Empty file -> OK */ |
|
211
|
c = *z; |
|
212
|
while( --n>0 ){ |
|
213
|
if( c>=0x80 ){ |
|
214
|
const unsigned char *def; /* pointer to range table*/ |
|
215
|
|
|
216
|
c <<= 1; /* multiply by 2 and get rid of highest bit */ |
|
217
|
def = &lb_tab[c]; /* search fb's valid range in table */ |
|
218
|
if( (unsigned int)(*++z-def[0])>=def[1] ){ |
|
219
|
return LOOK_INVALID; /* Invalid UTF-8 */ |
|
220
|
} |
|
221
|
c = (c>=0xC0) ? (c|3) : ' '; /* determine next lead byte */ |
|
222
|
} else { |
|
223
|
c = *++z; |
|
224
|
} |
|
225
|
} |
|
226
|
return (c>=0x80) ? LOOK_INVALID : 0; /* Final lead byte must be ASCII. */ |
|
227
|
} |
|
228
|
|
|
229
|
/* |
|
230
|
** Define the type needed to represent a Unicode (UTF-16) character. |
|
231
|
*/ |
|
232
|
#ifndef WCHAR_T |
|
233
|
# ifdef _WIN32 |
|
234
|
# define WCHAR_T wchar_t |
|
235
|
# else |
|
236
|
# define WCHAR_T unsigned short |
|
237
|
# endif |
|
238
|
#endif |
|
239
|
|
|
240
|
/* |
|
241
|
** Maximum length of a line in a text file, in UTF-16 characters. (4096) |
|
242
|
** The number of bytes represented by this value cannot exceed LENGTH_MASK |
|
243
|
** bytes, because that is the line buffer size used by the diff engine. |
|
244
|
*/ |
|
245
|
#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char))) |
|
246
|
#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1) |
|
247
|
|
|
248
|
/* |
|
249
|
** This macro is used to swap the byte order of a UTF-16 character in the |
|
250
|
** looks_like_utf16() function. |
|
251
|
*/ |
|
252
|
#define UTF16_SWAP(ch) ((((ch) << 8) & 0xff00) | (((ch) >> 8) & 0xff)) |
|
253
|
#define UTF16_SWAP_IF(expr,ch) ((expr) ? UTF16_SWAP((ch)) : (ch)) |
|
254
|
|
|
255
|
/* |
|
256
|
** This function attempts to scan each logical line within the blob to |
|
257
|
** determine the type of content it appears to contain. The return value |
|
258
|
** is a combination of one or more of the LOOK_XXX flags (see above): |
|
259
|
** |
|
260
|
** !LOOK_BINARY -- The content appears to consist entirely of text; however, |
|
261
|
** the encoding may not be UTF-16. |
|
262
|
** |
|
263
|
** LOOK_BINARY -- The content appears to be binary because it contains one |
|
264
|
** or more embedded NUL characters or an extremely long line. |
|
265
|
** Since this function does not understand UTF-8, it may |
|
266
|
** falsely consider UTF-8 text to be binary. |
|
267
|
** |
|
268
|
** Additional flags (i.e. those other than the ones included in LOOK_BINARY) |
|
269
|
** may be present in the result as well; however, they should not impact the |
|
270
|
** determination of text versus binary content. |
|
271
|
** |
|
272
|
************************************ WARNING ********************************** |
|
273
|
** |
|
274
|
** This function does not validate that the blob content is properly formed |
|
275
|
** UTF-16. It assumes that all code points are the same size. It does not |
|
276
|
** validate any code points. It makes no attempt to detect if any [invalid] |
|
277
|
** switches between the UTF-16be and UTF-16le encodings occur. |
|
278
|
** |
|
279
|
** The only code points that this function cares about are the NUL character, |
|
280
|
** carriage-return, and line-feed. |
|
281
|
** |
|
282
|
** This function examines the contents of the blob until one of the flags |
|
283
|
** specified in "stopFlags" is set. |
|
284
|
** |
|
285
|
************************************ WARNING ********************************** |
|
286
|
*/ |
|
287
|
int looks_like_utf16(const Blob *pContent, int bReverse, int stopFlags){ |
|
288
|
const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent); |
|
289
|
unsigned int n = blob_size(pContent); |
|
290
|
int j, c, flags = LOOK_NONE; /* Assume UTF-16 text, prove otherwise */ |
|
291
|
|
|
292
|
if( n%sizeof(WCHAR_T) ){ |
|
293
|
flags |= LOOK_ODD; /* Odd number of bytes -> binary (UTF-8?) */ |
|
294
|
} |
|
295
|
if( n<sizeof(WCHAR_T) ) return flags;/* Zero or One byte -> binary (UTF-8?) */ |
|
296
|
c = *z; |
|
297
|
if( bReverse ){ |
|
298
|
c = UTF16_SWAP(c); |
|
299
|
} |
|
300
|
if( c==0 ){ |
|
301
|
flags |= LOOK_NUL; /* NUL character in a file -> binary */ |
|
302
|
}else if( c=='\r' ){ |
|
303
|
flags |= LOOK_CR; |
|
304
|
if( n<(2*sizeof(WCHAR_T)) || UTF16_SWAP_IF(bReverse, z[1])!='\n' ){ |
|
305
|
flags |= LOOK_LONE_CR; /* Not enough chars or next char not LF */ |
|
306
|
} |
|
307
|
} |
|
308
|
j = (c!='\n'); |
|
309
|
if( !j ) flags |= (LOOK_LF | LOOK_LONE_LF); /* Found LF as first char */ |
|
310
|
while( !(flags&stopFlags) && ((n-=sizeof(WCHAR_T))>=sizeof(WCHAR_T)) ){ |
|
311
|
int c2 = c; |
|
312
|
c = *++z; |
|
313
|
if( bReverse ){ |
|
314
|
c = UTF16_SWAP(c); |
|
315
|
} |
|
316
|
++j; |
|
317
|
if( c==0 ){ |
|
318
|
flags |= LOOK_NUL; /* NUL character in a file -> binary */ |
|
319
|
}else if( c=='\n' ){ |
|
320
|
flags |= LOOK_LF; |
|
321
|
if( c2=='\r' ){ |
|
322
|
flags |= (LOOK_CR | LOOK_CRLF); /* Found LF preceded by CR */ |
|
323
|
}else{ |
|
324
|
flags |= LOOK_LONE_LF; |
|
325
|
} |
|
326
|
if( j>UTF16_LENGTH_MASK ){ |
|
327
|
flags |= LOOK_LONG; /* Very long line -> binary */ |
|
328
|
} |
|
329
|
j = 0; |
|
330
|
}else if( c=='\r' ){ |
|
331
|
flags |= LOOK_CR; |
|
332
|
if( n<(2*sizeof(WCHAR_T)) || UTF16_SWAP_IF(bReverse, z[1])!='\n' ){ |
|
333
|
flags |= LOOK_LONE_CR; /* Not enough chars or next char not LF */ |
|
334
|
} |
|
335
|
} |
|
336
|
} |
|
337
|
if( n ){ |
|
338
|
flags |= LOOK_SHORT; /* The whole blob was not examined */ |
|
339
|
} |
|
340
|
if( j>UTF16_LENGTH_MASK ){ |
|
341
|
flags |= LOOK_LONG; /* Very long line -> binary */ |
|
342
|
} |
|
343
|
return flags; |
|
344
|
} |
|
345
|
|
|
346
|
/* |
|
347
|
** This function returns an array of bytes representing the byte-order-mark |
|
348
|
** for UTF-8. |
|
349
|
*/ |
|
350
|
const unsigned char *get_utf8_bom(int *pnByte){ |
|
351
|
static const unsigned char bom[] = { |
|
352
|
0xef, 0xbb, 0xbf, 0x00, 0x00, 0x00 |
|
353
|
}; |
|
354
|
if( pnByte ) *pnByte = 3; |
|
355
|
return bom; |
|
356
|
} |
|
357
|
|
|
358
|
/* |
|
359
|
** This function returns non-zero if the blob starts with a UTF-8 |
|
360
|
** byte-order-mark (BOM). |
|
361
|
*/ |
|
362
|
int starts_with_utf8_bom(const Blob *pContent, int *pnByte){ |
|
363
|
const char *z = blob_buffer(pContent); |
|
364
|
int bomSize = 0; |
|
365
|
const unsigned char *bom = get_utf8_bom(&bomSize); |
|
366
|
|
|
367
|
if( pnByte ) *pnByte = bomSize; |
|
368
|
if( (int)blob_size(pContent)<bomSize ) return 0; |
|
369
|
return memcmp(z, bom, bomSize)==0; |
|
370
|
} |
|
371
|
|
|
372
|
/* |
|
373
|
** This function returns non-zero if the blob starts with a UTF-16 |
|
374
|
** byte-order-mark (BOM), either in the endianness of the machine |
|
375
|
** or in reversed byte order. The UTF-32 BOM is ruled out by checking |
|
376
|
** if the UTF-16 BOM is not immediately followed by (utf16) 0. |
|
377
|
** pnByte is only set when the function returns 1. |
|
378
|
** |
|
379
|
** pbReverse is always set, even when no BOM is found. Without a BOM, |
|
380
|
** it is set to 1 on little-endian and 0 on big-endian platforms. See |
|
381
|
** clause D98 of conformance (section 3.10) of the Unicode standard. |
|
382
|
*/ |
|
383
|
int starts_with_utf16_bom( |
|
384
|
const Blob *pContent, /* IN: Blob content to perform BOM detection on. */ |
|
385
|
int *pnByte, /* OUT: The number of bytes used for the BOM. */ |
|
386
|
int *pbReverse /* OUT: Non-zero for BOM in reverse byte-order. */ |
|
387
|
){ |
|
388
|
const unsigned char *z = (unsigned char *)blob_buffer(pContent); |
|
389
|
int bomSize = sizeof(unsigned short); |
|
390
|
int size = blob_size(pContent); |
|
391
|
unsigned short i0; |
|
392
|
|
|
393
|
if( size<bomSize ) goto noBom; /* No: cannot read BOM. */ |
|
394
|
if( size>=(2*bomSize) && z[2]==0 && z[3]==0 ) goto noBom; |
|
395
|
memcpy(&i0, z, sizeof(i0)); |
|
396
|
if( i0==0xfeff ){ |
|
397
|
if( pbReverse ) *pbReverse = 0; |
|
398
|
}else if( i0==0xfffe ){ |
|
399
|
if( pbReverse ) *pbReverse = 1; |
|
400
|
}else{ |
|
401
|
static const int one = 1; |
|
402
|
noBom: |
|
403
|
if( pbReverse ) *pbReverse = *(char *) &one; |
|
404
|
return 0; /* No: UTF-16 byte-order-mark not found. */ |
|
405
|
} |
|
406
|
if( pnByte ) *pnByte = bomSize; |
|
407
|
return 1; /* Yes. */ |
|
408
|
} |
|
409
|
|
|
410
|
/* |
|
411
|
** Returns non-zero if the specified content could be valid UTF-16. |
|
412
|
*/ |
|
413
|
int could_be_utf16(const Blob *pContent, int *pbReverse){ |
|
414
|
return (blob_size(pContent) % sizeof(WCHAR_T) == 0) ? |
|
415
|
starts_with_utf16_bom(pContent, 0, pbReverse) : 0; |
|
416
|
} |
|
417
|
|
|
418
|
|
|
419
|
/* |
|
420
|
** COMMAND: test-looks-like-utf |
|
421
|
** |
|
422
|
** Usage: %fossil test-looks-like-utf FILENAME |
|
423
|
** |
|
424
|
** Options: |
|
425
|
** -n|--limit N Repeat looks-like function N times, for |
|
426
|
** performance measurement. Default = 1 |
|
427
|
** --utf8 Ignoring BOM and file size, force UTF-8 checking |
|
428
|
** --utf16 Ignoring BOM and file size, force UTF-16 checking |
|
429
|
** -v|--verbose Report the line numbers where each flag is first set |
|
430
|
** |
|
431
|
** FILENAME is the name of a file to check for textual content in the UTF-8 |
|
432
|
** and/or UTF-16 encodings. |
|
433
|
*/ |
|
434
|
void looks_like_utf_test_cmd(void){ |
|
435
|
Blob blob; /* the contents of the specified file */ |
|
436
|
int fUtf8 = 0; /* return value of starts_with_utf8_bom() */ |
|
437
|
int fUtf16 = 0; /* return value of starts_with_utf16_bom() */ |
|
438
|
int fUnicode = 0; /* return value of could_be_utf16() */ |
|
439
|
int lookFlags = 0; /* output flags from looks_like_utf8/utf16() */ |
|
440
|
int bRevUtf16 = 0; /* non-zero -> UTF-16 byte order reversed */ |
|
441
|
int fForceUtf8 = find_option("utf8",0,0)!=0; |
|
442
|
int fForceUtf16 = find_option("utf16",0,0)!=0; |
|
443
|
const char *zCount = find_option("limit","n",1); |
|
444
|
int fVerbose = find_option("verbose","v",0)!=0; |
|
445
|
int nRepeat = 1; |
|
446
|
|
|
447
|
if( g.argc!=3 ) usage("FILENAME"); |
|
448
|
if( zCount ){ |
|
449
|
nRepeat = atoi(zCount); |
|
450
|
} |
|
451
|
blob_read_from_file(&blob, g.argv[2], ExtFILE); |
|
452
|
while( --nRepeat >= 0 ){ |
|
453
|
fUtf8 = starts_with_utf8_bom(&blob, 0); |
|
454
|
fUtf16 = starts_with_utf16_bom(&blob, 0, &bRevUtf16); |
|
455
|
if( fForceUtf8 ){ |
|
456
|
fUnicode = 0; |
|
457
|
}else{ |
|
458
|
fUnicode = could_be_utf16(&blob, 0) || fForceUtf16; |
|
459
|
} |
|
460
|
if( fUnicode ){ |
|
461
|
lookFlags = looks_like_utf16(&blob, bRevUtf16, 0); |
|
462
|
}else{ |
|
463
|
lookFlags = looks_like_utf8(&blob, 0, fVerbose) | invalid_utf8(&blob); |
|
464
|
} |
|
465
|
} |
|
466
|
fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob)); |
|
467
|
fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no"); |
|
468
|
fossil_print("Starts with UTF-16 BOM: %s\n", |
|
469
|
fUtf16?(bRevUtf16?"reversed":"yes"):"no"); |
|
470
|
fossil_print("Looks like UTF-%s: %s\n",fUnicode?"16":"8", |
|
471
|
(lookFlags&LOOK_BINARY)?"no":"yes"); |
|
472
|
fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no"); |
|
473
|
fossil_print("Has flag LOOK_CR: %s\n",(lookFlags&LOOK_CR)?"yes":"no"); |
|
474
|
fossil_print("Has flag LOOK_LONE_CR: %s\n", |
|
475
|
(lookFlags&LOOK_LONE_CR)?"yes":"no"); |
|
476
|
fossil_print("Has flag LOOK_LF: %s\n",(lookFlags&LOOK_LF)?"yes":"no"); |
|
477
|
fossil_print("Has flag LOOK_LONE_LF: %s\n", |
|
478
|
(lookFlags&LOOK_LONE_LF)?"yes":"no"); |
|
479
|
fossil_print("Has flag LOOK_CRLF: %s\n",(lookFlags&LOOK_CRLF)?"yes":"no"); |
|
480
|
fossil_print("Has flag LOOK_LONG: %s\n",(lookFlags&LOOK_LONG)?"yes":"no"); |
|
481
|
fossil_print("Has flag LOOK_INVALID: %s\n", |
|
482
|
(lookFlags&LOOK_INVALID)?"yes":"no"); |
|
483
|
fossil_print("Has flag LOOK_ODD: %s\n",(lookFlags&LOOK_ODD)?"yes":"no"); |
|
484
|
fossil_print("Has flag LOOK_SHORT: %s\n",(lookFlags&LOOK_SHORT)?"yes":"no"); |
|
485
|
blob_reset(&blob); |
|
486
|
} |
|
487
|
|
|
488
|
/* |
|
489
|
** Return true if z[i] is the whole word given by zWord in a context that |
|
490
|
** might be an attempted SQL injection. |
|
491
|
*/ |
|
492
|
static int isWholeWord(const char *z, unsigned int i, const char *zWord, int n){ |
|
493
|
if( i==0 ) return 0; |
|
494
|
if( sqlite3_strnicmp(z+i, zWord, n)!=0 ) return 0; |
|
495
|
if( fossil_isalnum(z[i-1]) ) return 0; |
|
496
|
if( fossil_isalnum(z[i+n]) ) return 0; |
|
497
|
if( strchr("-)_", z[i-1])!=0 ) return 0; |
|
498
|
if( strchr("(_", z[i+n])!=0 ) return 0; |
|
499
|
return 1; |
|
500
|
} |
|
501
|
|
|
502
|
/* |
|
503
|
** Returns true if the given text contains certain keywords or |
|
504
|
** punctuation which indicate that it might be an SQL injection attempt |
|
505
|
** or Cross-site scripting attempt or some other kind of mischief. |
|
506
|
** |
|
507
|
** This is not a primary defense against vulnerabilities in the Fossil |
|
508
|
** code. Rather, this is part of an effort to do early detection of malicious |
|
509
|
** spiders to avoid them using up too many CPU cycles. Or, this routine |
|
510
|
** can also be thought of as a secondary layer of defense against attacks. |
|
511
|
*/ |
|
512
|
int looks_like_attack(const char *zTxt){ |
|
513
|
unsigned int i; |
|
514
|
int rc = 0; |
|
515
|
if( zTxt==0 ) return 0; |
|
516
|
for(i=0; zTxt[i]; i++){ |
|
517
|
switch( zTxt[i] ){ |
|
518
|
case '<': |
|
519
|
case ';': |
|
520
|
case '\'': |
|
521
|
return 1; |
|
522
|
case '/': /* 0123456789 123456789 */ |
|
523
|
if( strncmp(zTxt+i+1, "/wp-content/plugins/", 20)==0 ) rc = 1; |
|
524
|
if( strncmp(zTxt+i+1, "/wp-admin/admin-ajax", 20)==0 ) rc = 1; |
|
525
|
break; |
|
526
|
case 'a': |
|
527
|
case 'A': |
|
528
|
if( isWholeWord(zTxt, i, "and", 3) ) rc = 1; |
|
529
|
break; |
|
530
|
case 'n': |
|
531
|
case 'N': |
|
532
|
if( isWholeWord(zTxt, i, "null", 4) ) rc = 1; |
|
533
|
break; |
|
534
|
case 'o': |
|
535
|
case 'O': |
|
536
|
if( isWholeWord(zTxt, i, "order", 5) && fossil_isspace(zTxt[i+5]) ){ |
|
537
|
rc = 1; |
|
538
|
} |
|
539
|
if( isWholeWord(zTxt, i, "or", 2) ) rc = 1; |
|
540
|
break; |
|
541
|
case 's': |
|
542
|
case 'S': |
|
543
|
if( isWholeWord(zTxt, i, "select", 6) ) rc = 1; |
|
544
|
break; |
|
545
|
case 'w': |
|
546
|
case 'W': |
|
547
|
if( isWholeWord(zTxt, i, "waitfor", 7) ) rc = 1; |
|
548
|
break; |
|
549
|
} |
|
550
|
} |
|
551
|
if( rc ){ |
|
552
|
/* The test/markdown-test3.md document which is part of the Fossil source |
|
553
|
** tree intentionally tries to fake an attack. Do not report such |
|
554
|
** errors. */ |
|
555
|
const char *zPathInfo = P("PATH_INFO"); |
|
556
|
if( sqlite3_strglob("/doc/*/test/markdown-test3.md", zPathInfo)==0 ){ |
|
557
|
rc = 0; |
|
558
|
} |
|
559
|
} |
|
560
|
return rc; |
|
561
|
} |
|
562
|
|
|
563
|
/* |
|
564
|
** This is a utility routine associated with the test-looks-like-sql-injection |
|
565
|
** command. |
|
566
|
** |
|
567
|
** Read input from zInFile and print only those lines that look like they |
|
568
|
** might be SQL injection. |
|
569
|
** |
|
570
|
** Or if bInvert is true, then show the opposite - those lines that do NOT |
|
571
|
** look like SQL injection. |
|
572
|
*/ |
|
573
|
static void show_attack_lines( |
|
574
|
const char *zInFile, /* Name of input file */ |
|
575
|
int bInvert, /* Invert the sense of the output (-v) */ |
|
576
|
int bDeHttpize /* De-httpize the inputs. (-d) */ |
|
577
|
){ |
|
578
|
FILE *in; |
|
579
|
char zLine[10000]; |
|
580
|
if( zInFile==0 || strcmp(zInFile,"-")==0 ){ |
|
581
|
in = stdin; |
|
582
|
}else{ |
|
583
|
in = fopen(zInFile, "rb"); |
|
584
|
if( in==0 ){ |
|
585
|
fossil_fatal("cannot open \"%s\" for reading\n", zInFile); |
|
586
|
} |
|
587
|
} |
|
588
|
while( fgets(zLine, sizeof(zLine), in) ){ |
|
589
|
dehttpize(zLine); |
|
590
|
if( (looks_like_attack(zLine)!=0) ^ bInvert ){ |
|
591
|
fossil_print("%s", zLine); |
|
592
|
} |
|
593
|
} |
|
594
|
if( in!=stdin ) fclose(in); |
|
595
|
} |
|
596
|
|
|
597
|
/* |
|
598
|
** COMMAND: test-looks-like-attack |
|
599
|
** |
|
600
|
** Read lines of input from files named as arguments (or from standard |
|
601
|
** input if no arguments are provided) and print those that look like they |
|
602
|
** might be part of an SQL injection attack. |
|
603
|
** |
|
604
|
** Used to test the looks_lile_attack() utility subroutine, possibly |
|
605
|
** by piping in actual server log data. |
|
606
|
*/ |
|
607
|
void test_looks_like_attack(void){ |
|
608
|
int i; |
|
609
|
int bInvert = find_option("invert","v",0)!=0; |
|
610
|
int bDeHttpize = find_option("dehttpize","d",0)!=0; |
|
611
|
verify_all_options(); |
|
612
|
if( g.argc==2 ){ |
|
613
|
show_attack_lines(0, bInvert, bDeHttpize); |
|
614
|
} |
|
615
|
for(i=2; i<g.argc; i++){ |
|
616
|
show_attack_lines(g.argv[i], bInvert, bDeHttpize); |
|
617
|
} |
|
618
|
} |
|
619
|
|