Fossil SCM

fossil-scm / src / lookslike.c
Blame History Raw 619 lines
1
/*
2
** Copyright (c) 2013 D. Richard Hipp
3
**
4
** This program is free software; you can redistribute it and/or
5
** modify it under the terms of the Simplified BSD License (also
6
** known as the "2-Clause License" or "FreeBSD License".)
7
8
** This program is distributed in the hope that it will be useful,
9
** but without any warranty; without even the implied warranty of
10
** merchantability or fitness for a particular purpose.
11
**
12
** Author contact information:
13
** [email protected]
14
** http://www.hwaci.com/drh/
15
**
16
*******************************************************************************
17
**
18
** This file contains code used to try to guess if a particular file is
19
** text or binary, what types of line endings it uses, is it UTF8 or
20
** UTF16, etc.
21
*/
22
#include "config.h"
23
#include "lookslike.h"
24
#include <assert.h>
25
26
27
#if INTERFACE
28
29
/*
30
** This macro is designed to return non-zero if the specified blob contains
31
** data that MAY be binary in nature; otherwise, zero will be returned.
32
*/
33
#define looks_like_binary(blob) \
34
((looks_like_utf8((blob), LOOK_BINARY, 0) & LOOK_BINARY) != LOOK_NONE)
35
36
/*
37
** Output flags for the looks_like_utf8() and looks_like_utf16() routines used
38
** to convey status information about the blob content.
39
*/
40
#define LOOK_NONE ((int)0x00000000) /* Nothing special was found. */
41
#define LOOK_NUL ((int)0x00000001) /* One or more NUL chars were found. */
42
#define LOOK_CR ((int)0x00000002) /* One or more CR chars were found. */
43
#define LOOK_LONE_CR ((int)0x00000004) /* An unpaired CR char was found. */
44
#define LOOK_LF ((int)0x00000008) /* One or more LF chars were found. */
45
#define LOOK_LONE_LF ((int)0x00000010) /* An unpaired LF char was found. */
46
#define LOOK_CRLF ((int)0x00000020) /* One or more CR/LF pairs were found. */
47
#define LOOK_LONG ((int)0x00000040) /* An over length line was found. */
48
#define LOOK_ODD ((int)0x00000080) /* An odd number of bytes was found. */
49
#define LOOK_SHORT ((int)0x00000100) /* Unable to perform full check. */
50
#define LOOK_INVALID ((int)0x00000200) /* Invalid sequence was found. */
51
#define LOOK_BINARY (LOOK_NUL | LOOK_LONG | LOOK_SHORT) /* May be binary. */
52
#define LOOK_EOL (LOOK_LONE_CR | LOOK_LONE_LF | LOOK_CRLF) /* Line seps. */
53
#endif /* INTERFACE */
54
55
/* definitions for various UTF-8 sequence lengths, encoded as start value
56
* and size of each valid range belonging to some lead byte*/
57
#define US2A 0x80, 0x01 /* for lead byte 0xC0 */
58
#define US2B 0x80, 0x40 /* for lead bytes 0xC2-0xDF */
59
#define US3A 0xA0, 0x20 /* for lead byte 0xE0 */
60
#define US3B 0x80, 0x40 /* for lead bytes 0xE1-0xEF */
61
#define US4A 0x90, 0x30 /* for lead byte 0xF0 */
62
#define US4B 0x80, 0x40 /* for lead bytes 0xF1-0xF3 */
63
#define US4C 0x80, 0x10 /* for lead byte 0xF4 */
64
#define US0A 0x00, 0x00 /* for any other lead byte */
65
66
/* a table used for quick lookup of the definition that goes with a
67
* particular lead byte */
68
static const unsigned char lb_tab[] = {
69
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
70
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
71
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
72
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
73
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
74
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
75
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
76
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
77
US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
78
US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
79
US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
80
US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
81
US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
82
US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
83
US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A,
84
US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A
85
};
86
87
/*
88
** This function attempts to scan each logical line within the blob to
89
** determine the type of content it appears to contain. The return value
90
** is a combination of one or more of the LOOK_XXX flags (see above):
91
**
92
** !LOOK_BINARY -- The content appears to consist entirely of text; however,
93
** the encoding may not be UTF-8.
94
**
95
** LOOK_BINARY -- The content appears to be binary because it contains one
96
** or more embedded NUL characters or an extremely long line.
97
** Since this function does not understand UTF-16, it may
98
** falsely consider UTF-16 text to be binary.
99
**
100
** Additional flags (i.e. those other than the ones included in LOOK_BINARY)
101
** may be present in the result as well; however, they should not impact the
102
** determination of text versus binary content.
103
**
104
************************************ WARNING **********************************
105
**
106
** This function does not validate that the blob content is properly formed
107
** UTF-8. It assumes that all code points are the same size. It does not
108
** validate any code points. It makes no attempt to detect if any [invalid]
109
** switches between UTF-8 and other encodings occur.
110
**
111
** The only code points that this function cares about are the NUL character,
112
** carriage-return, and line-feed.
113
**
114
** This function examines the contents of the blob until one of the flags
115
** specified in "stopFlags" is set.
116
**
117
************************************ WARNING **********************************
118
*/
119
int looks_like_utf8(const Blob *pContent, int stopFlags, int fVerbose){
120
const char *z = blob_buffer(pContent);
121
unsigned int n = blob_size(pContent);
122
int j, c, flags = LOOK_NONE; /* Assume UTF-8 text, prove otherwise */
123
int nLine = 1;
124
125
if( n==0 ) return flags; /* Empty file -> text */
126
c = *z;
127
if( c==0 ){
128
flags |= LOOK_NUL; /* NUL character in a file -> binary */
129
if( fVerbose ) fossil_print("NUL at start\n");
130
}else if( c=='\r' ){
131
flags |= LOOK_CR;
132
if( fVerbose ) fossil_print("CR at start\n");
133
if( n<=1 || z[1]!='\n' ){
134
flags |= LOOK_LONE_CR; /* Not enough chars or next char not LF */
135
if( fVerbose ) fossil_print("Lone CR at start\n");
136
}
137
}
138
j = (c!='\n');
139
if( !j ) flags |= (LOOK_LF | LOOK_LONE_LF); /* Found LF as first char */
140
while( !(flags&stopFlags) && --n>0 ){
141
int c2 = c;
142
c = *++z; ++j;
143
if( c==0 ){
144
if( fVerbose && !(flags&LOOK_NUL) ){
145
fossil_print("NUL on line %d\n", nLine);
146
}
147
flags |= LOOK_NUL; /* NUL character in a file -> binary */
148
}else if( c=='\n' ){
149
flags |= LOOK_LF;
150
if( c2=='\r' ){
151
if( fVerbose && !(flags&LOOK_CRLF) ){
152
fossil_print("CRLF on line %d\n", nLine);
153
}
154
flags |= (LOOK_CR | LOOK_CRLF); /* Found LF preceded by CR */
155
}else{
156
if( fVerbose && !(flags&LOOK_LONE_LF) ){
157
fossil_print("Lone LF on line %d\n", nLine);
158
}
159
flags |= LOOK_LONE_LF;
160
}
161
if( j>LENGTH_MASK ){
162
if( fVerbose && !(flags&LOOK_LONG) ){
163
fossil_print("Line %d is longer than %d bytes\n", nLine, j);
164
}
165
flags |= LOOK_LONG; /* Very long line -> binary */
166
}
167
++nLine;
168
j = 0;
169
}else if( c=='\r' ){
170
flags |= LOOK_CR;
171
if( n<=1 || z[1]!='\n' ){
172
if( fVerbose && !(flags&LOOK_LONE_CR) ){
173
fossil_print("Lone CR on line %d\n", nLine);
174
}
175
flags |= LOOK_LONE_CR; /* Not enough chars or next char not LF */
176
}
177
}
178
}
179
if( n ){
180
flags |= LOOK_SHORT; /* The whole blob was not examined */
181
}
182
if( j>LENGTH_MASK ){
183
flags |= LOOK_LONG; /* Very long line -> binary */
184
}
185
return flags;
186
}
187
188
/*
189
** Checks for proper UTF-8. It uses the method described in:
190
** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
191
** except for the "overlong form" of \u0000 which is not considered
192
** invalid here: Some languages like Java and Tcl use it. This function
193
** also considers valid the derivatives CESU-8 & WTF-8 (as described in
194
** the same wikipedia article referenced previously). For UTF-8 characters
195
** > 0x7f, the variable 'c' not necessary means the real lead byte.
196
** It's number of higher 1-bits indicate the number of continuation
197
** bytes that are expected to be followed. E.g. when 'c' has a value
198
** in the range 0xc0..0xdf it means that after 'c' a single continuation
199
** byte is expected. A value 0xe0..0xef means that after 'c' two more
200
** continuation bytes are expected.
201
*/
202
203
int invalid_utf8(
204
const Blob *pContent
205
){
206
const unsigned char *z = (unsigned char *) blob_buffer(pContent);
207
unsigned int n = blob_size(pContent);
208
unsigned char c; /* lead byte to be handled. */
209
210
if( n==0 ) return 0; /* Empty file -> OK */
211
c = *z;
212
while( --n>0 ){
213
if( c>=0x80 ){
214
const unsigned char *def; /* pointer to range table*/
215
216
c <<= 1; /* multiply by 2 and get rid of highest bit */
217
def = &lb_tab[c]; /* search fb's valid range in table */
218
if( (unsigned int)(*++z-def[0])>=def[1] ){
219
return LOOK_INVALID; /* Invalid UTF-8 */
220
}
221
c = (c>=0xC0) ? (c|3) : ' '; /* determine next lead byte */
222
} else {
223
c = *++z;
224
}
225
}
226
return (c>=0x80) ? LOOK_INVALID : 0; /* Final lead byte must be ASCII. */
227
}
228
229
/*
230
** Define the type needed to represent a Unicode (UTF-16) character.
231
*/
232
#ifndef WCHAR_T
233
# ifdef _WIN32
234
# define WCHAR_T wchar_t
235
# else
236
# define WCHAR_T unsigned short
237
# endif
238
#endif
239
240
/*
241
** Maximum length of a line in a text file, in UTF-16 characters. (4096)
242
** The number of bytes represented by this value cannot exceed LENGTH_MASK
243
** bytes, because that is the line buffer size used by the diff engine.
244
*/
245
#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char)))
246
#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1)
247
248
/*
249
** This macro is used to swap the byte order of a UTF-16 character in the
250
** looks_like_utf16() function.
251
*/
252
#define UTF16_SWAP(ch) ((((ch) << 8) & 0xff00) | (((ch) >> 8) & 0xff))
253
#define UTF16_SWAP_IF(expr,ch) ((expr) ? UTF16_SWAP((ch)) : (ch))
254
255
/*
256
** This function attempts to scan each logical line within the blob to
257
** determine the type of content it appears to contain. The return value
258
** is a combination of one or more of the LOOK_XXX flags (see above):
259
**
260
** !LOOK_BINARY -- The content appears to consist entirely of text; however,
261
** the encoding may not be UTF-16.
262
**
263
** LOOK_BINARY -- The content appears to be binary because it contains one
264
** or more embedded NUL characters or an extremely long line.
265
** Since this function does not understand UTF-8, it may
266
** falsely consider UTF-8 text to be binary.
267
**
268
** Additional flags (i.e. those other than the ones included in LOOK_BINARY)
269
** may be present in the result as well; however, they should not impact the
270
** determination of text versus binary content.
271
**
272
************************************ WARNING **********************************
273
**
274
** This function does not validate that the blob content is properly formed
275
** UTF-16. It assumes that all code points are the same size. It does not
276
** validate any code points. It makes no attempt to detect if any [invalid]
277
** switches between the UTF-16be and UTF-16le encodings occur.
278
**
279
** The only code points that this function cares about are the NUL character,
280
** carriage-return, and line-feed.
281
**
282
** This function examines the contents of the blob until one of the flags
283
** specified in "stopFlags" is set.
284
**
285
************************************ WARNING **********************************
286
*/
287
int looks_like_utf16(const Blob *pContent, int bReverse, int stopFlags){
288
const WCHAR_T *z = (WCHAR_T *)blob_buffer(pContent);
289
unsigned int n = blob_size(pContent);
290
int j, c, flags = LOOK_NONE; /* Assume UTF-16 text, prove otherwise */
291
292
if( n%sizeof(WCHAR_T) ){
293
flags |= LOOK_ODD; /* Odd number of bytes -> binary (UTF-8?) */
294
}
295
if( n<sizeof(WCHAR_T) ) return flags;/* Zero or One byte -> binary (UTF-8?) */
296
c = *z;
297
if( bReverse ){
298
c = UTF16_SWAP(c);
299
}
300
if( c==0 ){
301
flags |= LOOK_NUL; /* NUL character in a file -> binary */
302
}else if( c=='\r' ){
303
flags |= LOOK_CR;
304
if( n<(2*sizeof(WCHAR_T)) || UTF16_SWAP_IF(bReverse, z[1])!='\n' ){
305
flags |= LOOK_LONE_CR; /* Not enough chars or next char not LF */
306
}
307
}
308
j = (c!='\n');
309
if( !j ) flags |= (LOOK_LF | LOOK_LONE_LF); /* Found LF as first char */
310
while( !(flags&stopFlags) && ((n-=sizeof(WCHAR_T))>=sizeof(WCHAR_T)) ){
311
int c2 = c;
312
c = *++z;
313
if( bReverse ){
314
c = UTF16_SWAP(c);
315
}
316
++j;
317
if( c==0 ){
318
flags |= LOOK_NUL; /* NUL character in a file -> binary */
319
}else if( c=='\n' ){
320
flags |= LOOK_LF;
321
if( c2=='\r' ){
322
flags |= (LOOK_CR | LOOK_CRLF); /* Found LF preceded by CR */
323
}else{
324
flags |= LOOK_LONE_LF;
325
}
326
if( j>UTF16_LENGTH_MASK ){
327
flags |= LOOK_LONG; /* Very long line -> binary */
328
}
329
j = 0;
330
}else if( c=='\r' ){
331
flags |= LOOK_CR;
332
if( n<(2*sizeof(WCHAR_T)) || UTF16_SWAP_IF(bReverse, z[1])!='\n' ){
333
flags |= LOOK_LONE_CR; /* Not enough chars or next char not LF */
334
}
335
}
336
}
337
if( n ){
338
flags |= LOOK_SHORT; /* The whole blob was not examined */
339
}
340
if( j>UTF16_LENGTH_MASK ){
341
flags |= LOOK_LONG; /* Very long line -> binary */
342
}
343
return flags;
344
}
345
346
/*
347
** This function returns an array of bytes representing the byte-order-mark
348
** for UTF-8.
349
*/
350
const unsigned char *get_utf8_bom(int *pnByte){
351
static const unsigned char bom[] = {
352
0xef, 0xbb, 0xbf, 0x00, 0x00, 0x00
353
};
354
if( pnByte ) *pnByte = 3;
355
return bom;
356
}
357
358
/*
359
** This function returns non-zero if the blob starts with a UTF-8
360
** byte-order-mark (BOM).
361
*/
362
int starts_with_utf8_bom(const Blob *pContent, int *pnByte){
363
const char *z = blob_buffer(pContent);
364
int bomSize = 0;
365
const unsigned char *bom = get_utf8_bom(&bomSize);
366
367
if( pnByte ) *pnByte = bomSize;
368
if( (int)blob_size(pContent)<bomSize ) return 0;
369
return memcmp(z, bom, bomSize)==0;
370
}
371
372
/*
373
** This function returns non-zero if the blob starts with a UTF-16
374
** byte-order-mark (BOM), either in the endianness of the machine
375
** or in reversed byte order. The UTF-32 BOM is ruled out by checking
376
** if the UTF-16 BOM is not immediately followed by (utf16) 0.
377
** pnByte is only set when the function returns 1.
378
**
379
** pbReverse is always set, even when no BOM is found. Without a BOM,
380
** it is set to 1 on little-endian and 0 on big-endian platforms. See
381
** clause D98 of conformance (section 3.10) of the Unicode standard.
382
*/
383
int starts_with_utf16_bom(
384
const Blob *pContent, /* IN: Blob content to perform BOM detection on. */
385
int *pnByte, /* OUT: The number of bytes used for the BOM. */
386
int *pbReverse /* OUT: Non-zero for BOM in reverse byte-order. */
387
){
388
const unsigned char *z = (unsigned char *)blob_buffer(pContent);
389
int bomSize = sizeof(unsigned short);
390
int size = blob_size(pContent);
391
unsigned short i0;
392
393
if( size<bomSize ) goto noBom; /* No: cannot read BOM. */
394
if( size>=(2*bomSize) && z[2]==0 && z[3]==0 ) goto noBom;
395
memcpy(&i0, z, sizeof(i0));
396
if( i0==0xfeff ){
397
if( pbReverse ) *pbReverse = 0;
398
}else if( i0==0xfffe ){
399
if( pbReverse ) *pbReverse = 1;
400
}else{
401
static const int one = 1;
402
noBom:
403
if( pbReverse ) *pbReverse = *(char *) &one;
404
return 0; /* No: UTF-16 byte-order-mark not found. */
405
}
406
if( pnByte ) *pnByte = bomSize;
407
return 1; /* Yes. */
408
}
409
410
/*
411
** Returns non-zero if the specified content could be valid UTF-16.
412
*/
413
int could_be_utf16(const Blob *pContent, int *pbReverse){
414
return (blob_size(pContent) % sizeof(WCHAR_T) == 0) ?
415
starts_with_utf16_bom(pContent, 0, pbReverse) : 0;
416
}
417
418
419
/*
420
** COMMAND: test-looks-like-utf
421
**
422
** Usage: %fossil test-looks-like-utf FILENAME
423
**
424
** Options:
425
** -n|--limit N Repeat looks-like function N times, for
426
** performance measurement. Default = 1
427
** --utf8 Ignoring BOM and file size, force UTF-8 checking
428
** --utf16 Ignoring BOM and file size, force UTF-16 checking
429
** -v|--verbose Report the line numbers where each flag is first set
430
**
431
** FILENAME is the name of a file to check for textual content in the UTF-8
432
** and/or UTF-16 encodings.
433
*/
434
void looks_like_utf_test_cmd(void){
435
Blob blob; /* the contents of the specified file */
436
int fUtf8 = 0; /* return value of starts_with_utf8_bom() */
437
int fUtf16 = 0; /* return value of starts_with_utf16_bom() */
438
int fUnicode = 0; /* return value of could_be_utf16() */
439
int lookFlags = 0; /* output flags from looks_like_utf8/utf16() */
440
int bRevUtf16 = 0; /* non-zero -> UTF-16 byte order reversed */
441
int fForceUtf8 = find_option("utf8",0,0)!=0;
442
int fForceUtf16 = find_option("utf16",0,0)!=0;
443
const char *zCount = find_option("limit","n",1);
444
int fVerbose = find_option("verbose","v",0)!=0;
445
int nRepeat = 1;
446
447
if( g.argc!=3 ) usage("FILENAME");
448
if( zCount ){
449
nRepeat = atoi(zCount);
450
}
451
blob_read_from_file(&blob, g.argv[2], ExtFILE);
452
while( --nRepeat >= 0 ){
453
fUtf8 = starts_with_utf8_bom(&blob, 0);
454
fUtf16 = starts_with_utf16_bom(&blob, 0, &bRevUtf16);
455
if( fForceUtf8 ){
456
fUnicode = 0;
457
}else{
458
fUnicode = could_be_utf16(&blob, 0) || fForceUtf16;
459
}
460
if( fUnicode ){
461
lookFlags = looks_like_utf16(&blob, bRevUtf16, 0);
462
}else{
463
lookFlags = looks_like_utf8(&blob, 0, fVerbose) | invalid_utf8(&blob);
464
}
465
}
466
fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
467
fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
468
fossil_print("Starts with UTF-16 BOM: %s\n",
469
fUtf16?(bRevUtf16?"reversed":"yes"):"no");
470
fossil_print("Looks like UTF-%s: %s\n",fUnicode?"16":"8",
471
(lookFlags&LOOK_BINARY)?"no":"yes");
472
fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no");
473
fossil_print("Has flag LOOK_CR: %s\n",(lookFlags&LOOK_CR)?"yes":"no");
474
fossil_print("Has flag LOOK_LONE_CR: %s\n",
475
(lookFlags&LOOK_LONE_CR)?"yes":"no");
476
fossil_print("Has flag LOOK_LF: %s\n",(lookFlags&LOOK_LF)?"yes":"no");
477
fossil_print("Has flag LOOK_LONE_LF: %s\n",
478
(lookFlags&LOOK_LONE_LF)?"yes":"no");
479
fossil_print("Has flag LOOK_CRLF: %s\n",(lookFlags&LOOK_CRLF)?"yes":"no");
480
fossil_print("Has flag LOOK_LONG: %s\n",(lookFlags&LOOK_LONG)?"yes":"no");
481
fossil_print("Has flag LOOK_INVALID: %s\n",
482
(lookFlags&LOOK_INVALID)?"yes":"no");
483
fossil_print("Has flag LOOK_ODD: %s\n",(lookFlags&LOOK_ODD)?"yes":"no");
484
fossil_print("Has flag LOOK_SHORT: %s\n",(lookFlags&LOOK_SHORT)?"yes":"no");
485
blob_reset(&blob);
486
}
487
488
/*
489
** Return true if z[i] is the whole word given by zWord in a context that
490
** might be an attempted SQL injection.
491
*/
492
static int isWholeWord(const char *z, unsigned int i, const char *zWord, int n){
493
if( i==0 ) return 0;
494
if( sqlite3_strnicmp(z+i, zWord, n)!=0 ) return 0;
495
if( fossil_isalnum(z[i-1]) ) return 0;
496
if( fossil_isalnum(z[i+n]) ) return 0;
497
if( strchr("-)_", z[i-1])!=0 ) return 0;
498
if( strchr("(_", z[i+n])!=0 ) return 0;
499
return 1;
500
}
501
502
/*
503
** Returns true if the given text contains certain keywords or
504
** punctuation which indicate that it might be an SQL injection attempt
505
** or Cross-site scripting attempt or some other kind of mischief.
506
**
507
** This is not a primary defense against vulnerabilities in the Fossil
508
** code. Rather, this is part of an effort to do early detection of malicious
509
** spiders to avoid them using up too many CPU cycles. Or, this routine
510
** can also be thought of as a secondary layer of defense against attacks.
511
*/
512
int looks_like_attack(const char *zTxt){
513
unsigned int i;
514
int rc = 0;
515
if( zTxt==0 ) return 0;
516
for(i=0; zTxt[i]; i++){
517
switch( zTxt[i] ){
518
case '<':
519
case ';':
520
case '\'':
521
return 1;
522
case '/': /* 0123456789 123456789 */
523
if( strncmp(zTxt+i+1, "/wp-content/plugins/", 20)==0 ) rc = 1;
524
if( strncmp(zTxt+i+1, "/wp-admin/admin-ajax", 20)==0 ) rc = 1;
525
break;
526
case 'a':
527
case 'A':
528
if( isWholeWord(zTxt, i, "and", 3) ) rc = 1;
529
break;
530
case 'n':
531
case 'N':
532
if( isWholeWord(zTxt, i, "null", 4) ) rc = 1;
533
break;
534
case 'o':
535
case 'O':
536
if( isWholeWord(zTxt, i, "order", 5) && fossil_isspace(zTxt[i+5]) ){
537
rc = 1;
538
}
539
if( isWholeWord(zTxt, i, "or", 2) ) rc = 1;
540
break;
541
case 's':
542
case 'S':
543
if( isWholeWord(zTxt, i, "select", 6) ) rc = 1;
544
break;
545
case 'w':
546
case 'W':
547
if( isWholeWord(zTxt, i, "waitfor", 7) ) rc = 1;
548
break;
549
}
550
}
551
if( rc ){
552
/* The test/markdown-test3.md document which is part of the Fossil source
553
** tree intentionally tries to fake an attack. Do not report such
554
** errors. */
555
const char *zPathInfo = P("PATH_INFO");
556
if( sqlite3_strglob("/doc/*/test/markdown-test3.md", zPathInfo)==0 ){
557
rc = 0;
558
}
559
}
560
return rc;
561
}
562
563
/*
564
** This is a utility routine associated with the test-looks-like-sql-injection
565
** command.
566
**
567
** Read input from zInFile and print only those lines that look like they
568
** might be SQL injection.
569
**
570
** Or if bInvert is true, then show the opposite - those lines that do NOT
571
** look like SQL injection.
572
*/
573
static void show_attack_lines(
574
const char *zInFile, /* Name of input file */
575
int bInvert, /* Invert the sense of the output (-v) */
576
int bDeHttpize /* De-httpize the inputs. (-d) */
577
){
578
FILE *in;
579
char zLine[10000];
580
if( zInFile==0 || strcmp(zInFile,"-")==0 ){
581
in = stdin;
582
}else{
583
in = fopen(zInFile, "rb");
584
if( in==0 ){
585
fossil_fatal("cannot open \"%s\" for reading\n", zInFile);
586
}
587
}
588
while( fgets(zLine, sizeof(zLine), in) ){
589
dehttpize(zLine);
590
if( (looks_like_attack(zLine)!=0) ^ bInvert ){
591
fossil_print("%s", zLine);
592
}
593
}
594
if( in!=stdin ) fclose(in);
595
}
596
597
/*
598
** COMMAND: test-looks-like-attack
599
**
600
** Read lines of input from files named as arguments (or from standard
601
** input if no arguments are provided) and print those that look like they
602
** might be part of an SQL injection attack.
603
**
604
** Used to test the looks_lile_attack() utility subroutine, possibly
605
** by piping in actual server log data.
606
*/
607
void test_looks_like_attack(void){
608
int i;
609
int bInvert = find_option("invert","v",0)!=0;
610
int bDeHttpize = find_option("dehttpize","d",0)!=0;
611
verify_all_options();
612
if( g.argc==2 ){
613
show_attack_lines(0, bInvert, bDeHttpize);
614
}
615
for(i=2; i<g.argc; i++){
616
show_attack_lines(g.argv[i], bInvert, bDeHttpize);
617
}
618
}
619

Keyboard Shortcuts

Open search /
Next entry (timeline) j
Previous entry (timeline) k
Open focused entry Enter
Show this help ?
Toggle theme Top nav button