Fossil SCM

fossil-scm / src / lookslike.c

Blame History Raw 619 lines

1	`/*`
2	`** Copyright (c) 2013 D. Richard Hipp`
3	`**`
4	`** This program is free software; you can redistribute it and/or`
5	`** modify it under the terms of the Simplified BSD License (also`
6	`** known as the "2-Clause License" or "FreeBSD License".)`
7
8	`** This program is distributed in the hope that it will be useful,`
9	`** but without any warranty; without even the implied warranty of`
10	`** merchantability or fitness for a particular purpose.`
11	`**`
12	`** Author contact information:`
13	`** [email protected]`
14	`** http://www.hwaci.com/drh/`
15	`**`
16	`*******************************************************************************`
17	`**`
18	`** This file contains code used to try to guess if a particular file is`
19	`** text or binary, what types of line endings it uses, is it UTF8 or`
20	`** UTF16, etc.`
21	`*/`
22	`#include "config.h"`
23	`#include "lookslike.h"`
24	`#include <assert.h>`
25
26
27	`#if INTERFACE`
28
29	`/*`
30	`** This macro is designed to return non-zero if the specified blob contains`
31	`** data that MAY be binary in nature; otherwise, zero will be returned.`
32	`*/`
33	`#define looks_like_binary(blob) \`
34	`((looks_like_utf8((blob), LOOK_BINARY, 0) & LOOK_BINARY) != LOOK_NONE)`
35
36	`/*`
37	`** Output flags for the looks_like_utf8() and looks_like_utf16() routines used`
38	`** to convey status information about the blob content.`
39	`*/`
40	`#define LOOK_NONE ((int)0x00000000) /* Nothing special was found. */`
41	`#define LOOK_NUL ((int)0x00000001) /* One or more NUL chars were found. */`
42	`#define LOOK_CR ((int)0x00000002) /* One or more CR chars were found. */`
43	`#define LOOK_LONE_CR ((int)0x00000004) /* An unpaired CR char was found. */`
44	`#define LOOK_LF ((int)0x00000008) /* One or more LF chars were found. */`
45	`#define LOOK_LONE_LF ((int)0x00000010) /* An unpaired LF char was found. */`
46	`#define LOOK_CRLF ((int)0x00000020) /* One or more CR/LF pairs were found. */`
47	`#define LOOK_LONG ((int)0x00000040) /* An over length line was found. */`
48	`#define LOOK_ODD ((int)0x00000080) /* An odd number of bytes was found. */`
49	`#define LOOK_SHORT ((int)0x00000100) /* Unable to perform full check. */`
50	`#define LOOK_INVALID ((int)0x00000200) /* Invalid sequence was found. */`
51	`#define LOOK_BINARY (LOOK_NUL \| LOOK_LONG \| LOOK_SHORT) /* May be binary. */`
52	`#define LOOK_EOL (LOOK_LONE_CR \| LOOK_LONE_LF \| LOOK_CRLF) /* Line seps. */`
53	`#endif /* INTERFACE */`
54
55	`/* definitions for various UTF-8 sequence lengths, encoded as start value`
56	`* and size of each valid range belonging to some lead byte*/`
57	`#define US2A 0x80, 0x01 /* for lead byte 0xC0 */`
58	`#define US2B 0x80, 0x40 /* for lead bytes 0xC2-0xDF */`
59	`#define US3A 0xA0, 0x20 /* for lead byte 0xE0 */`
60	`#define US3B 0x80, 0x40 /* for lead bytes 0xE1-0xEF */`
61	`#define US4A 0x90, 0x30 /* for lead byte 0xF0 */`
62	`#define US4B 0x80, 0x40 /* for lead bytes 0xF1-0xF3 */`
63	`#define US4C 0x80, 0x10 /* for lead byte 0xF4 */`
64	`#define US0A 0x00, 0x00 /* for any other lead byte */`
65
66	`/* a table used for quick lookup of the definition that goes with a`
67	`* particular lead byte */`
68	`static const unsigned char lb_tab[] = {`
69	`US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,`
70	`US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,`
71	`US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,`
72	`US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,`
73	`US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,`
74	`US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,`
75	`US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,`
76	`US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,`
77	`US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,`
78	`US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,`
79	`US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,`
80	`US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,`
81	`US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,`
82	`US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B,`
83	`US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A,`
84	`US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A`
85	`};`
86
87	`/*`
88	`** This function attempts to scan each logical line within the blob to`
89	`** determine the type of content it appears to contain. The return value`
90	`** is a combination of one or more of the LOOK_XXX flags (see above):`
91	`**`
92	`** !LOOK_BINARY -- The content appears to consist entirely of text; however,`
93	`** the encoding may not be UTF-8.`
94	`**`
95	`** LOOK_BINARY -- The content appears to be binary because it contains one`
96	`** or more embedded NUL characters or an extremely long line.`
97	`** Since this function does not understand UTF-16, it may`
98	`** falsely consider UTF-16 text to be binary.`
99	`**`
100	`** Additional flags (i.e. those other than the ones included in LOOK_BINARY)`
101	`** may be present in the result as well; however, they should not impact the`
102	`** determination of text versus binary content.`
103	`**`
104	`********************************** WARNING ********************************`
105	`**`
106	`** This function does not validate that the blob content is properly formed`
107	`** UTF-8. It assumes that all code points are the same size. It does not`
108	`** validate any code points. It makes no attempt to detect if any [invalid]`
109	`** switches between UTF-8 and other encodings occur.`
110	`**`
111	`** The only code points that this function cares about are the NUL character,`
112	`** carriage-return, and line-feed.`
113	`**`
114	`** This function examines the contents of the blob until one of the flags`
115	`** specified in "stopFlags" is set.`
116	`**`
117	`********************************** WARNING ********************************`
118	`*/`
119	`int looks_like_utf8(const Blob *pContent, int stopFlags, int fVerbose){`
120	`const char *z = blob_buffer(pContent);`
121	`unsigned int n = blob_size(pContent);`
122	`int j, c, flags = LOOK_NONE; /* Assume UTF-8 text, prove otherwise */`
123	`int nLine = 1;`
124
125	`if( n==0 ) return flags; /* Empty file -> text */`
126	`c = *z;`
127	`if( c==0 ){`
128	`flags \|= LOOK_NUL; /* NUL character in a file -> binary */`
129	`if( fVerbose ) fossil_print("NUL at start\n");`
130	`}else if( c=='\r' ){`
131	`flags \|= LOOK_CR;`
132	`if( fVerbose ) fossil_print("CR at start\n");`
133	`if( n<=1 \|\| z[1]!='\n' ){`
134	`flags \|= LOOK_LONE_CR; /* Not enough chars or next char not LF */`
135	`if( fVerbose ) fossil_print("Lone CR at start\n");`
136	`}`
137	`}`
138	`j = (c!='\n');`
139	`if( !j ) flags \|= (LOOK_LF \| LOOK_LONE_LF); /* Found LF as first char */`
140	`while( !(flags&stopFlags) && --n>0 ){`
141	`int c2 = c;`
142	`c = *++z; ++j;`
143	`if( c==0 ){`
144	`if( fVerbose && !(flags&LOOK_NUL) ){`
145	`fossil_print("NUL on line %d\n", nLine);`
146	`}`
147	`flags \|= LOOK_NUL; /* NUL character in a file -> binary */`
148	`}else if( c=='\n' ){`
149	`flags \|= LOOK_LF;`
150	`if( c2=='\r' ){`
151	`if( fVerbose && !(flags&LOOK_CRLF) ){`
152	`fossil_print("CRLF on line %d\n", nLine);`
153	`}`
154	`flags \|= (LOOK_CR \| LOOK_CRLF); /* Found LF preceded by CR */`
155	`}else{`
156	`if( fVerbose && !(flags&LOOK_LONE_LF) ){`
157	`fossil_print("Lone LF on line %d\n", nLine);`
158	`}`
159	`flags \|= LOOK_LONE_LF;`
160	`}`
161	`if( j>LENGTH_MASK ){`
162	`if( fVerbose && !(flags&LOOK_LONG) ){`
163	`fossil_print("Line %d is longer than %d bytes\n", nLine, j);`
164	`}`
165	`flags \|= LOOK_LONG; /* Very long line -> binary */`
166	`}`
167	`++nLine;`
168	`j = 0;`
169	`}else if( c=='\r' ){`
170	`flags \|= LOOK_CR;`
171	`if( n<=1 \|\| z[1]!='\n' ){`
172	`if( fVerbose && !(flags&LOOK_LONE_CR) ){`
173	`fossil_print("Lone CR on line %d\n", nLine);`
174	`}`
175	`flags \|= LOOK_LONE_CR; /* Not enough chars or next char not LF */`
176	`}`
177	`}`
178	`}`
179	`if( n ){`
180	`flags \|= LOOK_SHORT; /* The whole blob was not examined */`
181	`}`
182	`if( j>LENGTH_MASK ){`
183	`flags \|= LOOK_LONG; /* Very long line -> binary */`
184	`}`
185	`return flags;`
186	`}`
187
188	`/*`
189	`** Checks for proper UTF-8. It uses the method described in:`
190	`** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences`
191	`** except for the "overlong form" of \u0000 which is not considered`
192	`** invalid here: Some languages like Java and Tcl use it. This function`
193	`** also considers valid the derivatives CESU-8 & WTF-8 (as described in`
194	`** the same wikipedia article referenced previously). For UTF-8 characters`
195	`** > 0x7f, the variable 'c' not necessary means the real lead byte.`
196	`** It's number of higher 1-bits indicate the number of continuation`
197	`** bytes that are expected to be followed. E.g. when 'c' has a value`
198	`** in the range 0xc0..0xdf it means that after 'c' a single continuation`
199	`** byte is expected. A value 0xe0..0xef means that after 'c' two more`
200	`** continuation bytes are expected.`
201	`*/`
202
203	`int invalid_utf8(`
204	`const Blob *pContent`
205	`){`
206	`const unsigned char z = (unsigned char ) blob_buffer(pContent);`
207	`unsigned int n = blob_size(pContent);`
208	`unsigned char c; /* lead byte to be handled. */`
209
210	`if( n==0 ) return 0; /* Empty file -> OK */`
211	`c = *z;`
212	`while( --n>0 ){`
213	`if( c>=0x80 ){`
214	`const unsigned char def; / pointer to range table*/`
215
216	`c <<= 1; /* multiply by 2 and get rid of highest bit */`
217	`def = &lb_tab[c]; /* search fb's valid range in table */`
218	`if( (unsigned int)(*++z-def[0])>=def[1] ){`
219	`return LOOK_INVALID; /* Invalid UTF-8 */`
220	`}`
221	`c = (c>=0xC0) ? (c\|3) : ' '; /* determine next lead byte */`
222	`} else {`
223	`c = *++z;`
224	`}`
225	`}`
226	`return (c>=0x80) ? LOOK_INVALID : 0; /* Final lead byte must be ASCII. */`
227	`}`
228
229	`/*`
230	`** Define the type needed to represent a Unicode (UTF-16) character.`
231	`*/`
232	`#ifndef WCHAR_T`
233	`# ifdef _WIN32`
234	`# define WCHAR_T wchar_t`
235	`# else`
236	`# define WCHAR_T unsigned short`
237	`# endif`
238	`#endif`
239
240	`/*`
241	`** Maximum length of a line in a text file, in UTF-16 characters. (4096)`
242	`** The number of bytes represented by this value cannot exceed LENGTH_MASK`
243	`** bytes, because that is the line buffer size used by the diff engine.`
244	`*/`
245	`#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char)))`
246	`#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1)`
247
248	`/*`
249	`** This macro is used to swap the byte order of a UTF-16 character in the`
250	`** looks_like_utf16() function.`
251	`*/`
252	`#define UTF16_SWAP(ch) ((((ch) << 8) & 0xff00) \| (((ch) >> 8) & 0xff))`
253	`#define UTF16_SWAP_IF(expr,ch) ((expr) ? UTF16_SWAP((ch)) : (ch))`
254
255	`/*`
256	`** This function attempts to scan each logical line within the blob to`
257	`** determine the type of content it appears to contain. The return value`
258	`** is a combination of one or more of the LOOK_XXX flags (see above):`
259	`**`
260	`** !LOOK_BINARY -- The content appears to consist entirely of text; however,`
261	`** the encoding may not be UTF-16.`
262	`**`
263	`** LOOK_BINARY -- The content appears to be binary because it contains one`
264	`** or more embedded NUL characters or an extremely long line.`
265	`** Since this function does not understand UTF-8, it may`
266	`** falsely consider UTF-8 text to be binary.`
267	`**`
268	`** Additional flags (i.e. those other than the ones included in LOOK_BINARY)`
269	`** may be present in the result as well; however, they should not impact the`
270	`** determination of text versus binary content.`
271	`**`
272	`********************************** WARNING ********************************`
273	`**`
274	`** This function does not validate that the blob content is properly formed`
275	`** UTF-16. It assumes that all code points are the same size. It does not`
276	`** validate any code points. It makes no attempt to detect if any [invalid]`
277	`** switches between the UTF-16be and UTF-16le encodings occur.`
278	`**`
279	`** The only code points that this function cares about are the NUL character,`
280	`** carriage-return, and line-feed.`
281	`**`
282	`** This function examines the contents of the blob until one of the flags`
283	`** specified in "stopFlags" is set.`
284	`**`
285	`********************************** WARNING ********************************`
286	`*/`
287	`int looks_like_utf16(const Blob *pContent, int bReverse, int stopFlags){`
288	`const WCHAR_T z = (WCHAR_T )blob_buffer(pContent);`
289	`unsigned int n = blob_size(pContent);`
290	`int j, c, flags = LOOK_NONE; /* Assume UTF-16 text, prove otherwise */`
291
292	`if( n%sizeof(WCHAR_T) ){`
293	`flags \|= LOOK_ODD; /* Odd number of bytes -> binary (UTF-8?) */`
294	`}`
295	`if( n<sizeof(WCHAR_T) ) return flags;/* Zero or One byte -> binary (UTF-8?) */`
296	`c = *z;`
297	`if( bReverse ){`
298	`c = UTF16_SWAP(c);`
299	`}`
300	`if( c==0 ){`
301	`flags \|= LOOK_NUL; /* NUL character in a file -> binary */`
302	`}else if( c=='\r' ){`
303	`flags \|= LOOK_CR;`
304	`if( n<(2*sizeof(WCHAR_T)) \|\| UTF16_SWAP_IF(bReverse, z[1])!='\n' ){`
305	`flags \|= LOOK_LONE_CR; /* Not enough chars or next char not LF */`
306	`}`
307	`}`
308	`j = (c!='\n');`
309	`if( !j ) flags \|= (LOOK_LF \| LOOK_LONE_LF); /* Found LF as first char */`
310	`while( !(flags&stopFlags) && ((n-=sizeof(WCHAR_T))>=sizeof(WCHAR_T)) ){`
311	`int c2 = c;`
312	`c = *++z;`
313	`if( bReverse ){`
314	`c = UTF16_SWAP(c);`
315	`}`
316	`++j;`
317	`if( c==0 ){`
318	`flags \|= LOOK_NUL; /* NUL character in a file -> binary */`
319	`}else if( c=='\n' ){`
320	`flags \|= LOOK_LF;`
321	`if( c2=='\r' ){`
322	`flags \|= (LOOK_CR \| LOOK_CRLF); /* Found LF preceded by CR */`
323	`}else{`
324	`flags \|= LOOK_LONE_LF;`
325	`}`
326	`if( j>UTF16_LENGTH_MASK ){`
327	`flags \|= LOOK_LONG; /* Very long line -> binary */`
328	`}`
329	`j = 0;`
330	`}else if( c=='\r' ){`
331	`flags \|= LOOK_CR;`
332	`if( n<(2*sizeof(WCHAR_T)) \|\| UTF16_SWAP_IF(bReverse, z[1])!='\n' ){`
333	`flags \|= LOOK_LONE_CR; /* Not enough chars or next char not LF */`
334	`}`
335	`}`
336	`}`
337	`if( n ){`
338	`flags \|= LOOK_SHORT; /* The whole blob was not examined */`
339	`}`
340	`if( j>UTF16_LENGTH_MASK ){`
341	`flags \|= LOOK_LONG; /* Very long line -> binary */`
342	`}`
343	`return flags;`
344	`}`
345
346	`/*`
347	`** This function returns an array of bytes representing the byte-order-mark`
348	`** for UTF-8.`
349	`*/`
350	`const unsigned char get_utf8_bom(int pnByte){`
351	`static const unsigned char bom[] = {`
352	`0xef, 0xbb, 0xbf, 0x00, 0x00, 0x00`
353	`};`
354	`if( pnByte ) *pnByte = 3;`
355	`return bom;`
356	`}`
357
358	`/*`
359	`** This function returns non-zero if the blob starts with a UTF-8`
360	`** byte-order-mark (BOM).`
361	`*/`
362	`int starts_with_utf8_bom(const Blob pContent, int pnByte){`
363	`const char *z = blob_buffer(pContent);`
364	`int bomSize = 0;`
365	`const unsigned char *bom = get_utf8_bom(&bomSize);`
366
367	`if( pnByte ) *pnByte = bomSize;`
368	`if( (int)blob_size(pContent)<bomSize ) return 0;`
369	`return memcmp(z, bom, bomSize)==0;`
370	`}`
371
372	`/*`
373	`** This function returns non-zero if the blob starts with a UTF-16`
374	`** byte-order-mark (BOM), either in the endianness of the machine`
375	`** or in reversed byte order. The UTF-32 BOM is ruled out by checking`
376	`** if the UTF-16 BOM is not immediately followed by (utf16) 0.`
377	`** pnByte is only set when the function returns 1.`
378	`**`
379	`** pbReverse is always set, even when no BOM is found. Without a BOM,`
380	`** it is set to 1 on little-endian and 0 on big-endian platforms. See`
381	`** clause D98 of conformance (section 3.10) of the Unicode standard.`
382	`*/`
383	`int starts_with_utf16_bom(`
384	`const Blob pContent, / IN: Blob content to perform BOM detection on. */`
385	`int pnByte, / OUT: The number of bytes used for the BOM. */`
386	`int pbReverse / OUT: Non-zero for BOM in reverse byte-order. */`
387	`){`
388	`const unsigned char z = (unsigned char )blob_buffer(pContent);`
389	`int bomSize = sizeof(unsigned short);`
390	`int size = blob_size(pContent);`
391	`unsigned short i0;`
392
393	`if( size<bomSize ) goto noBom; /* No: cannot read BOM. */`
394	`if( size>=(2*bomSize) && z[2]==0 && z[3]==0 ) goto noBom;`
395	`memcpy(&i0, z, sizeof(i0));`
396	`if( i0==0xfeff ){`
397	`if( pbReverse ) *pbReverse = 0;`
398	`}else if( i0==0xfffe ){`
399	`if( pbReverse ) *pbReverse = 1;`
400	`}else{`
401	`static const int one = 1;`
402	`noBom:`
403	`if( pbReverse ) pbReverse = (char *) &one;`
404	`return 0; /* No: UTF-16 byte-order-mark not found. */`
405	`}`
406	`if( pnByte ) *pnByte = bomSize;`
407	`return 1; /* Yes. */`
408	`}`
409
410	`/*`
411	`** Returns non-zero if the specified content could be valid UTF-16.`
412	`*/`
413	`int could_be_utf16(const Blob pContent, int pbReverse){`
414	`return (blob_size(pContent) % sizeof(WCHAR_T) == 0) ?`
415	`starts_with_utf16_bom(pContent, 0, pbReverse) : 0;`
416	`}`
417
418
419	`/*`
420	`** COMMAND: test-looks-like-utf`
421	`**`
422	`** Usage: %fossil test-looks-like-utf FILENAME`
423	`**`
424	`** Options:`
425	`** -n\|--limit N Repeat looks-like function N times, for`
426	`** performance measurement. Default = 1`
427	`** --utf8 Ignoring BOM and file size, force UTF-8 checking`
428	`** --utf16 Ignoring BOM and file size, force UTF-16 checking`
429	`** -v\|--verbose Report the line numbers where each flag is first set`
430	`**`
431	`** FILENAME is the name of a file to check for textual content in the UTF-8`
432	`** and/or UTF-16 encodings.`
433	`*/`
434	`void looks_like_utf_test_cmd(void){`
435	`Blob blob; /* the contents of the specified file */`
436	`int fUtf8 = 0; /* return value of starts_with_utf8_bom() */`
437	`int fUtf16 = 0; /* return value of starts_with_utf16_bom() */`
438	`int fUnicode = 0; /* return value of could_be_utf16() */`
439	`int lookFlags = 0; /* output flags from looks_like_utf8/utf16() */`
440	`int bRevUtf16 = 0; /* non-zero -> UTF-16 byte order reversed */`
441	`int fForceUtf8 = find_option("utf8",0,0)!=0;`
442	`int fForceUtf16 = find_option("utf16",0,0)!=0;`
443	`const char *zCount = find_option("limit","n",1);`
444	`int fVerbose = find_option("verbose","v",0)!=0;`
445	`int nRepeat = 1;`
446
447	`if( g.argc!=3 ) usage("FILENAME");`
448	`if( zCount ){`
449	`nRepeat = atoi(zCount);`
450	`}`
451	`blob_read_from_file(&blob, g.argv[2], ExtFILE);`
452	`while( --nRepeat >= 0 ){`
453	`fUtf8 = starts_with_utf8_bom(&blob, 0);`
454	`fUtf16 = starts_with_utf16_bom(&blob, 0, &bRevUtf16);`
455	`if( fForceUtf8 ){`
456	`fUnicode = 0;`
457	`}else{`
458	`fUnicode = could_be_utf16(&blob, 0) \|\| fForceUtf16;`
459	`}`
460	`if( fUnicode ){`
461	`lookFlags = looks_like_utf16(&blob, bRevUtf16, 0);`
462	`}else{`
463	`lookFlags = looks_like_utf8(&blob, 0, fVerbose) \| invalid_utf8(&blob);`
464	`}`
465	`}`
466	`fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));`
467	`fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");`
468	`fossil_print("Starts with UTF-16 BOM: %s\n",`
469	`fUtf16?(bRevUtf16?"reversed":"yes"):"no");`
470	`fossil_print("Looks like UTF-%s: %s\n",fUnicode?"16":"8",`
471	`(lookFlags&LOOK_BINARY)?"no":"yes");`
472	`fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no");`
473	`fossil_print("Has flag LOOK_CR: %s\n",(lookFlags&LOOK_CR)?"yes":"no");`
474	`fossil_print("Has flag LOOK_LONE_CR: %s\n",`
475	`(lookFlags&LOOK_LONE_CR)?"yes":"no");`
476	`fossil_print("Has flag LOOK_LF: %s\n",(lookFlags&LOOK_LF)?"yes":"no");`
477	`fossil_print("Has flag LOOK_LONE_LF: %s\n",`
478	`(lookFlags&LOOK_LONE_LF)?"yes":"no");`
479	`fossil_print("Has flag LOOK_CRLF: %s\n",(lookFlags&LOOK_CRLF)?"yes":"no");`
480	`fossil_print("Has flag LOOK_LONG: %s\n",(lookFlags&LOOK_LONG)?"yes":"no");`
481	`fossil_print("Has flag LOOK_INVALID: %s\n",`
482	`(lookFlags&LOOK_INVALID)?"yes":"no");`
483	`fossil_print("Has flag LOOK_ODD: %s\n",(lookFlags&LOOK_ODD)?"yes":"no");`
484	`fossil_print("Has flag LOOK_SHORT: %s\n",(lookFlags&LOOK_SHORT)?"yes":"no");`
485	`blob_reset(&blob);`
486	`}`
487
488	`/*`
489	`** Return true if z[i] is the whole word given by zWord in a context that`
490	`** might be an attempted SQL injection.`
491	`*/`
492	`static int isWholeWord(const char z, unsigned int i, const char zWord, int n){`
493	`if( i==0 ) return 0;`
494	`if( sqlite3_strnicmp(z+i, zWord, n)!=0 ) return 0;`
495	`if( fossil_isalnum(z[i-1]) ) return 0;`
496	`if( fossil_isalnum(z[i+n]) ) return 0;`
497	`if( strchr("-)_", z[i-1])!=0 ) return 0;`
498	`if( strchr("(_", z[i+n])!=0 ) return 0;`
499	`return 1;`
500	`}`
501
502	`/*`
503	`** Returns true if the given text contains certain keywords or`
504	`** punctuation which indicate that it might be an SQL injection attempt`
505	`** or Cross-site scripting attempt or some other kind of mischief.`
506	`**`
507	`** This is not a primary defense against vulnerabilities in the Fossil`
508	`** code. Rather, this is part of an effort to do early detection of malicious`
509	`** spiders to avoid them using up too many CPU cycles. Or, this routine`
510	`** can also be thought of as a secondary layer of defense against attacks.`
511	`*/`
512	`int looks_like_attack(const char *zTxt){`
513	`unsigned int i;`
514	`int rc = 0;`
515	`if( zTxt==0 ) return 0;`
516	`for(i=0; zTxt[i]; i++){`
517	`switch( zTxt[i] ){`
518	`case '<':`
519	`case ';':`
520	`case '\'':`
521	`return 1;`
522	`case '/': /* 0123456789 123456789 */`
523	`if( strncmp(zTxt+i+1, "/wp-content/plugins/", 20)==0 ) rc = 1;`
524	`if( strncmp(zTxt+i+1, "/wp-admin/admin-ajax", 20)==0 ) rc = 1;`
525	`break;`
526	`case 'a':`
527	`case 'A':`
528	`if( isWholeWord(zTxt, i, "and", 3) ) rc = 1;`
529	`break;`
530	`case 'n':`
531	`case 'N':`
532	`if( isWholeWord(zTxt, i, "null", 4) ) rc = 1;`
533	`break;`
534	`case 'o':`
535	`case 'O':`
536	`if( isWholeWord(zTxt, i, "order", 5) && fossil_isspace(zTxt[i+5]) ){`
537	`rc = 1;`
538	`}`
539	`if( isWholeWord(zTxt, i, "or", 2) ) rc = 1;`
540	`break;`
541	`case 's':`
542	`case 'S':`
543	`if( isWholeWord(zTxt, i, "select", 6) ) rc = 1;`
544	`break;`
545	`case 'w':`
546	`case 'W':`
547	`if( isWholeWord(zTxt, i, "waitfor", 7) ) rc = 1;`
548	`break;`
549	`}`
550	`}`
551	`if( rc ){`
552	`/* The test/markdown-test3.md document which is part of the Fossil source`
553	`** tree intentionally tries to fake an attack. Do not report such`
554	`** errors. */`
555	`const char *zPathInfo = P("PATH_INFO");`
556	`if( sqlite3_strglob("/doc/*/test/markdown-test3.md", zPathInfo)==0 ){`
557	`rc = 0;`
558	`}`
559	`}`
560	`return rc;`
561	`}`
562
563	`/*`
564	`** This is a utility routine associated with the test-looks-like-sql-injection`
565	`** command.`
566	`**`
567	`** Read input from zInFile and print only those lines that look like they`
568	`** might be SQL injection.`
569	`**`
570	`** Or if bInvert is true, then show the opposite - those lines that do NOT`
571	`** look like SQL injection.`
572	`*/`
573	`static void show_attack_lines(`
574	`const char zInFile, / Name of input file */`
575	`int bInvert, /* Invert the sense of the output (-v) */`
576	`int bDeHttpize /* De-httpize the inputs. (-d) */`
577	`){`
578	`FILE *in;`
579	`char zLine[10000];`
580	`if( zInFile==0 \|\| strcmp(zInFile,"-")==0 ){`
581	`in = stdin;`
582	`}else{`
583	`in = fopen(zInFile, "rb");`
584	`if( in==0 ){`
585	`fossil_fatal("cannot open \"%s\" for reading\n", zInFile);`
586	`}`
587	`}`
588	`while( fgets(zLine, sizeof(zLine), in) ){`
589	`dehttpize(zLine);`
590	`if( (looks_like_attack(zLine)!=0) ^ bInvert ){`
591	`fossil_print("%s", zLine);`
592	`}`
593	`}`
594	`if( in!=stdin ) fclose(in);`
595	`}`
596
597	`/*`
598	`** COMMAND: test-looks-like-attack`
599	`**`
600	`** Read lines of input from files named as arguments (or from standard`
601	`** input if no arguments are provided) and print those that look like they`
602	`** might be part of an SQL injection attack.`
603	`**`
604	`** Used to test the looks_lile_attack() utility subroutine, possibly`
605	`** by piping in actual server log data.`
606	`*/`
607	`void test_looks_like_attack(void){`
608	`int i;`
609	`int bInvert = find_option("invert","v",0)!=0;`
610	`int bDeHttpize = find_option("dehttpize","d",0)!=0;`
611	`verify_all_options();`
612	`if( g.argc==2 ){`
613	`show_attack_lines(0, bInvert, bDeHttpize);`
614	`}`
615	`for(i=2; i<g.argc; i++){`
616	`show_attack_lines(g.argv[i], bInvert, bDeHttpize);`
617	`}`
618	`}`
619

Fossil SCM

Keyboard Shortcuts