Fossil SCM

fossil-scm / src / lookslike.c

Source Blame History 596 lines

d70ea7e…	drh	1	/*
d70ea7e…	drh	2	** Copyright (c) 2013 D. Richard Hipp
d70ea7e…	drh	3	**
d70ea7e…	drh	4	** This program is free software; you can redistribute it and/or
d70ea7e…	drh	5	** modify it under the terms of the Simplified BSD License (also
d70ea7e…	drh	6	** known as the "2-Clause License" or "FreeBSD License".)
d70ea7e…	drh	7
d70ea7e…	drh	8	** This program is distributed in the hope that it will be useful,
d70ea7e…	drh	9	** but without any warranty; without even the implied warranty of
d70ea7e…	drh	10	** merchantability or fitness for a particular purpose.
d70ea7e…	drh	11	**
d70ea7e…	drh	12	** Author contact information:
d70ea7e…	drh	13	** [email protected]
d70ea7e…	drh	14	** http://www.hwaci.com/drh/
d70ea7e…	drh	15	**
d70ea7e…	drh	16	*******************************************************************************
d70ea7e…	drh	17	**
d70ea7e…	drh	18	** This file contains code used to try to guess if a particular file is
d70ea7e…	drh	19	** text or binary, what types of line endings it uses, is it UTF8 or
d70ea7e…	drh	20	** UTF16, etc.
d70ea7e…	drh	21	*/
d70ea7e…	drh	22	#include "config.h"
d70ea7e…	drh	23	#include "lookslike.h"
d70ea7e…	drh	24	#include <assert.h>
d70ea7e…	drh	25
d70ea7e…	drh	26
d70ea7e…	drh	27	#if INTERFACE
d70ea7e…	drh	28
d70ea7e…	drh	29	/*
d70ea7e…	drh	30	** This macro is designed to return non-zero if the specified blob contains
d70ea7e…	drh	31	** data that MAY be binary in nature; otherwise, zero will be returned.
d70ea7e…	drh	32	*/
d70ea7e…	drh	33	#define looks_like_binary(blob) \
d70ea7e…	drh	34	((looks_like_utf8((blob), LOOK_BINARY) & LOOK_BINARY) != LOOK_NONE)
d70ea7e…	drh	35
d70ea7e…	drh	36	/*
d70ea7e…	drh	37	** Output flags for the looks_like_utf8() and looks_like_utf16() routines used
d70ea7e…	drh	38	** to convey status information about the blob content.
d70ea7e…	drh	39	*/
d70ea7e…	drh	40	#define LOOK_NONE ((int)0x00000000) /* Nothing special was found. */
d70ea7e…	drh	41	#define LOOK_NUL ((int)0x00000001) /* One or more NUL chars were found. */
3161968…	mistachkin	42	#define LOOK_CR ((int)0x00000002) /* One or more CR chars were found. */
3161968…	mistachkin	43	#define LOOK_LONE_CR ((int)0x00000004) /* An unpaired CR char was found. */
3161968…	mistachkin	44	#define LOOK_LF ((int)0x00000008) /* One or more LF chars were found. */
3161968…	mistachkin	45	#define LOOK_LONE_LF ((int)0x00000010) /* An unpaired LF char was found. */
3161968…	mistachkin	46	#define LOOK_CRLF ((int)0x00000020) /* One or more CR/LF pairs were found. */
3161968…	mistachkin	47	#define LOOK_LONG ((int)0x00000040) /* An over length line was found. */
3161968…	mistachkin	48	#define LOOK_ODD ((int)0x00000080) /* An odd number of bytes was found. */
3161968…	mistachkin	49	#define LOOK_SHORT ((int)0x00000100) /* Unable to perform full check. */
3161968…	mistachkin	50	#define LOOK_INVALID ((int)0x00000200) /* Invalid sequence was found. */
d70ea7e…	drh	51	#define LOOK_BINARY (LOOK_NUL \| LOOK_LONG \| LOOK_SHORT) /* May be binary. */
d70ea7e…	drh	52	#define LOOK_EOL (LOOK_LONE_CR \| LOOK_LONE_LF \| LOOK_CRLF) /* Line seps. */
d70ea7e…	drh	53	#endif /* INTERFACE */
d70ea7e…	drh	54
7c08a68…	jan.nijtmans	55	/* definitions for various UTF-8 sequence lengths, encoded as start value
7c08a68…	jan.nijtmans	56	* and size of each valid range belonging to some lead byte*/
7c08a68…	jan.nijtmans	57	#define US2A 0x80, 0x01 /* for lead byte 0xC0 */
7c08a68…	jan.nijtmans	58	#define US2B 0x80, 0x40 /* for lead bytes 0xC2-0xDF */
7c08a68…	jan.nijtmans	59	#define US3A 0xA0, 0x20 /* for lead byte 0xE0 */
7c08a68…	jan.nijtmans	60	#define US3B 0x80, 0x40 /* for lead bytes 0xE1-0xEF */
7c08a68…	jan.nijtmans	61	#define US4A 0x90, 0x30 /* for lead byte 0xF0 */
7c08a68…	jan.nijtmans	62	#define US4B 0x80, 0x40 /* for lead bytes 0xF1-0xF3 */
7c08a68…	jan.nijtmans	63	#define US4C 0x80, 0x10 /* for lead byte 0xF4 */
7c08a68…	jan.nijtmans	64	#define US0A 0x00, 0x00 /* for any other lead byte */
7c08a68…	jan.nijtmans	65
7c08a68…	jan.nijtmans	66	/* a table used for quick lookup of the definition that goes with a
7c08a68…	jan.nijtmans	67	* particular lead byte */
7c08a68…	jan.nijtmans	68	static const unsigned char lb_tab[] = {
7c08a68…	jan.nijtmans	69	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
7c08a68…	jan.nijtmans	70	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
7c08a68…	jan.nijtmans	71	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
7c08a68…	jan.nijtmans	72	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
7c08a68…	jan.nijtmans	73	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
7c08a68…	jan.nijtmans	74	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
7c08a68…	jan.nijtmans	75	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
7c08a68…	jan.nijtmans	76	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A,
7c08a68…	jan.nijtmans	77	US2A, US0A, US2B, US2B, US2B, US2B, US2B, US2B,
7c08a68…	jan.nijtmans	78	US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
7c08a68…	jan.nijtmans	79	US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
7c08a68…	jan.nijtmans	80	US2B, US2B, US2B, US2B, US2B, US2B, US2B, US2B,
7c08a68…	jan.nijtmans	81	US3A, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
7c08a68…	jan.nijtmans	82	US3B, US3B, US3B, US3B, US3B, US3B, US3B, US3B,
7c08a68…	jan.nijtmans	83	US4A, US4B, US4B, US4B, US4C, US0A, US0A, US0A,
7c08a68…	jan.nijtmans	84	US0A, US0A, US0A, US0A, US0A, US0A, US0A, US0A
7c08a68…	jan.nijtmans	85	};
d70ea7e…	drh	86
d70ea7e…	drh	87	/*
d70ea7e…	drh	88	** This function attempts to scan each logical line within the blob to
d70ea7e…	drh	89	** determine the type of content it appears to contain. The return value
d70ea7e…	drh	90	** is a combination of one or more of the LOOK_XXX flags (see above):
d70ea7e…	drh	91	**
d70ea7e…	drh	92	** !LOOK_BINARY -- The content appears to consist entirely of text; however,
d70ea7e…	drh	93	** the encoding may not be UTF-8.
d70ea7e…	drh	94	**
d70ea7e…	drh	95	** LOOK_BINARY -- The content appears to be binary because it contains one
d70ea7e…	drh	96	** or more embedded NUL characters or an extremely long line.
d70ea7e…	drh	97	** Since this function does not understand UTF-16, it may
d70ea7e…	drh	98	** falsely consider UTF-16 text to be binary.
d70ea7e…	drh	99	**
d70ea7e…	drh	100	** Additional flags (i.e. those other than the ones included in LOOK_BINARY)
d70ea7e…	drh	101	** may be present in the result as well; however, they should not impact the
d70ea7e…	drh	102	** determination of text versus binary content.
d70ea7e…	drh	103	**
d70ea7e…	drh	104	********************************** WARNING ********************************
d70ea7e…	drh	105	**
d70ea7e…	drh	106	** This function does not validate that the blob content is properly formed
d70ea7e…	drh	107	** UTF-8. It assumes that all code points are the same size. It does not
d70ea7e…	drh	108	** validate any code points. It makes no attempt to detect if any [invalid]
d70ea7e…	drh	109	** switches between UTF-8 and other encodings occur.
d70ea7e…	drh	110	**
d70ea7e…	drh	111	** The only code points that this function cares about are the NUL character,
d70ea7e…	drh	112	** carriage-return, and line-feed.
d70ea7e…	drh	113	**
d70ea7e…	drh	114	** This function examines the contents of the blob until one of the flags
d70ea7e…	drh	115	** specified in "stopFlags" is set.
d70ea7e…	drh	116	**
d70ea7e…	drh	117	********************************** WARNING ********************************
d70ea7e…	drh	118	*/
d70ea7e…	drh	119	int looks_like_utf8(const Blob *pContent, int stopFlags){
d70ea7e…	drh	120	const char *z = blob_buffer(pContent);
d70ea7e…	drh	121	unsigned int n = blob_size(pContent);
d70ea7e…	drh	122	int j, c, flags = LOOK_NONE; /* Assume UTF-8 text, prove otherwise */
d70ea7e…	drh	123
d70ea7e…	drh	124	if( n==0 ) return flags; /* Empty file -> text */
d70ea7e…	drh	125	c = *z;
d70ea7e…	drh	126	if( c==0 ){
d70ea7e…	drh	127	flags \|= LOOK_NUL; /* NUL character in a file -> binary */
d70ea7e…	drh	128	}else if( c=='\r' ){
3161968…	mistachkin	129	flags \|= LOOK_CR;
d70ea7e…	drh	130	if( n<=1 \|\| z[1]!='\n' ){
6e3fceb…	mistachkin	131	flags \|= LOOK_LONE_CR; /* Not enough chars or next char not LF */
d70ea7e…	drh	132	}
d70ea7e…	drh	133	}
d70ea7e…	drh	134	j = (c!='\n');
3161968…	mistachkin	135	if( !j ) flags \|= (LOOK_LF \| LOOK_LONE_LF); /* Found LF as first char */
d70ea7e…	drh	136	while( !(flags&stopFlags) && --n>0 ){
d70ea7e…	drh	137	int c2 = c;
d70ea7e…	drh	138	c = *++z; ++j;
d70ea7e…	drh	139	if( c==0 ){
d70ea7e…	drh	140	flags \|= LOOK_NUL; /* NUL character in a file -> binary */
d70ea7e…	drh	141	}else if( c=='\n' ){
3161968…	mistachkin	142	flags \|= LOOK_LF;
d70ea7e…	drh	143	if( c2=='\r' ){
3161968…	mistachkin	144	flags \|= (LOOK_CR \| LOOK_CRLF); /* Found LF preceded by CR */
d70ea7e…	drh	145	}else{
d70ea7e…	drh	146	flags \|= LOOK_LONE_LF;
d70ea7e…	drh	147	}
d70ea7e…	drh	148	if( j>LENGTH_MASK ){
d70ea7e…	drh	149	flags \|= LOOK_LONG; /* Very long line -> binary */
d70ea7e…	drh	150	}
d70ea7e…	drh	151	j = 0;
d70ea7e…	drh	152	}else if( c=='\r' ){
3161968…	mistachkin	153	flags \|= LOOK_CR;
d70ea7e…	drh	154	if( n<=1 \|\| z[1]!='\n' ){
6e3fceb…	mistachkin	155	flags \|= LOOK_LONE_CR; /* Not enough chars or next char not LF */
d70ea7e…	drh	156	}
d70ea7e…	drh	157	}
d70ea7e…	drh	158	}
d70ea7e…	drh	159	if( n ){
d70ea7e…	drh	160	flags \|= LOOK_SHORT; /* The whole blob was not examined */
d70ea7e…	drh	161	}
d70ea7e…	drh	162	if( j>LENGTH_MASK ){
d70ea7e…	drh	163	flags \|= LOOK_LONG; /* Very long line -> binary */
d70ea7e…	drh	164	}
d70ea7e…	drh	165	return flags;
d70ea7e…	drh	166	}
d70ea7e…	drh	167
5f24da1…	jan.nijtmans	168	/*
5f24da1…	jan.nijtmans	169	** Checks for proper UTF-8. It uses the method described in:
5f24da1…	jan.nijtmans	170	** http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
7c08a68…	jan.nijtmans	171	** except for the "overlong form" of \u0000 which is not considered
7c08a68…	jan.nijtmans	172	** invalid here: Some languages like Java and Tcl use it. This function
7c08a68…	jan.nijtmans	173	** also considers valid the derivatives CESU-8 & WTF-8 (as described in
7c08a68…	jan.nijtmans	174	** the same wikipedia article referenced previously). For UTF-8 characters
7c08a68…	jan.nijtmans	175	** > 0x7f, the variable 'c' not necessary means the real lead byte.
7c08a68…	jan.nijtmans	176	** It's number of higher 1-bits indicate the number of continuation
7c08a68…	jan.nijtmans	177	** bytes that are expected to be followed. E.g. when 'c' has a value
7c08a68…	jan.nijtmans	178	** in the range 0xc0..0xdf it means that after 'c' a single continuation
7c08a68…	jan.nijtmans	179	** byte is expected. A value 0xe0..0xef means that after 'c' two more
7c08a68…	jan.nijtmans	180	** continuation bytes are expected.
5f24da1…	jan.nijtmans	181	*/
5f24da1…	jan.nijtmans	182
60349a6…	jan.nijtmans	183	int invalid_utf8(
60349a6…	jan.nijtmans	184	const Blob *pContent
60349a6…	jan.nijtmans	185	){
5f24da1…	jan.nijtmans	186	const unsigned char z = (unsigned char ) blob_buffer(pContent);
5f24da1…	jan.nijtmans	187	unsigned int n = blob_size(pContent);
7c08a68…	jan.nijtmans	188	unsigned char c; /* lead byte to be handled. */
5f24da1…	jan.nijtmans	189
5f24da1…	jan.nijtmans	190	if( n==0 ) return 0; /* Empty file -> OK */
5f24da1…	jan.nijtmans	191	c = *z;
5f24da1…	jan.nijtmans	192	while( --n>0 ){
7c08a68…	jan.nijtmans	193	if( c>=0x80 ){
7c08a68…	jan.nijtmans	194	const unsigned char def; / pointer to range table*/
7c08a68…	jan.nijtmans	195
7c08a68…	jan.nijtmans	196	c <<= 1; /* multiply by 2 and get rid of highest bit */
7c08a68…	jan.nijtmans	197	def = &lb_tab[c]; /* search fb's valid range in table */
7c08a68…	jan.nijtmans	198	if( (unsigned int)(*++z-def[0])>=def[1] ){
5f24da1…	jan.nijtmans	199	return LOOK_INVALID; /* Invalid UTF-8 */
5f24da1…	jan.nijtmans	200	}
7c08a68…	jan.nijtmans	201	c = (c>=0xC0) ? (c\|3) : ' '; /* determine next lead byte */
7c08a68…	jan.nijtmans	202	} else {
7c08a68…	jan.nijtmans	203	c = *++z;
1ca5983…	jan.nijtmans	204	}
1ca5983…	jan.nijtmans	205	}
7c08a68…	jan.nijtmans	206	return (c>=0x80) ? LOOK_INVALID : 0; /* Final lead byte must be ASCII. */
1ca5983…	jan.nijtmans	207	}
d70ea7e…	drh	208
d70ea7e…	drh	209	/*
d70ea7e…	drh	210	** Define the type needed to represent a Unicode (UTF-16) character.
d70ea7e…	drh	211	*/
d70ea7e…	drh	212	#ifndef WCHAR_T
d70ea7e…	drh	213	# ifdef _WIN32
d70ea7e…	drh	214	# define WCHAR_T wchar_t
d70ea7e…	drh	215	# else
d70ea7e…	drh	216	# define WCHAR_T unsigned short
d70ea7e…	drh	217	# endif
d70ea7e…	drh	218	#endif
d70ea7e…	drh	219
d70ea7e…	drh	220	/*
d70ea7e…	drh	221	** Maximum length of a line in a text file, in UTF-16 characters. (4096)
d70ea7e…	drh	222	** The number of bytes represented by this value cannot exceed LENGTH_MASK
d70ea7e…	drh	223	** bytes, because that is the line buffer size used by the diff engine.
d70ea7e…	drh	224	*/
d70ea7e…	drh	225	#define UTF16_LENGTH_MASK_SZ (LENGTH_MASK_SZ-(sizeof(WCHAR_T)-sizeof(char)))
d70ea7e…	drh	226	#define UTF16_LENGTH_MASK ((1<<UTF16_LENGTH_MASK_SZ)-1)
d70ea7e…	drh	227
d70ea7e…	drh	228	/*
d70ea7e…	drh	229	** This macro is used to swap the byte order of a UTF-16 character in the
d70ea7e…	drh	230	** looks_like_utf16() function.
d70ea7e…	drh	231	*/
5f24da1…	jan.nijtmans	232	#define UTF16_SWAP(ch) ((((ch) << 8) & 0xff00) \| (((ch) >> 8) & 0xff))
d70ea7e…	drh	233	#define UTF16_SWAP_IF(expr,ch) ((expr) ? UTF16_SWAP((ch)) : (ch))
d70ea7e…	drh	234
d70ea7e…	drh	235	/*
d70ea7e…	drh	236	** This function attempts to scan each logical line within the blob to
d70ea7e…	drh	237	** determine the type of content it appears to contain. The return value
d70ea7e…	drh	238	** is a combination of one or more of the LOOK_XXX flags (see above):
d70ea7e…	drh	239	**
d70ea7e…	drh	240	** !LOOK_BINARY -- The content appears to consist entirely of text; however,
d70ea7e…	drh	241	** the encoding may not be UTF-16.
d70ea7e…	drh	242	**
d70ea7e…	drh	243	** LOOK_BINARY -- The content appears to be binary because it contains one
d70ea7e…	drh	244	** or more embedded NUL characters or an extremely long line.
d70ea7e…	drh	245	** Since this function does not understand UTF-8, it may
d70ea7e…	drh	246	** falsely consider UTF-8 text to be binary.
d70ea7e…	drh	247	**
d70ea7e…	drh	248	** Additional flags (i.e. those other than the ones included in LOOK_BINARY)
d70ea7e…	drh	249	** may be present in the result as well; however, they should not impact the
d70ea7e…	drh	250	** determination of text versus binary content.
d70ea7e…	drh	251	**
d70ea7e…	drh	252	********************************** WARNING ********************************
d70ea7e…	drh	253	**
d70ea7e…	drh	254	** This function does not validate that the blob content is properly formed
d70ea7e…	drh	255	** UTF-16. It assumes that all code points are the same size. It does not
d70ea7e…	drh	256	** validate any code points. It makes no attempt to detect if any [invalid]
d70ea7e…	drh	257	** switches between the UTF-16be and UTF-16le encodings occur.
d70ea7e…	drh	258	**
d70ea7e…	drh	259	** The only code points that this function cares about are the NUL character,
d70ea7e…	drh	260	** carriage-return, and line-feed.
d70ea7e…	drh	261	**
d70ea7e…	drh	262	** This function examines the contents of the blob until one of the flags
d70ea7e…	drh	263	** specified in "stopFlags" is set.
d70ea7e…	drh	264	**
d70ea7e…	drh	265	********************************** WARNING ********************************
d70ea7e…	drh	266	*/
d70ea7e…	drh	267	int looks_like_utf16(const Blob *pContent, int bReverse, int stopFlags){
d70ea7e…	drh	268	const WCHAR_T z = (WCHAR_T )blob_buffer(pContent);
d70ea7e…	drh	269	unsigned int n = blob_size(pContent);
d70ea7e…	drh	270	int j, c, flags = LOOK_NONE; /* Assume UTF-16 text, prove otherwise */
d70ea7e…	drh	271
d70ea7e…	drh	272	if( n%sizeof(WCHAR_T) ){
d70ea7e…	drh	273	flags \|= LOOK_ODD; /* Odd number of bytes -> binary (UTF-8?) */
d70ea7e…	drh	274	}
275da70…	danield	275	if( n<sizeof(WCHAR_T) ) return flags;/* Zero or One byte -> binary (UTF-8?) */
d70ea7e…	drh	276	c = *z;
d70ea7e…	drh	277	if( bReverse ){
d70ea7e…	drh	278	c = UTF16_SWAP(c);
d70ea7e…	drh	279	}
d70ea7e…	drh	280	if( c==0 ){
d70ea7e…	drh	281	flags \|= LOOK_NUL; /* NUL character in a file -> binary */
d70ea7e…	drh	282	}else if( c=='\r' ){
3161968…	mistachkin	283	flags \|= LOOK_CR;
d70ea7e…	drh	284	if( n<(2*sizeof(WCHAR_T)) \|\| UTF16_SWAP_IF(bReverse, z[1])!='\n' ){
6e3fceb…	mistachkin	285	flags \|= LOOK_LONE_CR; /* Not enough chars or next char not LF */
d70ea7e…	drh	286	}
d70ea7e…	drh	287	}
d70ea7e…	drh	288	j = (c!='\n');
3161968…	mistachkin	289	if( !j ) flags \|= (LOOK_LF \| LOOK_LONE_LF); /* Found LF as first char */
7458a18…	jan.nijtmans	290	while( !(flags&stopFlags) && ((n-=sizeof(WCHAR_T))>=sizeof(WCHAR_T)) ){
d70ea7e…	drh	291	int c2 = c;
d70ea7e…	drh	292	c = *++z;
d70ea7e…	drh	293	if( bReverse ){
d70ea7e…	drh	294	c = UTF16_SWAP(c);
d70ea7e…	drh	295	}
d70ea7e…	drh	296	++j;
d70ea7e…	drh	297	if( c==0 ){
d70ea7e…	drh	298	flags \|= LOOK_NUL; /* NUL character in a file -> binary */
d70ea7e…	drh	299	}else if( c=='\n' ){
3161968…	mistachkin	300	flags \|= LOOK_LF;
d70ea7e…	drh	301	if( c2=='\r' ){
3161968…	mistachkin	302	flags \|= (LOOK_CR \| LOOK_CRLF); /* Found LF preceded by CR */
d70ea7e…	drh	303	}else{
d70ea7e…	drh	304	flags \|= LOOK_LONE_LF;
d70ea7e…	drh	305	}
d70ea7e…	drh	306	if( j>UTF16_LENGTH_MASK ){
d70ea7e…	drh	307	flags \|= LOOK_LONG; /* Very long line -> binary */
d70ea7e…	drh	308	}
d70ea7e…	drh	309	j = 0;
d70ea7e…	drh	310	}else if( c=='\r' ){
3161968…	mistachkin	311	flags \|= LOOK_CR;
d70ea7e…	drh	312	if( n<(2*sizeof(WCHAR_T)) \|\| UTF16_SWAP_IF(bReverse, z[1])!='\n' ){
6e3fceb…	mistachkin	313	flags \|= LOOK_LONE_CR; /* Not enough chars or next char not LF */
d70ea7e…	drh	314	}
d70ea7e…	drh	315	}
d70ea7e…	drh	316	}
d70ea7e…	drh	317	if( n ){
d70ea7e…	drh	318	flags \|= LOOK_SHORT; /* The whole blob was not examined */
d70ea7e…	drh	319	}
d70ea7e…	drh	320	if( j>UTF16_LENGTH_MASK ){
d70ea7e…	drh	321	flags \|= LOOK_LONG; /* Very long line -> binary */
d70ea7e…	drh	322	}
d70ea7e…	drh	323	return flags;
d70ea7e…	drh	324	}
d70ea7e…	drh	325
d70ea7e…	drh	326	/*
d70ea7e…	drh	327	** This function returns an array of bytes representing the byte-order-mark
d70ea7e…	drh	328	** for UTF-8.
d70ea7e…	drh	329	*/
d70ea7e…	drh	330	const unsigned char get_utf8_bom(int pnByte){
d70ea7e…	drh	331	static const unsigned char bom[] = {
5f24da1…	jan.nijtmans	332	0xef, 0xbb, 0xbf, 0x00, 0x00, 0x00
d70ea7e…	drh	333	};
d70ea7e…	drh	334	if( pnByte ) *pnByte = 3;
d70ea7e…	drh	335	return bom;
d70ea7e…	drh	336	}
d70ea7e…	drh	337
d70ea7e…	drh	338	/*
d70ea7e…	drh	339	** This function returns non-zero if the blob starts with a UTF-8
d70ea7e…	drh	340	** byte-order-mark (BOM).
d70ea7e…	drh	341	*/
d70ea7e…	drh	342	int starts_with_utf8_bom(const Blob pContent, int pnByte){
d70ea7e…	drh	343	const char *z = blob_buffer(pContent);
d70ea7e…	drh	344	int bomSize = 0;
d70ea7e…	drh	345	const unsigned char *bom = get_utf8_bom(&bomSize);
d70ea7e…	drh	346
d70ea7e…	drh	347	if( pnByte ) *pnByte = bomSize;
53db40e…	drh	348	if( (int)blob_size(pContent)<bomSize ) return 0;
d70ea7e…	drh	349	return memcmp(z, bom, bomSize)==0;
d70ea7e…	drh	350	}
d70ea7e…	drh	351
d70ea7e…	drh	352	/*
d70ea7e…	drh	353	** This function returns non-zero if the blob starts with a UTF-16
d70ea7e…	drh	354	** byte-order-mark (BOM), either in the endianness of the machine
d70ea7e…	drh	355	** or in reversed byte order. The UTF-32 BOM is ruled out by checking
d70ea7e…	drh	356	** if the UTF-16 BOM is not immediately followed by (utf16) 0.
d70ea7e…	drh	357	** pnByte is only set when the function returns 1.
d70ea7e…	drh	358	**
d70ea7e…	drh	359	** pbReverse is always set, even when no BOM is found. Without a BOM,
d70ea7e…	drh	360	** it is set to 1 on little-endian and 0 on big-endian platforms. See
d70ea7e…	drh	361	** clause D98 of conformance (section 3.10) of the Unicode standard.
d70ea7e…	drh	362	*/
d70ea7e…	drh	363	int starts_with_utf16_bom(
d70ea7e…	drh	364	const Blob pContent, / IN: Blob content to perform BOM detection on. */
d70ea7e…	drh	365	int pnByte, / OUT: The number of bytes used for the BOM. */
d70ea7e…	drh	366	int pbReverse / OUT: Non-zero for BOM in reverse byte-order. */
d70ea7e…	drh	367	){
f7c41be…	drh	368	const unsigned char z = (unsigned char )blob_buffer(pContent);
d70ea7e…	drh	369	int bomSize = sizeof(unsigned short);
d70ea7e…	drh	370	int size = blob_size(pContent);
f7c41be…	drh	371	unsigned short i0;
d70ea7e…	drh	372
d70ea7e…	drh	373	if( size<bomSize ) goto noBom; /* No: cannot read BOM. */
f7c41be…	drh	374	if( size>=(2*bomSize) && z[2]==0 && z[3]==0 ) goto noBom;
f7c41be…	drh	375	memcpy(&i0, z, sizeof(i0));
f7c41be…	drh	376	if( i0==0xfeff ){
d70ea7e…	drh	377	if( pbReverse ) *pbReverse = 0;
f7c41be…	drh	378	}else if( i0==0xfffe ){
d70ea7e…	drh	379	if( pbReverse ) *pbReverse = 1;
d70ea7e…	drh	380	}else{
d70ea7e…	drh	381	static const int one = 1;
d70ea7e…	drh	382	noBom:
d70ea7e…	drh	383	if( pbReverse ) pbReverse = (char *) &one;
d70ea7e…	drh	384	return 0; /* No: UTF-16 byte-order-mark not found. */
d70ea7e…	drh	385	}
d70ea7e…	drh	386	if( pnByte ) *pnByte = bomSize;
d70ea7e…	drh	387	return 1; /* Yes. */
d70ea7e…	drh	388	}
d70ea7e…	drh	389
d70ea7e…	drh	390	/*
d70ea7e…	drh	391	** Returns non-zero if the specified content could be valid UTF-16.
d70ea7e…	drh	392	*/
d70ea7e…	drh	393	int could_be_utf16(const Blob pContent, int pbReverse){
d70ea7e…	drh	394	return (blob_size(pContent) % sizeof(WCHAR_T) == 0) ?
d70ea7e…	drh	395	starts_with_utf16_bom(pContent, 0, pbReverse) : 0;
d70ea7e…	drh	396	}
d70ea7e…	drh	397
d70ea7e…	drh	398
d70ea7e…	drh	399	/*
d70ea7e…	drh	400	** COMMAND: test-looks-like-utf
d70ea7e…	drh	401	**
d70ea7e…	drh	402	** Usage: %fossil test-looks-like-utf FILENAME
d70ea7e…	drh	403	**
d70ea7e…	drh	404	** Options:
11384f1…	drh	405	** -n\|--limit N Repeat looks-like function N times, for
4cb50c4…	stephan	406	** performance measurement. Default = 1
d70ea7e…	drh	407	** --utf8 Ignoring BOM and file size, force UTF-8 checking
d70ea7e…	drh	408	** --utf16 Ignoring BOM and file size, force UTF-16 checking
d70ea7e…	drh	409	**
d70ea7e…	drh	410	** FILENAME is the name of a file to check for textual content in the UTF-8
d70ea7e…	drh	411	** and/or UTF-16 encodings.
d70ea7e…	drh	412	*/
d70ea7e…	drh	413	void looks_like_utf_test_cmd(void){
503482a…	jan.nijtmans	414	Blob blob; /* the contents of the specified file */
503482a…	jan.nijtmans	415	int fUtf8 = 0; /* return value of starts_with_utf8_bom() */
503482a…	jan.nijtmans	416	int fUtf16 = 0; /* return value of starts_with_utf16_bom() */
503482a…	jan.nijtmans	417	int fUnicode = 0; /* return value of could_be_utf16() */
503482a…	jan.nijtmans	418	int lookFlags = 0; /* output flags from looks_like_utf8/utf16() */
d70ea7e…	drh	419	int bRevUtf16 = 0; /* non-zero -> UTF-16 byte order reversed */
d70ea7e…	drh	420	int fForceUtf8 = find_option("utf8",0,0)!=0;
d70ea7e…	drh	421	int fForceUtf16 = find_option("utf16",0,0)!=0;
5f24da1…	jan.nijtmans	422	const char *zCount = find_option("limit","n",1);
5f24da1…	jan.nijtmans	423	int nRepeat = 1;
5f24da1…	jan.nijtmans	424
d70ea7e…	drh	425	if( g.argc!=3 ) usage("FILENAME");
5f24da1…	jan.nijtmans	426	if( zCount ){
5f24da1…	jan.nijtmans	427	nRepeat = atoi(zCount);
5f24da1…	jan.nijtmans	428	}
1772357…	drh	429	blob_read_from_file(&blob, g.argv[2], ExtFILE);
5f24da1…	jan.nijtmans	430	while( --nRepeat >= 0 ){
5f24da1…	jan.nijtmans	431	fUtf8 = starts_with_utf8_bom(&blob, 0);
5f24da1…	jan.nijtmans	432	fUtf16 = starts_with_utf16_bom(&blob, 0, &bRevUtf16);
5f24da1…	jan.nijtmans	433	if( fForceUtf8 ){
5f24da1…	jan.nijtmans	434	fUnicode = 0;
5f24da1…	jan.nijtmans	435	}else{
09f2386…	jan.nijtmans	436	fUnicode = could_be_utf16(&blob, 0) \|\| fForceUtf16;
5f24da1…	jan.nijtmans	437	}
5f24da1…	jan.nijtmans	438	if( fUnicode ){
09f2386…	jan.nijtmans	439	lookFlags = looks_like_utf16(&blob, bRevUtf16, 0);
5f24da1…	jan.nijtmans	440	}else{
60349a6…	jan.nijtmans	441	lookFlags = looks_like_utf8(&blob, 0) \| invalid_utf8(&blob);
5f24da1…	jan.nijtmans	442	}
5f24da1…	jan.nijtmans	443	}
d70ea7e…	drh	444	fossil_print("File \"%s\" has %d bytes.\n",g.argv[2],blob_size(&blob));
d70ea7e…	drh	445	fossil_print("Starts with UTF-8 BOM: %s\n",fUtf8?"yes":"no");
d70ea7e…	drh	446	fossil_print("Starts with UTF-16 BOM: %s\n",
d70ea7e…	drh	447	fUtf16?(bRevUtf16?"reversed":"yes"):"no");
d70ea7e…	drh	448	fossil_print("Looks like UTF-%s: %s\n",fUnicode?"16":"8",
d70ea7e…	drh	449	(lookFlags&LOOK_BINARY)?"no":"yes");
d70ea7e…	drh	450	fossil_print("Has flag LOOK_NUL: %s\n",(lookFlags&LOOK_NUL)?"yes":"no");
d70ea7e…	drh	451	fossil_print("Has flag LOOK_CR: %s\n",(lookFlags&LOOK_CR)?"yes":"no");
d70ea7e…	drh	452	fossil_print("Has flag LOOK_LONE_CR: %s\n",
d70ea7e…	drh	453	(lookFlags&LOOK_LONE_CR)?"yes":"no");
d70ea7e…	drh	454	fossil_print("Has flag LOOK_LF: %s\n",(lookFlags&LOOK_LF)?"yes":"no");
d70ea7e…	drh	455	fossil_print("Has flag LOOK_LONE_LF: %s\n",
d70ea7e…	drh	456	(lookFlags&LOOK_LONE_LF)?"yes":"no");
d70ea7e…	drh	457	fossil_print("Has flag LOOK_CRLF: %s\n",(lookFlags&LOOK_CRLF)?"yes":"no");
d70ea7e…	drh	458	fossil_print("Has flag LOOK_LONG: %s\n",(lookFlags&LOOK_LONG)?"yes":"no");
d70ea7e…	drh	459	fossil_print("Has flag LOOK_INVALID: %s\n",
d70ea7e…	drh	460	(lookFlags&LOOK_INVALID)?"yes":"no");
d70ea7e…	drh	461	fossil_print("Has flag LOOK_ODD: %s\n",(lookFlags&LOOK_ODD)?"yes":"no");
d70ea7e…	drh	462	fossil_print("Has flag LOOK_SHORT: %s\n",(lookFlags&LOOK_SHORT)?"yes":"no");
d70ea7e…	drh	463	blob_reset(&blob);
534c10f…	stephan	464	}
534c10f…	stephan	465
534c10f…	stephan	466	/*
57f1e87…	drh	467	** Return true if z[i] is the whole word given by zWord in a context that
57f1e87…	drh	468	** might be an attempted SQL injection.
d3cb62f…	drh	469	*/
d3cb62f…	drh	470	static int isWholeWord(const char z, unsigned int i, const char zWord, int n){
57f1e87…	drh	471	if( i==0 ) return 0;
d3cb62f…	drh	472	if( sqlite3_strnicmp(z+i, zWord, n)!=0 ) return 0;
57f1e87…	drh	473	if( fossil_isalnum(z[i-1]) ) return 0;
eb7fad0…	drh	474	if( fossil_isalnum(z[i+n]) ) return 0;
57f1e87…	drh	475	if( strchr("-)_", z[i-1])!=0 ) return 0;
57f1e87…	drh	476	if( strchr("(_", z[i+n])!=0 ) return 0;
d3cb62f…	drh	477	return 1;
d3cb62f…	drh	478	}
d3cb62f…	drh	479
d3cb62f…	drh	480	/*
534c10f…	stephan	481	** Returns true if the given text contains certain keywords or
eb7fad0…	drh	482	** punctuation which indicate that it might be an SQL injection attempt
8612122…	drh	483	** or Cross-site scripting attempt or some other kind of mischief.
eb7fad0…	drh	484	**
8612122…	drh	485	** This is not a primary defense against vulnerabilities in the Fossil
8612122…	drh	486	** code. Rather, this is part of an effort to do early detection of malicious
8612122…	drh	487	** spiders to avoid them using up too many CPU cycles. Or, this routine
8612122…	drh	488	** can also be thought of as a secondary layer of defense against attacks.
d3cb62f…	drh	489	*/
8612122…	drh	490	int looks_like_attack(const char *zTxt){
d3cb62f…	drh	491	unsigned int i;
5a33f30…	drh	492	int rc = 0;
d3cb62f…	drh	493	if( zTxt==0 ) return 0;
d3cb62f…	drh	494	for(i=0; zTxt[i]; i++){
d3cb62f…	drh	495	switch( zTxt[i] ){
5a33f30…	drh	496	case '<':
d3cb62f…	drh	497	case ';':
d3cb62f…	drh	498	case '\'':
d3cb62f…	drh	499	return 1;
eb7fad0…	drh	500	case '/': /* 0123456789 123456789 */
5a33f30…	drh	501	if( strncmp(zTxt+i+1, "/wp-content/plugins/", 20)==0 ) rc = 1;
5a33f30…	drh	502	if( strncmp(zTxt+i+1, "/wp-admin/admin-ajax", 20)==0 ) rc = 1;
eb7fad0…	drh	503	break;
d3cb62f…	drh	504	case 'a':
d3cb62f…	drh	505	case 'A':
5a33f30…	drh	506	if( isWholeWord(zTxt, i, "and", 3) ) rc = 1;
d3cb62f…	drh	507	break;
d3cb62f…	drh	508	case 'n':
d3cb62f…	drh	509	case 'N':
5a33f30…	drh	510	if( isWholeWord(zTxt, i, "null", 4) ) rc = 1;
d3cb62f…	drh	511	break;
d3cb62f…	drh	512	case 'o':
d3cb62f…	drh	513	case 'O':
57f1e87…	drh	514	if( isWholeWord(zTxt, i, "order", 5) && fossil_isspace(zTxt[i+5]) ){
5a33f30…	drh	515	rc = 1;
57f1e87…	drh	516	}
5a33f30…	drh	517	if( isWholeWord(zTxt, i, "or", 2) ) rc = 1;
d3cb62f…	drh	518	break;
d3cb62f…	drh	519	case 's':
d3cb62f…	drh	520	case 'S':
5a33f30…	drh	521	if( isWholeWord(zTxt, i, "select", 6) ) rc = 1;
d3cb62f…	drh	522	break;
d3cb62f…	drh	523	case 'w':
d3cb62f…	drh	524	case 'W':
5a33f30…	drh	525	if( isWholeWord(zTxt, i, "waitfor", 7) ) rc = 1;
d3cb62f…	drh	526	break;
d3cb62f…	drh	527	}
d3cb62f…	drh	528	}
5a33f30…	drh	529	if( rc ){
5a33f30…	drh	530	/* The test/markdown-test3.md document which is part of the Fossil source
5a33f30…	drh	531	** tree intentionally tries to fake an attack. Do not report such
5a33f30…	drh	532	** errors. */
5a33f30…	drh	533	const char *zPathInfo = P("PATH_INFO");
5a33f30…	drh	534	if( sqlite3_strglob("/doc/*/test/markdown-test3.md", zPathInfo)==0 ){
5a33f30…	drh	535	rc = 0;
5a33f30…	drh	536	}
5a33f30…	drh	537	}
5a33f30…	drh	538	return rc;
d3cb62f…	drh	539	}
d3cb62f…	drh	540
d3cb62f…	drh	541	/*
d3cb62f…	drh	542	** This is a utility routine associated with the test-looks-like-sql-injection
d3cb62f…	drh	543	** command.
d3cb62f…	drh	544	**
d3cb62f…	drh	545	** Read input from zInFile and print only those lines that look like they
d3cb62f…	drh	546	** might be SQL injection.
d3cb62f…	drh	547	**
d3cb62f…	drh	548	** Or if bInvert is true, then show the opposite - those lines that do NOT
d3cb62f…	drh	549	** look like SQL injection.
d3cb62f…	drh	550	*/
8612122…	drh	551	static void show_attack_lines(
d3cb62f…	drh	552	const char zInFile, / Name of input file */
d3cb62f…	drh	553	int bInvert, /* Invert the sense of the output (-v) */
d3cb62f…	drh	554	int bDeHttpize /* De-httpize the inputs. (-d) */
d3cb62f…	drh	555	){
d3cb62f…	drh	556	FILE *in;
d3cb62f…	drh	557	char zLine[10000];
d3cb62f…	drh	558	if( zInFile==0 \|\| strcmp(zInFile,"-")==0 ){
d3cb62f…	drh	559	in = stdin;
d3cb62f…	drh	560	}else{
d3cb62f…	drh	561	in = fopen(zInFile, "rb");
d3cb62f…	drh	562	if( in==0 ){
d3cb62f…	drh	563	fossil_fatal("cannot open \"%s\" for reading\n", zInFile);
d3cb62f…	drh	564	}
d3cb62f…	drh	565	}
d3cb62f…	drh	566	while( fgets(zLine, sizeof(zLine), in) ){
d3cb62f…	drh	567	dehttpize(zLine);
8612122…	drh	568	if( (looks_like_attack(zLine)!=0) ^ bInvert ){
d3cb62f…	drh	569	fossil_print("%s", zLine);
d3cb62f…	drh	570	}
d3cb62f…	drh	571	}
d3cb62f…	drh	572	if( in!=stdin ) fclose(in);
d3cb62f…	drh	573	}
d3cb62f…	drh	574
d3cb62f…	drh	575	/*
8612122…	drh	576	** COMMAND: test-looks-like-attack
d3cb62f…	drh	577	**
d3cb62f…	drh	578	** Read lines of input from files named as arguments (or from standard
d3cb62f…	drh	579	** input if no arguments are provided) and print those that look like they
d3cb62f…	drh	580	** might be part of an SQL injection attack.
d3cb62f…	drh	581	**
8612122…	drh	582	** Used to test the looks_lile_attack() utility subroutine, possibly
d3cb62f…	drh	583	** by piping in actual server log data.
534c10f…	stephan	584	*/
8612122…	drh	585	void test_looks_like_attack(void){
d3cb62f…	drh	586	int i;
d3cb62f…	drh	587	int bInvert = find_option("invert","v",0)!=0;
d3cb62f…	drh	588	int bDeHttpize = find_option("dehttpize","d",0)!=0;
d3cb62f…	drh	589	verify_all_options();
d3cb62f…	drh	590	if( g.argc==2 ){
8612122…	drh	591	show_attack_lines(0, bInvert, bDeHttpize);
d3cb62f…	drh	592	}
d3cb62f…	drh	593	for(i=2; i<g.argc; i++){
8612122…	drh	594	show_attack_lines(g.argv[i], bInvert, bDeHttpize);
d3cb62f…	drh	595	}
d70ea7e…	drh	596	}

Fossil SCM

Keyboard Shortcuts